diff options
Diffstat (limited to 'arch/x86')
239 files changed, 7914 insertions, 2214 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fbf26e0f7a6a..7b6dd10b162a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -14,10 +14,11 @@ config X86_32 select ARCH_WANT_IPC_PARSE_VERSION select CLKSRC_I8253 select CLONE_BACKWARDS + select GENERIC_VDSO_32 select HAVE_DEBUG_STACKOVERFLOW + select KMAP_LOCAL select MODULES_USE_ELF_REL select OLD_SIGACTION - select GENERIC_VDSO_32 config X86_64 def_bool y @@ -91,7 +92,9 @@ config X86 select ARCH_STACKWALK select ARCH_SUPPORTS_ACPI select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_DEBUG_PAGEALLOC select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 + select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS @@ -109,7 +112,6 @@ config X86 select DCACHE_WORD_ACCESS select EDAC_ATOMIC_SCRUB select EDAC_SUPPORT - select GENERIC_CLOCKEVENTS select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) select GENERIC_CLOCKEVENTS_MIN_ADJUST select GENERIC_CMOS_UPDATE @@ -163,11 +165,13 @@ config X86 select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL select HAVE_CONTEXT_TRACKING if X86_64 + select HAVE_CONTEXT_TRACKING_OFFSTACK if HAVE_CONTEXT_TRACKING select HAVE_C_RECORDMCOUNT select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS + select HAVE_DYNAMIC_FTRACE_WITH_ARGS if X86_64 select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS @@ -199,6 +203,7 @@ config X86 select HAVE_MIXED_BREAKPOINTS_REGS select HAVE_MOD_ARCH_SPECIFIC select HAVE_MOVE_PMD + select HAVE_MOVE_PUD select HAVE_NMI select HAVE_OPROFILE select HAVE_OPTPROBES @@ -330,9 +335,6 @@ config ZONE_DMA32 config AUDIT_ARCH def_bool y if X86_64 -config ARCH_SUPPORTS_DEBUG_PAGEALLOC - def_bool y - config KASAN_SHADOW_OFFSET hex depends on KASAN @@ -1931,6 +1933,23 @@ config X86_INTEL_TSX_MODE_AUTO side channel attacks- equals the tsx=auto command line parameter. endchoice +config X86_SGX + bool "Software Guard eXtensions (SGX)" + depends on X86_64 && CPU_SUP_INTEL + depends on CRYPTO=y + depends on CRYPTO_SHA256=y + select SRCU + select MMU_NOTIFIER + help + Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions + that can be used by applications to set aside private regions of code + and data, referred to as enclaves. An enclave's private memory can + only be accessed by code running within the enclave. Accesses from + outside the enclave, including other enclaves, are disallowed by + hardware. + + If unsure, say N. + config EFI bool "EFI runtime service support" depends on ACPI diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 27b5e2bc6a01..80b57e7f4947 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -62,9 +62,6 @@ config EARLY_PRINTK_USB_XDBC You should normally say N here, unless you want to debug early crashes or need a very simple printk logging facility. -config COPY_MC_TEST - def_bool n - config EFI_PGT_DUMP bool "Dump the EFI pagetable" depends on EFI diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1bf21746f4ce..7116da3980be 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -24,14 +24,7 @@ endif # How to compile the 16-bit code. Note we always compile for -march=i386; # that way we can complain to the user if the CPU is insufficient. -# -# The -m16 option is supported by GCC >= 4.9 and clang >= 3.5. For -# older versions of GCC, include an *assembly* header to make sure that -# gcc doesn't play any games behind our back. -CODE16GCC_CFLAGS := -m32 -Wa,$(srctree)/arch/x86/boot/code16gcc.h -M16_CFLAGS := $(call cc-option, -m16, $(CODE16GCC_CFLAGS)) - -REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -DDISABLE_BRANCH_PROFILING \ +REALMODE_CFLAGS := -m16 -g -Os -DDISABLE_BRANCH_PROFILING \ -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \ -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ -mno-mmx -mno-sse diff --git a/arch/x86/boot/code16gcc.h b/arch/x86/boot/code16gcc.h deleted file mode 100644 index e19fd7536307..000000000000 --- a/arch/x86/boot/code16gcc.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -# -# code16gcc.h -# -# This file is added to the assembler via -Wa when compiling 16-bit C code. -# This is done this way instead via asm() to make sure gcc does not reorder -# things around us. -# -# gcc 4.9+ has a real -m16 option so we can drop this hack long term. -# - - .code16gcc diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 40b8fd375d52..e0bc3988c3fa 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -35,7 +35,7 @@ cflags-$(CONFIG_X86_32) := -march=i386 cflags-$(CONFIG_X86_64) := -mcmodel=small -mno-red-zone KBUILD_CFLAGS += $(cflags-y) KBUILD_CFLAGS += -mno-mmx -mno-sse -KBUILD_CFLAGS += -ffreestanding +KBUILD_CFLAGS += -ffreestanding -fshort-wchar KBUILD_CFLAGS += -fno-stack-protector KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) KBUILD_CFLAGS += $(call cc-disable-warning, gnu) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 017de6cc87dc..e94874f4bbc1 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -241,12 +241,12 @@ SYM_FUNC_START(startup_32) leal rva(startup_64)(%ebp), %eax #ifdef CONFIG_EFI_MIXED movl rva(efi32_boot_args)(%ebp), %edi - cmp $0, %edi + testl %edi, %edi jz 1f leal rva(efi64_stub_entry)(%ebp), %eax movl rva(efi32_boot_args+4)(%ebp), %esi movl rva(efi32_boot_args+8)(%ebp), %edx // saved bootparams pointer - cmpl $0, %edx + testl %edx, %edx jnz 1f /* * efi_pe_entry uses MS calling convention, which requires 32 bytes of @@ -592,7 +592,7 @@ SYM_CODE_START(trampoline_32bit_src) movl %eax, %cr0 /* Check what paging mode we want to be in after the trampoline */ - cmpl $0, %edx + testl %edx, %edx jz 1f /* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */ @@ -622,7 +622,7 @@ SYM_CODE_START(trampoline_32bit_src) /* Enable PAE and LA57 (if required) paging modes */ movl $X86_CR4_PAE, %eax - cmpl $0, %edx + testl %edx, %edx jz 1f orl $X86_CR4_LA57, %eax 1: diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index 39b2eded7bc2..f7213d0943b8 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -168,16 +168,6 @@ void initialize_identity_maps(void *rmode) write_cr3(top_level_pgt); } -/* - * This switches the page tables to the new level4 that has been built - * via calls to add_identity_map() above. If booted via startup_32(), - * this is effectively a no-op. - */ -void finalize_identity_maps(void) -{ - write_cr3(top_level_pgt); -} - static pte_t *split_large_pmd(struct x86_mapping_info *info, pmd_t *pmdp, unsigned long __address) { diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index d9a631c5973c..901ea5ebec22 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -12,6 +12,7 @@ #undef CONFIG_PARAVIRT_XXL #undef CONFIG_PARAVIRT_SPINLOCKS #undef CONFIG_KASAN +#undef CONFIG_KASAN_GENERIC /* cpu_feature_enabled() cannot be used this early */ #define USE_EARLY_PGTABLE_L5 diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c deleted file mode 100644 index 7b7dc05fa1a4..000000000000 --- a/arch/x86/crypto/aes_glue.c +++ /dev/null @@ -1 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 1852b19a73a0..d1436c37008b 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -318,7 +318,7 @@ _initial_blocks_\@: # Main loop - Encrypt/Decrypt remaining blocks - cmp $0, %r13 + test %r13, %r13 je _zero_cipher_left_\@ sub $64, %r13 je _four_cipher_left_\@ @@ -437,7 +437,7 @@ _multiple_of_16_bytes_\@: mov PBlockLen(%arg2), %r12 - cmp $0, %r12 + test %r12, %r12 je _partial_done\@ GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 @@ -474,7 +474,7 @@ _T_8_\@: add $8, %r10 sub $8, %r11 psrldq $8, %xmm0 - cmp $0, %r11 + test %r11, %r11 je _return_T_done_\@ _T_4_\@: movd %xmm0, %eax @@ -482,7 +482,7 @@ _T_4_\@: add $4, %r10 sub $4, %r11 psrldq $4, %xmm0 - cmp $0, %r11 + test %r11, %r11 je _return_T_done_\@ _T_123_\@: movd %xmm0, %eax @@ -619,7 +619,7 @@ _get_AAD_blocks\@: /* read the last <16B of AAD */ _get_AAD_rest\@: - cmp $0, %r11 + test %r11, %r11 je _get_AAD_done\@ READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 @@ -640,7 +640,7 @@ _get_AAD_done\@: .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ AAD_HASH operation mov PBlockLen(%arg2), %r13 - cmp $0, %r13 + test %r13, %r13 je _partial_block_done_\@ # Leave Macro if no partial blocks # Read in input data without over reading cmp $16, \PLAIN_CYPH_LEN @@ -692,7 +692,7 @@ _no_extra_mask_1_\@: pshufb %xmm2, %xmm3 pxor %xmm3, \AAD_HASH - cmp $0, %r10 + test %r10, %r10 jl _partial_incomplete_1_\@ # GHASH computation for the last <16 Byte block @@ -727,7 +727,7 @@ _no_extra_mask_2_\@: pshufb %xmm2, %xmm9 pxor %xmm9, \AAD_HASH - cmp $0, %r10 + test %r10, %r10 jl _partial_incomplete_2_\@ # GHASH computation for the last <16 Byte block @@ -747,7 +747,7 @@ _encode_done_\@: pshufb %xmm2, %xmm9 .endif # output encrypted Bytes - cmp $0, %r10 + test %r10, %r10 jl _partial_fill_\@ mov %r13, %r12 mov $16, %r13 @@ -2720,7 +2720,7 @@ SYM_FUNC_END(aesni_ctr_enc) */ SYM_FUNC_START(aesni_xts_crypt8) FRAME_BEGIN - cmpb $0, %cl + testb %cl, %cl movl $0, %ecx movl $240, %r10d leaq _aesni_enc4, %r11 diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 5fee47956f3b..2cf8e94d986a 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -369,7 +369,7 @@ _initial_num_blocks_is_0\@: _initial_blocks_encrypted\@: - cmp $0, %r13 + test %r13, %r13 je _zero_cipher_left\@ sub $128, %r13 @@ -528,7 +528,7 @@ _multiple_of_16_bytes\@: vmovdqu HashKey(arg2), %xmm13 mov PBlockLen(arg2), %r12 - cmp $0, %r12 + test %r12, %r12 je _partial_done\@ #GHASH computation for the last <16 Byte block @@ -573,7 +573,7 @@ _T_8\@: add $8, %r10 sub $8, %r11 vpsrldq $8, %xmm9, %xmm9 - cmp $0, %r11 + test %r11, %r11 je _return_T_done\@ _T_4\@: vmovd %xmm9, %eax @@ -581,7 +581,7 @@ _T_4\@: add $4, %r10 sub $4, %r11 vpsrldq $4, %xmm9, %xmm9 - cmp $0, %r11 + test %r11, %r11 je _return_T_done\@ _T_123\@: vmovd %xmm9, %eax @@ -625,7 +625,7 @@ _get_AAD_blocks\@: cmp $16, %r11 jge _get_AAD_blocks\@ vmovdqu \T8, \T7 - cmp $0, %r11 + test %r11, %r11 je _get_AAD_done\@ vpxor \T7, \T7, \T7 @@ -644,7 +644,7 @@ _get_AAD_rest8\@: vpxor \T1, \T7, \T7 jmp _get_AAD_rest8\@ _get_AAD_rest4\@: - cmp $0, %r11 + test %r11, %r11 jle _get_AAD_rest0\@ mov (%r10), %eax movq %rax, \T1 @@ -749,7 +749,7 @@ _done_read_partial_block_\@: .macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ AAD_HASH ENC_DEC mov PBlockLen(arg2), %r13 - cmp $0, %r13 + test %r13, %r13 je _partial_block_done_\@ # Leave Macro if no partial blocks # Read in input data without over reading cmp $16, \PLAIN_CYPH_LEN @@ -801,7 +801,7 @@ _no_extra_mask_1_\@: vpshufb %xmm2, %xmm3, %xmm3 vpxor %xmm3, \AAD_HASH, \AAD_HASH - cmp $0, %r10 + test %r10, %r10 jl _partial_incomplete_1_\@ # GHASH computation for the last <16 Byte block @@ -836,7 +836,7 @@ _no_extra_mask_2_\@: vpshufb %xmm2, %xmm9, %xmm9 vpxor %xmm9, \AAD_HASH, \AAD_HASH - cmp $0, %r10 + test %r10, %r10 jl _partial_incomplete_2_\@ # GHASH computation for the last <16 Byte block @@ -856,7 +856,7 @@ _encode_done_\@: vpshufb %xmm2, %xmm9, %xmm9 .endif # output encrypted Bytes - cmp $0, %r10 + test %r10, %r10 jl _partial_fill_\@ mov %r13, %r12 mov $16, %r13 diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl index 7d568012cc15..71fae5a09e56 100644 --- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl +++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl @@ -251,7 +251,7 @@ $code.=<<___; mov %rax,8($ctx) mov %rax,16($ctx) - cmp \$0,$inp + test $inp,$inp je .Lno_key ___ $code.=<<___ if (!$kernel); diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index c44aba290fbb..646da46e8d10 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -210,7 +210,7 @@ void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) } poly1305_simd_emit(&dctx->h, dst, dctx->s); - *dctx = (struct poly1305_desc_ctx){}; + memzero_explicit(dctx, sizeof(*dctx)); } EXPORT_SYMBOL(poly1305_final_arch); diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 18200135603f..44340a1139e0 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -22,7 +22,7 @@ #include <linux/module.h> #include <linux/mm.h> #include <linux/types.h> -#include <crypto/sha.h> +#include <crypto/sha1.h> #include <crypto/sha1_base.h> #include <asm/simd.h> diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index dd06249229e1..3a5f6be7dbba 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -35,7 +35,7 @@ #include <linux/module.h> #include <linux/mm.h> #include <linux/types.h> -#include <crypto/sha.h> +#include <crypto/sha2.h> #include <crypto/sha256_base.h> #include <linux/string.h> #include <asm/simd.h> diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S index 63470fd6ae32..684d58c8bc4f 100644 --- a/arch/x86/crypto/sha512-avx-asm.S +++ b/arch/x86/crypto/sha512-avx-asm.S @@ -278,7 +278,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE # "blocks" is the message length in SHA512 blocks ######################################################################## SYM_FUNC_START(sha512_transform_avx) - cmp $0, msglen + test msglen, msglen je nowork # Allocate Stack Space diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S index 7946a1bee85b..50812af0b083 100644 --- a/arch/x86/crypto/sha512-ssse3-asm.S +++ b/arch/x86/crypto/sha512-ssse3-asm.S @@ -280,7 +280,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE ######################################################################## SYM_FUNC_START(sha512_transform_ssse3) - cmp $0, msglen + test msglen, msglen je nowork # Allocate Stack Space diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index b0b05c93409e..30e70f4fe2f7 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -34,7 +34,7 @@ #include <linux/mm.h> #include <linux/string.h> #include <linux/types.h> -#include <crypto/sha.h> +#include <crypto/sha2.h> #include <crypto/sha512_base.h> #include <asm/simd.h> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 870efeec8bda..18d8f17f755c 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -209,40 +209,6 @@ SYSCALL_DEFINE0(ni_syscall) return -ENOSYS; } -noinstr bool idtentry_enter_nmi(struct pt_regs *regs) -{ - bool irq_state = lockdep_hardirqs_enabled(); - - __nmi_enter(); - lockdep_hardirqs_off(CALLER_ADDR0); - lockdep_hardirq_enter(); - rcu_nmi_enter(); - - instrumentation_begin(); - trace_hardirqs_off_finish(); - ftrace_nmi_enter(); - instrumentation_end(); - - return irq_state; -} - -noinstr void idtentry_exit_nmi(struct pt_regs *regs, bool restore) -{ - instrumentation_begin(); - ftrace_nmi_exit(); - if (restore) { - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); - } - instrumentation_end(); - - rcu_nmi_exit(); - lockdep_hardirq_exit(); - if (restore) - lockdep_hardirqs_on(CALLER_ADDR0); - __nmi_exit(); -} - #ifdef CONFIG_XEN_PV #ifndef CONFIG_PREEMPTION /* diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 0d0667a9fbd7..874aeacde2dd 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -445,3 +445,4 @@ 438 i386 pidfd_getfd sys_pidfd_getfd 439 i386 faccessat2 sys_faccessat2 440 i386 process_madvise sys_process_madvise +441 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 379819244b91..78672124d28b 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -362,6 +362,7 @@ 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise +441 common epoll_pwait2 sys_epoll_pwait2 # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 21243747965d..02e3e42f380b 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -27,9 +27,10 @@ VDSO32-$(CONFIG_IA32_EMULATION) := y vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o vobjs32-y += vdso32/vclock_gettime.o +vobjs-$(CONFIG_X86_SGX) += vsgx.o # files to link into kernel -obj-y += vma.o +obj-y += vma.o extable.o KASAN_SANITIZE_vma.o := y UBSAN_SANITIZE_vma.o := y KCSAN_SANITIZE_vma.o := y @@ -98,6 +99,7 @@ $(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS CFLAGS_REMOVE_vclock_gettime.o = -pg CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg CFLAGS_REMOVE_vgetcpu.o = -pg +CFLAGS_REMOVE_vsgx.o = -pg # # X32 processes use x32 vDSO to access 64bit kernel data. @@ -128,8 +130,8 @@ $(obj)/%-x32.o: $(obj)/%.o FORCE targets += vdsox32.lds $(vobjx32s-y) -$(obj)/%.so: OBJCOPYFLAGS := -S -$(obj)/%.so: $(obj)/%.so.dbg FORCE +$(obj)/%.so: OBJCOPYFLAGS := -S --remove-section __ex_table +$(obj)/%.so: $(obj)/%.so.dbg $(call if_changed,objcopy) $(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE diff --git a/arch/x86/entry/vdso/extable.c b/arch/x86/entry/vdso/extable.c new file mode 100644 index 000000000000..afcf5b65beef --- /dev/null +++ b/arch/x86/entry/vdso/extable.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/err.h> +#include <linux/mm.h> +#include <asm/current.h> +#include <asm/traps.h> +#include <asm/vdso.h> + +struct vdso_exception_table_entry { + int insn, fixup; +}; + +bool fixup_vdso_exception(struct pt_regs *regs, int trapnr, + unsigned long error_code, unsigned long fault_addr) +{ + const struct vdso_image *image = current->mm->context.vdso_image; + const struct vdso_exception_table_entry *extable; + unsigned int nr_entries, i; + unsigned long base; + + /* + * Do not attempt to fixup #DB or #BP. It's impossible to identify + * whether or not a #DB/#BP originated from within an SGX enclave and + * SGX enclaves are currently the only use case for vDSO fixup. + */ + if (trapnr == X86_TRAP_DB || trapnr == X86_TRAP_BP) + return false; + + if (!current->mm->context.vdso) + return false; + + base = (unsigned long)current->mm->context.vdso + image->extable_base; + nr_entries = image->extable_len / (sizeof(*extable)); + extable = image->extable; + + for (i = 0; i < nr_entries; i++) { + if (regs->ip == base + extable[i].insn) { + regs->ip = base + extable[i].fixup; + regs->di = trapnr; + regs->si = error_code; + regs->dx = fault_addr; + return true; + } + } + + return false; +} diff --git a/arch/x86/entry/vdso/extable.h b/arch/x86/entry/vdso/extable.h new file mode 100644 index 000000000000..b56f6b012941 --- /dev/null +++ b/arch/x86/entry/vdso/extable.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __VDSO_EXTABLE_H +#define __VDSO_EXTABLE_H + +/* + * Inject exception fixup for vDSO code. Unlike normal exception fixup, + * vDSO uses a dedicated handler the addresses are relative to the overall + * exception table, not each individual entry. + */ +#ifdef __ASSEMBLY__ +#define _ASM_VDSO_EXTABLE_HANDLE(from, to) \ + ASM_VDSO_EXTABLE_HANDLE from to + +.macro ASM_VDSO_EXTABLE_HANDLE from:req to:req + .pushsection __ex_table, "a" + .long (\from) - __ex_table + .long (\to) - __ex_table + .popsection +.endm +#else +#define _ASM_VDSO_EXTABLE_HANDLE(from, to) \ + ".pushsection __ex_table, \"a\"\n" \ + ".long (" #from ") - __ex_table\n" \ + ".long (" #to ") - __ex_table\n" \ + ".popsection\n" +#endif + +#endif /* __VDSO_EXTABLE_H */ diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index 4d152933547d..dc8da7695859 100644 --- a/arch/x86/entry/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -75,11 +75,18 @@ SECTIONS * stuff that isn't used at runtime in between. */ - .text : { *(.text*) } :text =0x90909090, + .text : { + *(.text*) + *(.fixup) + } :text =0x90909090, + + .altinstructions : { *(.altinstructions) } :text .altinstr_replacement : { *(.altinstr_replacement) } :text + __ex_table : { *(__ex_table) } :text + /DISCARD/ : { *(.discard) *(.discard.*) diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S index 36b644e16272..4bf48462fca7 100644 --- a/arch/x86/entry/vdso/vdso.lds.S +++ b/arch/x86/entry/vdso/vdso.lds.S @@ -27,6 +27,7 @@ VERSION { __vdso_time; clock_getres; __vdso_clock_getres; + __vdso_sgx_enter_enclave; local: *; }; } diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 7380908045c7..2d0f3d8bcc25 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -101,6 +101,8 @@ struct vdso_sym required_syms[] = { {"__kernel_sigreturn", true}, {"__kernel_rt_sigreturn", true}, {"int80_landing_pad", true}, + {"vdso32_rt_sigreturn_landing_pad", true}, + {"vdso32_sigreturn_landing_pad", true}, }; __attribute__((format(printf, 1, 2))) __attribute__((noreturn)) diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index 6f46e11ce539..1c7cfac7e64a 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h @@ -5,6 +5,41 @@ * are built for 32-bit userspace. */ +static void BITSFUNC(copy)(FILE *outfile, const unsigned char *data, size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) { + if (i % 10 == 0) + fprintf(outfile, "\n\t"); + fprintf(outfile, "0x%02X, ", (int)(data)[i]); + } +} + + +/* + * Extract a section from the input data into a standalone blob. Used to + * capture kernel-only data that needs to persist indefinitely, e.g. the + * exception fixup tables, but only in the kernel, i.e. the section can + * be stripped from the final vDSO image. + */ +static void BITSFUNC(extract)(const unsigned char *data, size_t data_len, + FILE *outfile, ELF(Shdr) *sec, const char *name) +{ + unsigned long offset; + size_t len; + + offset = (unsigned long)GET_LE(&sec->sh_offset); + len = (size_t)GET_LE(&sec->sh_size); + + if (offset + len > data_len) + fail("section to extract overruns input data"); + + fprintf(outfile, "static const unsigned char %s[%lu] = {", name, len); + BITSFUNC(copy)(outfile, data + offset, len); + fprintf(outfile, "\n};\n\n"); +} + static void BITSFUNC(go)(void *raw_addr, size_t raw_len, void *stripped_addr, size_t stripped_len, FILE *outfile, const char *image_name) @@ -15,7 +50,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr; unsigned long i, syms_nr; ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr, - *alt_sec = NULL; + *alt_sec = NULL, *extable_sec = NULL; ELF(Dyn) *dyn = 0, *dyn_end = 0; const char *secstrings; INT_BITS syms[NSYMS] = {}; @@ -77,6 +112,8 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, if (!strcmp(secstrings + GET_LE(&sh->sh_name), ".altinstructions")) alt_sec = sh; + if (!strcmp(secstrings + GET_LE(&sh->sh_name), "__ex_table")) + extable_sec = sh; } if (!symtab_hdr) @@ -155,6 +192,9 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, (int)((unsigned char *)stripped_addr)[i]); } fprintf(outfile, "\n};\n\n"); + if (extable_sec) + BITSFUNC(extract)(raw_addr, raw_len, outfile, + extable_sec, "extable"); fprintf(outfile, "const struct vdso_image %s = {\n", image_name); fprintf(outfile, "\t.data = raw_data,\n"); @@ -165,6 +205,14 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, fprintf(outfile, "\t.alt_len = %lu,\n", (unsigned long)GET_LE(&alt_sec->sh_size)); } + if (extable_sec) { + fprintf(outfile, "\t.extable_base = %lu,\n", + (unsigned long)GET_LE(&extable_sec->sh_offset)); + fprintf(outfile, "\t.extable_len = %lu,\n", + (unsigned long)GET_LE(&extable_sec->sh_size)); + fprintf(outfile, "\t.extable = extable,\n"); + } + for (i = 0; i < NSYMS; i++) { if (required_syms[i].export && syms[i]) fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n", diff --git a/arch/x86/entry/vdso/vdso32/sigreturn.S b/arch/x86/entry/vdso/vdso32/sigreturn.S index c3233ee98a6b..1bd068f72d4c 100644 --- a/arch/x86/entry/vdso/vdso32/sigreturn.S +++ b/arch/x86/entry/vdso/vdso32/sigreturn.S @@ -18,6 +18,7 @@ __kernel_sigreturn: movl $__NR_sigreturn, %eax SYSCALL_ENTER_KERNEL .LEND_sigreturn: +SYM_INNER_LABEL(vdso32_sigreturn_landing_pad, SYM_L_GLOBAL) nop .size __kernel_sigreturn,.-.LSTART_sigreturn @@ -29,6 +30,7 @@ __kernel_rt_sigreturn: movl $__NR_rt_sigreturn, %eax SYSCALL_ENTER_KERNEL .LEND_rt_sigreturn: +SYM_INNER_LABEL(vdso32_rt_sigreturn_landing_pad, SYM_L_GLOBAL) nop .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn .previous diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 9185cb1d13b9..825e829ffff1 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -89,30 +89,14 @@ static void vdso_fix_landing(const struct vdso_image *image, static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { - unsigned long new_size = new_vma->vm_end - new_vma->vm_start; const struct vdso_image *image = current->mm->context.vdso_image; - if (image->size != new_size) - return -EINVAL; - vdso_fix_landing(image, new_vma); current->mm->context.vdso = (void __user *)new_vma->vm_start; return 0; } -static int vvar_mremap(const struct vm_special_mapping *sm, - struct vm_area_struct *new_vma) -{ - const struct vdso_image *image = new_vma->vm_mm->context.vdso_image; - unsigned long new_size = new_vma->vm_end - new_vma->vm_start; - - if (new_size != -image->sym_vvar_start) - return -EINVAL; - - return 0; -} - #ifdef CONFIG_TIME_NS static struct page *find_timens_vvar_page(struct vm_area_struct *vma) { @@ -252,7 +236,6 @@ static const struct vm_special_mapping vdso_mapping = { static const struct vm_special_mapping vvar_mapping = { .name = "[vvar]", .fault = vvar_fault, - .mremap = vvar_mremap, }; /* @@ -413,10 +396,10 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) #ifdef CONFIG_COMPAT int compat_arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp) + int uses_interp, bool x32) { #ifdef CONFIG_X86_X32_ABI - if (test_thread_flag(TIF_X32)) { + if (x32) { if (!vdso64_enabled) return 0; return map_vdso_randomized(&vdso_image_x32); @@ -436,6 +419,21 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) } #endif +bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs) +{ +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) + const struct vdso_image *image = current->mm->context.vdso_image; + unsigned long vdso = (unsigned long) current->mm->context.vdso; + + if (in_ia32_syscall() && image == &vdso_image_32) { + if (regs->ip == vdso + image->sym_vdso32_sigreturn_landing_pad || + regs->ip == vdso + image->sym_vdso32_rt_sigreturn_landing_pad) + return true; + } +#endif + return false; +} + #ifdef CONFIG_X86_64 static __init int vdso_setup(char *s) { diff --git a/arch/x86/entry/vdso/vsgx.S b/arch/x86/entry/vdso/vsgx.S new file mode 100644 index 000000000000..86a0e94f68df --- /dev/null +++ b/arch/x86/entry/vdso/vsgx.S @@ -0,0 +1,151 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/linkage.h> +#include <asm/export.h> +#include <asm/errno.h> +#include <asm/enclu.h> + +#include "extable.h" + +/* Relative to %rbp. */ +#define SGX_ENCLAVE_OFFSET_OF_RUN 16 + +/* The offsets relative to struct sgx_enclave_run. */ +#define SGX_ENCLAVE_RUN_TCS 0 +#define SGX_ENCLAVE_RUN_LEAF 8 +#define SGX_ENCLAVE_RUN_EXCEPTION_VECTOR 12 +#define SGX_ENCLAVE_RUN_EXCEPTION_ERROR_CODE 14 +#define SGX_ENCLAVE_RUN_EXCEPTION_ADDR 16 +#define SGX_ENCLAVE_RUN_USER_HANDLER 24 +#define SGX_ENCLAVE_RUN_USER_DATA 32 /* not used */ +#define SGX_ENCLAVE_RUN_RESERVED_START 40 +#define SGX_ENCLAVE_RUN_RESERVED_END 256 + +.code64 +.section .text, "ax" + +SYM_FUNC_START(__vdso_sgx_enter_enclave) + /* Prolog */ + .cfi_startproc + push %rbp + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset %rbp, 0 + mov %rsp, %rbp + .cfi_def_cfa_register %rbp + push %rbx + .cfi_rel_offset %rbx, -8 + + mov %ecx, %eax +.Lenter_enclave: + /* EENTER <= function <= ERESUME */ + cmp $EENTER, %eax + jb .Linvalid_input + cmp $ERESUME, %eax + ja .Linvalid_input + + mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rcx + + /* Validate that the reserved area contains only zeros. */ + mov $SGX_ENCLAVE_RUN_RESERVED_START, %rbx +1: + cmpq $0, (%rcx, %rbx) + jne .Linvalid_input + add $8, %rbx + cmpq $SGX_ENCLAVE_RUN_RESERVED_END, %rbx + jne 1b + + /* Load TCS and AEP */ + mov SGX_ENCLAVE_RUN_TCS(%rcx), %rbx + lea .Lasync_exit_pointer(%rip), %rcx + + /* Single ENCLU serving as both EENTER and AEP (ERESUME) */ +.Lasync_exit_pointer: +.Lenclu_eenter_eresume: + enclu + + /* EEXIT jumps here unless the enclave is doing something fancy. */ + mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rbx + + /* Set exit_reason. */ + movl $EEXIT, SGX_ENCLAVE_RUN_LEAF(%rbx) + + /* Invoke userspace's exit handler if one was provided. */ +.Lhandle_exit: + cmpq $0, SGX_ENCLAVE_RUN_USER_HANDLER(%rbx) + jne .Linvoke_userspace_handler + + /* Success, in the sense that ENCLU was attempted. */ + xor %eax, %eax + +.Lout: + pop %rbx + leave + .cfi_def_cfa %rsp, 8 + ret + + /* The out-of-line code runs with the pre-leave stack frame. */ + .cfi_def_cfa %rbp, 16 + +.Linvalid_input: + mov $(-EINVAL), %eax + jmp .Lout + +.Lhandle_exception: + mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rbx + + /* Set the exception info. */ + mov %eax, (SGX_ENCLAVE_RUN_LEAF)(%rbx) + mov %di, (SGX_ENCLAVE_RUN_EXCEPTION_VECTOR)(%rbx) + mov %si, (SGX_ENCLAVE_RUN_EXCEPTION_ERROR_CODE)(%rbx) + mov %rdx, (SGX_ENCLAVE_RUN_EXCEPTION_ADDR)(%rbx) + jmp .Lhandle_exit + +.Linvoke_userspace_handler: + /* Pass the untrusted RSP (at exit) to the callback via %rcx. */ + mov %rsp, %rcx + + /* Save struct sgx_enclave_exception %rbx is about to be clobbered. */ + mov %rbx, %rax + + /* Save the untrusted RSP offset in %rbx (non-volatile register). */ + mov %rsp, %rbx + and $0xf, %rbx + + /* + * Align stack per x86_64 ABI. Note, %rsp needs to be 16-byte aligned + * _after_ pushing the parameters on the stack, hence the bonus push. + */ + and $-0x10, %rsp + push %rax + + /* Push struct sgx_enclave_exception as a param to the callback. */ + push %rax + + /* Clear RFLAGS.DF per x86_64 ABI */ + cld + + /* + * Load the callback pointer to %rax and lfence for LVI (load value + * injection) protection before making the call. + */ + mov SGX_ENCLAVE_RUN_USER_HANDLER(%rax), %rax + lfence + call *%rax + + /* Undo the post-exit %rsp adjustment. */ + lea 0x10(%rsp, %rbx), %rsp + + /* + * If the return from callback is zero or negative, return immediately, + * else re-execute ENCLU with the postive return value interpreted as + * the requested ENCLU function. + */ + cmp $0, %eax + jle .Lout + jmp .Lenter_enclave + + .cfi_endproc + +_ASM_VDSO_EXTABLE_HANDLE(.Lenclu_eenter_eresume, .Lhandle_exception) + +SYM_FUNC_END(__vdso_sgx_enter_enclave) diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 44c33103a955..1b40b9297083 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -316,7 +316,7 @@ static struct vm_area_struct gate_vma __ro_after_init = { struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { #ifdef CONFIG_COMPAT - if (!mm || mm->context.ia32_compat) + if (!mm || !(mm->context.flags & MM_CONTEXT_HAS_VSYSCALL)) return NULL; #endif if (vsyscall_mode == NONE) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 39eb276d0277..2c1791c4a518 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -538,7 +538,7 @@ static void amd_pmu_cpu_starting(int cpu) if (!x86_pmu.amd_nb_constraints) return; - nb_id = amd_get_nb_id(cpu); + nb_id = topology_die_id(cpu); WARN_ON_ONCE(nb_id == BAD_APICID); for_each_online_cpu(i) { diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index a88c94d65693..e37de298a495 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1174,7 +1174,7 @@ static inline void x86_assign_hw_event(struct perf_event *event, case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: /* All the metric events are mapped onto the fixed counter 3. */ idx = INTEL_PMC_IDX_FIXED_SLOTS; - /* fall through */ + fallthrough; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1: hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + @@ -2602,7 +2602,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent struct stack_frame_ia32 frame; const struct stack_frame_ia32 __user *fp; - if (!test_thread_flag(TIF_IA32)) + if (user_64bit_mode(regs)) return 0; cs_base = get_segment_base(regs->cs); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index af457f8cb29d..d4569bfa83e3 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -257,7 +257,8 @@ static struct event_constraint intel_icl_event_constraints[] = { INTEL_EVENT_CONSTRAINT_RANGE(0x48, 0x54, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0x60, 0x8b, 0xf), INTEL_UEVENT_CONSTRAINT(0x04a3, 0xff), /* CYCLE_ACTIVITY.STALLS_TOTAL */ - INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff), /* CYCLE_ACTIVITY.STALLS_MEM_ANY */ + INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff), /* CYCLE_ACTIVITY.CYCLES_MEM_ANY */ + INTEL_UEVENT_CONSTRAINT(0x14a3, 0xff), /* CYCLE_ACTIVITY.STALLS_MEM_ANY */ INTEL_EVENT_CONSTRAINT(0xa3, 0xf), /* CYCLE_ACTIVITY.* */ INTEL_EVENT_CONSTRAINT_RANGE(0xa8, 0xb0, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0xb7, 0xbd, 0xf), @@ -1900,6 +1901,19 @@ static __initconst const u64 tnt_hw_cache_extra_regs }, }; +EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound_tnt, "event=0x71,umask=0x0"); +EVENT_ATTR_STR(topdown-retiring, td_retiring_tnt, "event=0xc2,umask=0x0"); +EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec_tnt, "event=0x73,umask=0x6"); +EVENT_ATTR_STR(topdown-be-bound, td_be_bound_tnt, "event=0x74,umask=0x0"); + +static struct attribute *tnt_events_attrs[] = { + EVENT_PTR(td_fe_bound_tnt), + EVENT_PTR(td_retiring_tnt), + EVENT_PTR(td_bad_spec_tnt), + EVENT_PTR(td_be_bound_tnt), + NULL, +}; + static struct extra_reg intel_tnt_extra_regs[] __read_mostly = { /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x800ff0ffffff9fffull, RSP_0), @@ -5173,6 +5187,7 @@ __init int intel_pmu_init(void) x86_pmu.lbr_pt_coexist = true; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.get_event_constraints = tnt_get_event_constraints; + td_attr = tnt_events_attrs; extra_attr = slm_format_attr; pr_cont("Tremont events, "); name = "Tremont"; @@ -5442,6 +5457,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ICELAKE: case INTEL_FAM6_TIGERLAKE_L: case INTEL_FAM6_TIGERLAKE: + case INTEL_FAM6_ROCKETLAKE: x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); @@ -5464,7 +5480,7 @@ __init int intel_pmu_init(void) mem_attr = icl_events_attrs; td_attr = icl_td_events_attrs; tsx_attr = icl_tsx_events_attrs; - x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xca, .umask=0x02); + x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04); x86_pmu.lbr_pt_coexist = true; intel_pmu_pebs_data_source_skl(pmem); x86_pmu.update_topdown_event = icl_update_topdown_event; diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 4eb7ee5fed72..407eee5f6f95 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -51,46 +51,46 @@ * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL, - * TNT + * TNT,RKL * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 * Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML, - * ICL,TGL + * ICL,TGL,RKL * Scope: Core * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. * perf code: 0x00 * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, - * KBL,CML,ICL,TGL,TNT + * KBL,CML,ICL,TGL,TNT,RKL * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL, - * GLM,CNL,KBL,CML,ICL,TGL,TNT + * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL * Scope: Package (physical package) * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL, - * TNT + * TNT,RKL * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL, - * KBL,CML,ICL,TGL + * KBL,CML,ICL,TGL,RKL * Scope: Package (physical package) * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter. * perf code: 0x04 - * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL + * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL * Scope: Package (physical package) * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter. * perf code: 0x05 - * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL + * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL * Scope: Package (physical package) * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. * perf code: 0x06 * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL, - * TNT + * TNT,RKL * Scope: Package (physical package) * */ @@ -649,6 +649,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &icl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 485c5066f8b8..67dbc91bccfe 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -961,7 +961,8 @@ static void adaptive_pebs_record_size_update(void) #define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \ PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \ - PERF_SAMPLE_TRANSACTION) + PERF_SAMPLE_TRANSACTION | \ + PERF_SAMPLE_DATA_PAGE_SIZE) static u64 pebs_update_adaptive_cfg(struct perf_event *event) { @@ -1261,7 +1262,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) old_to = to; #ifdef CONFIG_X86_64 - is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32); + is_64bit = kernel_ip(to) || any_64bit_mode(regs); #endif insn_init(&insn, kaddr, size, is_64bit); insn_get_length(&insn); @@ -1337,6 +1338,10 @@ static u64 get_data_src(struct perf_event *event, u64 aux) return val; } +#define PERF_SAMPLE_ADDR_TYPE (PERF_SAMPLE_ADDR | \ + PERF_SAMPLE_PHYS_ADDR | \ + PERF_SAMPLE_DATA_PAGE_SIZE) + static void setup_pebs_fixed_sample_data(struct perf_event *event, struct pt_regs *iregs, void *__pebs, struct perf_sample_data *data, @@ -1451,7 +1456,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, } - if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) && + if ((sample_type & PERF_SAMPLE_ADDR_TYPE) && x86_pmu.intel_cap.pebs_format >= 1) data->addr = pebs->dla; @@ -1579,7 +1584,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, if (sample_type & PERF_SAMPLE_DATA_SRC) data->data_src.val = get_data_src(event, meminfo->aux); - if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) + if (sample_type & PERF_SAMPLE_ADDR_TYPE) data->addr = meminfo->address; if (sample_type & PERF_SAMPLE_TRANSACTION) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 8961653c5dd2..21890dacfcfe 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -919,7 +919,7 @@ static __always_inline bool get_lbr_predicted(u64 info) return !(info & LBR_INFO_MISPRED); } -static __always_inline bool get_lbr_cycles(u64 info) +static __always_inline u16 get_lbr_cycles(u64 info) { if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !(x86_pmu.lbr_timed_lbr && info & LBR_INFO_CYC_CNT_VALID)) @@ -1221,7 +1221,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort) * on 64-bit systems running 32-bit apps */ #ifdef CONFIG_X86_64 - is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32); + is64 = kernel_ip((unsigned long)addr) || any_64bit_mode(current_pt_regs()); #endif insn_init(&insn, addr, bytes_read, is64); insn_get_opcode(&insn); diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 80d52cbe2fde..357258f82dc8 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1636,6 +1636,11 @@ static const struct intel_uncore_init_fun tgl_l_uncore_init __initconst = { .mmio_init = tgl_l_uncore_mmio_init, }; +static const struct intel_uncore_init_fun rkl_uncore_init __initconst = { + .cpu_init = tgl_uncore_cpu_init, + .pci_init = skl_uncore_pci_init, +}; + static const struct intel_uncore_init_fun icx_uncore_init __initconst = { .cpu_init = icx_uncore_cpu_init, .pci_init = icx_uncore_pci_init, @@ -1683,6 +1688,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &tgl_l_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &tgl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rkl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init), {}, }; diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index bbd1120ae161..098f893e2e22 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -60,7 +60,8 @@ #define PCI_DEVICE_ID_INTEL_TGL_U3_IMC 0x9a12 #define PCI_DEVICE_ID_INTEL_TGL_U4_IMC 0x9a14 #define PCI_DEVICE_ID_INTEL_TGL_H_IMC 0x9a36 - +#define PCI_DEVICE_ID_INTEL_RKL_1_IMC 0x4c43 +#define PCI_DEVICE_ID_INTEL_RKL_2_IMC 0x4c53 /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff @@ -405,6 +406,12 @@ static struct intel_uncore_type *tgl_msr_uncores[] = { NULL, }; +static void rkl_uncore_msr_init_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) + wrmsrl(SKL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN); +} + void tgl_uncore_cpu_init(void) { uncore_msr_uncores = tgl_msr_uncores; @@ -412,6 +419,7 @@ void tgl_uncore_cpu_init(void) icl_uncore_cbox.ops = &skl_uncore_msr_ops; icl_uncore_clockbox.ops = &skl_uncore_msr_ops; snb_uncore_arb.ops = &skl_uncore_msr_ops; + skl_uncore_msr_ops.init_box = rkl_uncore_msr_init_box; } enum { @@ -926,6 +934,14 @@ static const struct pci_device_id icl_uncore_pci_ids[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U2_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RKL_1_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RKL_2_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, { /* end: all zeroes */ }, }; @@ -1019,6 +1035,8 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(CML_S5_IMC, &skl_uncore_pci_driver), IMC_DEV(ICL_U_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ IMC_DEV(ICL_U2_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ + IMC_DEV(RKL_1_IMC, &icl_uncore_pci_driver), + IMC_DEV(RKL_2_IMC, &icl_uncore_pci_driver), { /* end marker */ } }; diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 4be8f9cabd07..680404c58cb1 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -99,6 +99,7 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_ICELAKE_D: case INTEL_FAM6_TIGERLAKE_L: case INTEL_FAM6_TIGERLAKE: + case INTEL_FAM6_ROCKETLAKE: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 6a8edfe59b09..7895cf4c59a7 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -132,7 +132,7 @@ struct amd_nb { PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \ - PERF_SAMPLE_PERIOD) + PERF_SAMPLE_PERIOD | PERF_SAMPLE_CODE_PAGE_SIZE) #define PEBS_GP_REGS \ ((1ULL << PERF_REG_X86_AX) | \ diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 81cf22398cd1..5e3d9b7fd5fb 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -347,7 +347,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, */ unsafe_put_user(*((u64 *)&code), (u64 __user *)frame->retcode, Efault); unsafe_put_sigcontext32(&frame->uc.uc_mcontext, fp, regs, set, Efault); - unsafe_put_user(*(__u64 *)set, (__u64 *)&frame->uc.uc_sigmask, Efault); + unsafe_put_user(*(__u64 *)set, (__u64 __user *)&frame->uc.uc_sigmask, Efault); user_access_end(); if (__copy_siginfo_to_user32(&frame->info, &ksig->info)) diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 6d2df1ee427b..65064d9f7fa6 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -159,6 +159,8 @@ static inline u64 x86_default_get_root_pointer(void) extern int x86_acpi_numa_init(void); #endif /* CONFIG_ACPI_NUMA */ +struct cper_ia_proc_ctx; + #ifdef CONFIG_ACPI_APEI static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) { @@ -177,6 +179,15 @@ static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) */ return PAGE_KERNEL_NOENC; } + +int arch_apei_report_x86_error(struct cper_ia_proc_ctx *ctx_info, + u64 lapic_id); +#else +static inline int arch_apei_report_x86_error(struct cper_ia_proc_ctx *ctx_info, + u64 lapic_id) +{ + return -EINVAL; +} #endif #define ACPI_TABLE_UPGRADE_MAX_PHYS (max_low_pfn_mapped << PAGE_SHIFT) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 4e3099d9ae62..34cb3c159481 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -259,6 +259,7 @@ static inline u64 native_x2apic_icr_read(void) extern int x2apic_mode; extern int x2apic_phys; +extern void __init x2apic_set_max_apicid(u32 apicid); extern void __init check_x2apic(void); extern void x2apic_setup(void); static inline int x2apic_enabled(void) @@ -305,11 +306,10 @@ struct apic { void (*send_IPI_all)(int vector); void (*send_IPI_self)(int vector); - /* dest_logical is used by the IPI functions */ - u32 dest_logical; u32 disable_esr; - u32 irq_delivery_mode; - u32 irq_dest_mode; + + enum apic_delivery_modes delivery_mode; + bool dest_mode_logical; u32 (*calc_dest_apicid)(unsigned int cpu); @@ -520,12 +520,10 @@ static inline void apic_smt_update(void) { } #endif struct msi_msg; +struct irq_cfg; -#ifdef CONFIG_PCI_MSI -void x86_vector_msi_compose_msg(struct irq_data *data, struct msi_msg *msg); -#else -# define x86_vector_msi_compose_msg NULL -#endif +extern void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg, + bool dmar); extern void ioapic_zap_locks(void); diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 05e694ed8386..5716f22f81ac 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -432,15 +432,13 @@ struct local_apic { #define BAD_APICID 0xFFFFu #endif -enum ioapic_irq_destination_types { - dest_Fixed = 0, - dest_LowestPrio = 1, - dest_SMI = 2, - dest__reserved_1 = 3, - dest_NMI = 4, - dest_INIT = 5, - dest__reserved_2 = 6, - dest_ExtINT = 7 +enum apic_delivery_modes { + APIC_DELIVERY_MODE_FIXED = 0, + APIC_DELIVERY_MODE_LOWESTPRIO = 1, + APIC_DELIVERY_MODE_SMI = 2, + APIC_DELIVERY_MODE_NMI = 4, + APIC_DELIVERY_MODE_INIT = 5, + APIC_DELIVERY_MODE_EXTINT = 7, }; #endif /* _ASM_X86_APICDEF_H */ diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index b6cac6e9bb70..f732741ad7c7 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -199,7 +199,7 @@ static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new) { - return try_cmpxchg(&v->counter, old, new); + return arch_try_cmpxchg(&v->counter, old, new); } #define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 809bd010a751..7886d0578fc9 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -187,7 +187,7 @@ static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { - return try_cmpxchg(&v->counter, old, new); + return arch_try_cmpxchg(&v->counter, old, new); } #define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h index 86b63c7feab7..86b2e0dcc4bf 100644 --- a/arch/x86/include/asm/cacheinfo.h +++ b/arch/x86/include/asm/cacheinfo.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_CACHEINFO_H #define _ASM_X86_CACHEINFO_H -void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id); -void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id); +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu); +void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu); #endif /* _ASM_X86_CACHEINFO_H */ diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index a8bfac131256..4d4ec5cbdc51 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -221,7 +221,7 @@ extern void __add_wrong_size(void) #define __try_cmpxchg(ptr, pold, new, size) \ __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX) -#define try_cmpxchg(ptr, pold, new) \ +#define arch_try_cmpxchg(ptr, pold, new) \ __try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr))) /* diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 0e327a01f50f..f145e3326c6d 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -177,14 +177,13 @@ typedef struct user_regs_struct compat_elf_gregset_t; static inline void __user *arch_compat_alloc_user_space(long len) { - compat_uptr_t sp; - - if (test_thread_flag(TIF_IA32)) { - sp = task_pt_regs(current)->sp; - } else { - /* -128 for the x32 ABI redzone */ - sp = task_pt_regs(current)->sp - 128; - } + compat_uptr_t sp = task_pt_regs(current)->sp; + + /* + * -128 for the x32 ABI redzone. For IA32, it is not strictly + * necessary, but not harmful. + */ + sp -= 128; return (void __user *)round_down(sp - len, 16); } diff --git a/arch/x86/include/asm/copy_mc_test.h b/arch/x86/include/asm/copy_mc_test.h deleted file mode 100644 index e4991ba96726..000000000000 --- a/arch/x86/include/asm/copy_mc_test.h +++ /dev/null @@ -1,75 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _COPY_MC_TEST_H_ -#define _COPY_MC_TEST_H_ - -#ifndef __ASSEMBLY__ -#ifdef CONFIG_COPY_MC_TEST -extern unsigned long copy_mc_test_src; -extern unsigned long copy_mc_test_dst; - -static inline void copy_mc_inject_src(void *addr) -{ - if (addr) - copy_mc_test_src = (unsigned long) addr; - else - copy_mc_test_src = ~0UL; -} - -static inline void copy_mc_inject_dst(void *addr) -{ - if (addr) - copy_mc_test_dst = (unsigned long) addr; - else - copy_mc_test_dst = ~0UL; -} -#else /* CONFIG_COPY_MC_TEST */ -static inline void copy_mc_inject_src(void *addr) -{ -} - -static inline void copy_mc_inject_dst(void *addr) -{ -} -#endif /* CONFIG_COPY_MC_TEST */ - -#else /* __ASSEMBLY__ */ -#include <asm/export.h> - -#ifdef CONFIG_COPY_MC_TEST -.macro COPY_MC_TEST_CTL - .pushsection .data - .align 8 - .globl copy_mc_test_src - copy_mc_test_src: - .quad 0 - EXPORT_SYMBOL_GPL(copy_mc_test_src) - .globl copy_mc_test_dst - copy_mc_test_dst: - .quad 0 - EXPORT_SYMBOL_GPL(copy_mc_test_dst) - .popsection -.endm - -.macro COPY_MC_TEST_SRC reg count target - leaq \count(\reg), %r9 - cmp copy_mc_test_src, %r9 - ja \target -.endm - -.macro COPY_MC_TEST_DST reg count target - leaq \count(\reg), %r9 - cmp copy_mc_test_dst, %r9 - ja \target -.endm -#else -.macro COPY_MC_TEST_CTL -.endm - -.macro COPY_MC_TEST_SRC reg count target -.endm - -.macro COPY_MC_TEST_DST reg count target -.endm -#endif /* CONFIG_COPY_MC_TEST */ -#endif /* __ASSEMBLY__ */ -#endif /* _COPY_MC_TEST_H_ */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index dad350d42ecf..84b887825f12 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -237,10 +237,12 @@ #define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ #define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ #define X86_FEATURE_SEV_ES ( 8*32+20) /* AMD Secure Encrypted Virtualization - Encrypted State */ +#define X86_FEATURE_VM_PAGE_FLUSH ( 8*32+21) /* "" VM Page Flush MSR is supported */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ #define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3B */ +#define X86_FEATURE_SGX ( 9*32+ 2) /* Software Guard Extensions */ #define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ #define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ #define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ @@ -356,6 +358,7 @@ #define X86_FEATURE_MOVDIRI (16*32+27) /* MOVDIRI instruction */ #define X86_FEATURE_MOVDIR64B (16*32+28) /* MOVDIR64B instruction */ #define X86_FEATURE_ENQCMD (16*32+29) /* ENQCMD and ENQCMDS instructions */ +#define X86_FEATURE_SGX_LC (16*32+30) /* Software Guard Extensions Launch Control */ /* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */ @@ -374,6 +377,7 @@ #define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ #define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */ +#define X86_FEATURE_AVX512_FP16 (18*32+23) /* AVX512 FP16 */ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ #define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 5861d34f9771..7947cb1782da 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -62,6 +62,12 @@ # define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31)) #endif +#ifdef CONFIG_X86_SGX +# define DISABLE_SGX 0 +#else +# define DISABLE_SGX (1 << (X86_FEATURE_SGX & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -74,7 +80,7 @@ #define DISABLED_MASK6 0 #define DISABLED_MASK7 (DISABLE_PTI) #define DISABLED_MASK8 0 -#define DISABLED_MASK9 (DISABLE_SMAP) +#define DISABLED_MASK9 (DISABLE_SMAP|DISABLE_SGX) #define DISABLED_MASK10 0 #define DISABLED_MASK11 0 #define DISABLED_MASK12 0 diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index bc9758ef292e..c98f78330b09 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -213,8 +213,6 @@ static inline bool efi_is_64bit(void) static inline bool efi_is_native(void) { - if (!IS_ENABLED(CONFIG_X86_64)) - return true; return efi_is_64bit(); } @@ -382,4 +380,7 @@ static inline void efi_fake_memmap_early(void) } #endif +#define arch_ima_efi_boot_mode \ + ({ extern struct boot_params boot_params; boot_params.secure_boot; }) + #endif /* _ASM_X86_EFI_H */ diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index b9a5d488f1a5..66bdfe838d61 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -186,8 +186,9 @@ static inline void elf_common_init(struct thread_struct *t, #define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ elf_common_init(¤t->thread, regs, __USER_DS) -void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp); -#define compat_start_thread compat_start_thread +void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32); +#define COMPAT_START_THREAD(ex, regs, new_ip, new_sp) \ + compat_start_thread(regs, new_ip, new_sp, ex->e_machine == EM_X86_64) void set_personality_ia32(bool); #define COMPAT_SET_PERSONALITY(ex) \ @@ -361,7 +362,7 @@ do { \ #define AT_SYSINFO 32 #define COMPAT_ARCH_DLINFO \ -if (test_thread_flag(TIF_X32)) \ +if (exec->e_machine == EM_X86_64) \ ARCH_DLINFO_X32; \ else \ ARCH_DLINFO_IA32 @@ -382,8 +383,12 @@ struct linux_binprm; extern int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp); -#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages + int uses_interp, bool x32); +#define COMPAT_ARCH_SETUP_ADDITIONAL_PAGES(bprm, ex, interpreter) \ + compat_arch_setup_additional_pages(bprm, interpreter, \ + (ex->e_machine == EM_X86_64)) + +extern bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs); /* Do not change the values. See get_align_mask() */ enum align_flags { diff --git a/arch/x86/include/asm/enclu.h b/arch/x86/include/asm/enclu.h new file mode 100644 index 000000000000..b1314e41a744 --- /dev/null +++ b/arch/x86/include/asm/enclu.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_ENCLU_H +#define _ASM_X86_ENCLU_H + +#define EENTER 0x02 +#define ERESUME 0x03 +#define EEXIT 0x04 + +#endif /* _ASM_X86_ENCLU_H */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 77217bd292bd..9f1a0a987e5e 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -14,13 +14,20 @@ #ifndef _ASM_X86_FIXMAP_H #define _ASM_X86_FIXMAP_H +#include <asm/kmap_size.h> + /* * Exposed to assembly code for setting up initial page tables. Cannot be * calculated in assembly code (fixmap entries are an enum), but is sanity * checked in the actual fixmap C code to make sure that the fixmap is * covered fully. */ -#define FIXMAP_PMD_NUM 2 +#ifndef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP +# define FIXMAP_PMD_NUM 2 +#else +# define KM_PMDS (KM_MAX_IDX * ((CONFIG_NR_CPUS + 511) / 512)) +# define FIXMAP_PMD_NUM (KM_PMDS + 2) +#endif /* fixmap starts downwards from the 507th entry in level2_fixmap_pgt */ #define FIXMAP_PMD_TOP 507 @@ -31,7 +38,6 @@ #include <asm/pgtable_types.h> #ifdef CONFIG_X86_32 #include <linux/threads.h> -#include <asm/kmap_types.h> #else #include <uapi/asm/vsyscall.h> #endif @@ -92,9 +98,9 @@ enum fixed_addresses { FIX_IO_APIC_BASE_0, FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, #endif -#ifdef CONFIG_X86_32 +#ifdef CONFIG_KMAP_LOCAL FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ - FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, + FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, #ifdef CONFIG_PCI_MMCONFIG FIX_PCIE_MCFG, #endif @@ -151,7 +157,6 @@ extern void reserve_top_address(unsigned long reserve); extern int fixmaps_set; -extern pte_t *kmap_pte; extern pte_t *pkmap_page_table; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index dcd9503b1098..a5aba4ab0224 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -29,17 +29,32 @@ extern void fpregs_mark_activate(void); * A context switch will (and softirq might) save CPU's FPU registers to * fpu->state and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in * a random state. + * + * local_bh_disable() protects against both preemption and soft interrupts + * on !RT kernels. + * + * On RT kernels local_bh_disable() is not sufficient because it only + * serializes soft interrupt related sections via a local lock, but stays + * preemptible. Disabling preemption is the right choice here as bottom + * half processing is always in thread context on RT kernels so it + * implicitly prevents bottom half processing as well. + * + * Disabling preemption also serializes against kernel_fpu_begin(). */ static inline void fpregs_lock(void) { - preempt_disable(); - local_bh_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_disable(); + else + preempt_disable(); } static inline void fpregs_unlock(void) { - local_bh_enable(); - preempt_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_enable(); + else + preempt_enable(); } #ifdef CONFIG_X86_DEBUG_FPU diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 84b9449be080..9f3130f40807 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -41,6 +41,24 @@ static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned regs->orig_ax = addr; } +#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS +struct ftrace_regs { + struct pt_regs regs; +}; + +static __always_inline struct pt_regs * +arch_ftrace_get_regs(struct ftrace_regs *fregs) +{ + /* Only when FL_SAVE_REGS is set, cs will be non zero */ + if (!fregs->regs.cs) + return NULL; + return &fregs->regs; +} + +#define ftrace_instruction_pointer_set(fregs, _ip) \ + do { (fregs)->regs.ip = (_ip); } while (0) +#endif + #ifdef CONFIG_DYNAMIC_FTRACE struct dyn_arch_ftrace { diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 0f420b24e0fc..032e020853aa 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -23,7 +23,6 @@ #include <linux/interrupt.h> #include <linux/threads.h> -#include <asm/kmap_types.h> #include <asm/tlbflush.h> #include <asm/paravirt.h> #include <asm/fixmap.h> @@ -58,11 +57,17 @@ extern unsigned long highstart_pfn, highend_pfn; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -void *kmap_atomic_pfn(unsigned long pfn); -void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); - #define flush_cache_kmaps() do { } while (0) +#define arch_kmap_local_post_map(vaddr, pteval) \ + arch_flush_lazy_mmu_mode() + +#define arch_kmap_local_post_unmap(vaddr) \ + do { \ + flush_tlb_one_kernel((vaddr)); \ + arch_flush_lazy_mmu_mode(); \ + } while (0) + extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, unsigned long end_pfn); diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 6352dee37cda..ab9f3dd87c80 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -74,17 +74,6 @@ extern void hpet_disable(void); extern unsigned int hpet_readl(unsigned int a); extern void force_hpet_resume(void); -struct irq_data; -struct hpet_channel; -struct irq_domain; - -extern void hpet_msi_unmask(struct irq_data *data); -extern void hpet_msi_mask(struct irq_data *data); -extern void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg); -extern struct irq_domain *hpet_create_irq_domain(int hpet_id); -extern int hpet_assign_irq(struct irq_domain *domain, - struct hpet_channel *hc, int dev_num); - #ifdef CONFIG_HPET_EMULATE_RTC #include <linux/interrupt.h> diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index a4aeeaace040..d465ece58151 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -39,18 +39,16 @@ enum irq_alloc_type { X86_IRQ_ALLOC_TYPE_PCI_MSI, X86_IRQ_ALLOC_TYPE_PCI_MSIX, X86_IRQ_ALLOC_TYPE_DMAR, + X86_IRQ_ALLOC_TYPE_AMDVI, X86_IRQ_ALLOC_TYPE_UV, - X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT, - X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT, }; struct ioapic_alloc_info { - int pin; - int node; - u32 trigger : 1; - u32 polarity : 1; - u32 valid : 1; - struct IO_APIC_route_entry *entry; + int pin; + int node; + u32 is_level : 1; + u32 active_low : 1; + u32 valid : 1; }; struct uv_alloc_info { diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 0ed20e8bba9e..6bf42aed387e 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -23,6 +23,13 @@ #define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 #define HYPERV_CPUID_NESTED_FEATURES 0x4000000A +#define HYPERV_CPUID_VIRT_STACK_INTERFACE 0x40000081 +#define HYPERV_VS_INTERFACE_EAX_SIGNATURE 0x31235356 /* "VS#1" */ + +#define HYPERV_CPUID_VIRT_STACK_PROPERTIES 0x40000082 +/* Support for the extended IOAPIC RTE format */ +#define HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE BIT(2) + #define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000 #define HYPERV_CPUID_MIN 0x40000005 #define HYPERV_CPUID_MAX 0x4000ffff diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index b2442eb0ac2f..247a60a47331 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -11,9 +11,6 @@ #include <asm/irq_stack.h> -bool idtentry_enter_nmi(struct pt_regs *regs); -void idtentry_exit_nmi(struct pt_regs *regs, bool irq_state); - /** * DECLARE_IDTENTRY - Declare functions for simple IDT entry points * No error code pushed by hardware diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h index bd7f02480ca1..438ccd4f3cc4 100644 --- a/arch/x86/include/asm/inst.h +++ b/arch/x86/include/asm/inst.h @@ -143,21 +143,6 @@ .macro MODRM mod opd1 opd2 .byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3) .endm - -.macro RDPID opd - REG_TYPE rdpid_opd_type \opd - .if rdpid_opd_type == REG_TYPE_R64 - R64_NUM rdpid_opd \opd - .else - R32_NUM rdpid_opd \opd - .endif - .byte 0xf3 - .if rdpid_opd > 7 - PFX_REX rdpid_opd 0 - .endif - .byte 0x0f, 0xc7 - MODRM 0xc0 rdpid_opd 0x7 -.endm #endif #endif diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index a1a26f6d3aa4..437aa8d00e53 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -13,15 +13,6 @@ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar */ -/* I/O Unit Redirection Table */ -#define IO_APIC_REDIR_VECTOR_MASK 0x000FF -#define IO_APIC_REDIR_DEST_LOGICAL 0x00800 -#define IO_APIC_REDIR_DEST_PHYSICAL 0x00000 -#define IO_APIC_REDIR_SEND_PENDING (1 << 12) -#define IO_APIC_REDIR_REMOTE_IRR (1 << 14) -#define IO_APIC_REDIR_LEVEL_TRIGGER (1 << 15) -#define IO_APIC_REDIR_MASKED (1 << 16) - /* * The structure of the IO-APIC: */ @@ -65,52 +56,40 @@ union IO_APIC_reg_03 { }; struct IO_APIC_route_entry { - __u32 vector : 8, - delivery_mode : 3, /* 000: FIXED - * 001: lowest prio - * 111: ExtINT - */ - dest_mode : 1, /* 0: physical, 1: logical */ - delivery_status : 1, - polarity : 1, - irr : 1, - trigger : 1, /* 0: edge, 1: level */ - mask : 1, /* 0: enabled, 1: disabled */ - __reserved_2 : 15; - - __u32 __reserved_3 : 24, - dest : 8; -} __attribute__ ((packed)); - -struct IR_IO_APIC_route_entry { - __u64 vector : 8, - zero : 3, - index2 : 1, - delivery_status : 1, - polarity : 1, - irr : 1, - trigger : 1, - mask : 1, - reserved : 31, - format : 1, - index : 15; + union { + struct { + u64 vector : 8, + delivery_mode : 3, + dest_mode_logical : 1, + delivery_status : 1, + active_low : 1, + irr : 1, + is_level : 1, + masked : 1, + reserved_0 : 15, + reserved_1 : 17, + virt_destid_8_14 : 7, + destid_0_7 : 8; + }; + struct { + u64 ir_shared_0 : 8, + ir_zero : 3, + ir_index_15 : 1, + ir_shared_1 : 5, + ir_reserved_0 : 31, + ir_format : 1, + ir_index_0_14 : 15; + }; + struct { + u64 w1 : 32, + w2 : 32; + }; + }; } __attribute__ ((packed)); struct irq_alloc_info; struct ioapic_domain_cfg; -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 - -#define IOAPIC_MASKED 1 -#define IOAPIC_UNMASKED 0 - -#define IOAPIC_POL_HIGH 0 -#define IOAPIC_POL_LOW 1 - -#define IOAPIC_DEST_MODE_PHYSICAL 0 -#define IOAPIC_DEST_MODE_LOGICAL 1 - #define IOAPIC_MAP_ALLOC 0x1 #define IOAPIC_MAP_CHECK 0x2 diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index bacf68c4d70e..e2de092fc38c 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h @@ -9,19 +9,14 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/uaccess.h> +#include <linux/highmem.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> -void __iomem * -iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); +void __iomem *__iomap_local_pfn_prot(unsigned long pfn, pgprot_t prot); -void -iounmap_atomic(void __iomem *kvaddr); +int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); -int -iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); - -void -iomap_free(resource_size_t base, unsigned long size); +void iomap_free(resource_size_t base, unsigned long size); #endif /* _ASM_X86_IOMAP_H */ diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index af4a151d70b3..7cc49432187f 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -44,9 +44,6 @@ extern int irq_remapping_reenable(int); extern int irq_remap_enable_fault_handling(void); extern void panic_if_irq_remap(const char *msg); -extern struct irq_domain * -irq_remapping_get_irq_domain(struct irq_alloc_info *info); - /* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */ extern struct irq_domain * arch_create_remap_msi_irq_domain(struct irq_domain *par, const char *n, int id); @@ -71,11 +68,5 @@ static inline void panic_if_irq_remap(const char *msg) { } -static inline struct irq_domain * -irq_remapping_get_irq_domain(struct irq_alloc_info *info) -{ - return NULL; -} - #endif /* CONFIG_IRQ_REMAP */ #endif /* __X86_IRQ_REMAPPING_H */ diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h index cd684d45cb5f..125c23b7bad3 100644 --- a/arch/x86/include/asm/irqdomain.h +++ b/arch/x86/include/asm/irqdomain.h @@ -12,6 +12,9 @@ enum { X86_IRQ_ALLOC_LEGACY = 0x2, }; +extern int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec); +extern int x86_fwspec_is_hpet(struct irq_fwspec *fwspec); + extern struct irq_domain *x86_vector_domain; extern void init_irq_alloc_info(struct irq_alloc_info *info, diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h deleted file mode 100644 index 04ab8266e347..000000000000 --- a/arch/x86/include/asm/kmap_types.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_KMAP_TYPES_H -#define _ASM_X86_KMAP_TYPES_H - -#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM) -#define __WITH_KM_FENCE -#endif - -#include <asm-generic/kmap_types.h> - -#undef __WITH_KM_FENCE - -#endif /* _ASM_X86_KMAP_TYPES_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7e5f33a0d0e2..3ab7b46087b7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -614,6 +614,7 @@ struct kvm_vcpu_arch { struct kvm_pio_request pio; void *pio_data; + void *guest_ins_data; u8 event_exit_inst_len; @@ -805,6 +806,9 @@ struct kvm_vcpu_arch { */ bool enforce; } pv_cpuid; + + /* Protected Guests */ + bool guest_state_protected; }; struct kvm_lpage_info { @@ -1088,7 +1092,7 @@ struct kvm_x86_ops { void (*hardware_disable)(void); void (*hardware_unsetup)(void); bool (*cpu_has_accelerated_tpr)(void); - bool (*has_emulated_msr)(u32 index); + bool (*has_emulated_msr)(struct kvm *kvm, u32 index); void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); unsigned int vm_size; @@ -1115,7 +1119,8 @@ struct kvm_x86_ops { struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); - int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); + bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr0); + void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); @@ -1231,6 +1236,7 @@ struct kvm_x86_ops { void (*enable_log_dirty_pt_masked)(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t offset, unsigned long mask); + int (*cpu_dirty_log_size)(void); /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; @@ -1280,6 +1286,7 @@ struct kvm_x86_ops { void (*migrate_timers)(struct kvm_vcpu *vcpu); void (*msr_filter_changed)(struct kvm_vcpu *vcpu); + int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err); }; struct kvm_x86_nested_ops { @@ -1470,6 +1477,10 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); +void kvm_free_guest_fpu(struct kvm_vcpu *vcpu); + +void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0); +void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); @@ -1696,7 +1707,8 @@ void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu); int kvm_is_in_guest(void); -int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); +void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, + u32 size); bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); @@ -1743,4 +1755,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) #define GET_SMSTATE(type, buf, offset) \ (*(type *)((buf) + (offset) - 0x7e00)) +int kvm_cpu_dirty_log_size(void); + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h index 1fde1ab6559e..7c5cc6660e4b 100644 --- a/arch/x86/include/asm/livepatch.h +++ b/arch/x86/include/asm/livepatch.h @@ -12,9 +12,9 @@ #include <asm/setup.h> #include <linux/ftrace.h> -static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) +static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) { - regs->ip = ip; + ftrace_instruction_pointer_set(fregs, ip); } #endif /* _ASM_X86_LIVEPATCH_H */ diff --git a/arch/x86/include/asm/local64.h b/arch/x86/include/asm/local64.h deleted file mode 100644 index 36c93b5cc239..000000000000 --- a/arch/x86/include/asm/local64.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/local64.h> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index a0f147893a04..56cdeaac76a0 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -177,7 +177,8 @@ enum mce_notifier_prios { MCE_PRIO_EXTLOG, MCE_PRIO_UC, MCE_PRIO_EARLY, - MCE_PRIO_CEC + MCE_PRIO_CEC, + MCE_PRIO_HIGHEST = MCE_PRIO_CEC }; struct notifier_block; @@ -198,16 +199,22 @@ static inline void enable_copy_mc_fragile(void) } #endif +struct cper_ia_proc_ctx; + #ifdef CONFIG_X86_MCE int mcheck_init(void); void mcheck_cpu_init(struct cpuinfo_x86 *c); void mcheck_cpu_clear(struct cpuinfo_x86 *c); void mcheck_vendor_init_severity(void); +int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, + u64 lapic_id); #else static inline int mcheck_init(void) { return 0; } static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {} static inline void mcheck_vendor_init_severity(void) {} +static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, + u64 lapic_id) { return -EINVAL; } #endif #ifdef CONFIG_X86_ANCIENT_MCE diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 2f62bbdd9d12..31c4df123aa0 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -37,6 +37,7 @@ void __init sme_map_bootdata(char *real_mode_data); void __init sme_unmap_bootdata(char *real_mode_data); void __init sme_early_init(void); +void __init sev_setup_arch(void); void __init sme_encrypt_kernel(struct boot_params *bp); void __init sme_enable(struct boot_params *bp); @@ -69,6 +70,7 @@ static inline void __init sme_map_bootdata(char *real_mode_data) { } static inline void __init sme_unmap_bootdata(char *real_mode_data) { } static inline void __init sme_early_init(void) { } +static inline void __init sev_setup_arch(void) { } static inline void __init sme_encrypt_kernel(struct boot_params *bp) { } static inline void __init sme_enable(struct boot_params *bp) { } diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 9257667d13c5..5d7494631ea9 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -6,6 +6,12 @@ #include <linux/rwsem.h> #include <linux/mutex.h> #include <linux/atomic.h> +#include <linux/bits.h> + +/* Uprobes on this MM assume 32-bit code */ +#define MM_CONTEXT_UPROBE_IA32 BIT(0) +/* vsyscall page is accessible on this MM */ +#define MM_CONTEXT_HAS_VSYSCALL BIT(1) /* * x86 has arch-specific MMU state beyond what lives in mm_struct. @@ -33,8 +39,7 @@ typedef struct { #endif #ifdef CONFIG_X86_64 - /* True if mm supports a task running in 32 bit compatibility mode. */ - unsigned short ia32_compat; + unsigned short flags; #endif struct mutex lock; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index d98016b83755..27516046117a 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -91,12 +91,14 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) } #endif +#define enter_lazy_tlb enter_lazy_tlb extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); /* * Init a new mm. Used on mm copies, like at fork() * and on mm's that are brand-new, like at execve(). */ +#define init_new_context init_new_context static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { @@ -116,6 +118,8 @@ static inline int init_new_context(struct task_struct *tsk, init_new_context_ldt(mm); return 0; } + +#define destroy_context destroy_context static inline void destroy_context(struct mm_struct *mm) { destroy_context_ldt(mm); @@ -177,7 +181,7 @@ static inline void arch_exit_mmap(struct mm_struct *mm) static inline bool is_64bit_mm(struct mm_struct *mm) { return !IS_ENABLED(CONFIG_IA32_EMULATION) || - !(mm->context.ia32_compat == TIF_IA32); + !(mm->context.flags & MM_CONTEXT_UPROBE_IA32); } #else static inline bool is_64bit_mm(struct mm_struct *mm) @@ -214,4 +218,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, unsigned long __get_current_cr3_fast(void); +#include <asm-generic/mmu_context.h> + #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h index cd30013d15d3..b85147d75626 100644 --- a/arch/x86/include/asm/msi.h +++ b/arch/x86/include/asm/msi.h @@ -9,4 +9,54 @@ typedef struct irq_alloc_info msi_alloc_info_t; int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, msi_alloc_info_t *arg); +/* Structs and defines for the X86 specific MSI message format */ + +typedef struct x86_msi_data { + u32 vector : 8, + delivery_mode : 3, + dest_mode_logical : 1, + reserved : 2, + active_low : 1, + is_level : 1; + + u32 dmar_subhandle; +} __attribute__ ((packed)) arch_msi_msg_data_t; +#define arch_msi_msg_data x86_msi_data + +typedef struct x86_msi_addr_lo { + union { + struct { + u32 reserved_0 : 2, + dest_mode_logical : 1, + redirect_hint : 1, + reserved_1 : 1, + virt_destid_8_14 : 7, + destid_0_7 : 8, + base_address : 12; + }; + struct { + u32 dmar_reserved_0 : 2, + dmar_index_15 : 1, + dmar_subhandle_valid : 1, + dmar_format : 1, + dmar_index_0_14 : 15, + dmar_base_address : 12; + }; + }; +} __attribute__ ((packed)) arch_msi_msg_addr_lo_t; +#define arch_msi_msg_addr_lo x86_msi_addr_lo + +#define X86_MSI_BASE_ADDRESS_LOW (0xfee00000 >> 20) + +typedef struct x86_msi_addr_hi { + u32 reserved : 8, + destid_8_31 : 24; +} __attribute__ ((packed)) arch_msi_msg_addr_hi_t; +#define arch_msi_msg_addr_hi x86_msi_addr_hi + +#define X86_MSI_BASE_ADDRESS_HIGH (0) + +struct msi_msg; +u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid); + #endif /* _ASM_X86_MSI_H */ diff --git a/arch/x86/include/asm/msidef.h b/arch/x86/include/asm/msidef.h deleted file mode 100644 index ee2f8ccc32d0..000000000000 --- a/arch/x86/include/asm/msidef.h +++ /dev/null @@ -1,57 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_MSIDEF_H -#define _ASM_X86_MSIDEF_H - -/* - * Constants for Intel APIC based MSI messages. - */ - -/* - * Shifts for MSI data - */ - -#define MSI_DATA_VECTOR_SHIFT 0 -#define MSI_DATA_VECTOR_MASK 0x000000ff -#define MSI_DATA_VECTOR(v) (((v) << MSI_DATA_VECTOR_SHIFT) & \ - MSI_DATA_VECTOR_MASK) - -#define MSI_DATA_DELIVERY_MODE_SHIFT 8 -#define MSI_DATA_DELIVERY_FIXED (0 << MSI_DATA_DELIVERY_MODE_SHIFT) -#define MSI_DATA_DELIVERY_LOWPRI (1 << MSI_DATA_DELIVERY_MODE_SHIFT) - -#define MSI_DATA_LEVEL_SHIFT 14 -#define MSI_DATA_LEVEL_DEASSERT (0 << MSI_DATA_LEVEL_SHIFT) -#define MSI_DATA_LEVEL_ASSERT (1 << MSI_DATA_LEVEL_SHIFT) - -#define MSI_DATA_TRIGGER_SHIFT 15 -#define MSI_DATA_TRIGGER_EDGE (0 << MSI_DATA_TRIGGER_SHIFT) -#define MSI_DATA_TRIGGER_LEVEL (1 << MSI_DATA_TRIGGER_SHIFT) - -/* - * Shift/mask fields for msi address - */ - -#define MSI_ADDR_BASE_HI 0 -#define MSI_ADDR_BASE_LO 0xfee00000 - -#define MSI_ADDR_DEST_MODE_SHIFT 2 -#define MSI_ADDR_DEST_MODE_PHYSICAL (0 << MSI_ADDR_DEST_MODE_SHIFT) -#define MSI_ADDR_DEST_MODE_LOGICAL (1 << MSI_ADDR_DEST_MODE_SHIFT) - -#define MSI_ADDR_REDIRECTION_SHIFT 3 -#define MSI_ADDR_REDIRECTION_CPU (0 << MSI_ADDR_REDIRECTION_SHIFT) - /* dedicated cpu */ -#define MSI_ADDR_REDIRECTION_LOWPRI (1 << MSI_ADDR_REDIRECTION_SHIFT) - /* lowest priority */ - -#define MSI_ADDR_DEST_ID_SHIFT 12 -#define MSI_ADDR_DEST_ID_MASK 0x00ffff0 -#define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \ - MSI_ADDR_DEST_ID_MASK) -#define MSI_ADDR_EXT_DEST_ID(dest) ((dest) & 0xffffff00) - -#define MSI_ADDR_IR_EXT_INT (1 << 4) -#define MSI_ADDR_IR_SHV (1 << 3) -#define MSI_ADDR_IR_INDEX1(index) ((index & 0x8000) >> 13) -#define MSI_ADDR_IR_INDEX2(index) ((index & 0x7fff) << 5) -#endif /* _ASM_X86_MSIDEF_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 972a34d93505..546d6ecf0a35 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -139,6 +139,7 @@ #define MSR_IA32_MCG_CAP 0x00000179 #define MSR_IA32_MCG_STATUS 0x0000017a #define MSR_IA32_MCG_CTL 0x0000017b +#define MSR_ERROR_CONTROL 0x0000017f #define MSR_IA32_MCG_EXT_CTL 0x000004d0 #define MSR_OFFCORE_RSP_0 0x000001a6 @@ -326,8 +327,9 @@ #define MSR_PP1_ENERGY_STATUS 0x00000641 #define MSR_PP1_POLICY 0x00000642 -#define MSR_AMD_PKG_ENERGY_STATUS 0xc001029b #define MSR_AMD_RAPL_POWER_UNIT 0xc0010299 +#define MSR_AMD_CORE_ENERGY_STATUS 0xc001029a +#define MSR_AMD_PKG_ENERGY_STATUS 0xc001029b /* Config TDP MSRs */ #define MSR_CONFIG_TDP_NOMINAL 0x00000648 @@ -470,6 +472,7 @@ #define MSR_AMD64_ICIBSEXTDCTL 0xc001103c #define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ +#define MSR_AMD64_VM_PAGE_FLUSH 0xc001011e #define MSR_AMD64_SEV_ES_GHCB 0xc0010130 #define MSR_AMD64_SEV 0xc0010131 #define MSR_AMD64_SEV_ENABLED_BIT 0 @@ -609,6 +612,8 @@ #define FEAT_CTL_LOCKED BIT(0) #define FEAT_CTL_VMX_ENABLED_INSIDE_SMX BIT(1) #define FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX BIT(2) +#define FEAT_CTL_SGX_LC_ENABLED BIT(17) +#define FEAT_CTL_SGX_ENABLED BIT(18) #define FEAT_CTL_LMCE_ENABLED BIT(20) #define MSR_IA32_TSC_ADJUST 0x0000003b @@ -628,6 +633,12 @@ #define MSR_IA32_UCODE_WRITE 0x00000079 #define MSR_IA32_UCODE_REV 0x0000008b +/* Intel SGX Launch Enclave Public Key Hash MSRs */ +#define MSR_IA32_SGXLEPUBKEYHASH0 0x0000008C +#define MSR_IA32_SGXLEPUBKEYHASH1 0x0000008D +#define MSR_IA32_SGXLEPUBKEYHASH2 0x0000008E +#define MSR_IA32_SGXLEPUBKEYHASH3 0x0000008F + #define MSR_IA32_SMM_MONITOR_CTL 0x0000009b #define MSR_IA32_SMBASE 0x0000009e diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index f462895a33e4..faf9cc1c14bb 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -53,7 +53,13 @@ #define STACK_TOP_MAX STACK_TOP /* - * Kernel image size is limited to 512 MB (see in arch/x86/kernel/head_32.S) + * In spite of the name, KERNEL_IMAGE_SIZE is a limit on the maximum virtual + * address for the kernel image, rather than the limit on the size itself. On + * 32-bit, this is not a strict limit, but this value is used to limit the + * link-time virtual address range of the kernel, and by KASLR to limit the + * randomized address from which the kernel is executed. A relocatable kernel + * can be loaded somewhat higher than KERNEL_IMAGE_SIZE as long as enough space + * remains for the vmalloc area. */ #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 3f49dac03617..645bd1d0ee07 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -98,8 +98,10 @@ #define STACK_TOP_MAX TASK_SIZE_MAX /* - * Maximum kernel image size is limited to 1 GiB, due to the fixmap living - * in the next 1 GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). + * In spite of the name, KERNEL_IMAGE_SIZE is a limit on the maximum virtual + * address for the kernel image, rather than the limit on the size itself. + * This can be at most 1 GiB, due to the fixmap living in the next 1 GiB (see + * level2_kernel_pgt in arch/x86/kernel/head_64.S). * * On KASLR use 1 GiB by default, leaving 1 GiB for modules once the * page tables are fully set up. diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index d25cc6830e89..f8dce11d2bc1 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -812,17 +812,6 @@ extern void default_banner(void); #endif /* CONFIG_PARAVIRT_XXL */ #endif /* CONFIG_X86_64 */ -#ifdef CONFIG_PARAVIRT_XXL - -#define GET_CR2_INTO_AX \ - PARA_SITE(PARA_PATCH(PV_MMU_read_cr2), \ - ANNOTATE_RETPOLINE_SAFE; \ - call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2); \ - ) - -#endif /* CONFIG_PARAVIRT_XXL */ - - #endif /* __ASSEMBLY__ */ #else /* CONFIG_PARAVIRT */ # define default_banner x86_init_noop diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 0fad9f61c76a..b6b02b7c19cc 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -41,7 +41,6 @@ #ifndef __ASSEMBLY__ #include <asm/desc_defs.h> -#include <asm/kmap_types.h> #include <asm/pgtable_types.h> #include <asm/nospec-branch.h> diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index d7acae4120d5..7c9c968a42ef 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -57,19 +57,13 @@ do { \ #endif /* - * This is how much memory in addition to the memory covered up to - * and including _end we need mapped initially. - * We need: - * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE) - * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE) + * This is used to calculate the .brk reservation for initial pagetables. + * Enough space is reserved to allocate pagetables sufficient to cover all + * of LOWMEM_PAGES, which is an upper bound on the size of the direct map of + * lowmem. * - * Modulo rounding, each megabyte assigned here requires a kilobyte of - * memory, which is currently unreclaimed. - * - * This should be a multiple of a page. - * - * KERNEL_IMAGE_SIZE should be greater than pa(_end) - * and small than max_low_pfn, otherwise will waste some page table entries + * With PAE paging (PTRS_PER_PMD > 1), we allocate PTRS_PER_PGD == 4 pages for + * the PMD's in addition to the pages required for the last level pagetables. */ #if PTRS_PER_PMD > 1 #define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 52e5f5f2240d..91ac10654570 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -143,7 +143,11 @@ extern unsigned int ptrs_per_p4d; #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) /* The module sections ends with the start of the fixmap */ -#define MODULES_END _AC(0xffffffffff000000, UL) +#ifndef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP +# define MODULES_END _AC(0xffffffffff000000, UL) +#else +# define MODULES_END _AC(0xfffffffffe000000, UL) +#endif #define MODULES_LEN (MODULES_END - MODULES_VADDR) #define ESPFIX_PGD_ENTRY _AC(-2, UL) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 82a08b585818..c20a52b5534b 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -813,10 +813,8 @@ extern int set_tsc_mode(unsigned int val); DECLARE_PER_CPU(u64, msr_misc_features_shadow); #ifdef CONFIG_CPU_SUP_AMD -extern u16 amd_get_nb_id(int cpu); extern u32 amd_get_nodes_per_socket(void); #else -static inline u16 amd_get_nb_id(int cpu) { return 0; } static inline u32 amd_get_nodes_per_socket(void) { return 0; } #endif diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h index 2bd1338de236..fef16e398161 100644 --- a/arch/x86/include/asm/seccomp.h +++ b/arch/x86/include/asm/seccomp.h @@ -16,6 +16,26 @@ #define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn #endif +#ifdef CONFIG_X86_64 +# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_X86_64 +# define SECCOMP_ARCH_NATIVE_NR NR_syscalls +# define SECCOMP_ARCH_NATIVE_NAME "x86_64" +# ifdef CONFIG_COMPAT +# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_I386 +# define SECCOMP_ARCH_COMPAT_NR IA32_NR_syscalls +# define SECCOMP_ARCH_COMPAT_NAME "ia32" +# endif +/* + * x32 will have __X32_SYSCALL_BIT set in syscall number. We don't support + * caching them and they are treated as out of range syscalls, which will + * always pass through the BPF filter. + */ +#else /* !CONFIG_X86_64 */ +# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_I386 +# define SECCOMP_ARCH_NATIVE_NR NR_syscalls +# define SECCOMP_ARCH_NATIVE_NAME "ia32" +#endif + #include <asm-generic/seccomp.h> #endif /* _ASM_X86_SECCOMP_H */ diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 5948218f35c5..4352f08bfbb5 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -82,6 +82,7 @@ int set_pages_rw(struct page *page, int numpages); int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); +bool kernel_page_present(struct page *page); extern int kernel_set_to_readonly; diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 49600643faba..f248eb2ac2d4 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -88,9 +88,6 @@ get_stack_pointer(struct task_struct *task, struct pt_regs *regs) return (unsigned long *)task->thread.sp; } -void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, const char *log_lvl); - /* The form of the top of the frame on the stack */ struct stack_frame { struct stack_frame *next_frame; diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 71d630bb5e08..1c561945b426 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -98,6 +98,16 @@ enum { INTERCEPT_MWAIT_COND, INTERCEPT_XSETBV, INTERCEPT_RDPRU, + TRAP_EFER_WRITE, + TRAP_CR0_WRITE, + TRAP_CR1_WRITE, + TRAP_CR2_WRITE, + TRAP_CR3_WRITE, + TRAP_CR4_WRITE, + TRAP_CR5_WRITE, + TRAP_CR6_WRITE, + TRAP_CR7_WRITE, + TRAP_CR8_WRITE, /* Byte offset 014h (word 5) */ INTERCEPT_INVLPGB = 160, INTERCEPT_INVLPGB_ILLEGAL, @@ -130,7 +140,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u32 exit_int_info_err; u64 nested_ctl; u64 avic_vapic_bar; - u8 reserved_4[8]; + u64 ghcb_gpa; u32 event_inj; u32 event_inj_err; u64 nested_cr3; @@ -144,6 +154,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u8 reserved_6[8]; /* Offset 0xe8 */ u64 avic_logical_id; /* Offset 0xf0 */ u64 avic_physical_id; /* Offset 0xf8 */ + u8 reserved_7[8]; + u64 vmsa_pa; /* Used for an SEV-ES guest */ }; @@ -178,7 +190,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define LBR_CTL_ENABLE_MASK BIT_ULL(0) #define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1) -#define SVM_INTERRUPT_SHADOW_MASK 1 +#define SVM_INTERRUPT_SHADOW_MASK BIT_ULL(0) +#define SVM_GUEST_INTERRUPT_MASK BIT_ULL(1) #define SVM_IOIO_STR_SHIFT 2 #define SVM_IOIO_REP_SHIFT 3 @@ -197,6 +210,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_NESTED_CTL_NP_ENABLE BIT(0) #define SVM_NESTED_CTL_SEV_ENABLE BIT(1) +#define SVM_NESTED_CTL_SEV_ES_ENABLE BIT(2) struct vmcb_seg { u16 selector; @@ -220,7 +234,8 @@ struct vmcb_save_area { u8 cpl; u8 reserved_2[4]; u64 efer; - u8 reserved_3[112]; + u8 reserved_3[104]; + u64 xss; /* Valid for SEV-ES only */ u64 cr4; u64 cr3; u64 cr0; @@ -251,9 +266,12 @@ struct vmcb_save_area { /* * The following part of the save area is valid only for - * SEV-ES guests when referenced through the GHCB. + * SEV-ES guests when referenced through the GHCB or for + * saving to the host save area. */ - u8 reserved_7[104]; + u8 reserved_7[80]; + u32 pkru; + u8 reserved_7a[20]; u64 reserved_8; /* rax already available at 0x01f8 */ u64 rcx; u64 rdx; @@ -294,7 +312,7 @@ struct ghcb { #define EXPECTED_VMCB_SAVE_AREA_SIZE 1032 -#define EXPECTED_VMCB_CONTROL_AREA_SIZE 256 +#define EXPECTED_VMCB_CONTROL_AREA_SIZE 272 #define EXPECTED_GHCB_SIZE PAGE_SIZE static inline void __unused_size_checks(void) @@ -379,6 +397,16 @@ struct vmcb { (unsigned long *)&ghcb->save.valid_bitmap); \ } \ \ + static inline u64 ghcb_get_##field(struct ghcb *ghcb) \ + { \ + return ghcb->save.field; \ + } \ + \ + static inline u64 ghcb_get_##field##_if_valid(struct ghcb *ghcb) \ + { \ + return ghcb_##field##_is_valid(ghcb) ? ghcb->save.field : 0; \ + } \ + \ static inline void ghcb_set_##field(struct ghcb *ghcb, u64 value) \ { \ __set_bit(GHCB_BITMAP_IDX(field), \ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 44733a4bfc42..0d751d5da702 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -55,6 +55,7 @@ struct task_struct; struct thread_info { unsigned long flags; /* low level flags */ + unsigned long syscall_work; /* SYSCALL_WORK_ flags */ u32 status; /* thread synchronous flags */ }; @@ -74,15 +75,11 @@ struct thread_info { * - these are process state flags that various assembly files * may need to access */ -#define TIF_SYSCALL_TRACE 0 /* syscall trace active */ #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ #define TIF_SSBD 5 /* Speculative store bypass disable */ -#define TIF_SYSCALL_EMU 6 /* syscall emulation active */ -#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ -#define TIF_SECCOMP 8 /* secure computing */ #define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ #define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ @@ -91,7 +88,7 @@ struct thread_info { #define TIF_NEED_FPU_LOAD 14 /* load FPU on return to userspace */ #define TIF_NOCPUID 15 /* CPUID is not accessible in userland */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ -#define TIF_IA32 17 /* IA32 compatibility process */ +#define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */ #define TIF_SLD 18 /* Restore split lock detection on context switch */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ @@ -99,19 +96,13 @@ struct thread_info { #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ -#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ -#define TIF_X32 30 /* 32-bit native x86-64 binary */ -#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_SSBD (1 << TIF_SSBD) -#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) -#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) -#define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_SPEC_IB (1 << TIF_SPEC_IB) #define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) @@ -120,16 +111,14 @@ struct thread_info { #define _TIF_NEED_FPU_LOAD (1 << TIF_NEED_FPU_LOAD) #define _TIF_NOCPUID (1 << TIF_NOCPUID) #define _TIF_NOTSC (1 << TIF_NOTSC) -#define _TIF_IA32 (1 << TIF_IA32) +#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_SLD (1 << TIF_SLD) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) -#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_ADDR32 (1 << TIF_ADDR32) -#define _TIF_X32 (1 << TIF_X32) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW_BASE \ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index f4234575f3fd..488a8e848754 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -218,4 +218,9 @@ static inline void arch_set_max_freq_ratio(bool turbo_disabled) } #endif +#ifdef CONFIG_ACPI_CPPC_LIB +void init_freq_invariance_cppc(void); +#define init_freq_invariance_cppc init_freq_invariance_cppc +#endif + #endif /* _ASM_X86_TOPOLOGY_H */ diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h index 305bc1214aef..10b1de500ab1 100644 --- a/arch/x86/include/asm/trap_pf.h +++ b/arch/x86/include/asm/trap_pf.h @@ -11,6 +11,7 @@ * bit 3 == 1: use of reserved bit detected * bit 4 == 1: fault was an instruction fetch * bit 5 == 1: protection keys block access + * bit 15 == 1: SGX MMU page-fault */ enum x86_pf_error_code { X86_PF_PROT = 1 << 0, @@ -19,6 +20,7 @@ enum x86_pf_error_code { X86_PF_RSVD = 1 << 3, X86_PF_INSTR = 1 << 4, X86_PF_PK = 1 << 5, + X86_PF_SGX = 1 << 15, }; #endif /* _ASM_X86_TRAP_PF_H */ diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 08b3d810dfba..1b6455f881f9 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -28,6 +28,20 @@ enum uv_bios_cmd { UV_BIOS_SET_LEGACY_VGA_TARGET }; +#define UV_BIOS_EXTRA 0x10000 +#define UV_BIOS_GET_PCI_TOPOLOGY 0x10001 +#define UV_BIOS_GET_GEOINFO 0x10003 + +#define UV_BIOS_EXTRA_OP_MEM_COPYIN 0x1000 +#define UV_BIOS_EXTRA_OP_MEM_COPYOUT 0x2000 +#define UV_BIOS_EXTRA_OP_MASK 0x0fff +#define UV_BIOS_EXTRA_GET_HEAPSIZE 1 +#define UV_BIOS_EXTRA_INSTALL_HEAP 2 +#define UV_BIOS_EXTRA_MASTER_NASID 3 +#define UV_BIOS_EXTRA_OBJECT_COUNT (10|UV_BIOS_EXTRA_OP_MEM_COPYOUT) +#define UV_BIOS_EXTRA_ENUM_OBJECTS (12|UV_BIOS_EXTRA_OP_MEM_COPYOUT) +#define UV_BIOS_EXTRA_ENUM_PORTS (13|UV_BIOS_EXTRA_OP_MEM_COPYOUT) + /* * Status values returned from a BIOS call. */ @@ -109,6 +123,32 @@ struct uv_systab { } entry[1]; /* additional entries follow */ }; extern struct uv_systab *uv_systab; + +#define UV_BIOS_MAXSTRING 128 +struct uv_bios_hub_info { + unsigned int id; + union { + struct { + unsigned long long this_part:1; + unsigned long long is_shared:1; + unsigned long long is_disabled:1; + } fields; + struct { + unsigned long long flags; + unsigned long long reserved; + } b; + } f; + char name[UV_BIOS_MAXSTRING]; + char location[UV_BIOS_MAXSTRING]; + unsigned int ports; +}; + +struct uv_bios_port_info { + unsigned int port; + unsigned int conn_id; + unsigned int conn_port; +}; + /* (... end of definitions from UV BIOS ...) */ enum { @@ -142,6 +182,15 @@ extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *); extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus); +extern s64 uv_bios_get_master_nasid(u64 sz, u64 *nasid); +extern s64 uv_bios_get_heapsize(u64 nasid, u64 sz, u64 *heap_sz); +extern s64 uv_bios_install_heap(u64 nasid, u64 sz, u64 *heap); +extern s64 uv_bios_obj_count(u64 nasid, u64 sz, u64 *objcnt); +extern s64 uv_bios_enum_objs(u64 nasid, u64 sz, u64 *objbuf); +extern s64 uv_bios_enum_ports(u64 nasid, u64 obj_id, u64 sz, u64 *portbuf); +extern s64 uv_bios_get_geoinfo(u64 nasid, u64 sz, u64 *geo); +extern s64 uv_bios_get_pci_topology(u64 sz, u64 *buf); + extern int uv_bios_init(void); extern unsigned long get_uv_systab_phys(bool msg); @@ -151,6 +200,8 @@ extern long sn_partition_id; extern long sn_coherency_id; extern long sn_region_size; extern long system_serial_number; +extern ssize_t uv_get_archtype(char *buf, int len); +extern int uv_get_hubless_system(void); extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ diff --git a/arch/x86/include/asm/uv/uv_geo.h b/arch/x86/include/asm/uv/uv_geo.h new file mode 100644 index 000000000000..f241451035fb --- /dev/null +++ b/arch/x86/include/asm/uv/uv_geo.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 2020 Hewlett Packard Enterprise Development LP. All rights reserved. + */ + +#ifndef _ASM_UV_GEO_H +#define _ASM_UV_GEO_H + +/* Type declaractions */ + +/* Size of a geoid_s structure (must be before decl. of geoid_u) */ +#define GEOID_SIZE 8 + +/* Fields common to all substructures */ +struct geo_common_s { + unsigned char type; /* What type of h/w is named by this geoid_s */ + unsigned char blade; + unsigned char slot; /* slot is IRU */ + unsigned char upos; + unsigned char rack; +}; + +/* Additional fields for particular types of hardware */ +struct geo_node_s { + struct geo_common_s common; /* No additional fields needed */ +}; + +struct geo_rtr_s { + struct geo_common_s common; /* No additional fields needed */ +}; + +struct geo_iocntl_s { + struct geo_common_s common; /* No additional fields needed */ +}; + +struct geo_pcicard_s { + struct geo_iocntl_s common; + char bus; /* Bus/widget number */ + char slot; /* PCI slot number */ +}; + +/* Subcomponents of a node */ +struct geo_cpu_s { + struct geo_node_s node; + unsigned char socket:4, /* Which CPU on the node */ + thread:4; + unsigned char core; +}; + +struct geo_mem_s { + struct geo_node_s node; + char membus; /* The memory bus on the node */ + char memslot; /* The memory slot on the bus */ +}; + +union geoid_u { + struct geo_common_s common; + struct geo_node_s node; + struct geo_iocntl_s iocntl; + struct geo_pcicard_s pcicard; + struct geo_rtr_s rtr; + struct geo_cpu_s cpu; + struct geo_mem_s mem; + char padsize[GEOID_SIZE]; +}; + +/* Defined constants */ + +#define GEO_MAX_LEN 48 + +#define GEO_TYPE_INVALID 0 +#define GEO_TYPE_MODULE 1 +#define GEO_TYPE_NODE 2 +#define GEO_TYPE_RTR 3 +#define GEO_TYPE_IOCNTL 4 +#define GEO_TYPE_IOCARD 5 +#define GEO_TYPE_CPU 6 +#define GEO_TYPE_MEM 7 +#define GEO_TYPE_MAX (GEO_TYPE_MEM+1) + +static inline int geo_rack(union geoid_u g) +{ + return (g.common.type == GEO_TYPE_INVALID) ? + -1 : g.common.rack; +} + +static inline int geo_slot(union geoid_u g) +{ + return (g.common.type == GEO_TYPE_INVALID) ? + -1 : g.common.upos; +} + +static inline int geo_blade(union geoid_u g) +{ + return (g.common.type == GEO_TYPE_INVALID) ? + -1 : g.common.blade * 2 + g.common.slot; +} + +#endif /* _ASM_UV_GEO_H */ diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index bbcdc7b8f963..98aa103eb4ab 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -15,6 +15,8 @@ struct vdso_image { unsigned long size; /* Always a multiple of PAGE_SIZE */ unsigned long alt, alt_len; + unsigned long extable_base, extable_len; + const void *extable; long sym_vvar_start; /* Negative offset to the vvar area */ @@ -27,6 +29,8 @@ struct vdso_image { long sym___kernel_rt_sigreturn; long sym___kernel_vsyscall; long sym_int80_landing_pad; + long sym_vdso32_sigreturn_landing_pad; + long sym_vdso32_rt_sigreturn_landing_pad; }; #ifdef CONFIG_X86_64 @@ -45,6 +49,9 @@ extern void __init init_vdso_image(const struct vdso_image *image); extern int map_vdso_once(const struct vdso_image *image, unsigned long addr); +extern bool fixup_vdso_exception(struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr); #endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_VDSO_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index f8ba5289ecb0..38ca445a8429 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -113,6 +113,7 @@ #define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f #define VMX_MISC_SAVE_EFER_LMA 0x00000020 #define VMX_MISC_ACTIVITY_HLT 0x00000040 +#define VMX_MISC_ACTIVITY_WAIT_SIPI 0x00000100 #define VMX_MISC_ZERO_LEN_INS 0x40000000 #define VMX_MISC_MSR_LIST_MULTIPLIER 512 diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index dde5b3f1e7cd..5c69f7eb5d47 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -116,6 +116,7 @@ struct x86_init_pci { * @init_platform: platform setup * @guest_late_init: guest late init * @x2apic_available: X2APIC detection + * @msi_ext_dest_id: MSI supports 15-bit APIC IDs * @init_mem_mapping: setup early mappings during init_mem_mapping() * @init_after_bootmem: guest init after boot allocator is finished */ @@ -123,6 +124,7 @@ struct x86_hyper_init { void (*init_platform)(void); void (*guest_late_init)(void); bool (*x2apic_available)(void); + bool (*msi_ext_dest_id)(void); void (*init_mem_mapping)(void); void (*init_after_bootmem)(void); }; diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 5941e18edd5a..1a162e559753 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -355,7 +355,7 @@ unsigned long arbitrary_virt_to_mfn(void *vaddr); void make_lowmem_page_readonly(void *vaddr); void make_lowmem_page_readwrite(void *vaddr); -#define xen_remap(cookie, size) ioremap((cookie), (size)); +#define xen_remap(cookie, size) ioremap((cookie), (size)) #define xen_unmap(cookie) iounmap((cookie)) static inline bool xen_arch_need_swiotlb(struct device *dev, diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 89e5f3d1bba8..8e76d3701db3 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -12,6 +12,7 @@ #define KVM_PIO_PAGE_OFFSET 1 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2 +#define KVM_DIRTY_LOG_PAGE_OFFSET 64 #define DE_VECTOR 0 #define DB_VECTOR 1 diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h new file mode 100644 index 000000000000..9034f3007c4e --- /dev/null +++ b/arch/x86/include/uapi/asm/sgx.h @@ -0,0 +1,168 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright(c) 2016-20 Intel Corporation. + */ +#ifndef _UAPI_ASM_X86_SGX_H +#define _UAPI_ASM_X86_SGX_H + +#include <linux/types.h> +#include <linux/ioctl.h> + +/** + * enum sgx_page_flags - page control flags + * %SGX_PAGE_MEASURE: Measure the page contents with a sequence of + * ENCLS[EEXTEND] operations. + */ +enum sgx_page_flags { + SGX_PAGE_MEASURE = 0x01, +}; + +#define SGX_MAGIC 0xA4 + +#define SGX_IOC_ENCLAVE_CREATE \ + _IOW(SGX_MAGIC, 0x00, struct sgx_enclave_create) +#define SGX_IOC_ENCLAVE_ADD_PAGES \ + _IOWR(SGX_MAGIC, 0x01, struct sgx_enclave_add_pages) +#define SGX_IOC_ENCLAVE_INIT \ + _IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init) +#define SGX_IOC_ENCLAVE_PROVISION \ + _IOW(SGX_MAGIC, 0x03, struct sgx_enclave_provision) + +/** + * struct sgx_enclave_create - parameter structure for the + * %SGX_IOC_ENCLAVE_CREATE ioctl + * @src: address for the SECS page data + */ +struct sgx_enclave_create { + __u64 src; +}; + +/** + * struct sgx_enclave_add_pages - parameter structure for the + * %SGX_IOC_ENCLAVE_ADD_PAGE ioctl + * @src: start address for the page data + * @offset: starting page offset + * @length: length of the data (multiple of the page size) + * @secinfo: address for the SECINFO data + * @flags: page control flags + * @count: number of bytes added (multiple of the page size) + */ +struct sgx_enclave_add_pages { + __u64 src; + __u64 offset; + __u64 length; + __u64 secinfo; + __u64 flags; + __u64 count; +}; + +/** + * struct sgx_enclave_init - parameter structure for the + * %SGX_IOC_ENCLAVE_INIT ioctl + * @sigstruct: address for the SIGSTRUCT data + */ +struct sgx_enclave_init { + __u64 sigstruct; +}; + +/** + * struct sgx_enclave_provision - parameter structure for the + * %SGX_IOC_ENCLAVE_PROVISION ioctl + * @fd: file handle of /dev/sgx_provision + */ +struct sgx_enclave_provision { + __u64 fd; +}; + +struct sgx_enclave_run; + +/** + * typedef sgx_enclave_user_handler_t - Exit handler function accepted by + * __vdso_sgx_enter_enclave() + * @run: The run instance given by the caller + * + * The register parameters contain the snapshot of their values at enclave + * exit. An invalid ENCLU function number will cause -EINVAL to be returned + * to the caller. + * + * Return: + * - <= 0: The given value is returned back to the caller. + * - > 0: ENCLU function to invoke, either EENTER or ERESUME. + */ +typedef int (*sgx_enclave_user_handler_t)(long rdi, long rsi, long rdx, + long rsp, long r8, long r9, + struct sgx_enclave_run *run); + +/** + * struct sgx_enclave_run - the execution context of __vdso_sgx_enter_enclave() + * @tcs: TCS used to enter the enclave + * @function: The last seen ENCLU function (EENTER, ERESUME or EEXIT) + * @exception_vector: The interrupt vector of the exception + * @exception_error_code: The exception error code pulled out of the stack + * @exception_addr: The address that triggered the exception + * @user_handler: User provided callback run on exception + * @user_data: Data passed to the user handler + * @reserved Reserved for future extensions + * + * If @user_handler is provided, the handler will be invoked on all return paths + * of the normal flow. The user handler may transfer control, e.g. via a + * longjmp() call or a C++ exception, without returning to + * __vdso_sgx_enter_enclave(). + */ +struct sgx_enclave_run { + __u64 tcs; + __u32 function; + __u16 exception_vector; + __u16 exception_error_code; + __u64 exception_addr; + __u64 user_handler; + __u64 user_data; + __u8 reserved[216]; +}; + +/** + * typedef vdso_sgx_enter_enclave_t - Prototype for __vdso_sgx_enter_enclave(), + * a vDSO function to enter an SGX enclave. + * @rdi: Pass-through value for RDI + * @rsi: Pass-through value for RSI + * @rdx: Pass-through value for RDX + * @function: ENCLU function, must be EENTER or ERESUME + * @r8: Pass-through value for R8 + * @r9: Pass-through value for R9 + * @run: struct sgx_enclave_run, must be non-NULL + * + * NOTE: __vdso_sgx_enter_enclave() does not ensure full compliance with the + * x86-64 ABI, e.g. doesn't handle XSAVE state. Except for non-volatile + * general purpose registers, EFLAGS.DF, and RSP alignment, preserving/setting + * state in accordance with the x86-64 ABI is the responsibility of the enclave + * and its runtime, i.e. __vdso_sgx_enter_enclave() cannot be called from C + * code without careful consideration by both the enclave and its runtime. + * + * All general purpose registers except RAX, RBX and RCX are passed as-is to the + * enclave. RAX, RBX and RCX are consumed by EENTER and ERESUME and are loaded + * with @function, asynchronous exit pointer, and @run.tcs respectively. + * + * RBP and the stack are used to anchor __vdso_sgx_enter_enclave() to the + * pre-enclave state, e.g. to retrieve @run.exception and @run.user_handler + * after an enclave exit. All other registers are available for use by the + * enclave and its runtime, e.g. an enclave can push additional data onto the + * stack (and modify RSP) to pass information to the optional user handler (see + * below). + * + * Most exceptions reported on ENCLU, including those that occur within the + * enclave, are fixed up and reported synchronously instead of being delivered + * via a standard signal. Debug Exceptions (#DB) and Breakpoints (#BP) are + * never fixed up and are always delivered via standard signals. On synchrously + * reported exceptions, -EFAULT is returned and details about the exception are + * recorded in @run.exception, the optional sgx_enclave_exception struct. + * + * Return: + * - 0: ENCLU function was successfully executed. + * - -EINVAL: Invalid ENCL number (neither EENTER nor ERESUME). + */ +typedef int (*vdso_sgx_enter_enclave_t)(unsigned long rdi, unsigned long rsi, + unsigned long rdx, unsigned int function, + unsigned long r8, unsigned long r9, + struct sgx_enclave_run *run); + +#endif /* _UAPI_ASM_X86_SGX_H */ diff --git a/arch/x86/include/uapi/asm/signal.h b/arch/x86/include/uapi/asm/signal.h index e5745d593dc7..164a22a72984 100644 --- a/arch/x86/include/uapi/asm/signal.h +++ b/arch/x86/include/uapi/asm/signal.h @@ -62,30 +62,6 @@ typedef unsigned long sigset_t; #define SIGRTMIN 32 #define SIGRTMAX _NSIG -/* - * SA_FLAGS values: - * - * SA_ONSTACK indicates that a registered stack_t will be used. - * SA_RESTART flag to get restarting signals (which were the default long ago) - * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop. - * SA_RESETHAND clears the handler when the signal is delivered. - * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies. - * SA_NODEFER prevents the current signal from being masked in the handler. - * - * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single - * Unix names RESETHAND and NODEFER respectively. - */ -#define SA_NOCLDSTOP 0x00000001u -#define SA_NOCLDWAIT 0x00000002u -#define SA_SIGINFO 0x00000004u -#define SA_ONSTACK 0x08000000u -#define SA_RESTART 0x10000000u -#define SA_NODEFER 0x40000000u -#define SA_RESETHAND 0x80000000u - -#define SA_NOMASK SA_NODEFER -#define SA_ONESHOT SA_RESETHAND - #define SA_RESTORER 0x04000000 #define MINSIGSTKSZ 2048 diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index f1d8307454e0..554f75fe013c 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -77,10 +77,28 @@ #define SVM_EXIT_MWAIT_COND 0x08c #define SVM_EXIT_XSETBV 0x08d #define SVM_EXIT_RDPRU 0x08e +#define SVM_EXIT_EFER_WRITE_TRAP 0x08f +#define SVM_EXIT_CR0_WRITE_TRAP 0x090 +#define SVM_EXIT_CR1_WRITE_TRAP 0x091 +#define SVM_EXIT_CR2_WRITE_TRAP 0x092 +#define SVM_EXIT_CR3_WRITE_TRAP 0x093 +#define SVM_EXIT_CR4_WRITE_TRAP 0x094 +#define SVM_EXIT_CR5_WRITE_TRAP 0x095 +#define SVM_EXIT_CR6_WRITE_TRAP 0x096 +#define SVM_EXIT_CR7_WRITE_TRAP 0x097 +#define SVM_EXIT_CR8_WRITE_TRAP 0x098 +#define SVM_EXIT_CR9_WRITE_TRAP 0x099 +#define SVM_EXIT_CR10_WRITE_TRAP 0x09a +#define SVM_EXIT_CR11_WRITE_TRAP 0x09b +#define SVM_EXIT_CR12_WRITE_TRAP 0x09c +#define SVM_EXIT_CR13_WRITE_TRAP 0x09d +#define SVM_EXIT_CR14_WRITE_TRAP 0x09e +#define SVM_EXIT_CR15_WRITE_TRAP 0x09f #define SVM_EXIT_INVPCID 0x0a2 #define SVM_EXIT_NPF 0x400 #define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401 #define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402 +#define SVM_EXIT_VMGEXIT 0x403 /* SEV-ES software-defined VMGEXIT events */ #define SVM_VMGEXIT_MMIO_READ 0x80000001 @@ -183,10 +201,20 @@ { SVM_EXIT_MONITOR, "monitor" }, \ { SVM_EXIT_MWAIT, "mwait" }, \ { SVM_EXIT_XSETBV, "xsetbv" }, \ + { SVM_EXIT_EFER_WRITE_TRAP, "write_efer_trap" }, \ + { SVM_EXIT_CR0_WRITE_TRAP, "write_cr0_trap" }, \ + { SVM_EXIT_CR4_WRITE_TRAP, "write_cr4_trap" }, \ + { SVM_EXIT_CR8_WRITE_TRAP, "write_cr8_trap" }, \ { SVM_EXIT_INVPCID, "invpcid" }, \ { SVM_EXIT_NPF, "npf" }, \ { SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \ { SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" }, \ + { SVM_EXIT_VMGEXIT, "vmgexit" }, \ + { SVM_VMGEXIT_MMIO_READ, "vmgexit_mmio_read" }, \ + { SVM_VMGEXIT_MMIO_WRITE, "vmgexit_mmio_write" }, \ + { SVM_VMGEXIT_NMI_COMPLETE, "vmgexit_nmi_complete" }, \ + { SVM_VMGEXIT_AP_HLT_LOOP, "vmgexit_ap_hlt_loop" }, \ + { SVM_VMGEXIT_AP_JUMP_TABLE, "vmgexit_ap_jump_table" }, \ { SVM_EXIT_ERR, "invalid_guest_state" } diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index b8ff9e8ac0d5..ada955c5ebb6 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -32,6 +32,7 @@ #define EXIT_REASON_EXTERNAL_INTERRUPT 1 #define EXIT_REASON_TRIPLE_FAULT 2 #define EXIT_REASON_INIT_SIGNAL 3 +#define EXIT_REASON_SIPI_SIGNAL 4 #define EXIT_REASON_INTERRUPT_WINDOW 7 #define EXIT_REASON_NMI_WINDOW 8 @@ -94,6 +95,7 @@ { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ { EXIT_REASON_INIT_SIGNAL, "INIT_SIGNAL" }, \ + { EXIT_REASON_SIPI_SIGNAL, "SIPI_SIGNAL" }, \ { EXIT_REASON_INTERRUPT_WINDOW, "INTERRUPT_WINDOW" }, \ { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 68608bd892c0..5eeb808eb024 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -161,5 +161,3 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_MMCONF_FAM10H) += mmconf-fam10h_64.o obj-y += vsmp_64.o endif - -obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c index c22fb55abcfd..0916f00a992e 100644 --- a/arch/x86/kernel/acpi/apei.c +++ b/arch/x86/kernel/acpi/apei.c @@ -43,3 +43,8 @@ void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) apei_mce_report_mem_error(sev, mem_err); #endif } + +int arch_apei_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) +{ + return apei_smca_report_x86_error(ctx_info, lapic_id); +} diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index c8daa92f38dc..5d3a0b8fd379 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -112,7 +112,7 @@ SYM_FUNC_START(do_suspend_lowlevel) movq pt_regs_r14(%rax), %r14 movq pt_regs_r15(%rax), %r15 -#ifdef CONFIG_KASAN +#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK /* * The suspend path may have poisoned some areas deeper in the stack, * which we now need to unpoison. diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 2400ad62f330..8d778e46725d 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1374,7 +1374,7 @@ void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const voi * @addr: address to patch * @opcode: opcode of new instruction * @len: length to copy - * @handler: address to jump to when the temporary breakpoint is hit + * @emulate: instruction to be emulated * * Update a single instruction with the vector in the stack, avoiding * dynamically allocated memory. This function should be used when it is diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 18f6b7c4bd79..b4396952c9a6 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -384,7 +384,7 @@ struct resource *amd_get_mmconfig_range(struct resource *res) int amd_get_subcaches(int cpu) { - struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; + struct pci_dev *link = node_to_amd_nb(topology_die_id(cpu))->link; unsigned int mask; if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) @@ -398,7 +398,7 @@ int amd_get_subcaches(int cpu) int amd_set_subcaches(int cpu, unsigned long mask) { static unsigned int reset, ban; - struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); + struct amd_northbridge *nb = node_to_amd_nb(topology_die_id(cpu)); unsigned int reg; int cuid; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b3eef1d5c903..6bd20c0de8bc 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -94,6 +94,11 @@ static unsigned int disabled_cpu_apicid __ro_after_init = BAD_APICID; static int apic_extnmi __ro_after_init = APIC_EXTNMI_BSP; /* + * Hypervisor supports 15 bits of APIC ID in MSI Extended Destination ID + */ +static bool virt_ext_dest_id __ro_after_init; + +/* * Map cpu index to physical APIC ID */ DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); @@ -1591,7 +1596,7 @@ static void setup_local_APIC(void) apic->init_apic_ldr(); #ifdef CONFIG_X86_32 - if (apic->dest_logical) { + if (apic->dest_mode_logical) { int logical_apicid, ldr_apicid; /* @@ -1841,20 +1846,34 @@ static __init void try_to_enable_x2apic(int remap_mode) return; if (remap_mode != IRQ_REMAP_X2APIC_MODE) { - /* IR is required if there is APIC ID > 255 even when running - * under KVM + u32 apic_limit = 255; + + /* + * Using X2APIC without IR is not architecturally supported + * on bare metal but may be supported in guests. */ - if (max_physical_apicid > 255 || - !x86_init.hyper.x2apic_available()) { + if (!x86_init.hyper.x2apic_available()) { pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n"); x2apic_disable(); return; } /* - * without IR all CPUs can be addressed by IOAPIC/MSI - * only in physical mode + * If the hypervisor supports extended destination ID in + * MSI, that increases the maximum APIC ID that can be + * used for non-remapped IRQ domains. */ + if (x86_init.hyper.msi_ext_dest_id()) { + virt_ext_dest_id = 1; + apic_limit = 32767; + } + + /* + * Without IR, all CPUs can be addressed by IOAPIC/MSI only + * in physical mode, and CPUs with an APIC ID that cannnot + * be addressed must not be brought online. + */ + x2apic_set_max_apicid(apic_limit); x2apic_phys = 1; } x2apic_enable(); @@ -2478,6 +2497,46 @@ int hard_smp_processor_id(void) return read_apic_id(); } +void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg, + bool dmar) +{ + memset(msg, 0, sizeof(*msg)); + + msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW; + msg->arch_addr_lo.dest_mode_logical = apic->dest_mode_logical; + msg->arch_addr_lo.destid_0_7 = cfg->dest_apicid & 0xFF; + + msg->arch_data.delivery_mode = APIC_DELIVERY_MODE_FIXED; + msg->arch_data.vector = cfg->vector; + + msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH; + /* + * Only the IOMMU itself can use the trick of putting destination + * APIC ID into the high bits of the address. Anything else would + * just be writing to memory if it tried that, and needs IR to + * address APICs which can't be addressed in the normal 32-bit + * address range at 0xFFExxxxx. That is typically just 8 bits, but + * some hypervisors allow the extended destination ID field in bits + * 5-11 to be used, giving support for 15 bits of APIC IDs in total. + */ + if (dmar) + msg->arch_addr_hi.destid_8_31 = cfg->dest_apicid >> 8; + else if (virt_ext_dest_id && cfg->dest_apicid < 0x8000) + msg->arch_addr_lo.virt_destid_8_14 = cfg->dest_apicid >> 8; + else + WARN_ON_ONCE(cfg->dest_apicid > 0xFF); +} + +u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid) +{ + u32 dest = msg->arch_addr_lo.destid_0_7; + + if (extid) + dest |= msg->arch_addr_hi.destid_8_31 << 8; + return dest; +} +EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid); + /* * Override the generic EOI implementation with an optimized version. * Only called during early boot when only one CPU is active and with diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 7862b152a052..8f72b4351c9f 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -53,7 +53,7 @@ static void _flat_send_IPI_mask(unsigned long mask, int vector) unsigned long flags; local_irq_save(flags); - __default_send_IPI_dest_field(mask, vector, apic->dest_logical); + __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } @@ -113,15 +113,13 @@ static struct apic apic_flat __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 1, /* logical */ + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = true, .disable_esr = 0, - .dest_logical = APIC_DEST_LOGICAL, - .check_apicid_used = NULL, + .check_apicid_used = NULL, .init_apic_ldr = flat_init_apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, @@ -206,15 +204,13 @@ static struct apic apic_physflat __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 0, /* physical */ + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, .disable_esr = 0, - .dest_logical = 0, - .check_apicid_used = NULL, + .check_apicid_used = NULL, .init_apic_ldr = physflat_init_apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 780c702969b7..fe78319e0f7a 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -95,19 +95,15 @@ struct apic apic_noop __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = noop_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - /* logical delivery broadcast to all CPUs: */ - .irq_dest_mode = 1, + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = true, .disable_esr = 0, - .dest_logical = APIC_DEST_LOGICAL, - .check_apicid_used = default_check_apicid_used, + .check_apicid_used = default_check_apicid_used, .init_apic_ldr = noop_init_apic_ldr, - .ioapic_phys_id_map = default_ioapic_phys_id_map, .setup_apic_routing = NULL, - .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 35edd57f064a..a54d817eb4b6 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -246,15 +246,13 @@ static const struct apic apic_numachip1 __refconst = { .apic_id_valid = numachip_apic_id_valid, .apic_id_registered = numachip_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 0, /* physical */ + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, .disable_esr = 0, - .dest_logical = 0, - .check_apicid_used = NULL, + .check_apicid_used = NULL, .init_apic_ldr = flat_init_apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, @@ -295,15 +293,13 @@ static const struct apic apic_numachip2 __refconst = { .apic_id_valid = numachip_apic_id_valid, .apic_id_registered = numachip_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 0, /* physical */ + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, .disable_esr = 0, - .dest_logical = 0, - .check_apicid_used = NULL, + .check_apicid_used = NULL, .init_apic_ldr = flat_init_apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 98d015a4405a..77555f66c14d 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -127,16 +127,13 @@ static struct apic apic_bigsmp __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = bigsmp_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - /* phys delivery to target CPU: */ - .irq_dest_mode = 0, + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, .disable_esr = 1, - .dest_logical = 0, - .check_apicid_used = bigsmp_check_apicid_used, + .check_apicid_used = bigsmp_check_apicid_used, .init_apic_ldr = bigsmp_init_apic_ldr, - .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, .setup_apic_routing = bigsmp_setup_apic_routing, .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 7b3c7e0d4a09..e4ab4804b20d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -48,6 +48,7 @@ #include <linux/jiffies.h> /* time_after() */ #include <linux/slab.h> #include <linux/memblock.h> +#include <linux/msi.h> #include <asm/irqdomain.h> #include <asm/io.h> @@ -63,7 +64,6 @@ #include <asm/setup.h> #include <asm/irq_remapping.h> #include <asm/hw_irq.h> - #include <asm/apic.h> #define for_each_ioapic(idx) \ @@ -89,12 +89,12 @@ struct irq_pin_list { }; struct mp_chip_data { - struct list_head irq_2_pin; - struct IO_APIC_route_entry entry; - int trigger; - int polarity; + struct list_head irq_2_pin; + struct IO_APIC_route_entry entry; + bool is_level; + bool active_low; + bool isa_irq; u32 count; - bool isa_irq; }; struct mp_ioapic_gsi { @@ -286,31 +286,26 @@ static void io_apic_write(unsigned int apic, unsigned int reg, writel(value, &io_apic->data); } -union entry_union { - struct { u32 w1, w2; }; - struct IO_APIC_route_entry entry; -}; - static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin) { - union entry_union eu; + struct IO_APIC_route_entry entry; - eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); - eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + entry.w1 = io_apic_read(apic, 0x10 + 2 * pin); + entry.w2 = io_apic_read(apic, 0x11 + 2 * pin); - return eu.entry; + return entry; } static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) { - union entry_union eu; + struct IO_APIC_route_entry entry; unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - eu.entry = __ioapic_read_entry(apic, pin); + entry = __ioapic_read_entry(apic, pin); raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return eu.entry; + return entry; } /* @@ -321,11 +316,8 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) */ static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { - union entry_union eu = {{0, 0}}; - - eu.entry = e; - io_apic_write(apic, 0x11 + 2*pin, eu.w2); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); + io_apic_write(apic, 0x11 + 2*pin, e.w2); + io_apic_write(apic, 0x10 + 2*pin, e.w1); } static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) @@ -344,12 +336,12 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) */ static void ioapic_mask_entry(int apic, int pin) { + struct IO_APIC_route_entry e = { .masked = true }; unsigned long flags; - union entry_union eu = { .entry.mask = IOAPIC_MASKED }; raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); - io_apic_write(apic, 0x11 + 2*pin, eu.w2); + io_apic_write(apic, 0x10 + 2*pin, e.w1); + io_apic_write(apic, 0x11 + 2*pin, e.w2); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -422,20 +414,15 @@ static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node, add_pin_to_irq_node(data, node, newapic, newpin); } -static void io_apic_modify_irq(struct mp_chip_data *data, - int mask_and, int mask_or, +static void io_apic_modify_irq(struct mp_chip_data *data, bool masked, void (*final)(struct irq_pin_list *entry)) { - union entry_union eu; struct irq_pin_list *entry; - eu.entry = data->entry; - eu.w1 &= mask_and; - eu.w1 |= mask_or; - data->entry = eu.entry; + data->entry.masked = masked; for_each_irq_pin(entry, data->irq_2_pin) { - io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1); + io_apic_write(entry->apic, 0x10 + 2 * entry->pin, data->entry.w1); if (final) final(entry); } @@ -459,13 +446,13 @@ static void mask_ioapic_irq(struct irq_data *irq_data) unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); + io_apic_modify_irq(data, true, &io_apic_sync); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void __unmask_ioapic(struct mp_chip_data *data) { - io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL); + io_apic_modify_irq(data, false, NULL); } static void unmask_ioapic_irq(struct irq_data *irq_data) @@ -506,8 +493,8 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector) /* * Mask the entry and change the trigger mode to edge. */ - entry1.mask = IOAPIC_MASKED; - entry1.trigger = IOAPIC_EDGE; + entry1.masked = true; + entry1.is_level = false; __ioapic_write_entry(apic, pin, entry1); @@ -535,15 +522,15 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) /* Check delivery_mode to be sure we're not clearing an SMI pin */ entry = ioapic_read_entry(apic, pin); - if (entry.delivery_mode == dest_SMI) + if (entry.delivery_mode == APIC_DELIVERY_MODE_SMI) return; /* * Make sure the entry is masked and re-read the contents to check * if it is a level triggered pin and if the remote-IRR is set. */ - if (entry.mask == IOAPIC_UNMASKED) { - entry.mask = IOAPIC_MASKED; + if (!entry.masked) { + entry.masked = true; ioapic_write_entry(apic, pin, entry); entry = ioapic_read_entry(apic, pin); } @@ -556,8 +543,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) * doesn't clear the remote-IRR if the trigger mode is not * set to level. */ - if (entry.trigger == IOAPIC_EDGE) { - entry.trigger = IOAPIC_LEVEL; + if (!entry.is_level) { + entry.is_level = true; ioapic_write_entry(apic, pin, entry); } raw_spin_lock_irqsave(&ioapic_lock, flags); @@ -659,8 +646,8 @@ void mask_ioapic_entries(void) struct IO_APIC_route_entry entry; entry = ioapics[apic].saved_registers[pin]; - if (entry.mask == IOAPIC_UNMASKED) { - entry.mask = IOAPIC_MASKED; + if (!entry.masked) { + entry.masked = true; ioapic_write_entry(apic, pin, entry); } } @@ -745,44 +732,7 @@ static int __init find_isa_irq_apic(int irq, int type) return -1; } -#ifdef CONFIG_EISA -/* - * EISA Edge/Level control register, ELCR - */ -static int EISA_ELCR(unsigned int irq) -{ - if (irq < nr_legacy_irqs()) { - unsigned int port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; - } - apic_printk(APIC_VERBOSE, KERN_INFO - "Broken MPtable reports ISA irq %d\n", irq); - return 0; -} - -#endif - -/* ISA interrupts are always active high edge triggered, - * when listed as conforming in the MP table. */ - -#define default_ISA_trigger(idx) (IOAPIC_EDGE) -#define default_ISA_polarity(idx) (IOAPIC_POL_HIGH) - -/* EISA interrupts are always polarity zero and can be edge or level - * trigger depending on the ELCR value. If an interrupt is listed as - * EISA conforming in the MP table, that means its trigger type must - * be read in from the ELCR */ - -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq)) -#define default_EISA_polarity(idx) default_ISA_polarity(idx) - -/* PCI interrupts are always active low level triggered, - * when listed as conforming in the MP table. */ - -#define default_PCI_trigger(idx) (IOAPIC_LEVEL) -#define default_PCI_polarity(idx) (IOAPIC_POL_LOW) - -static int irq_polarity(int idx) +static bool irq_active_low(int idx) { int bus = mp_irqs[idx].srcbus; @@ -791,90 +741,139 @@ static int irq_polarity(int idx) */ switch (mp_irqs[idx].irqflag & MP_IRQPOL_MASK) { case MP_IRQPOL_DEFAULT: - /* conforms to spec, ie. bus-type dependent polarity */ - if (test_bit(bus, mp_bus_not_pci)) - return default_ISA_polarity(idx); - else - return default_PCI_polarity(idx); + /* + * Conforms to spec, ie. bus-type dependent polarity. PCI + * defaults to low active. [E]ISA defaults to high active. + */ + return !test_bit(bus, mp_bus_not_pci); case MP_IRQPOL_ACTIVE_HIGH: - return IOAPIC_POL_HIGH; + return false; case MP_IRQPOL_RESERVED: pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n"); fallthrough; case MP_IRQPOL_ACTIVE_LOW: default: /* Pointless default required due to do gcc stupidity */ - return IOAPIC_POL_LOW; + return true; } } #ifdef CONFIG_EISA -static int eisa_irq_trigger(int idx, int bus, int trigger) +/* + * EISA Edge/Level control register, ELCR + */ +static bool EISA_ELCR(unsigned int irq) +{ + if (irq < nr_legacy_irqs()) { + unsigned int port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; + } + apic_printk(APIC_VERBOSE, KERN_INFO + "Broken MPtable reports ISA irq %d\n", irq); + return false; +} + +/* + * EISA interrupts are always active high and can be edge or level + * triggered depending on the ELCR value. If an interrupt is listed as + * EISA conforming in the MP table, that means its trigger type must be + * read in from the ELCR. + */ +static bool eisa_irq_is_level(int idx, int bus, bool level) { switch (mp_bus_id_to_type[bus]) { case MP_BUS_PCI: case MP_BUS_ISA: - return trigger; + return level; case MP_BUS_EISA: - return default_EISA_trigger(idx); + return EISA_ELCR(mp_irqs[idx].srcbusirq); } pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus); - return IOAPIC_LEVEL; + return true; } #else -static inline int eisa_irq_trigger(int idx, int bus, int trigger) +static inline int eisa_irq_is_level(int idx, int bus, bool level) { - return trigger; + return level; } #endif -static int irq_trigger(int idx) +static bool irq_is_level(int idx) { int bus = mp_irqs[idx].srcbus; - int trigger; + bool level; /* * Determine IRQ trigger mode (edge or level sensitive): */ switch (mp_irqs[idx].irqflag & MP_IRQTRIG_MASK) { case MP_IRQTRIG_DEFAULT: - /* conforms to spec, ie. bus-type dependent trigger mode */ - if (test_bit(bus, mp_bus_not_pci)) - trigger = default_ISA_trigger(idx); - else - trigger = default_PCI_trigger(idx); + /* + * Conforms to spec, ie. bus-type dependent trigger + * mode. PCI defaults to level, ISA to edge. + */ + level = !test_bit(bus, mp_bus_not_pci); /* Take EISA into account */ - return eisa_irq_trigger(idx, bus, trigger); + return eisa_irq_is_level(idx, bus, level); case MP_IRQTRIG_EDGE: - return IOAPIC_EDGE; + return false; case MP_IRQTRIG_RESERVED: pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n"); fallthrough; case MP_IRQTRIG_LEVEL: default: /* Pointless default required due to do gcc stupidity */ - return IOAPIC_LEVEL; + return true; } } +static int __acpi_get_override_irq(u32 gsi, bool *trigger, bool *polarity) +{ + int ioapic, pin, idx; + + if (skip_ioapic_setup) + return -1; + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return -1; + + pin = mp_find_ioapic_pin(ioapic, gsi); + if (pin < 0) + return -1; + + idx = find_irq_entry(ioapic, pin, mp_INT); + if (idx < 0) + return -1; + + *trigger = irq_is_level(idx); + *polarity = irq_active_low(idx); + return 0; +} + +#ifdef CONFIG_ACPI +int acpi_get_override_irq(u32 gsi, int *is_level, int *active_low) +{ + *is_level = *active_low = 0; + return __acpi_get_override_irq(gsi, (bool *)is_level, + (bool *)active_low); +} +#endif + void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, int trigger, int polarity) { init_irq_alloc_info(info, NULL); info->type = X86_IRQ_ALLOC_TYPE_IOAPIC; info->ioapic.node = node; - info->ioapic.trigger = trigger; - info->ioapic.polarity = polarity; + info->ioapic.is_level = trigger; + info->ioapic.active_low = polarity; info->ioapic.valid = 1; } -#ifndef CONFIG_ACPI -int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity); -#endif - static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst, struct irq_alloc_info *src, u32 gsi, int ioapic_idx, int pin) { - int trigger, polarity; + bool level, pol_low; copy_irq_alloc_info(dst, src); dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC; @@ -883,20 +882,20 @@ static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst, dst->ioapic.valid = 1; if (src && src->ioapic.valid) { dst->ioapic.node = src->ioapic.node; - dst->ioapic.trigger = src->ioapic.trigger; - dst->ioapic.polarity = src->ioapic.polarity; + dst->ioapic.is_level = src->ioapic.is_level; + dst->ioapic.active_low = src->ioapic.active_low; } else { dst->ioapic.node = NUMA_NO_NODE; - if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) { - dst->ioapic.trigger = trigger; - dst->ioapic.polarity = polarity; + if (__acpi_get_override_irq(gsi, &level, &pol_low) >= 0) { + dst->ioapic.is_level = level; + dst->ioapic.active_low = pol_low; } else { /* * PCI interrupts are always active low level * triggered. */ - dst->ioapic.trigger = IOAPIC_LEVEL; - dst->ioapic.polarity = IOAPIC_POL_LOW; + dst->ioapic.is_level = true; + dst->ioapic.active_low = true; } } } @@ -906,12 +905,12 @@ static int ioapic_alloc_attr_node(struct irq_alloc_info *info) return (info && info->ioapic.valid) ? info->ioapic.node : NUMA_NO_NODE; } -static void mp_register_handler(unsigned int irq, unsigned long trigger) +static void mp_register_handler(unsigned int irq, bool level) { irq_flow_handler_t hdl; bool fasteoi; - if (trigger) { + if (level) { irq_set_status_flags(irq, IRQ_LEVEL); fasteoi = true; } else { @@ -933,14 +932,14 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) * pin with real trigger and polarity attributes. */ if (irq < nr_legacy_irqs() && data->count == 1) { - if (info->ioapic.trigger != data->trigger) - mp_register_handler(irq, info->ioapic.trigger); - data->entry.trigger = data->trigger = info->ioapic.trigger; - data->entry.polarity = data->polarity = info->ioapic.polarity; + if (info->ioapic.is_level != data->is_level) + mp_register_handler(irq, info->ioapic.is_level); + data->entry.is_level = data->is_level = info->ioapic.is_level; + data->entry.active_low = data->active_low = info->ioapic.active_low; } - return data->trigger == info->ioapic.trigger && - data->polarity == info->ioapic.polarity; + return data->is_level == info->ioapic.is_level && + data->active_low == info->ioapic.active_low; } static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, @@ -1219,10 +1218,9 @@ void ioapic_zap_locks(void) static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) { - int i; - char buf[256]; struct IO_APIC_route_entry entry; - struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry; + char buf[256]; + int i; printk(KERN_DEBUG "IOAPIC %d:\n", apic); for (i = 0; i <= nr_entries; i++) { @@ -1230,20 +1228,21 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) snprintf(buf, sizeof(buf), " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)", i, - entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ", - entry.trigger == IOAPIC_LEVEL ? "level" : "edge ", - entry.polarity == IOAPIC_POL_LOW ? "low " : "high", + entry.masked ? "disabled" : "enabled ", + entry.is_level ? "level" : "edge ", + entry.active_low ? "low " : "high", entry.vector, entry.irr, entry.delivery_status); - if (ir_entry->format) + if (entry.ir_format) { printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n", - buf, (ir_entry->index2 << 15) | ir_entry->index, - ir_entry->zero); - else - printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n", buf, - entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ? - "logical " : "physical", - entry.dest, entry.delivery_mode); + (entry.ir_index_15 << 15) | entry.ir_index_0_14, + entry.ir_zero); + } else { + printk(KERN_DEBUG "%s, %s, D(%02X%02X), M(%1d)\n", buf, + entry.dest_mode_logical ? "logical " : "physical", + entry.virt_destid_8_14, entry.destid_0_7, + entry.delivery_mode); + } } } @@ -1368,7 +1367,8 @@ void __init enable_IO_APIC(void) /* If the interrupt line is enabled and in ExtInt mode * I have found the pin where the i8259 is connected. */ - if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { + if (!entry.masked && + entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT) { ioapic_i8259.apic = apic; ioapic_i8259.pin = pin; goto found_i8259; @@ -1410,14 +1410,16 @@ void native_restore_boot_irq_mode(void) */ if (ioapic_i8259.pin != -1) { struct IO_APIC_route_entry entry; + u32 apic_id = read_apic_id(); memset(&entry, 0, sizeof(entry)); - entry.mask = IOAPIC_UNMASKED; - entry.trigger = IOAPIC_EDGE; - entry.polarity = IOAPIC_POL_HIGH; - entry.dest_mode = IOAPIC_DEST_MODE_PHYSICAL; - entry.delivery_mode = dest_ExtINT; - entry.dest = read_apic_id(); + entry.masked = false; + entry.is_level = false; + entry.active_low = false; + entry.dest_mode_logical = false; + entry.delivery_mode = APIC_DELIVERY_MODE_EXTINT; + entry.destid_0_7 = apic_id & 0xFF; + entry.virt_destid_8_14 = apic_id >> 8; /* * Add it to the IO-APIC irq-routing table: @@ -1618,21 +1620,16 @@ static void __init delay_without_tsc(void) static int __init timer_irq_works(void) { unsigned long t1 = jiffies; - unsigned long flags; if (no_timer_check) return 1; - local_save_flags(flags); local_irq_enable(); - if (boot_cpu_has(X86_FEATURE_TSC)) delay_with_tsc(); else delay_without_tsc(); - local_irq_restore(flags); - /* * Expect a few ticks at least, to be sure some possible * glue logic does not lock up after one or two first @@ -1641,10 +1638,10 @@ static int __init timer_irq_works(void) * least one tick may be lost due to delays. */ - /* jiffies wrap? */ - if (time_after(jiffies, t1 + 4)) - return 1; - return 0; + local_irq_disable(); + + /* Did jiffies advance? */ + return time_after(jiffies, t1 + 4); } /* @@ -1696,13 +1693,13 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data) raw_spin_lock_irqsave(&ioapic_lock, flags); for_each_irq_pin(entry, data->irq_2_pin) { - unsigned int reg; + struct IO_APIC_route_entry e; int pin; pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin*2); + e.w1 = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ - if (reg & IO_APIC_REDIR_REMOTE_IRR) { + if (e.irr) { raw_spin_unlock_irqrestore(&ioapic_lock, flags); return true; } @@ -1849,21 +1846,62 @@ static void ioapic_ir_ack_level(struct irq_data *irq_data) eoi_ioapic_pin(data->entry.vector, data); } +/* + * The I/OAPIC is just a device for generating MSI messages from legacy + * interrupt pins. Various fields of the RTE translate into bits of the + * resulting MSI which had a historical meaning. + * + * With interrupt remapping, many of those bits have different meanings + * in the underlying MSI, but the way that the I/OAPIC transforms them + * from its RTE to the MSI message is the same. This function allows + * the parent IRQ domain to compose the MSI message, then takes the + * relevant bits to put them in the appropriate places in the RTE in + * order to generate that message when the IRQ happens. + * + * The setup here relies on a preconfigured route entry (is_level, + * active_low, masked) because the parent domain is merely composing the + * generic message routing information which is used for the MSI. + */ +static void ioapic_setup_msg_from_msi(struct irq_data *irq_data, + struct IO_APIC_route_entry *entry) +{ + struct msi_msg msg; + + /* Let the parent domain compose the MSI message */ + irq_chip_compose_msi_msg(irq_data, &msg); + + /* + * - Real vector + * - DMAR/IR: 8bit subhandle (ioapic.pin) + * - AMD/IR: 8bit IRTE index + */ + entry->vector = msg.arch_data.vector; + /* Delivery mode (for DMAR/IR all 0) */ + entry->delivery_mode = msg.arch_data.delivery_mode; + /* Destination mode or DMAR/IR index bit 15 */ + entry->dest_mode_logical = msg.arch_addr_lo.dest_mode_logical; + /* DMAR/IR: 1, 0 for all other modes */ + entry->ir_format = msg.arch_addr_lo.dmar_format; + /* + * - DMAR/IR: index bit 0-14. + * + * - Virt: If the host supports x2apic without a virtualized IR + * unit then bit 0-6 of dmar_index_0_14 are providing bit + * 8-14 of the destination id. + * + * All other modes have bit 0-6 of dmar_index_0_14 cleared and the + * topmost 8 bits are destination id bit 0-7 (entry::destid_0_7). + */ + entry->ir_index_0_14 = msg.arch_addr_lo.dmar_index_0_14; +} + static void ioapic_configure_entry(struct irq_data *irqd) { struct mp_chip_data *mpd = irqd->chip_data; - struct irq_cfg *cfg = irqd_cfg(irqd); struct irq_pin_list *entry; - /* - * Only update when the parent is the vector domain, don't touch it - * if the parent is the remapping domain. Check the installed - * ioapic chip to verify that. - */ - if (irqd->chip == &ioapic_chip) { - mpd->entry.dest = cfg->dest_apicid; - mpd->entry.vector = cfg->vector; - } + ioapic_setup_msg_from_msi(irqd, &mpd->entry); + for_each_irq_pin(entry, mpd->irq_2_pin) __ioapic_write_entry(entry->apic, entry->pin, mpd->entry); } @@ -1919,7 +1957,7 @@ static int ioapic_irq_get_chip_state(struct irq_data *irqd, * irrelevant because the IO-APIC treats them as fire and * forget. */ - if (rentry.irr && rentry.trigger) { + if (rentry.irr && rentry.is_level) { *state = true; break; } @@ -2027,6 +2065,7 @@ static inline void __init unlock_ExtINT_logic(void) int apic, pin, i; struct IO_APIC_route_entry entry0, entry1; unsigned char save_control, save_freq_select; + u32 apic_id; pin = find_isa_irq_pin(8, mp_INT); if (pin == -1) { @@ -2042,14 +2081,16 @@ static inline void __init unlock_ExtINT_logic(void) entry0 = ioapic_read_entry(apic, pin); clear_IO_APIC_pin(apic, pin); + apic_id = hard_smp_processor_id(); memset(&entry1, 0, sizeof(entry1)); - entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL; - entry1.mask = IOAPIC_UNMASKED; - entry1.dest = hard_smp_processor_id(); - entry1.delivery_mode = dest_ExtINT; - entry1.polarity = entry0.polarity; - entry1.trigger = IOAPIC_EDGE; + entry1.dest_mode_logical = true; + entry1.masked = false; + entry1.destid_0_7 = apic_id & 0xFF; + entry1.virt_destid_8_14 = apic_id >> 8; + entry1.delivery_mode = APIC_DELIVERY_MODE_EXTINT; + entry1.active_low = entry0.active_low; + entry1.is_level = false; entry1.vector = 0; ioapic_write_entry(apic, pin, entry1); @@ -2117,13 +2158,12 @@ static inline void __init check_timer(void) struct irq_cfg *cfg = irqd_cfg(irq_data); int node = cpu_to_node(0); int apic1, pin1, apic2, pin2; - unsigned long flags; int no_pin1 = 0; if (!global_clock_event) return; - local_irq_save(flags); + local_irq_disable(); /* * get/set the timer IRQ vector: @@ -2178,9 +2218,9 @@ static inline void __init check_timer(void) * so only need to unmask if it is level-trigger * do we really have level trigger timer? */ - int idx; - idx = find_irq_entry(apic1, pin1, mp_INT); - if (idx != -1 && irq_trigger(idx)) + int idx = find_irq_entry(apic1, pin1, mp_INT); + + if (idx != -1 && irq_is_level(idx)) unmask_ioapic_irq(irq_get_irq_data(0)); } irq_domain_deactivate_irq(irq_data); @@ -2191,7 +2231,6 @@ static inline void __init check_timer(void) goto out; } panic_if_irq_remap("timer doesn't work through Interrupt-remapped IO-APIC"); - local_irq_disable(); clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " @@ -2215,7 +2254,6 @@ static inline void __init check_timer(void) /* * Cleanup, just in case ... */ - local_irq_disable(); legacy_pic->mask(0); clear_IO_APIC_pin(apic2, pin2); apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); @@ -2232,7 +2270,6 @@ static inline void __init check_timer(void) apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } - local_irq_disable(); legacy_pic->mask(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); @@ -2251,7 +2288,6 @@ static inline void __init check_timer(void) apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } - local_irq_disable(); apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); if (apic_is_x2apic_enabled()) apic_printk(APIC_QUIET, KERN_INFO @@ -2260,7 +2296,7 @@ static inline void __init check_timer(void) panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " "report. Then try booting with the 'noapic' option.\n"); out: - local_irq_restore(flags); + local_irq_enable(); } /* @@ -2284,36 +2320,37 @@ out: static int mp_irqdomain_create(int ioapic) { - struct irq_alloc_info info; struct irq_domain *parent; int hwirqs = mp_ioapic_pin_count(ioapic); struct ioapic *ip = &ioapics[ioapic]; struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg; struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); struct fwnode_handle *fn; - char *name = "IO-APIC"; + struct irq_fwspec fwspec; if (cfg->type == IOAPIC_DOMAIN_INVALID) return 0; - init_irq_alloc_info(&info, NULL); - info.type = X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT; - info.devid = mpc_ioapic_id(ioapic); - parent = irq_remapping_get_irq_domain(&info); - if (!parent) - parent = x86_vector_domain; - else - name = "IO-APIC-IR"; - /* Handle device tree enumerated APICs proper */ if (cfg->dev) { fn = of_node_to_fwnode(cfg->dev); } else { - fn = irq_domain_alloc_named_id_fwnode(name, ioapic); + fn = irq_domain_alloc_named_id_fwnode("IO-APIC", mpc_ioapic_id(ioapic)); if (!fn) return -ENOMEM; } + fwspec.fwnode = fn; + fwspec.param_count = 1; + fwspec.param[0] = mpc_ioapic_id(ioapic); + + parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY); + if (!parent) { + if (!cfg->dev) + irq_domain_free_fwnode(fn); + return -ENODEV; + } + ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops, (void *)(long)ioapic); @@ -2587,30 +2624,6 @@ static int io_apic_get_version(int ioapic) return reg_01.bits.version; } -int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity) -{ - int ioapic, pin, idx; - - if (skip_ioapic_setup) - return -1; - - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) - return -1; - - pin = mp_find_ioapic_pin(ioapic, gsi); - if (pin < 0) - return -1; - - idx = find_irq_entry(ioapic, pin, mp_INT); - if (idx < 0) - return -1; - - *trigger = irq_trigger(idx); - *polarity = irq_polarity(idx); - return 0; -} - /* * This function updates target affinity of IOAPIC interrupts to include * the CPUs which came online during SMP bringup. @@ -2934,44 +2947,49 @@ static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, struct irq_alloc_info *info) { if (info && info->ioapic.valid) { - data->trigger = info->ioapic.trigger; - data->polarity = info->ioapic.polarity; - } else if (acpi_get_override_irq(gsi, &data->trigger, - &data->polarity) < 0) { + data->is_level = info->ioapic.is_level; + data->active_low = info->ioapic.active_low; + } else if (__acpi_get_override_irq(gsi, &data->is_level, + &data->active_low) < 0) { /* PCI interrupts are always active low level triggered. */ - data->trigger = IOAPIC_LEVEL; - data->polarity = IOAPIC_POL_LOW; + data->is_level = true; + data->active_low = true; } } -static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data, - struct IO_APIC_route_entry *entry) +/* + * Configure the I/O-APIC specific fields in the routing entry. + * + * This is important to setup the I/O-APIC specific bits (is_level, + * active_low, masked) because the underlying parent domain will only + * provide the routing information and is oblivious of the I/O-APIC + * specific bits. + * + * The entry is just preconfigured at this point and not written into the + * RTE. This happens later during activation which will fill in the actual + * routing information. + */ +static void mp_preconfigure_entry(struct mp_chip_data *data) { + struct IO_APIC_route_entry *entry = &data->entry; + memset(entry, 0, sizeof(*entry)); - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->dest = cfg->dest_apicid; - entry->vector = cfg->vector; - entry->trigger = data->trigger; - entry->polarity = data->polarity; + entry->is_level = data->is_level; + entry->active_low = data->active_low; /* * Mask level triggered irqs. Edge triggered irqs are masked * by the irq core code in case they fire. */ - if (data->trigger == IOAPIC_LEVEL) - entry->mask = IOAPIC_MASKED; - else - entry->mask = IOAPIC_UNMASKED; + entry->masked = data->is_level; } int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { - int ret, ioapic, pin; - struct irq_cfg *cfg; - struct irq_data *irq_data; - struct mp_chip_data *data; struct irq_alloc_info *info = arg; + struct mp_chip_data *data; + struct irq_data *irq_data; + int ret, ioapic, pin; unsigned long flags; if (!info || nr_irqs > 1) @@ -2989,7 +3007,6 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, if (!data) return -ENOMEM; - info->ioapic.entry = &data->entry; ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); if (ret < 0) { kfree(data); @@ -3003,22 +3020,20 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, irq_data->chip_data = data; mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info); - cfg = irqd_cfg(irq_data); add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin); + mp_preconfigure_entry(data); + mp_register_handler(virq, data->is_level); + local_irq_save(flags); - if (info->ioapic.entry) - mp_setup_entry(cfg, data, info->ioapic.entry); - mp_register_handler(virq, data->trigger); if (virq < nr_legacy_irqs()) legacy_pic->mask(virq); local_irq_restore(flags); apic_printk(APIC_VERBOSE, KERN_DEBUG - "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n", - ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector, - virq, data->trigger, data->polarity, cfg->dest_apicid); - + "IOAPIC[%d]: Preconfigured routing entry (%d-%d -> IRQ %d Level:%i ActiveLow:%i)\n", + ioapic, mpc_ioapic_id(ioapic), pin, virq, + data->is_level, data->active_low); return 0; } diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 387154e39e08..d1fb874fbe64 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -260,7 +260,7 @@ void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, for_each_cpu(query_cpu, mask) __default_send_IPI_dest_field( early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, apic->dest_logical); + vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } @@ -279,7 +279,7 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, continue; __default_send_IPI_dest_field( early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, apic->dest_logical); + vector, APIC_DEST_LOGICAL); } local_irq_restore(flags); } @@ -297,7 +297,7 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) local_irq_save(flags); WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); - __default_send_IPI_dest_field(mask, vector, apic->dest_logical); + __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 6313f0a05db7..44ebe25e7703 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -15,7 +15,6 @@ #include <linux/hpet.h> #include <linux/msi.h> #include <asm/irqdomain.h> -#include <asm/msidef.h> #include <asm/hpet.h> #include <asm/hw_irq.h> #include <asm/apic.h> @@ -23,38 +22,11 @@ struct irq_domain *x86_pci_msi_default_domain __ro_after_init; -static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg) -{ - msg->address_hi = MSI_ADDR_BASE_HI; - - if (x2apic_enabled()) - msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); - - msg->address_lo = - MSI_ADDR_BASE_LO | - ((apic->irq_dest_mode == 0) ? - MSI_ADDR_DEST_MODE_PHYSICAL : - MSI_ADDR_DEST_MODE_LOGICAL) | - MSI_ADDR_REDIRECTION_CPU | - MSI_ADDR_DEST_ID(cfg->dest_apicid); - - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - MSI_DATA_DELIVERY_FIXED | - MSI_DATA_VECTOR(cfg->vector); -} - -void x86_vector_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) -{ - __irq_msi_compose_msg(irqd_cfg(data), msg); -} - static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg) { struct msi_msg msg[2] = { [1] = { }, }; - __irq_msi_compose_msg(cfg, msg); + __irq_msi_compose_msg(cfg, msg, false); irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg); } @@ -276,6 +248,17 @@ struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent, #endif #ifdef CONFIG_DMAR_TABLE +/* + * The Intel IOMMU (ab)uses the high bits of the MSI address to contain the + * high bits of the destination APIC ID. This can't be done in the general + * case for MSIs as it would be targeting real memory above 4GiB not the + * APIC. + */ +static void dmar_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) +{ + __irq_msi_compose_msg(irqd_cfg(data), msg, true); +} + static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg) { dmar_msi_write(data->irq, msg); @@ -288,6 +271,7 @@ static struct irq_chip dmar_msi_controller = { .irq_ack = irq_chip_ack_parent, .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_compose_msi_msg = dmar_msi_compose_msg, .irq_write_msi_msg = dmar_msi_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -356,114 +340,3 @@ void dmar_free_hwirq(int irq) irq_domain_free_irqs(irq, 1); } #endif - -/* - * MSI message composition - */ -#ifdef CONFIG_HPET_TIMER -static inline int hpet_dev_id(struct irq_domain *domain) -{ - struct msi_domain_info *info = msi_get_domain_info(domain); - - return (int)(long)info->data; -} - -static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) -{ - hpet_msi_write(irq_data_get_irq_handler_data(data), msg); -} - -static struct irq_chip hpet_msi_controller __ro_after_init = { - .name = "HPET-MSI", - .irq_unmask = hpet_msi_unmask, - .irq_mask = hpet_msi_mask, - .irq_ack = irq_chip_ack_parent, - .irq_set_affinity = msi_domain_set_affinity, - .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_write_msi_msg = hpet_msi_write_msg, - .flags = IRQCHIP_SKIP_SET_WAKE, -}; - -static int hpet_msi_init(struct irq_domain *domain, - struct msi_domain_info *info, unsigned int virq, - irq_hw_number_t hwirq, msi_alloc_info_t *arg) -{ - irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); - irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL, - handle_edge_irq, arg->data, "edge"); - - return 0; -} - -static void hpet_msi_free(struct irq_domain *domain, - struct msi_domain_info *info, unsigned int virq) -{ - irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); -} - -static struct msi_domain_ops hpet_msi_domain_ops = { - .msi_init = hpet_msi_init, - .msi_free = hpet_msi_free, -}; - -static struct msi_domain_info hpet_msi_domain_info = { - .ops = &hpet_msi_domain_ops, - .chip = &hpet_msi_controller, - .flags = MSI_FLAG_USE_DEF_DOM_OPS, -}; - -struct irq_domain *hpet_create_irq_domain(int hpet_id) -{ - struct msi_domain_info *domain_info; - struct irq_domain *parent, *d; - struct irq_alloc_info info; - struct fwnode_handle *fn; - - if (x86_vector_domain == NULL) - return NULL; - - domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL); - if (!domain_info) - return NULL; - - *domain_info = hpet_msi_domain_info; - domain_info->data = (void *)(long)hpet_id; - - init_irq_alloc_info(&info, NULL); - info.type = X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT; - info.devid = hpet_id; - parent = irq_remapping_get_irq_domain(&info); - if (parent == NULL) - parent = x86_vector_domain; - else - hpet_msi_controller.name = "IR-HPET-MSI"; - - fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name, - hpet_id); - if (!fn) { - kfree(domain_info); - return NULL; - } - - d = msi_create_irq_domain(fn, domain_info, parent); - if (!d) { - irq_domain_free_fwnode(fn); - kfree(domain_info); - } - return d; -} - -int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc, - int dev_num) -{ - struct irq_alloc_info info; - - init_irq_alloc_info(&info, NULL); - info.type = X86_IRQ_ALLOC_TYPE_HPET; - info.data = hc; - info.devid = hpet_dev_id(domain); - info.hwirq = dev_num; - - return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info); -} -#endif diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 67b6f7c049ec..a61f642b1b90 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -69,16 +69,13 @@ static struct apic apic_default __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = default_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - /* logical delivery broadcast to all CPUs: */ - .irq_dest_mode = 1, + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = true, .disable_esr = 0, - .dest_logical = APIC_DEST_LOGICAL, - .check_apicid_used = default_check_apicid_used, + .check_apicid_used = default_check_apicid_used, .init_apic_ldr = default_init_apic_ldr, - .ioapic_phys_id_map = default_ioapic_phys_id_map, .setup_apic_routing = setup_apic_flat_routing, .cpu_present_to_apicid = default_cpu_present_to_apicid, diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 758bbf25ef74..3c9c7492252f 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -640,7 +640,50 @@ static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d, } #endif +int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec) +{ + if (fwspec->param_count != 1) + return 0; + + if (is_fwnode_irqchip(fwspec->fwnode)) { + const char *fwname = fwnode_get_name(fwspec->fwnode); + return fwname && !strncmp(fwname, "IO-APIC-", 8) && + simple_strtol(fwname+8, NULL, 10) == fwspec->param[0]; + } + return to_of_node(fwspec->fwnode) && + of_device_is_compatible(to_of_node(fwspec->fwnode), + "intel,ce4100-ioapic"); +} + +int x86_fwspec_is_hpet(struct irq_fwspec *fwspec) +{ + if (fwspec->param_count != 1) + return 0; + + if (is_fwnode_irqchip(fwspec->fwnode)) { + const char *fwname = fwnode_get_name(fwspec->fwnode); + return fwname && !strncmp(fwname, "HPET-MSI-", 9) && + simple_strtol(fwname+9, NULL, 10) == fwspec->param[0]; + } + return 0; +} + +static int x86_vector_select(struct irq_domain *d, struct irq_fwspec *fwspec, + enum irq_domain_bus_token bus_token) +{ + /* + * HPET and I/OAPIC cannot be parented in the vector domain + * if IRQ remapping is enabled. APIC IDs above 15 bits are + * only permitted if IRQ remapping is enabled, so check that. + */ + if (apic->apic_id_valid(32768)) + return 0; + + return x86_fwspec_is_ioapic(fwspec) || x86_fwspec_is_hpet(fwspec); +} + static const struct irq_domain_ops x86_vector_domain_ops = { + .select = x86_vector_select, .alloc = x86_vector_alloc_irqs, .free = x86_vector_free_irqs, .activate = x86_vector_activate, @@ -822,6 +865,12 @@ void apic_ack_edge(struct irq_data *irqd) apic_ack_irq(irqd); } +static void x86_vector_msi_compose_msg(struct irq_data *data, + struct msi_msg *msg) +{ + __irq_msi_compose_msg(irqd_cfg(data), msg, false); +} + static struct irq_chip lapic_controller = { .name = "APIC", .irq_ack = apic_ack_edge, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index b0889c48a2ac..df6adc5674c9 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -61,7 +61,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) if (!dest) continue; - __x2apic_send_IPI_dest(dest, vector, apic->dest_logical); + __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL); /* Remove cluster CPUs from tmpmask */ cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask); } @@ -184,15 +184,13 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 1, /* logical */ + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = true, .disable_esr = 0, - .dest_logical = APIC_DEST_LOGICAL, - .check_apicid_used = NULL, + .check_apicid_used = NULL, .init_apic_ldr = init_x2apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index bc9693841353..0e4e81971567 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -8,6 +8,12 @@ int x2apic_phys; static struct apic apic_x2apic_phys; +static u32 x2apic_max_apicid __ro_after_init; + +void __init x2apic_set_max_apicid(u32 apicid) +{ + x2apic_max_apicid = apicid; +} static int __init set_x2apic_phys_mode(char *arg) { @@ -98,6 +104,9 @@ static int x2apic_phys_probe(void) /* Common x2apic functions, also used by x2apic_cluster */ int x2apic_apic_id_valid(u32 apicid) { + if (x2apic_max_apicid && apicid > x2apic_max_apicid) + return 0; + return 1; } @@ -148,15 +157,13 @@ static struct apic apic_x2apic_phys __ro_after_init = { .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 0, /* physical */ + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, .disable_esr = 0, - .dest_logical = 0, - .check_apicid_used = NULL, + .check_apicid_used = NULL, .init_apic_ldr = init_x2apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 235f5cde06fc..52bc217ca8c3 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -502,6 +502,18 @@ enum uv_system_type get_uv_system_type(void) return uv_system_type; } +int uv_get_hubless_system(void) +{ + return uv_hubless_system; +} +EXPORT_SYMBOL_GPL(uv_get_hubless_system); + +ssize_t uv_get_archtype(char *buf, int len) +{ + return scnprintf(buf, len, "%s/%s", uv_archtype, oem_table_id); +} +EXPORT_SYMBOL_GPL(uv_get_archtype); + int is_uv_system(void) { return uv_system_type != UV_NONE; @@ -716,9 +728,9 @@ static void uv_send_IPI_one(int cpu, int vector) unsigned long dmode, val; if (vector == NMI_VECTOR) - dmode = dest_NMI; + dmode = APIC_DELIVERY_MODE_NMI; else - dmode = dest_Fixed; + dmode = APIC_DELIVERY_MODE_FIXED; val = (1UL << UVH_IPI_INT_SEND_SHFT) | (apicid << UVH_IPI_INT_APIC_ID_SHFT) | @@ -820,15 +832,13 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .apic_id_valid = uv_apic_id_valid, .apic_id_registered = uv_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 0, /* Physical */ + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, .disable_esr = 0, - .dest_logical = APIC_DEST_LOGICAL, - .check_apicid_used = NULL, + .check_apicid_used = NULL, .init_apic_ldr = uv_init_apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, @@ -1603,21 +1613,30 @@ static void check_efi_reboot(void) reboot_type = BOOT_ACPI; } -/* Setup user proc fs files */ +/* + * User proc fs file handling now deprecated. + * Recommend using /sys/firmware/sgi_uv/... instead. + */ static int __maybe_unused proc_hubbed_show(struct seq_file *file, void *data) { + pr_notice_once("%s: using deprecated /proc/sgi_uv/hubbed, use /sys/firmware/sgi_uv/hub_type\n", + current->comm); seq_printf(file, "0x%x\n", uv_hubbed_system); return 0; } static int __maybe_unused proc_hubless_show(struct seq_file *file, void *data) { + pr_notice_once("%s: using deprecated /proc/sgi_uv/hubless, use /sys/firmware/sgi_uv/hubless\n", + current->comm); seq_printf(file, "0x%x\n", uv_hubless_system); return 0; } static int __maybe_unused proc_archtype_show(struct seq_file *file, void *data) { + pr_notice_once("%s: using deprecated /proc/sgi_uv/archtype, use /sys/firmware/sgi_uv/archtype\n", + current->comm); seq_printf(file, "%s/%s\n", uv_archtype, oem_table_id); return 0; } diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 70b7154f4bdd..60b9f42ce3c1 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -66,7 +66,6 @@ static void __used common(void) OFFSET(PV_IRQ_irq_disable, paravirt_patch_template, irq.irq_disable); OFFSET(PV_IRQ_irq_enable, paravirt_patch_template, irq.irq_enable); OFFSET(PV_CPU_iret, paravirt_patch_template, cpu.iret); - OFFSET(PV_MMU_read_cr2, paravirt_patch_template, mmu.read_cr2); #endif #ifdef CONFIG_XEN diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 93792b457b81..637b499450d1 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_X86_MCE) += mce/ obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_MICROCODE) += microcode/ obj-$(CONFIG_X86_CPU_RESCTRL) += resctrl/ +obj-$(CONFIG_X86_SGX) += sgx/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 6062ce586b95..f8ca66f3d861 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -23,7 +23,6 @@ #ifdef CONFIG_X86_64 # include <asm/mmconfig.h> -# include <asm/set_memory.h> #endif #include "cpu.h" @@ -330,7 +329,6 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c) */ static void amd_get_topology(struct cpuinfo_x86 *c) { - u8 node_id; int cpu = smp_processor_id(); /* get information required for multi-node processors */ @@ -340,7 +338,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c) cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - node_id = ecx & 0xff; + c->cpu_die_id = ecx & 0xff; if (c->x86 == 0x15) c->cu_id = ebx & 0xff; @@ -360,15 +358,15 @@ static void amd_get_topology(struct cpuinfo_x86 *c) if (!err) c->x86_coreid_bits = get_count_order(c->x86_max_cores); - cacheinfo_amd_init_llc_id(c, cpu, node_id); + cacheinfo_amd_init_llc_id(c, cpu); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); - node_id = value & 7; + c->cpu_die_id = value & 7; - per_cpu(cpu_llc_id, cpu) = node_id; + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; } else return; @@ -393,7 +391,7 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c) /* Convert the initial APIC ID into the socket ID */ c->phys_proc_id = c->initial_apicid >> bits; /* use socket ID also for last level cache */ - per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id; } static void amd_detect_ppin(struct cpuinfo_x86 *c) @@ -425,12 +423,6 @@ clear_ppin: clear_cpu_cap(c, X86_FEATURE_AMD_PPIN); } -u16 amd_get_nb_id(int cpu) -{ - return per_cpu(cpu_llc_id, cpu); -} -EXPORT_SYMBOL_GPL(amd_get_nb_id); - u32 amd_get_nodes_per_socket(void) { return nodes_per_socket; @@ -516,26 +508,6 @@ static void early_init_amd_mc(struct cpuinfo_x86 *c) static void bsp_init_amd(struct cpuinfo_x86 *c) { - -#ifdef CONFIG_X86_64 - if (c->x86 >= 0xf) { - unsigned long long tseg; - - /* - * Split up direct mapping around the TSEG SMM area. - * Don't do it for gbpages because there seems very little - * benefit in doing so. - */ - if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { - unsigned long pfn = tseg >> PAGE_SHIFT; - - pr_debug("tseg: %010llx\n", tseg); - if (pfn_range_is_mapped(pfn, pfn + 1)) - set_memory_4k((unsigned long)__va(tseg), 1); - } - } -#endif - if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { if (c->x86 > 0x10 || diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index e2f319dc992d..22911deacb6e 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -14,11 +14,13 @@ #include <linux/cpufreq.h> #include <linux/smp.h> #include <linux/sched/isolation.h> +#include <linux/rcupdate.h> #include "cpu.h" struct aperfmperf_sample { unsigned int khz; + atomic_t scfpending; ktime_t time; u64 aperf; u64 mperf; @@ -62,17 +64,20 @@ static void aperfmperf_snapshot_khz(void *dummy) s->aperf = aperf; s->mperf = mperf; s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); + atomic_set_release(&s->scfpending, 0); } static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait) { s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu)); + struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu); /* Don't bother re-computing within the cache threshold time. */ if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS) return true; - smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait); + if (!atomic_xchg(&s->scfpending, 1) || wait) + smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait); /* Return false if the previous iteration was too long ago. */ return time_delta <= APERFMPERF_STALE_THRESHOLD_MS; @@ -89,6 +94,9 @@ unsigned int aperfmperf_get_khz(int cpu) if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) return 0; + if (rcu_is_idle_cpu(cpu)) + return 0; /* Idle CPUs are completely uninteresting. */ + aperfmperf_snapshot_cpu(cpu, ktime_get(), true); return per_cpu(samples.khz, cpu); } @@ -108,6 +116,8 @@ void arch_freq_prepare_all(void) for_each_online_cpu(cpu) { if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) continue; + if (rcu_is_idle_cpu(cpu)) + continue; /* Idle CPUs are completely uninteresting. */ if (!aperfmperf_snapshot_cpu(cpu, now, false)) wait = true; } @@ -118,6 +128,8 @@ void arch_freq_prepare_all(void) unsigned int arch_freq_get_on_cpu(int cpu) { + struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu); + if (!cpu_khz) return 0; @@ -131,6 +143,8 @@ unsigned int arch_freq_get_on_cpu(int cpu) return per_cpu(samples.khz, cpu); msleep(APERFMPERF_REFRESH_DELAY_MS); + atomic_set(&s->scfpending, 1); + smp_mb(); /* ->scfpending before smp_call_function_single(). */ smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); return per_cpu(samples.khz, cpu); diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 57074cf3ad7c..3ca9be482a9e 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -580,7 +580,7 @@ static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) if (index < 3) return; - node = amd_get_nb_id(smp_processor_id()); + node = topology_die_id(smp_processor_id()); this_leaf->nb = node_to_amd_nb(node); if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) amd_calc_l3_indices(this_leaf->nb); @@ -646,7 +646,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) return i; } -void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id) +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu) { /* * We may have multiple LLCs if L3 caches exist, so check if we @@ -657,7 +657,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id) if (c->x86 < 0x17) { /* LLC is at the node level. */ - per_cpu(cpu_llc_id, cpu) = node_id; + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) { /* * LLC is at the core complex level. @@ -684,7 +684,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id) } } -void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id) +void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu) { /* * We may have multiple LLCs if L3 caches exist, so check if we diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index d502241995a3..42af31b64c2c 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -69,6 +69,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW }, { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, { X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA }, {} diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 29a3bedabd06..3b1b01f2b248 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -93,16 +93,41 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c) } #endif /* CONFIG_X86_VMX_FEATURE_NAMES */ +static void clear_sgx_caps(void) +{ + setup_clear_cpu_cap(X86_FEATURE_SGX); + setup_clear_cpu_cap(X86_FEATURE_SGX_LC); +} + +static int __init nosgx(char *str) +{ + clear_sgx_caps(); + + return 0; +} + +early_param("nosgx", nosgx); + void init_ia32_feat_ctl(struct cpuinfo_x86 *c) { bool tboot = tboot_enabled(); + bool enable_sgx; u64 msr; if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) { clear_cpu_cap(c, X86_FEATURE_VMX); + clear_sgx_caps(); return; } + /* + * Enable SGX if and only if the kernel supports SGX and Launch Control + * is supported, i.e. disable SGX if the LE hash MSRs can't be written. + */ + enable_sgx = cpu_has(c, X86_FEATURE_SGX) && + cpu_has(c, X86_FEATURE_SGX_LC) && + IS_ENABLED(CONFIG_X86_SGX); + if (msr & FEAT_CTL_LOCKED) goto update_caps; @@ -124,13 +149,16 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX; } + if (enable_sgx) + msr |= FEAT_CTL_SGX_ENABLED | FEAT_CTL_SGX_LC_ENABLED; + wrmsrl(MSR_IA32_FEAT_CTL, msr); update_caps: set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL); if (!cpu_has(c, X86_FEATURE_VMX)) - return; + goto update_sgx; if ( (tboot && !(msr & FEAT_CTL_VMX_ENABLED_INSIDE_SMX)) || (!tboot && !(msr & FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX))) { @@ -143,4 +171,12 @@ update_caps: init_vmx_capabilities(c); #endif } + +update_sgx: + if (!(msr & FEAT_CTL_SGX_ENABLED) || + !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) { + if (enable_sgx) + pr_err_once("SGX disabled by BIOS\n"); + clear_sgx_caps(); + } } diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index ac6c30e5801d..ae59115d18f9 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -14,9 +14,6 @@ #include <asm/cacheinfo.h> #include <asm/spec-ctrl.h> #include <asm/delay.h> -#ifdef CONFIG_X86_64 -# include <asm/set_memory.h> -#endif #include "cpu.h" @@ -65,7 +62,6 @@ static void hygon_get_topology_early(struct cpuinfo_x86 *c) */ static void hygon_get_topology(struct cpuinfo_x86 *c) { - u8 node_id; int cpu = smp_processor_id(); /* get information required for multi-node processors */ @@ -75,7 +71,7 @@ static void hygon_get_topology(struct cpuinfo_x86 *c) cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - node_id = ecx & 0xff; + c->cpu_die_id = ecx & 0xff; c->cpu_core_id = ebx & 0xff; @@ -93,14 +89,14 @@ static void hygon_get_topology(struct cpuinfo_x86 *c) /* Socket ID is ApicId[6] for these processors. */ c->phys_proc_id = c->apicid >> APICID_SOCKET_ID_BIT; - cacheinfo_hygon_init_llc_id(c, cpu, node_id); + cacheinfo_hygon_init_llc_id(c, cpu); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); - node_id = value & 7; + c->cpu_die_id = value & 7; - per_cpu(cpu_llc_id, cpu) = node_id; + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; } else return; @@ -123,7 +119,7 @@ static void hygon_detect_cmp(struct cpuinfo_x86 *c) /* Convert the initial APIC ID into the socket ID */ c->phys_proc_id = c->initial_apicid >> bits; /* use socket ID also for last level cache */ - per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id; } static void srat_detect_node(struct cpuinfo_x86 *c) @@ -204,23 +200,6 @@ static void early_init_hygon_mc(struct cpuinfo_x86 *c) static void bsp_init_hygon(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_64 - unsigned long long tseg; - - /* - * Split up direct mapping around the TSEG SMM area. - * Don't do it for gbpages because there seems very little - * benefit in doing so. - */ - if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { - unsigned long pfn = tseg >> PAGE_SHIFT; - - pr_debug("tseg: %010llx\n", tseg); - if (pfn_range_is_mapped(pfn, pfn + 1)) - set_memory_4k((unsigned long)__va(tseg), 1); - } -#endif - if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { u64 val; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 0c6b02dd744c..e486f96b3cb3 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -1341,7 +1341,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, return -ENODEV; if (is_shared_bank(bank)) { - nb = node_to_amd_nb(amd_get_nb_id(cpu)); + nb = node_to_amd_nb(topology_die_id(cpu)); /* threshold descriptor already initialized on this node? */ if (nb && nb->bank4) { @@ -1445,7 +1445,7 @@ static void threshold_remove_bank(struct threshold_bank *bank) * The last CPU on this node using the shared bank is going * away, remove that bank now. */ - nb = node_to_amd_nb(amd_get_nb_id(smp_processor_id())); + nb = node_to_amd_nb(topology_die_id(smp_processor_id())); nb->bank4 = NULL; } diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index af8d37962586..b58b85380ddb 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -51,6 +51,67 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) } EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); +int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) +{ + const u64 *i_mce = ((const u64 *) (ctx_info + 1)); + unsigned int cpu; + struct mce m; + + if (!boot_cpu_has(X86_FEATURE_SMCA)) + return -EINVAL; + + /* + * The starting address of the register array extracted from BERT must + * match with the first expected register in the register layout of + * SMCA address space. This address corresponds to banks's MCA_STATUS + * register. + * + * Match any MCi_STATUS register by turning off bank numbers. + */ + if ((ctx_info->msr_addr & MSR_AMD64_SMCA_MC0_STATUS) != + MSR_AMD64_SMCA_MC0_STATUS) + return -EINVAL; + + /* + * The register array size must be large enough to include all the + * SMCA registers which need to be extracted. + * + * The number of registers in the register array is determined by + * Register Array Size/8 as defined in UEFI spec v2.8, sec N.2.4.2.2. + * The register layout is fixed and currently the raw data in the + * register array includes 6 SMCA registers which the kernel can + * extract. + */ + if (ctx_info->reg_arr_size < 48) + return -EINVAL; + + mce_setup(&m); + + m.extcpu = -1; + m.socketid = -1; + + for_each_possible_cpu(cpu) { + if (cpu_data(cpu).initial_apicid == lapic_id) { + m.extcpu = cpu; + m.socketid = cpu_data(m.extcpu).phys_proc_id; + break; + } + } + + m.apicid = lapic_id; + m.bank = (ctx_info->msr_addr >> 4) & 0xFF; + m.status = *i_mce; + m.addr = *(i_mce + 1); + m.misc = *(i_mce + 2); + /* Skipping MCA_CONFIG */ + m.ipid = *(i_mce + 4); + m.synd = *(i_mce + 5); + + mce_log(&m); + + return 0; +} + #define CPER_CREATOR_MCE \ GUID_INIT(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \ 0x64, 0x90, 0xb8, 0x9d) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 32b7099e3511..13d3f1cbda17 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -162,7 +162,8 @@ EXPORT_SYMBOL_GPL(mce_log); void mce_register_decode_chain(struct notifier_block *nb) { - if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC)) + if (WARN_ON(nb->priority < MCE_PRIO_LOWEST || + nb->priority > MCE_PRIO_HIGHEST)) return; blocking_notifier_chain_register(&x86_mce_decoder_chain, nb); @@ -1265,14 +1266,14 @@ static void kill_me_maybe(struct callback_head *cb) } } -static void queue_task_work(struct mce *m, int kill_it) +static void queue_task_work(struct mce *m, int kill_current_task) { current->mce_addr = m->addr; current->mce_kflags = m->kflags; current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV); current->mce_whole_page = whole_page(m); - if (kill_it) + if (kill_current_task) current->mce_kill_me.func = kill_me_now; else current->mce_kill_me.func = kill_me_maybe; @@ -1320,10 +1321,10 @@ noinstr void do_machine_check(struct pt_regs *regs) int no_way_out = 0; /* - * If kill_it gets set, there might be a way to recover from this + * If kill_current_task is not set, there might be a way to recover from this * error. */ - int kill_it = 0; + int kill_current_task = 0; /* * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES @@ -1350,8 +1351,7 @@ noinstr void do_machine_check(struct pt_regs *regs) * severity is MCE_AR_SEVERITY we have other options. */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) - kill_it = 1; - + kill_current_task = (cfg->tolerant == 3) ? 0 : 1; /* * Check if this MCE is signaled to only this logical processor, * on Intel, Zhaoxin only. @@ -1368,7 +1368,7 @@ noinstr void do_machine_check(struct pt_regs *regs) * to see it will clear it. */ if (lmce) { - if (no_way_out) + if (no_way_out && cfg->tolerant < 3) mce_panic("Fatal local machine check", &m, msg); } else { order = mce_start(&no_way_out); @@ -1387,6 +1387,9 @@ noinstr void do_machine_check(struct pt_regs *regs) if (mce_end(order) < 0) { if (!no_way_out) no_way_out = worst >= MCE_PANIC_SEVERITY; + + if (no_way_out && cfg->tolerant < 3) + mce_panic("Fatal machine check on current CPU", &m, msg); } } else { /* @@ -1403,19 +1406,7 @@ noinstr void do_machine_check(struct pt_regs *regs) } } - /* - * If tolerant is at an insane level we drop requests to kill - * processes and continue even when there is no way out. - */ - if (cfg->tolerant == 3) - kill_it = 0; - else if (no_way_out) - mce_panic("Fatal machine check on current CPU", &m, msg); - - if (worst > 0) - irq_work_queue(&mce_irq_work); - - if (worst != MCE_AR_SEVERITY && !kill_it) + if (worst != MCE_AR_SEVERITY && !kill_current_task) goto out; /* Fault was in user mode and we need to take some action */ @@ -1423,7 +1414,7 @@ noinstr void do_machine_check(struct pt_regs *regs) /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - queue_task_work(&m, kill_it); + queue_task_work(&m, kill_current_task); } else { /* @@ -1441,7 +1432,7 @@ noinstr void do_machine_check(struct pt_regs *regs) } if (m.kflags & MCE_IN_KERNEL_COPYIN) - queue_task_work(&m, kill_it); + queue_task_work(&m, kill_current_task); } out: mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); @@ -1583,7 +1574,7 @@ static void __mcheck_cpu_mce_banks_init(void) * __mcheck_cpu_init_clear_banks() does the final bank setup. */ b->ctl = -1ULL; - b->init = 1; + b->init = true; } } @@ -1764,7 +1755,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) */ if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0) - mce_banks[0].init = 0; + mce_banks[0].init = false; /* * All newer Intel systems support MCE broadcasting. Enable @@ -1813,11 +1804,9 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) case X86_VENDOR_INTEL: intel_p5_mcheck_init(c); return 1; - break; case X86_VENDOR_CENTAUR: winchip_mcheck_init(c); return 1; - break; default: return 0; } @@ -1985,7 +1974,7 @@ void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check; static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) { - bool irq_state; + irqentry_state_t irq_state; WARN_ON_ONCE(user_mode(regs)); @@ -1997,7 +1986,7 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) mce_check_crashing_cpu()) return; - irq_state = idtentry_enter_nmi(regs); + irq_state = irqentry_nmi_enter(regs); /* * The call targets are marked noinstr, but objtool can't figure * that out because it's an indirect call. Annotate it. @@ -2008,7 +1997,7 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) if (regs->flags & X86_EFLAGS_IF) trace_hardirqs_on_prepare(); instrumentation_end(); - idtentry_exit_nmi(regs, irq_state); + irqentry_nmi_exit(regs, irq_state); } static __always_inline void exc_machine_check_user(struct pt_regs *regs) diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 3a44346f2276..7b360731fc2d 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -522,8 +522,8 @@ static void do_inject(void) if (boot_cpu_has(X86_FEATURE_AMD_DCM) && b == 4 && boot_cpu_data.x86 < 0x17) { - toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); - cpu = get_nbc_for_node(amd_get_nb_id(cpu)); + toggle_nb_mca_mst_cpu(topology_die_id(cpu)); + cpu = get_nbc_for_node(topology_die_id(cpu)); } get_online_cpus(); diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index abe9fe0fb851..c2476fe0682e 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -509,12 +509,33 @@ static void intel_ppin_init(struct cpuinfo_x86 *c) } } +/* + * Enable additional error logs from the integrated + * memory controller on processors that support this. + */ +static void intel_imc_init(struct cpuinfo_x86 *c) +{ + u64 error_control; + + switch (c->x86_model) { + case INTEL_FAM6_SANDYBRIDGE_X: + case INTEL_FAM6_IVYBRIDGE_X: + case INTEL_FAM6_HASWELL_X: + if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control)) + return; + error_control |= 2; + wrmsrl_safe(MSR_ERROR_CONTROL, error_control); + break; + } +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); intel_init_cmci(); intel_init_lmce(); intel_ppin_init(c); + intel_imc_init(c); } void mce_intel_feature_clear(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 3f6b137ef4e6..3d4a48336084 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -215,7 +215,6 @@ static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size default: WARN(1, "%s: WTF family: 0x%x\n", __func__, family); return 0; - break; } if (sh_psize > min_t(u32, buf_size, max_size)) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 05ef1f4550cb..f628e3dc150f 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -366,9 +366,38 @@ static void __init ms_hyperv_init_platform(void) #endif } +static bool __init ms_hyperv_x2apic_available(void) +{ + return x2apic_supported(); +} + +/* + * If ms_hyperv_msi_ext_dest_id() returns true, hyperv_prepare_irq_remapping() + * returns -ENODEV and the Hyper-V IOMMU driver is not used; instead, the + * generic support of the 15-bit APIC ID is used: see __irq_msi_compose_msg(). + * + * Note: for a VM on Hyper-V, the I/O-APIC is the only device which + * (logically) generates MSIs directly to the system APIC irq domain. + * There is no HPET, and PCI MSI/MSI-X interrupts are remapped by the + * pci-hyperv host bridge. + */ +static bool __init ms_hyperv_msi_ext_dest_id(void) +{ + u32 eax; + + eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_INTERFACE); + if (eax != HYPERV_VS_INTERFACE_EAX_SIGNATURE) + return false; + + eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES); + return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE; +} + const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = { .name = "Microsoft Hyper-V", .detect = ms_hyperv_platform, .type = X86_HYPER_MS_HYPERV, + .init.x2apic_available = ms_hyperv_x2apic_available, + .init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id, .init.init_platform = ms_hyperv_init_platform, }; diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 6a80f36b5d59..61eb26edc6d2 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -794,8 +794,6 @@ void mtrr_ap_init(void) if (!use_intel() || mtrr_aps_delayed_init) return; - rcu_cpu_starting(smp_processor_id()); - /* * Ideally we should hold mtrr_mutex here to avoid mtrr entries * changed, but this routine will be called in cpu boot time, @@ -813,7 +811,8 @@ void mtrr_ap_init(void) } /** - * Save current fixed-range MTRR state of the first cpu in cpu_online_mask. + * mtrr_save_state - Save current fixed-range MTRR state of the first + * cpu in cpu_online_mask. */ void mtrr_save_state(void) { diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index e8b5f1cf1ae8..698bb26aeb6e 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -895,6 +895,10 @@ static __init void __check_quirks_intel(void) set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat"); else set_rdt_options("!l3cat"); + fallthrough; + case INTEL_FAM6_BROADWELL_X: + intel_rdt_mbm_apply_quirk(); + break; } } diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index f65d3c0dbc41..ee71c47844cb 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -264,7 +264,7 @@ void __exit rdtgroup_exit(void); struct rftype { char *name; umode_t mode; - struct kernfs_ops *kf_ops; + const struct kernfs_ops *kf_ops; unsigned long flags; unsigned long fflags; @@ -619,6 +619,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms); void mbm_handle_overflow(struct work_struct *work); +void __init intel_rdt_mbm_apply_quirk(void); bool is_mba_sc(struct rdt_resource *r); void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm); u32 delay_bw_map(unsigned long bw, struct rdt_resource *r); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index a98519a3a2e6..7ac31210e452 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -64,6 +64,69 @@ unsigned int rdt_mon_features; */ unsigned int resctrl_cqm_threshold; +#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) + +/* + * The correction factor table is documented in Documentation/x86/resctrl.rst. + * If rmid > rmid threshold, MBM total and local values should be multiplied + * by the correction factor. + * + * The original table is modified for better code: + * + * 1. The threshold 0 is changed to rmid count - 1 so don't do correction + * for the case. + * 2. MBM total and local correction table indexed by core counter which is + * equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27. + * 3. The correction factor is normalized to 2^20 (1048576) so it's faster + * to calculate corrected value by shifting: + * corrected_value = (original_value * correction_factor) >> 20 + */ +static const struct mbm_correction_factor_table { + u32 rmidthreshold; + u64 cf; +} mbm_cf_table[] __initdata = { + {7, CF(1.000000)}, + {15, CF(1.000000)}, + {15, CF(0.969650)}, + {31, CF(1.000000)}, + {31, CF(1.066667)}, + {31, CF(0.969650)}, + {47, CF(1.142857)}, + {63, CF(1.000000)}, + {63, CF(1.185115)}, + {63, CF(1.066553)}, + {79, CF(1.454545)}, + {95, CF(1.000000)}, + {95, CF(1.230769)}, + {95, CF(1.142857)}, + {95, CF(1.066667)}, + {127, CF(1.000000)}, + {127, CF(1.254863)}, + {127, CF(1.185255)}, + {151, CF(1.000000)}, + {127, CF(1.066667)}, + {167, CF(1.000000)}, + {159, CF(1.454334)}, + {183, CF(1.000000)}, + {127, CF(0.969744)}, + {191, CF(1.280246)}, + {191, CF(1.230921)}, + {215, CF(1.000000)}, + {191, CF(1.143118)}, +}; + +static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX; +static u64 mbm_cf __read_mostly; + +static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) +{ + /* Correct MBM value. */ + if (rmid > mbm_cf_rmidthreshold) + val = (val * mbm_cf) >> 20; + + return val; +} + static inline struct rmid_entry *__rmid_entry(u32 rmid) { struct rmid_entry *entry; @@ -260,7 +323,8 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) m->chunks += chunks; m->prev_msr = tval; - rr->val += m->chunks; + rr->val += get_corrected_mbm_count(rmid, m->chunks); + return 0; } @@ -279,7 +343,7 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr) return; chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width); - cur_bw = (chunks * r->mon_scale) >> 20; + cur_bw = (get_corrected_mbm_count(rmid, chunks) * r->mon_scale) >> 20; if (m->delta_comp) m->delta_bw = abs(cur_bw - m->prev_bw); @@ -642,3 +706,17 @@ int rdt_get_mon_l3_config(struct rdt_resource *r) return 0; } + +void __init intel_rdt_mbm_apply_quirk(void) +{ + int cf_index; + + cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1; + if (cf_index >= ARRAY_SIZE(mbm_cf_table)) { + pr_info("No MBM correction factor available\n"); + return; + } + + mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; + mbm_cf = mbm_cf_table[cf_index].cf; +} diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 0daf2f1cf7a8..e916646adc69 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -1458,7 +1458,7 @@ static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) return 0; } -static int pseudo_lock_dev_mremap(struct vm_area_struct *area) +static int pseudo_lock_dev_mremap(struct vm_area_struct *area, unsigned long flags) { /* Not supported */ return -EINVAL; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index f3418428682b..29ffb95b25ff 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -240,13 +240,13 @@ static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, return -EINVAL; } -static struct kernfs_ops rdtgroup_kf_single_ops = { +static const struct kernfs_ops rdtgroup_kf_single_ops = { .atomic_write_len = PAGE_SIZE, .write = rdtgroup_file_write, .seq_show = rdtgroup_seqfile_show, }; -static struct kernfs_ops kf_mondata_ops = { +static const struct kernfs_ops kf_mondata_ops = { .atomic_write_len = PAGE_SIZE, .seq_show = rdtgroup_mondata_show, }; @@ -3023,8 +3023,7 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, return -EPERM; } -static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp, - cpumask_var_t tmpmask) +static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) { struct rdtgroup *prdtgrp = rdtgrp->mon.parent; int cpu; @@ -3056,8 +3055,7 @@ static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp, return 0; } -static int rdtgroup_ctrl_remove(struct kernfs_node *kn, - struct rdtgroup *rdtgrp) +static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp) { rdtgrp->flags = RDT_DELETED; list_del(&rdtgrp->rdtgroup_list); @@ -3066,8 +3064,7 @@ static int rdtgroup_ctrl_remove(struct kernfs_node *kn, return 0; } -static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, - cpumask_var_t tmpmask) +static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) { int cpu; @@ -3094,7 +3091,7 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, closid_free(rdtgrp->closid); free_rmid(rdtgrp->mon.rmid); - rdtgroup_ctrl_remove(kn, rdtgrp); + rdtgroup_ctrl_remove(rdtgrp); /* * Free all the child monitor group rmids. @@ -3131,13 +3128,13 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) rdtgrp != &rdtgroup_default) { if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - ret = rdtgroup_ctrl_remove(kn, rdtgrp); + ret = rdtgroup_ctrl_remove(rdtgrp); } else { - ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask); + ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask); } } else if (rdtgrp->type == RDTMON_GROUP && is_mon_groups(parent_kn, kn->name)) { - ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask); + ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask); } else { ret = -EPERM; } diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 866c9a9bcdee..236924930bf0 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -44,6 +44,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 }, { X86_FEATURE_SEV_ES, CPUID_EAX, 3, 0x8000001f, 0 }, { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 }, + { X86_FEATURE_VM_PAGE_FLUSH, CPUID_EAX, 2, 0x8000001f, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile new file mode 100644 index 000000000000..91d3dc784a29 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/Makefile @@ -0,0 +1,5 @@ +obj-y += \ + driver.o \ + encl.o \ + ioctl.o \ + main.o diff --git a/arch/x86/kernel/cpu/sgx/arch.h b/arch/x86/kernel/cpu/sgx/arch.h new file mode 100644 index 000000000000..dd7602c44c72 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/arch.h @@ -0,0 +1,338 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/** + * Copyright(c) 2016-20 Intel Corporation. + * + * Contains data structures defined by the SGX architecture. Data structures + * defined by the Linux software stack should not be placed here. + */ +#ifndef _ASM_X86_SGX_ARCH_H +#define _ASM_X86_SGX_ARCH_H + +#include <linux/bits.h> +#include <linux/types.h> + +/* The SGX specific CPUID function. */ +#define SGX_CPUID 0x12 +/* EPC enumeration. */ +#define SGX_CPUID_EPC 2 +/* An invalid EPC section, i.e. the end marker. */ +#define SGX_CPUID_EPC_INVALID 0x0 +/* A valid EPC section. */ +#define SGX_CPUID_EPC_SECTION 0x1 +/* The bitmask for the EPC section type. */ +#define SGX_CPUID_EPC_MASK GENMASK(3, 0) + +/** + * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV + * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not + * been completed yet. + * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's + * public key does not match IA32_SGXLEPUBKEYHASH. + * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received + */ +enum sgx_return_code { + SGX_NOT_TRACKED = 11, + SGX_INVALID_EINITTOKEN = 16, + SGX_UNMASKED_EVENT = 128, +}; + +/* The modulus size for 3072-bit RSA keys. */ +#define SGX_MODULUS_SIZE 384 + +/** + * enum sgx_miscselect - additional information to an SSA frame + * %SGX_MISC_EXINFO: Report #PF or #GP to the SSA frame. + * + * Save State Area (SSA) is a stack inside the enclave used to store processor + * state when an exception or interrupt occurs. This enum defines additional + * information stored to an SSA frame. + */ +enum sgx_miscselect { + SGX_MISC_EXINFO = BIT(0), +}; + +#define SGX_MISC_RESERVED_MASK GENMASK_ULL(63, 1) + +#define SGX_SSA_GPRS_SIZE 184 +#define SGX_SSA_MISC_EXINFO_SIZE 16 + +/** + * enum sgx_attributes - the attributes field in &struct sgx_secs + * %SGX_ATTR_INIT: Enclave can be entered (is initialized). + * %SGX_ATTR_DEBUG: Allow ENCLS(EDBGRD) and ENCLS(EDBGWR). + * %SGX_ATTR_MODE64BIT: Tell that this a 64-bit enclave. + * %SGX_ATTR_PROVISIONKEY: Allow to use provisioning keys for remote + * attestation. + * %SGX_ATTR_KSS: Allow to use key separation and sharing (KSS). + * %SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to + * sign cryptographic tokens that can be passed to + * EINIT as an authorization to run an enclave. + */ +enum sgx_attribute { + SGX_ATTR_INIT = BIT(0), + SGX_ATTR_DEBUG = BIT(1), + SGX_ATTR_MODE64BIT = BIT(2), + SGX_ATTR_PROVISIONKEY = BIT(4), + SGX_ATTR_EINITTOKENKEY = BIT(5), + SGX_ATTR_KSS = BIT(7), +}; + +#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | BIT_ULL(6) | GENMASK_ULL(63, 8)) + +/** + * struct sgx_secs - SGX Enclave Control Structure (SECS) + * @size: size of the address space + * @base: base address of the address space + * @ssa_frame_size: size of an SSA frame + * @miscselect: additional information stored to an SSA frame + * @attributes: attributes for enclave + * @xfrm: XSave-Feature Request Mask (subset of XCR0) + * @mrenclave: SHA256-hash of the enclave contents + * @mrsigner: SHA256-hash of the public key used to sign the SIGSTRUCT + * @config_id: a user-defined value that is used in key derivation + * @isv_prod_id: a user-defined value that is used in key derivation + * @isv_svn: a user-defined value that is used in key derivation + * @config_svn: a user-defined value that is used in key derivation + * + * SGX Enclave Control Structure (SECS) is a special enclave page that is not + * visible in the address space. In fact, this structure defines the address + * range and other global attributes for the enclave and it is the first EPC + * page created for any enclave. It is moved from a temporary buffer to an EPC + * by the means of ENCLS[ECREATE] function. + */ +struct sgx_secs { + u64 size; + u64 base; + u32 ssa_frame_size; + u32 miscselect; + u8 reserved1[24]; + u64 attributes; + u64 xfrm; + u32 mrenclave[8]; + u8 reserved2[32]; + u32 mrsigner[8]; + u8 reserved3[32]; + u32 config_id[16]; + u16 isv_prod_id; + u16 isv_svn; + u16 config_svn; + u8 reserved4[3834]; +} __packed; + +/** + * enum sgx_tcs_flags - execution flags for TCS + * %SGX_TCS_DBGOPTIN: If enabled allows single-stepping and breakpoints + * inside an enclave. It is cleared by EADD but can + * be set later with EDBGWR. + */ +enum sgx_tcs_flags { + SGX_TCS_DBGOPTIN = 0x01, +}; + +#define SGX_TCS_RESERVED_MASK GENMASK_ULL(63, 1) +#define SGX_TCS_RESERVED_SIZE 4024 + +/** + * struct sgx_tcs - Thread Control Structure (TCS) + * @state: used to mark an entered TCS + * @flags: execution flags (cleared by EADD) + * @ssa_offset: SSA stack offset relative to the enclave base + * @ssa_index: the current SSA frame index (cleard by EADD) + * @nr_ssa_frames: the number of frame in the SSA stack + * @entry_offset: entry point offset relative to the enclave base + * @exit_addr: address outside the enclave to exit on an exception or + * interrupt + * @fs_offset: offset relative to the enclave base to become FS + * segment inside the enclave + * @gs_offset: offset relative to the enclave base to become GS + * segment inside the enclave + * @fs_limit: size to become a new FS-limit (only 32-bit enclaves) + * @gs_limit: size to become a new GS-limit (only 32-bit enclaves) + * + * Thread Control Structure (TCS) is an enclave page visible in its address + * space that defines an entry point inside the enclave. A thread enters inside + * an enclave by supplying address of TCS to ENCLU(EENTER). A TCS can be entered + * by only one thread at a time. + */ +struct sgx_tcs { + u64 state; + u64 flags; + u64 ssa_offset; + u32 ssa_index; + u32 nr_ssa_frames; + u64 entry_offset; + u64 exit_addr; + u64 fs_offset; + u64 gs_offset; + u32 fs_limit; + u32 gs_limit; + u8 reserved[SGX_TCS_RESERVED_SIZE]; +} __packed; + +/** + * struct sgx_pageinfo - an enclave page descriptor + * @addr: address of the enclave page + * @contents: pointer to the page contents + * @metadata: pointer either to a SECINFO or PCMD instance + * @secs: address of the SECS page + */ +struct sgx_pageinfo { + u64 addr; + u64 contents; + u64 metadata; + u64 secs; +} __packed __aligned(32); + + +/** + * enum sgx_page_type - bits in the SECINFO flags defining the page type + * %SGX_PAGE_TYPE_SECS: a SECS page + * %SGX_PAGE_TYPE_TCS: a TCS page + * %SGX_PAGE_TYPE_REG: a regular page + * %SGX_PAGE_TYPE_VA: a VA page + * %SGX_PAGE_TYPE_TRIM: a page in trimmed state + */ +enum sgx_page_type { + SGX_PAGE_TYPE_SECS, + SGX_PAGE_TYPE_TCS, + SGX_PAGE_TYPE_REG, + SGX_PAGE_TYPE_VA, + SGX_PAGE_TYPE_TRIM, +}; + +#define SGX_NR_PAGE_TYPES 5 +#define SGX_PAGE_TYPE_MASK GENMASK(7, 0) + +/** + * enum sgx_secinfo_flags - the flags field in &struct sgx_secinfo + * %SGX_SECINFO_R: allow read + * %SGX_SECINFO_W: allow write + * %SGX_SECINFO_X: allow execution + * %SGX_SECINFO_SECS: a SECS page + * %SGX_SECINFO_TCS: a TCS page + * %SGX_SECINFO_REG: a regular page + * %SGX_SECINFO_VA: a VA page + * %SGX_SECINFO_TRIM: a page in trimmed state + */ +enum sgx_secinfo_flags { + SGX_SECINFO_R = BIT(0), + SGX_SECINFO_W = BIT(1), + SGX_SECINFO_X = BIT(2), + SGX_SECINFO_SECS = (SGX_PAGE_TYPE_SECS << 8), + SGX_SECINFO_TCS = (SGX_PAGE_TYPE_TCS << 8), + SGX_SECINFO_REG = (SGX_PAGE_TYPE_REG << 8), + SGX_SECINFO_VA = (SGX_PAGE_TYPE_VA << 8), + SGX_SECINFO_TRIM = (SGX_PAGE_TYPE_TRIM << 8), +}; + +#define SGX_SECINFO_PERMISSION_MASK GENMASK_ULL(2, 0) +#define SGX_SECINFO_PAGE_TYPE_MASK (SGX_PAGE_TYPE_MASK << 8) +#define SGX_SECINFO_RESERVED_MASK ~(SGX_SECINFO_PERMISSION_MASK | \ + SGX_SECINFO_PAGE_TYPE_MASK) + +/** + * struct sgx_secinfo - describes attributes of an EPC page + * @flags: permissions and type + * + * Used together with ENCLS leaves that add or modify an EPC page to an + * enclave to define page permissions and type. + */ +struct sgx_secinfo { + u64 flags; + u8 reserved[56]; +} __packed __aligned(64); + +#define SGX_PCMD_RESERVED_SIZE 40 + +/** + * struct sgx_pcmd - Paging Crypto Metadata (PCMD) + * @enclave_id: enclave identifier + * @mac: MAC over PCMD, page contents and isvsvn + * + * PCMD is stored for every swapped page to the regular memory. When ELDU loads + * the page back it recalculates the MAC by using a isvsvn number stored in a + * VA page. Together these two structures bring integrity and rollback + * protection. + */ +struct sgx_pcmd { + struct sgx_secinfo secinfo; + u64 enclave_id; + u8 reserved[SGX_PCMD_RESERVED_SIZE]; + u8 mac[16]; +} __packed __aligned(128); + +#define SGX_SIGSTRUCT_RESERVED1_SIZE 84 +#define SGX_SIGSTRUCT_RESERVED2_SIZE 20 +#define SGX_SIGSTRUCT_RESERVED3_SIZE 32 +#define SGX_SIGSTRUCT_RESERVED4_SIZE 12 + +/** + * struct sgx_sigstruct_header - defines author of the enclave + * @header1: constant byte string + * @vendor: must be either 0x0000 or 0x8086 + * @date: YYYYMMDD in BCD + * @header2: costant byte string + * @swdefined: software defined value + */ +struct sgx_sigstruct_header { + u64 header1[2]; + u32 vendor; + u32 date; + u64 header2[2]; + u32 swdefined; + u8 reserved1[84]; +} __packed; + +/** + * struct sgx_sigstruct_body - defines contents of the enclave + * @miscselect: additional information stored to an SSA frame + * @misc_mask: required miscselect in SECS + * @attributes: attributes for enclave + * @xfrm: XSave-Feature Request Mask (subset of XCR0) + * @attributes_mask: required attributes in SECS + * @xfrm_mask: required XFRM in SECS + * @mrenclave: SHA256-hash of the enclave contents + * @isvprodid: a user-defined value that is used in key derivation + * @isvsvn: a user-defined value that is used in key derivation + */ +struct sgx_sigstruct_body { + u32 miscselect; + u32 misc_mask; + u8 reserved2[20]; + u64 attributes; + u64 xfrm; + u64 attributes_mask; + u64 xfrm_mask; + u8 mrenclave[32]; + u8 reserved3[32]; + u16 isvprodid; + u16 isvsvn; +} __packed; + +/** + * struct sgx_sigstruct - an enclave signature + * @header: defines author of the enclave + * @modulus: the modulus of the public key + * @exponent: the exponent of the public key + * @signature: the signature calculated over the fields except modulus, + * @body: defines contents of the enclave + * @q1: a value used in RSA signature verification + * @q2: a value used in RSA signature verification + * + * Header and body are the parts that are actual signed. The remaining fields + * define the signature of the enclave. + */ +struct sgx_sigstruct { + struct sgx_sigstruct_header header; + u8 modulus[SGX_MODULUS_SIZE]; + u32 exponent; + u8 signature[SGX_MODULUS_SIZE]; + struct sgx_sigstruct_body body; + u8 reserved4[12]; + u8 q1[SGX_MODULUS_SIZE]; + u8 q2[SGX_MODULUS_SIZE]; +} __packed; + +#define SGX_LAUNCH_TOKEN_SIZE 304 + +#endif /* _ASM_X86_SGX_ARCH_H */ diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c new file mode 100644 index 000000000000..f2eac41bb4ff --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include <linux/acpi.h> +#include <linux/miscdevice.h> +#include <linux/mman.h> +#include <linux/security.h> +#include <linux/suspend.h> +#include <asm/traps.h> +#include "driver.h" +#include "encl.h" + +u64 sgx_attributes_reserved_mask; +u64 sgx_xfrm_reserved_mask = ~0x3; +u32 sgx_misc_reserved_mask; + +static int sgx_open(struct inode *inode, struct file *file) +{ + struct sgx_encl *encl; + int ret; + + encl = kzalloc(sizeof(*encl), GFP_KERNEL); + if (!encl) + return -ENOMEM; + + kref_init(&encl->refcount); + xa_init(&encl->page_array); + mutex_init(&encl->lock); + INIT_LIST_HEAD(&encl->va_pages); + INIT_LIST_HEAD(&encl->mm_list); + spin_lock_init(&encl->mm_lock); + + ret = init_srcu_struct(&encl->srcu); + if (ret) { + kfree(encl); + return ret; + } + + file->private_data = encl; + + return 0; +} + +static int sgx_release(struct inode *inode, struct file *file) +{ + struct sgx_encl *encl = file->private_data; + struct sgx_encl_mm *encl_mm; + + /* + * Drain the remaining mm_list entries. At this point the list contains + * entries for processes, which have closed the enclave file but have + * not exited yet. The processes, which have exited, are gone from the + * list by sgx_mmu_notifier_release(). + */ + for ( ; ; ) { + spin_lock(&encl->mm_lock); + + if (list_empty(&encl->mm_list)) { + encl_mm = NULL; + } else { + encl_mm = list_first_entry(&encl->mm_list, + struct sgx_encl_mm, list); + list_del_rcu(&encl_mm->list); + } + + spin_unlock(&encl->mm_lock); + + /* The enclave is no longer mapped by any mm. */ + if (!encl_mm) + break; + + synchronize_srcu(&encl->srcu); + mmu_notifier_unregister(&encl_mm->mmu_notifier, encl_mm->mm); + kfree(encl_mm); + } + + kref_put(&encl->refcount, sgx_encl_release); + return 0; +} + +static int sgx_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct sgx_encl *encl = file->private_data; + int ret; + + ret = sgx_encl_may_map(encl, vma->vm_start, vma->vm_end, vma->vm_flags); + if (ret) + return ret; + + ret = sgx_encl_mm_add(encl, vma->vm_mm); + if (ret) + return ret; + + vma->vm_ops = &sgx_vm_ops; + vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO; + vma->vm_private_data = encl; + + return 0; +} + +static unsigned long sgx_get_unmapped_area(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags) +{ + if ((flags & MAP_TYPE) == MAP_PRIVATE) + return -EINVAL; + + if (flags & MAP_FIXED) + return addr; + + return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); +} + +#ifdef CONFIG_COMPAT +static long sgx_compat_ioctl(struct file *filep, unsigned int cmd, + unsigned long arg) +{ + return sgx_ioctl(filep, cmd, arg); +} +#endif + +static const struct file_operations sgx_encl_fops = { + .owner = THIS_MODULE, + .open = sgx_open, + .release = sgx_release, + .unlocked_ioctl = sgx_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = sgx_compat_ioctl, +#endif + .mmap = sgx_mmap, + .get_unmapped_area = sgx_get_unmapped_area, +}; + +const struct file_operations sgx_provision_fops = { + .owner = THIS_MODULE, +}; + +static struct miscdevice sgx_dev_enclave = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_enclave", + .nodename = "sgx_enclave", + .fops = &sgx_encl_fops, +}; + +static struct miscdevice sgx_dev_provision = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_provision", + .nodename = "sgx_provision", + .fops = &sgx_provision_fops, +}; + +int __init sgx_drv_init(void) +{ + unsigned int eax, ebx, ecx, edx; + u64 attr_mask; + u64 xfrm_mask; + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) + return -ENODEV; + + cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx); + + if (!(eax & 1)) { + pr_err("SGX disabled: SGX1 instruction support not available.\n"); + return -ENODEV; + } + + sgx_misc_reserved_mask = ~ebx | SGX_MISC_RESERVED_MASK; + + cpuid_count(SGX_CPUID, 1, &eax, &ebx, &ecx, &edx); + + attr_mask = (((u64)ebx) << 32) + (u64)eax; + sgx_attributes_reserved_mask = ~attr_mask | SGX_ATTR_RESERVED_MASK; + + if (cpu_feature_enabled(X86_FEATURE_OSXSAVE)) { + xfrm_mask = (((u64)edx) << 32) + (u64)ecx; + sgx_xfrm_reserved_mask = ~xfrm_mask; + } + + ret = misc_register(&sgx_dev_enclave); + if (ret) + return ret; + + ret = misc_register(&sgx_dev_provision); + if (ret) { + misc_deregister(&sgx_dev_enclave); + return ret; + } + + return 0; +} diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h new file mode 100644 index 000000000000..4eddb4d571ef --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/driver.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ARCH_SGX_DRIVER_H__ +#define __ARCH_SGX_DRIVER_H__ + +#include <crypto/hash.h> +#include <linux/kref.h> +#include <linux/mmu_notifier.h> +#include <linux/radix-tree.h> +#include <linux/rwsem.h> +#include <linux/sched.h> +#include <linux/workqueue.h> +#include <uapi/asm/sgx.h> +#include "sgx.h" + +#define SGX_EINIT_SPIN_COUNT 20 +#define SGX_EINIT_SLEEP_COUNT 50 +#define SGX_EINIT_SLEEP_TIME 20 + +extern u64 sgx_attributes_reserved_mask; +extern u64 sgx_xfrm_reserved_mask; +extern u32 sgx_misc_reserved_mask; + +extern const struct file_operations sgx_provision_fops; + +long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); + +int sgx_drv_init(void); + +#endif /* __ARCH_X86_SGX_DRIVER_H__ */ diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c new file mode 100644 index 000000000000..ee50a5010277 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -0,0 +1,740 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include <linux/lockdep.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/shmem_fs.h> +#include <linux/suspend.h> +#include <linux/sched/mm.h> +#include "arch.h" +#include "encl.h" +#include "encls.h" +#include "sgx.h" + +/* + * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC + * Pages" in the SDM. + */ +static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, + struct sgx_epc_page *epc_page, + struct sgx_epc_page *secs_page) +{ + unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; + struct sgx_encl *encl = encl_page->encl; + struct sgx_pageinfo pginfo; + struct sgx_backing b; + pgoff_t page_index; + int ret; + + if (secs_page) + page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); + else + page_index = PFN_DOWN(encl->size); + + ret = sgx_encl_get_backing(encl, page_index, &b); + if (ret) + return ret; + + pginfo.addr = encl_page->desc & PAGE_MASK; + pginfo.contents = (unsigned long)kmap_atomic(b.contents); + pginfo.metadata = (unsigned long)kmap_atomic(b.pcmd) + + b.pcmd_offset; + + if (secs_page) + pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page); + else + pginfo.secs = 0; + + ret = __eldu(&pginfo, sgx_get_epc_virt_addr(epc_page), + sgx_get_epc_virt_addr(encl_page->va_page->epc_page) + va_offset); + if (ret) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "ELDU"); + + ret = -EFAULT; + } + + kunmap_atomic((void *)(unsigned long)(pginfo.metadata - b.pcmd_offset)); + kunmap_atomic((void *)(unsigned long)pginfo.contents); + + sgx_encl_put_backing(&b, false); + + return ret; +} + +static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, + struct sgx_epc_page *secs_page) +{ + + unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; + struct sgx_encl *encl = encl_page->encl; + struct sgx_epc_page *epc_page; + int ret; + + epc_page = sgx_alloc_epc_page(encl_page, false); + if (IS_ERR(epc_page)) + return epc_page; + + ret = __sgx_encl_eldu(encl_page, epc_page, secs_page); + if (ret) { + sgx_free_epc_page(epc_page); + return ERR_PTR(ret); + } + + sgx_free_va_slot(encl_page->va_page, va_offset); + list_move(&encl_page->va_page->list, &encl->va_pages); + encl_page->desc &= ~SGX_ENCL_PAGE_VA_OFFSET_MASK; + encl_page->epc_page = epc_page; + + return epc_page; +} + +static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, + unsigned long addr, + unsigned long vm_flags) +{ + unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); + struct sgx_epc_page *epc_page; + struct sgx_encl_page *entry; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + return ERR_PTR(-EFAULT); + + /* + * Verify that the faulted page has equal or higher build time + * permissions than the VMA permissions (i.e. the subset of {VM_READ, + * VM_WRITE, VM_EXECUTE} in vma->vm_flags). + */ + if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits) + return ERR_PTR(-EFAULT); + + /* Entry successfully located. */ + if (entry->epc_page) { + if (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED) + return ERR_PTR(-EBUSY); + + return entry; + } + + if (!(encl->secs.epc_page)) { + epc_page = sgx_encl_eldu(&encl->secs, NULL); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page); + } + + epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page); + + encl->secs_child_cnt++; + sgx_mark_page_reclaimable(entry->epc_page); + + return entry; +} + +static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) +{ + unsigned long addr = (unsigned long)vmf->address; + struct vm_area_struct *vma = vmf->vma; + struct sgx_encl_page *entry; + unsigned long phys_addr; + struct sgx_encl *encl; + unsigned long pfn; + vm_fault_t ret; + + encl = vma->vm_private_data; + + /* + * It's very unlikely but possible that allocating memory for the + * mm_list entry of a forked process failed in sgx_vma_open(). When + * this happens, vm_private_data is set to NULL. + */ + if (unlikely(!encl)) + return VM_FAULT_SIGBUS; + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr, vma->vm_flags); + if (IS_ERR(entry)) { + mutex_unlock(&encl->lock); + + if (PTR_ERR(entry) == -EBUSY) + return VM_FAULT_NOPAGE; + + return VM_FAULT_SIGBUS; + } + + phys_addr = sgx_get_epc_phys_addr(entry->epc_page); + + /* Check if another thread got here first to insert the PTE. */ + if (!follow_pfn(vma, addr, &pfn)) { + mutex_unlock(&encl->lock); + + return VM_FAULT_NOPAGE; + } + + ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); + if (ret != VM_FAULT_NOPAGE) { + mutex_unlock(&encl->lock); + + return VM_FAULT_SIGBUS; + } + + sgx_encl_test_and_clear_young(vma->vm_mm, entry); + mutex_unlock(&encl->lock); + + return VM_FAULT_NOPAGE; +} + +static void sgx_vma_open(struct vm_area_struct *vma) +{ + struct sgx_encl *encl = vma->vm_private_data; + + /* + * It's possible but unlikely that vm_private_data is NULL. This can + * happen in a grandchild of a process, when sgx_encl_mm_add() had + * failed to allocate memory in this callback. + */ + if (unlikely(!encl)) + return; + + if (sgx_encl_mm_add(encl, vma->vm_mm)) + vma->vm_private_data = NULL; +} + + +/** + * sgx_encl_may_map() - Check if a requested VMA mapping is allowed + * @encl: an enclave pointer + * @start: lower bound of the address range, inclusive + * @end: upper bound of the address range, exclusive + * @vm_flags: VMA flags + * + * Iterate through the enclave pages contained within [@start, @end) to verify + * that the permissions requested by a subset of {VM_READ, VM_WRITE, VM_EXEC} + * do not contain any permissions that are not contained in the build time + * permissions of any of the enclave pages within the given address range. + * + * An enclave creator must declare the strongest permissions that will be + * needed for each enclave page. This ensures that mappings have the identical + * or weaker permissions than the earlier declared permissions. + * + * Return: 0 on success, -EACCES otherwise + */ +int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, + unsigned long end, unsigned long vm_flags) +{ + unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); + struct sgx_encl_page *page; + unsigned long count = 0; + int ret = 0; + + XA_STATE(xas, &encl->page_array, PFN_DOWN(start)); + + /* + * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might + * conflict with the enclave page permissions. + */ + if (current->personality & READ_IMPLIES_EXEC) + return -EACCES; + + mutex_lock(&encl->lock); + xas_lock(&xas); + xas_for_each(&xas, page, PFN_DOWN(end - 1)) { + if (~page->vm_max_prot_bits & vm_prot_bits) { + ret = -EACCES; + break; + } + + /* Reschedule on every XA_CHECK_SCHED iteration. */ + if (!(++count % XA_CHECK_SCHED)) { + xas_pause(&xas); + xas_unlock(&xas); + mutex_unlock(&encl->lock); + + cond_resched(); + + mutex_lock(&encl->lock); + xas_lock(&xas); + } + } + xas_unlock(&xas); + mutex_unlock(&encl->lock); + + return ret; +} + +static int sgx_vma_mprotect(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long newflags) +{ + return sgx_encl_may_map(vma->vm_private_data, start, end, newflags); +} + +static int sgx_encl_debug_read(struct sgx_encl *encl, struct sgx_encl_page *page, + unsigned long addr, void *data) +{ + unsigned long offset = addr & ~PAGE_MASK; + int ret; + + + ret = __edbgrd(sgx_get_epc_virt_addr(page->epc_page) + offset, data); + if (ret) + return -EIO; + + return 0; +} + +static int sgx_encl_debug_write(struct sgx_encl *encl, struct sgx_encl_page *page, + unsigned long addr, void *data) +{ + unsigned long offset = addr & ~PAGE_MASK; + int ret; + + ret = __edbgwr(sgx_get_epc_virt_addr(page->epc_page) + offset, data); + if (ret) + return -EIO; + + return 0; +} + +/* + * Load an enclave page to EPC if required, and take encl->lock. + */ +static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl, + unsigned long addr, + unsigned long vm_flags) +{ + struct sgx_encl_page *entry; + + for ( ; ; ) { + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr, vm_flags); + if (PTR_ERR(entry) != -EBUSY) + break; + + mutex_unlock(&encl->lock); + } + + if (IS_ERR(entry)) + mutex_unlock(&encl->lock); + + return entry; +} + +static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write) +{ + struct sgx_encl *encl = vma->vm_private_data; + struct sgx_encl_page *entry = NULL; + char data[sizeof(unsigned long)]; + unsigned long align; + int offset; + int cnt; + int ret = 0; + int i; + + /* + * If process was forked, VMA is still there but vm_private_data is set + * to NULL. + */ + if (!encl) + return -EFAULT; + + if (!test_bit(SGX_ENCL_DEBUG, &encl->flags)) + return -EFAULT; + + for (i = 0; i < len; i += cnt) { + entry = sgx_encl_reserve_page(encl, (addr + i) & PAGE_MASK, + vma->vm_flags); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry); + break; + } + + align = ALIGN_DOWN(addr + i, sizeof(unsigned long)); + offset = (addr + i) & (sizeof(unsigned long) - 1); + cnt = sizeof(unsigned long) - offset; + cnt = min(cnt, len - i); + + ret = sgx_encl_debug_read(encl, entry, align, data); + if (ret) + goto out; + + if (write) { + memcpy(data + offset, buf + i, cnt); + ret = sgx_encl_debug_write(encl, entry, align, data); + if (ret) + goto out; + } else { + memcpy(buf + i, data + offset, cnt); + } + +out: + mutex_unlock(&encl->lock); + + if (ret) + break; + } + + return ret < 0 ? ret : i; +} + +const struct vm_operations_struct sgx_vm_ops = { + .fault = sgx_vma_fault, + .mprotect = sgx_vma_mprotect, + .open = sgx_vma_open, + .access = sgx_vma_access, +}; + +/** + * sgx_encl_release - Destroy an enclave instance + * @kref: address of a kref inside &sgx_encl + * + * Used together with kref_put(). Frees all the resources associated with the + * enclave and the instance itself. + */ +void sgx_encl_release(struct kref *ref) +{ + struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount); + struct sgx_va_page *va_page; + struct sgx_encl_page *entry; + unsigned long index; + + xa_for_each(&encl->page_array, index, entry) { + if (entry->epc_page) { + /* + * The page and its radix tree entry cannot be freed + * if the page is being held by the reclaimer. + */ + if (sgx_unmark_page_reclaimable(entry->epc_page)) + continue; + + sgx_free_epc_page(entry->epc_page); + encl->secs_child_cnt--; + entry->epc_page = NULL; + } + + kfree(entry); + } + + xa_destroy(&encl->page_array); + + if (!encl->secs_child_cnt && encl->secs.epc_page) { + sgx_free_epc_page(encl->secs.epc_page); + encl->secs.epc_page = NULL; + } + + while (!list_empty(&encl->va_pages)) { + va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, + list); + list_del(&va_page->list); + sgx_free_epc_page(va_page->epc_page); + kfree(va_page); + } + + if (encl->backing) + fput(encl->backing); + + cleanup_srcu_struct(&encl->srcu); + + WARN_ON_ONCE(!list_empty(&encl->mm_list)); + + /* Detect EPC page leak's. */ + WARN_ON_ONCE(encl->secs_child_cnt); + WARN_ON_ONCE(encl->secs.epc_page); + + kfree(encl); +} + +/* + * 'mm' is exiting and no longer needs mmu notifications. + */ +static void sgx_mmu_notifier_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier); + struct sgx_encl_mm *tmp = NULL; + + /* + * The enclave itself can remove encl_mm. Note, objects can't be moved + * off an RCU protected list, but deletion is ok. + */ + spin_lock(&encl_mm->encl->mm_lock); + list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) { + if (tmp == encl_mm) { + list_del_rcu(&encl_mm->list); + break; + } + } + spin_unlock(&encl_mm->encl->mm_lock); + + if (tmp == encl_mm) { + synchronize_srcu(&encl_mm->encl->srcu); + mmu_notifier_put(mn); + } +} + +static void sgx_mmu_notifier_free(struct mmu_notifier *mn) +{ + struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier); + + kfree(encl_mm); +} + +static const struct mmu_notifier_ops sgx_mmu_notifier_ops = { + .release = sgx_mmu_notifier_release, + .free_notifier = sgx_mmu_notifier_free, +}; + +static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl, + struct mm_struct *mm) +{ + struct sgx_encl_mm *encl_mm = NULL; + struct sgx_encl_mm *tmp; + int idx; + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(tmp, &encl->mm_list, list) { + if (tmp->mm == mm) { + encl_mm = tmp; + break; + } + } + + srcu_read_unlock(&encl->srcu, idx); + + return encl_mm; +} + +int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) +{ + struct sgx_encl_mm *encl_mm; + int ret; + + /* + * Even though a single enclave may be mapped into an mm more than once, + * each 'mm' only appears once on encl->mm_list. This is guaranteed by + * holding the mm's mmap lock for write before an mm can be added or + * remove to an encl->mm_list. + */ + mmap_assert_write_locked(mm); + + /* + * It's possible that an entry already exists in the mm_list, because it + * is removed only on VFS release or process exit. + */ + if (sgx_encl_find_mm(encl, mm)) + return 0; + + encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL); + if (!encl_mm) + return -ENOMEM; + + encl_mm->encl = encl; + encl_mm->mm = mm; + encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops; + + ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm); + if (ret) { + kfree(encl_mm); + return ret; + } + + spin_lock(&encl->mm_lock); + list_add_rcu(&encl_mm->list, &encl->mm_list); + /* Pairs with smp_rmb() in sgx_reclaimer_block(). */ + smp_wmb(); + encl->mm_list_version++; + spin_unlock(&encl->mm_lock); + + return 0; +} + +static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, + pgoff_t index) +{ + struct inode *inode = encl->backing->f_path.dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + gfp_t gfpmask = mapping_gfp_mask(mapping); + + return shmem_read_mapping_page_gfp(mapping, index, gfpmask); +} + +/** + * sgx_encl_get_backing() - Pin the backing storage + * @encl: an enclave pointer + * @page_index: enclave page index + * @backing: data for accessing backing storage for the page + * + * Pin the backing storage pages for storing the encrypted contents and Paging + * Crypto MetaData (PCMD) of an enclave page. + * + * Return: + * 0 on success, + * -errno otherwise. + */ +int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing) +{ + pgoff_t pcmd_index = PFN_DOWN(encl->size) + 1 + (page_index >> 5); + struct page *contents; + struct page *pcmd; + + contents = sgx_encl_get_backing_page(encl, page_index); + if (IS_ERR(contents)) + return PTR_ERR(contents); + + pcmd = sgx_encl_get_backing_page(encl, pcmd_index); + if (IS_ERR(pcmd)) { + put_page(contents); + return PTR_ERR(pcmd); + } + + backing->page_index = page_index; + backing->contents = contents; + backing->pcmd = pcmd; + backing->pcmd_offset = + (page_index & (PAGE_SIZE / sizeof(struct sgx_pcmd) - 1)) * + sizeof(struct sgx_pcmd); + + return 0; +} + +/** + * sgx_encl_put_backing() - Unpin the backing storage + * @backing: data for accessing backing storage for the page + * @do_write: mark pages dirty + */ +void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write) +{ + if (do_write) { + set_page_dirty(backing->pcmd); + set_page_dirty(backing->contents); + } + + put_page(backing->pcmd); + put_page(backing->contents); +} + +static int sgx_encl_test_and_clear_young_cb(pte_t *ptep, unsigned long addr, + void *data) +{ + pte_t pte; + int ret; + + ret = pte_young(*ptep); + if (ret) { + pte = pte_mkold(*ptep); + set_pte_at((struct mm_struct *)data, addr, ptep, pte); + } + + return ret; +} + +/** + * sgx_encl_test_and_clear_young() - Test and reset the accessed bit + * @mm: mm_struct that is checked + * @page: enclave page to be tested for recent access + * + * Checks the Access (A) bit from the PTE corresponding to the enclave page and + * clears it. + * + * Return: 1 if the page has been recently accessed and 0 if not. + */ +int sgx_encl_test_and_clear_young(struct mm_struct *mm, + struct sgx_encl_page *page) +{ + unsigned long addr = page->desc & PAGE_MASK; + struct sgx_encl *encl = page->encl; + struct vm_area_struct *vma; + int ret; + + ret = sgx_encl_find(mm, addr, &vma); + if (ret) + return 0; + + if (encl != vma->vm_private_data) + return 0; + + ret = apply_to_page_range(vma->vm_mm, addr, PAGE_SIZE, + sgx_encl_test_and_clear_young_cb, vma->vm_mm); + if (ret < 0) + return 0; + + return ret; +} + +/** + * sgx_alloc_va_page() - Allocate a Version Array (VA) page + * + * Allocate a free EPC page and convert it to a Version Array (VA) page. + * + * Return: + * a VA page, + * -errno otherwise + */ +struct sgx_epc_page *sgx_alloc_va_page(void) +{ + struct sgx_epc_page *epc_page; + int ret; + + epc_page = sgx_alloc_epc_page(NULL, true); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page); + + ret = __epa(sgx_get_epc_virt_addr(epc_page)); + if (ret) { + WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret); + sgx_free_epc_page(epc_page); + return ERR_PTR(-EFAULT); + } + + return epc_page; +} + +/** + * sgx_alloc_va_slot - allocate a VA slot + * @va_page: a &struct sgx_va_page instance + * + * Allocates a slot from a &struct sgx_va_page instance. + * + * Return: offset of the slot inside the VA page + */ +unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page) +{ + int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT); + + if (slot < SGX_VA_SLOT_COUNT) + set_bit(slot, va_page->slots); + + return slot << 3; +} + +/** + * sgx_free_va_slot - free a VA slot + * @va_page: a &struct sgx_va_page instance + * @offset: offset of the slot inside the VA page + * + * Frees a slot from a &struct sgx_va_page instance. + */ +void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset) +{ + clear_bit(offset >> 3, va_page->slots); +} + +/** + * sgx_va_page_full - is the VA page full? + * @va_page: a &struct sgx_va_page instance + * + * Return: true if all slots have been taken + */ +bool sgx_va_page_full(struct sgx_va_page *va_page) +{ + int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT); + + return slot == SGX_VA_SLOT_COUNT; +} diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h new file mode 100644 index 000000000000..d8d30ccbef4c --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/** + * Copyright(c) 2016-20 Intel Corporation. + * + * Contains the software defined data structures for enclaves. + */ +#ifndef _X86_ENCL_H +#define _X86_ENCL_H + +#include <linux/cpumask.h> +#include <linux/kref.h> +#include <linux/list.h> +#include <linux/mm_types.h> +#include <linux/mmu_notifier.h> +#include <linux/mutex.h> +#include <linux/notifier.h> +#include <linux/srcu.h> +#include <linux/workqueue.h> +#include <linux/xarray.h> +#include "sgx.h" + +/* 'desc' bits holding the offset in the VA (version array) page. */ +#define SGX_ENCL_PAGE_VA_OFFSET_MASK GENMASK_ULL(11, 3) + +/* 'desc' bit marking that the page is being reclaimed. */ +#define SGX_ENCL_PAGE_BEING_RECLAIMED BIT(3) + +struct sgx_encl_page { + unsigned long desc; + unsigned long vm_max_prot_bits; + struct sgx_epc_page *epc_page; + struct sgx_encl *encl; + struct sgx_va_page *va_page; +}; + +enum sgx_encl_flags { + SGX_ENCL_IOCTL = BIT(0), + SGX_ENCL_DEBUG = BIT(1), + SGX_ENCL_CREATED = BIT(2), + SGX_ENCL_INITIALIZED = BIT(3), +}; + +struct sgx_encl_mm { + struct sgx_encl *encl; + struct mm_struct *mm; + struct list_head list; + struct mmu_notifier mmu_notifier; +}; + +struct sgx_encl { + unsigned long base; + unsigned long size; + unsigned long flags; + unsigned int page_cnt; + unsigned int secs_child_cnt; + struct mutex lock; + struct xarray page_array; + struct sgx_encl_page secs; + unsigned long attributes; + unsigned long attributes_mask; + + cpumask_t cpumask; + struct file *backing; + struct kref refcount; + struct list_head va_pages; + unsigned long mm_list_version; + struct list_head mm_list; + spinlock_t mm_lock; + struct srcu_struct srcu; +}; + +#define SGX_VA_SLOT_COUNT 512 + +struct sgx_va_page { + struct sgx_epc_page *epc_page; + DECLARE_BITMAP(slots, SGX_VA_SLOT_COUNT); + struct list_head list; +}; + +struct sgx_backing { + pgoff_t page_index; + struct page *contents; + struct page *pcmd; + unsigned long pcmd_offset; +}; + +extern const struct vm_operations_struct sgx_vm_ops; + +static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr, + struct vm_area_struct **vma) +{ + struct vm_area_struct *result; + + result = find_vma(mm, addr); + if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start) + return -EINVAL; + + *vma = result; + + return 0; +} + +int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, + unsigned long end, unsigned long vm_flags); + +void sgx_encl_release(struct kref *ref); +int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); +int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing); +void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write); +int sgx_encl_test_and_clear_young(struct mm_struct *mm, + struct sgx_encl_page *page); + +struct sgx_epc_page *sgx_alloc_va_page(void); +unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); +void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); +bool sgx_va_page_full(struct sgx_va_page *va_page); + +#endif /* _X86_ENCL_H */ diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h new file mode 100644 index 000000000000..443188fe7e70 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -0,0 +1,231 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_ENCLS_H +#define _X86_ENCLS_H + +#include <linux/bitops.h> +#include <linux/err.h> +#include <linux/io.h> +#include <linux/rwsem.h> +#include <linux/types.h> +#include <asm/asm.h> +#include <asm/traps.h> +#include "sgx.h" + +enum sgx_encls_function { + ECREATE = 0x00, + EADD = 0x01, + EINIT = 0x02, + EREMOVE = 0x03, + EDGBRD = 0x04, + EDGBWR = 0x05, + EEXTEND = 0x06, + ELDU = 0x08, + EBLOCK = 0x09, + EPA = 0x0A, + EWB = 0x0B, + ETRACK = 0x0C, +}; + +/** + * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr + * + * ENCLS has its own (positive value) error codes and also generates + * ENCLS specific #GP and #PF faults. And the ENCLS values get munged + * with system error codes as everything percolates back up the stack. + * Unfortunately (for us), we need to precisely identify each unique + * error code, e.g. the action taken if EWB fails varies based on the + * type of fault and on the exact SGX error code, i.e. we can't simply + * convert all faults to -EFAULT. + * + * To make all three error types coexist, we set bit 30 to identify an + * ENCLS fault. Bit 31 (technically bits N:31) is used to differentiate + * between positive (faults and SGX error codes) and negative (system + * error codes) values. + */ +#define ENCLS_FAULT_FLAG 0x40000000 + +/* Retrieve the encoded trapnr from the specified return code. */ +#define ENCLS_TRAPNR(r) ((r) & ~ENCLS_FAULT_FLAG) + +/* Issue a WARN() about an ENCLS function. */ +#define ENCLS_WARN(r, name) { \ + do { \ + int _r = (r); \ + WARN_ONCE(_r, "%s returned %d (0x%x)\n", (name), _r, _r); \ + } while (0); \ +} + +/** + * encls_failed() - Check if an ENCLS function failed + * @ret: the return value of an ENCLS function call + * + * Check if an ENCLS function failed. This happens when the function causes a + * fault that is not caused by an EPCM conflict or when the function returns a + * non-zero value. + */ +static inline bool encls_failed(int ret) +{ + if (ret & ENCLS_FAULT_FLAG) + return ENCLS_TRAPNR(ret) != X86_TRAP_PF; + + return !!ret; +} + +/** + * __encls_ret_N - encode an ENCLS function that returns an error code in EAX + * @rax: function number + * @inputs: asm inputs for the function + * + * Emit assembly for an ENCLS function that returns an error code, e.g. EREMOVE. + * And because SGX isn't complex enough as it is, function that return an error + * code also modify flags. + * + * Return: + * 0 on success, + * SGX error code on failure + */ +#define __encls_ret_N(rax, inputs...) \ + ({ \ + int ret; \ + asm volatile( \ + "1: .byte 0x0f, 0x01, 0xcf;\n\t" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: orl $"__stringify(ENCLS_FAULT_FLAG)",%%eax\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE_FAULT(1b, 3b) \ + : "=a"(ret) \ + : "a"(rax), inputs \ + : "memory", "cc"); \ + ret; \ + }) + +#define __encls_ret_1(rax, rcx) \ + ({ \ + __encls_ret_N(rax, "c"(rcx)); \ + }) + +#define __encls_ret_2(rax, rbx, rcx) \ + ({ \ + __encls_ret_N(rax, "b"(rbx), "c"(rcx)); \ + }) + +#define __encls_ret_3(rax, rbx, rcx, rdx) \ + ({ \ + __encls_ret_N(rax, "b"(rbx), "c"(rcx), "d"(rdx)); \ + }) + +/** + * __encls_N - encode an ENCLS function that doesn't return an error code + * @rax: function number + * @rbx_out: optional output variable + * @inputs: asm inputs for the function + * + * Emit assembly for an ENCLS function that does not return an error code, e.g. + * ECREATE. Leaves without error codes either succeed or fault. @rbx_out is an + * optional parameter for use by EDGBRD, which returns the requested value in + * RBX. + * + * Return: + * 0 on success, + * trapnr with ENCLS_FAULT_FLAG set on fault + */ +#define __encls_N(rax, rbx_out, inputs...) \ + ({ \ + int ret; \ + asm volatile( \ + "1: .byte 0x0f, 0x01, 0xcf;\n\t" \ + " xor %%eax,%%eax;\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: orl $"__stringify(ENCLS_FAULT_FLAG)",%%eax\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE_FAULT(1b, 3b) \ + : "=a"(ret), "=b"(rbx_out) \ + : "a"(rax), inputs \ + : "memory"); \ + ret; \ + }) + +#define __encls_2(rax, rbx, rcx) \ + ({ \ + unsigned long ign_rbx_out; \ + __encls_N(rax, ign_rbx_out, "b"(rbx), "c"(rcx)); \ + }) + +#define __encls_1_1(rax, data, rcx) \ + ({ \ + unsigned long rbx_out; \ + int ret = __encls_N(rax, rbx_out, "c"(rcx)); \ + if (!ret) \ + data = rbx_out; \ + ret; \ + }) + +static inline int __ecreate(struct sgx_pageinfo *pginfo, void *secs) +{ + return __encls_2(ECREATE, pginfo, secs); +} + +static inline int __eextend(void *secs, void *addr) +{ + return __encls_2(EEXTEND, secs, addr); +} + +static inline int __eadd(struct sgx_pageinfo *pginfo, void *addr) +{ + return __encls_2(EADD, pginfo, addr); +} + +static inline int __einit(void *sigstruct, void *token, void *secs) +{ + return __encls_ret_3(EINIT, sigstruct, secs, token); +} + +static inline int __eremove(void *addr) +{ + return __encls_ret_1(EREMOVE, addr); +} + +static inline int __edbgwr(void *addr, unsigned long *data) +{ + return __encls_2(EDGBWR, *data, addr); +} + +static inline int __edbgrd(void *addr, unsigned long *data) +{ + return __encls_1_1(EDGBRD, *data, addr); +} + +static inline int __etrack(void *addr) +{ + return __encls_ret_1(ETRACK, addr); +} + +static inline int __eldu(struct sgx_pageinfo *pginfo, void *addr, + void *va) +{ + return __encls_ret_3(ELDU, pginfo, addr, va); +} + +static inline int __eblock(void *addr) +{ + return __encls_ret_1(EBLOCK, addr); +} + +static inline int __epa(void *addr) +{ + unsigned long rbx = SGX_PAGE_TYPE_VA; + + return __encls_2(EPA, rbx, addr); +} + +static inline int __ewb(struct sgx_pageinfo *pginfo, void *addr, + void *va) +{ + return __encls_ret_3(EWB, pginfo, addr, va); +} + +#endif /* _X86_ENCLS_H */ diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c new file mode 100644 index 000000000000..90a5caf76939 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -0,0 +1,716 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include <asm/mman.h> +#include <linux/mman.h> +#include <linux/delay.h> +#include <linux/file.h> +#include <linux/hashtable.h> +#include <linux/highmem.h> +#include <linux/ratelimit.h> +#include <linux/sched/signal.h> +#include <linux/shmem_fs.h> +#include <linux/slab.h> +#include <linux/suspend.h> +#include "driver.h" +#include "encl.h" +#include "encls.h" + +static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) +{ + struct sgx_va_page *va_page = NULL; + void *err; + + BUILD_BUG_ON(SGX_VA_SLOT_COUNT != + (SGX_ENCL_PAGE_VA_OFFSET_MASK >> 3) + 1); + + if (!(encl->page_cnt % SGX_VA_SLOT_COUNT)) { + va_page = kzalloc(sizeof(*va_page), GFP_KERNEL); + if (!va_page) + return ERR_PTR(-ENOMEM); + + va_page->epc_page = sgx_alloc_va_page(); + if (IS_ERR(va_page->epc_page)) { + err = ERR_CAST(va_page->epc_page); + kfree(va_page); + return err; + } + + WARN_ON_ONCE(encl->page_cnt % SGX_VA_SLOT_COUNT); + } + encl->page_cnt++; + return va_page; +} + +static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) +{ + encl->page_cnt--; + + if (va_page) { + sgx_free_epc_page(va_page->epc_page); + list_del(&va_page->list); + kfree(va_page); + } +} + +static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) +{ + struct sgx_epc_page *secs_epc; + struct sgx_va_page *va_page; + struct sgx_pageinfo pginfo; + struct sgx_secinfo secinfo; + unsigned long encl_size; + struct file *backing; + long ret; + + va_page = sgx_encl_grow(encl); + if (IS_ERR(va_page)) + return PTR_ERR(va_page); + else if (va_page) + list_add(&va_page->list, &encl->va_pages); + /* else the tail page of the VA page list had free slots. */ + + /* The extra page goes to SECS. */ + encl_size = secs->size + PAGE_SIZE; + + backing = shmem_file_setup("SGX backing", encl_size + (encl_size >> 5), + VM_NORESERVE); + if (IS_ERR(backing)) { + ret = PTR_ERR(backing); + goto err_out_shrink; + } + + encl->backing = backing; + + secs_epc = sgx_alloc_epc_page(&encl->secs, true); + if (IS_ERR(secs_epc)) { + ret = PTR_ERR(secs_epc); + goto err_out_backing; + } + + encl->secs.epc_page = secs_epc; + + pginfo.addr = 0; + pginfo.contents = (unsigned long)secs; + pginfo.metadata = (unsigned long)&secinfo; + pginfo.secs = 0; + memset(&secinfo, 0, sizeof(secinfo)); + + ret = __ecreate((void *)&pginfo, sgx_get_epc_virt_addr(secs_epc)); + if (ret) { + ret = -EIO; + goto err_out; + } + + if (secs->attributes & SGX_ATTR_DEBUG) + set_bit(SGX_ENCL_DEBUG, &encl->flags); + + encl->secs.encl = encl; + encl->base = secs->base; + encl->size = secs->size; + encl->attributes = secs->attributes; + encl->attributes_mask = SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | SGX_ATTR_KSS; + + /* Set only after completion, as encl->lock has not been taken. */ + set_bit(SGX_ENCL_CREATED, &encl->flags); + + return 0; + +err_out: + sgx_free_epc_page(encl->secs.epc_page); + encl->secs.epc_page = NULL; + +err_out_backing: + fput(encl->backing); + encl->backing = NULL; + +err_out_shrink: + sgx_encl_shrink(encl, va_page); + + return ret; +} + +/** + * sgx_ioc_enclave_create() - handler for %SGX_IOC_ENCLAVE_CREATE + * @encl: An enclave pointer. + * @arg: The ioctl argument. + * + * Allocate kernel data structures for the enclave and invoke ECREATE. + * + * Return: + * - 0: Success. + * - -EIO: ECREATE failed. + * - -errno: POSIX error. + */ +static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg) +{ + struct sgx_enclave_create create_arg; + void *secs; + int ret; + + if (test_bit(SGX_ENCL_CREATED, &encl->flags)) + return -EINVAL; + + if (copy_from_user(&create_arg, arg, sizeof(create_arg))) + return -EFAULT; + + secs = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!secs) + return -ENOMEM; + + if (copy_from_user(secs, (void __user *)create_arg.src, PAGE_SIZE)) + ret = -EFAULT; + else + ret = sgx_encl_create(encl, secs); + + kfree(secs); + return ret; +} + +static struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, + unsigned long offset, + u64 secinfo_flags) +{ + struct sgx_encl_page *encl_page; + unsigned long prot; + + encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); + if (!encl_page) + return ERR_PTR(-ENOMEM); + + encl_page->desc = encl->base + offset; + encl_page->encl = encl; + + prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | + _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | + _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); + + /* + * TCS pages must always RW set for CPU access while the SECINFO + * permissions are *always* zero - the CPU ignores the user provided + * values and silently overwrites them with zero permissions. + */ + if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) + prot |= PROT_READ | PROT_WRITE; + + /* Calculate maximum of the VM flags for the page. */ + encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); + + return encl_page; +} + +static int sgx_validate_secinfo(struct sgx_secinfo *secinfo) +{ + u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK; + u64 pt = secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK; + + if (pt != SGX_SECINFO_REG && pt != SGX_SECINFO_TCS) + return -EINVAL; + + if ((perm & SGX_SECINFO_W) && !(perm & SGX_SECINFO_R)) + return -EINVAL; + + /* + * CPU will silently overwrite the permissions as zero, which means + * that we need to validate it ourselves. + */ + if (pt == SGX_SECINFO_TCS && perm) + return -EINVAL; + + if (secinfo->flags & SGX_SECINFO_RESERVED_MASK) + return -EINVAL; + + if (memchr_inv(secinfo->reserved, 0, sizeof(secinfo->reserved))) + return -EINVAL; + + return 0; +} + +static int __sgx_encl_add_page(struct sgx_encl *encl, + struct sgx_encl_page *encl_page, + struct sgx_epc_page *epc_page, + struct sgx_secinfo *secinfo, unsigned long src) +{ + struct sgx_pageinfo pginfo; + struct vm_area_struct *vma; + struct page *src_page; + int ret; + + /* Deny noexec. */ + vma = find_vma(current->mm, src); + if (!vma) + return -EFAULT; + + if (!(vma->vm_flags & VM_MAYEXEC)) + return -EACCES; + + ret = get_user_pages(src, 1, 0, &src_page, NULL); + if (ret < 1) + return -EFAULT; + + pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page); + pginfo.addr = encl_page->desc & PAGE_MASK; + pginfo.metadata = (unsigned long)secinfo; + pginfo.contents = (unsigned long)kmap_atomic(src_page); + + ret = __eadd(&pginfo, sgx_get_epc_virt_addr(epc_page)); + + kunmap_atomic((void *)pginfo.contents); + put_page(src_page); + + return ret ? -EIO : 0; +} + +/* + * If the caller requires measurement of the page as a proof for the content, + * use EEXTEND to add a measurement for 256 bytes of the page. Repeat this + * operation until the entire page is measured." + */ +static int __sgx_encl_extend(struct sgx_encl *encl, + struct sgx_epc_page *epc_page) +{ + unsigned long offset; + int ret; + + for (offset = 0; offset < PAGE_SIZE; offset += SGX_EEXTEND_BLOCK_SIZE) { + ret = __eextend(sgx_get_epc_virt_addr(encl->secs.epc_page), + sgx_get_epc_virt_addr(epc_page) + offset); + if (ret) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "EEXTEND"); + + return -EIO; + } + } + + return 0; +} + +static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, + unsigned long offset, struct sgx_secinfo *secinfo, + unsigned long flags) +{ + struct sgx_encl_page *encl_page; + struct sgx_epc_page *epc_page; + struct sgx_va_page *va_page; + int ret; + + encl_page = sgx_encl_page_alloc(encl, offset, secinfo->flags); + if (IS_ERR(encl_page)) + return PTR_ERR(encl_page); + + epc_page = sgx_alloc_epc_page(encl_page, true); + if (IS_ERR(epc_page)) { + kfree(encl_page); + return PTR_ERR(epc_page); + } + + va_page = sgx_encl_grow(encl); + if (IS_ERR(va_page)) { + ret = PTR_ERR(va_page); + goto err_out_free; + } + + mmap_read_lock(current->mm); + mutex_lock(&encl->lock); + + /* + * Adding to encl->va_pages must be done under encl->lock. Ditto for + * deleting (via sgx_encl_shrink()) in the error path. + */ + if (va_page) + list_add(&va_page->list, &encl->va_pages); + + /* + * Insert prior to EADD in case of OOM. EADD modifies MRENCLAVE, i.e. + * can't be gracefully unwound, while failure on EADD/EXTEND is limited + * to userspace errors (or kernel/hardware bugs). + */ + ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc), + encl_page, GFP_KERNEL); + if (ret) + goto err_out_unlock; + + ret = __sgx_encl_add_page(encl, encl_page, epc_page, secinfo, + src); + if (ret) + goto err_out; + + /* + * Complete the "add" before doing the "extend" so that the "add" + * isn't in a half-baked state in the extremely unlikely scenario + * the enclave will be destroyed in response to EEXTEND failure. + */ + encl_page->encl = encl; + encl_page->epc_page = epc_page; + encl->secs_child_cnt++; + + if (flags & SGX_PAGE_MEASURE) { + ret = __sgx_encl_extend(encl, epc_page); + if (ret) + goto err_out; + } + + sgx_mark_page_reclaimable(encl_page->epc_page); + mutex_unlock(&encl->lock); + mmap_read_unlock(current->mm); + return ret; + +err_out: + xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc)); + +err_out_unlock: + sgx_encl_shrink(encl, va_page); + mutex_unlock(&encl->lock); + mmap_read_unlock(current->mm); + +err_out_free: + sgx_free_epc_page(epc_page); + kfree(encl_page); + + return ret; +} + +/** + * sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES + * @encl: an enclave pointer + * @arg: a user pointer to a struct sgx_enclave_add_pages instance + * + * Add one or more pages to an uninitialized enclave, and optionally extend the + * measurement with the contents of the page. The SECINFO and measurement mask + * are applied to all pages. + * + * A SECINFO for a TCS is required to always contain zero permissions because + * CPU silently zeros them. Allowing anything else would cause a mismatch in + * the measurement. + * + * mmap()'s protection bits are capped by the page permissions. For each page + * address, the maximum protection bits are computed with the following + * heuristics: + * + * 1. A regular page: PROT_R, PROT_W and PROT_X match the SECINFO permissions. + * 2. A TCS page: PROT_R | PROT_W. + * + * mmap() is not allowed to surpass the minimum of the maximum protection bits + * within the given address range. + * + * The function deinitializes kernel data structures for enclave and returns + * -EIO in any of the following conditions: + * + * - Enclave Page Cache (EPC), the physical memory holding enclaves, has + * been invalidated. This will cause EADD and EEXTEND to fail. + * - If the source address is corrupted somehow when executing EADD. + * + * Return: + * - 0: Success. + * - -EACCES: The source page is located in a noexec partition. + * - -ENOMEM: Out of EPC pages. + * - -EINTR: The call was interrupted before data was processed. + * - -EIO: Either EADD or EEXTEND failed because invalid source address + * or power cycle. + * - -errno: POSIX error. + */ +static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg) +{ + struct sgx_enclave_add_pages add_arg; + struct sgx_secinfo secinfo; + unsigned long c; + int ret; + + if (!test_bit(SGX_ENCL_CREATED, &encl->flags) || + test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return -EINVAL; + + if (copy_from_user(&add_arg, arg, sizeof(add_arg))) + return -EFAULT; + + if (!IS_ALIGNED(add_arg.offset, PAGE_SIZE) || + !IS_ALIGNED(add_arg.src, PAGE_SIZE)) + return -EINVAL; + + if (!add_arg.length || add_arg.length & (PAGE_SIZE - 1)) + return -EINVAL; + + if (add_arg.offset + add_arg.length - PAGE_SIZE >= encl->size) + return -EINVAL; + + if (copy_from_user(&secinfo, (void __user *)add_arg.secinfo, + sizeof(secinfo))) + return -EFAULT; + + if (sgx_validate_secinfo(&secinfo)) + return -EINVAL; + + for (c = 0 ; c < add_arg.length; c += PAGE_SIZE) { + if (signal_pending(current)) { + if (!c) + ret = -ERESTARTSYS; + + break; + } + + if (need_resched()) + cond_resched(); + + ret = sgx_encl_add_page(encl, add_arg.src + c, add_arg.offset + c, + &secinfo, add_arg.flags); + if (ret) + break; + } + + add_arg.count = c; + + if (copy_to_user(arg, &add_arg, sizeof(add_arg))) + return -EFAULT; + + return ret; +} + +static int __sgx_get_key_hash(struct crypto_shash *tfm, const void *modulus, + void *hash) +{ + SHASH_DESC_ON_STACK(shash, tfm); + + shash->tfm = tfm; + + return crypto_shash_digest(shash, modulus, SGX_MODULUS_SIZE, hash); +} + +static int sgx_get_key_hash(const void *modulus, void *hash) +{ + struct crypto_shash *tfm; + int ret; + + tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + ret = __sgx_get_key_hash(tfm, modulus, hash); + + crypto_free_shash(tfm); + return ret; +} + +static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, + void *token) +{ + u64 mrsigner[4]; + int i, j, k; + void *addr; + int ret; + + /* + * Deny initializing enclaves with attributes (namely provisioning) + * that have not been explicitly allowed. + */ + if (encl->attributes & ~encl->attributes_mask) + return -EACCES; + + /* + * Attributes should not be enforced *only* against what's available on + * platform (done in sgx_encl_create) but checked and enforced against + * the mask for enforcement in sigstruct. For example an enclave could + * opt to sign with AVX bit in xfrm, but still be loadable on a platform + * without it if the sigstruct->body.attributes_mask does not turn that + * bit on. + */ + if (sigstruct->body.attributes & sigstruct->body.attributes_mask & + sgx_attributes_reserved_mask) + return -EINVAL; + + if (sigstruct->body.miscselect & sigstruct->body.misc_mask & + sgx_misc_reserved_mask) + return -EINVAL; + + if (sigstruct->body.xfrm & sigstruct->body.xfrm_mask & + sgx_xfrm_reserved_mask) + return -EINVAL; + + ret = sgx_get_key_hash(sigstruct->modulus, mrsigner); + if (ret) + return ret; + + mutex_lock(&encl->lock); + + /* + * ENCLS[EINIT] is interruptible because it has such a high latency, + * e.g. 50k+ cycles on success. If an IRQ/NMI/SMI becomes pending, + * EINIT may fail with SGX_UNMASKED_EVENT so that the event can be + * serviced. + */ + for (i = 0; i < SGX_EINIT_SLEEP_COUNT; i++) { + for (j = 0; j < SGX_EINIT_SPIN_COUNT; j++) { + addr = sgx_get_epc_virt_addr(encl->secs.epc_page); + + preempt_disable(); + + for (k = 0; k < 4; k++) + wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + k, mrsigner[k]); + + ret = __einit(sigstruct, token, addr); + + preempt_enable(); + + if (ret == SGX_UNMASKED_EVENT) + continue; + else + break; + } + + if (ret != SGX_UNMASKED_EVENT) + break; + + msleep_interruptible(SGX_EINIT_SLEEP_TIME); + + if (signal_pending(current)) { + ret = -ERESTARTSYS; + goto err_out; + } + } + + if (ret & ENCLS_FAULT_FLAG) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "EINIT"); + + ret = -EIO; + } else if (ret) { + pr_debug("EINIT returned %d\n", ret); + ret = -EPERM; + } else { + set_bit(SGX_ENCL_INITIALIZED, &encl->flags); + } + +err_out: + mutex_unlock(&encl->lock); + return ret; +} + +/** + * sgx_ioc_enclave_init() - handler for %SGX_IOC_ENCLAVE_INIT + * @encl: an enclave pointer + * @arg: userspace pointer to a struct sgx_enclave_init instance + * + * Flush any outstanding enqueued EADD operations and perform EINIT. The + * Launch Enclave Public Key Hash MSRs are rewritten as necessary to match + * the enclave's MRSIGNER, which is caculated from the provided sigstruct. + * + * Return: + * - 0: Success. + * - -EPERM: Invalid SIGSTRUCT. + * - -EIO: EINIT failed because of a power cycle. + * - -errno: POSIX error. + */ +static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) +{ + struct sgx_sigstruct *sigstruct; + struct sgx_enclave_init init_arg; + struct page *initp_page; + void *token; + int ret; + + if (!test_bit(SGX_ENCL_CREATED, &encl->flags) || + test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return -EINVAL; + + if (copy_from_user(&init_arg, arg, sizeof(init_arg))) + return -EFAULT; + + initp_page = alloc_page(GFP_KERNEL); + if (!initp_page) + return -ENOMEM; + + sigstruct = kmap(initp_page); + token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2); + memset(token, 0, SGX_LAUNCH_TOKEN_SIZE); + + if (copy_from_user(sigstruct, (void __user *)init_arg.sigstruct, + sizeof(*sigstruct))) { + ret = -EFAULT; + goto out; + } + + /* + * A legacy field used with Intel signed enclaves. These used to mean + * regular and architectural enclaves. The CPU only accepts these values + * but they do not have any other meaning. + * + * Thus, reject any other values. + */ + if (sigstruct->header.vendor != 0x0000 && + sigstruct->header.vendor != 0x8086) { + ret = -EINVAL; + goto out; + } + + ret = sgx_encl_init(encl, sigstruct, token); + +out: + kunmap(initp_page); + __free_page(initp_page); + return ret; +} + +/** + * sgx_ioc_enclave_provision() - handler for %SGX_IOC_ENCLAVE_PROVISION + * @encl: an enclave pointer + * @arg: userspace pointer to a struct sgx_enclave_provision instance + * + * Allow ATTRIBUTE.PROVISION_KEY for an enclave by providing a file handle to + * /dev/sgx_provision. + * + * Return: + * - 0: Success. + * - -errno: Otherwise. + */ +static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg) +{ + struct sgx_enclave_provision params; + struct file *file; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + file = fget(params.fd); + if (!file) + return -EINVAL; + + if (file->f_op != &sgx_provision_fops) { + fput(file); + return -EINVAL; + } + + encl->attributes_mask |= SGX_ATTR_PROVISIONKEY; + + fput(file); + return 0; +} + +long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) +{ + struct sgx_encl *encl = filep->private_data; + int ret; + + if (test_and_set_bit(SGX_ENCL_IOCTL, &encl->flags)) + return -EBUSY; + + switch (cmd) { + case SGX_IOC_ENCLAVE_CREATE: + ret = sgx_ioc_enclave_create(encl, (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_ADD_PAGES: + ret = sgx_ioc_enclave_add_pages(encl, (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_INIT: + ret = sgx_ioc_enclave_init(encl, (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_PROVISION: + ret = sgx_ioc_enclave_provision(encl, (void __user *)arg); + break; + default: + ret = -ENOIOCTLCMD; + break; + } + + clear_bit(SGX_ENCL_IOCTL, &encl->flags); + return ret; +} diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c new file mode 100644 index 000000000000..c519fc5f6948 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -0,0 +1,733 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include <linux/freezer.h> +#include <linux/highmem.h> +#include <linux/kthread.h> +#include <linux/pagemap.h> +#include <linux/ratelimit.h> +#include <linux/sched/mm.h> +#include <linux/sched/signal.h> +#include <linux/slab.h> +#include "driver.h" +#include "encl.h" +#include "encls.h" + +struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; +static int sgx_nr_epc_sections; +static struct task_struct *ksgxd_tsk; +static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); + +/* + * These variables are part of the state of the reclaimer, and must be accessed + * with sgx_reclaimer_lock acquired. + */ +static LIST_HEAD(sgx_active_page_list); + +static DEFINE_SPINLOCK(sgx_reclaimer_lock); + +/* + * Reset dirty EPC pages to uninitialized state. Laundry can be left with SECS + * pages whose child pages blocked EREMOVE. + */ +static void sgx_sanitize_section(struct sgx_epc_section *section) +{ + struct sgx_epc_page *page; + LIST_HEAD(dirty); + int ret; + + /* init_laundry_list is thread-local, no need for a lock: */ + while (!list_empty(§ion->init_laundry_list)) { + if (kthread_should_stop()) + return; + + /* needed for access to ->page_list: */ + spin_lock(§ion->lock); + + page = list_first_entry(§ion->init_laundry_list, + struct sgx_epc_page, list); + + ret = __eremove(sgx_get_epc_virt_addr(page)); + if (!ret) + list_move(&page->list, §ion->page_list); + else + list_move_tail(&page->list, &dirty); + + spin_unlock(§ion->lock); + + cond_resched(); + } + + list_splice(&dirty, §ion->init_laundry_list); +} + +static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page) +{ + struct sgx_encl_page *page = epc_page->owner; + struct sgx_encl *encl = page->encl; + struct sgx_encl_mm *encl_mm; + bool ret = true; + int idx; + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + mmap_read_lock(encl_mm->mm); + ret = !sgx_encl_test_and_clear_young(encl_mm->mm, page); + mmap_read_unlock(encl_mm->mm); + + mmput_async(encl_mm->mm); + + if (!ret) + break; + } + + srcu_read_unlock(&encl->srcu, idx); + + if (!ret) + return false; + + return true; +} + +static void sgx_reclaimer_block(struct sgx_epc_page *epc_page) +{ + struct sgx_encl_page *page = epc_page->owner; + unsigned long addr = page->desc & PAGE_MASK; + struct sgx_encl *encl = page->encl; + unsigned long mm_list_version; + struct sgx_encl_mm *encl_mm; + struct vm_area_struct *vma; + int idx, ret; + + do { + mm_list_version = encl->mm_list_version; + + /* Pairs with smp_rmb() in sgx_encl_mm_add(). */ + smp_rmb(); + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + mmap_read_lock(encl_mm->mm); + + ret = sgx_encl_find(encl_mm->mm, addr, &vma); + if (!ret && encl == vma->vm_private_data) + zap_vma_ptes(vma, addr, PAGE_SIZE); + + mmap_read_unlock(encl_mm->mm); + + mmput_async(encl_mm->mm); + } + + srcu_read_unlock(&encl->srcu, idx); + } while (unlikely(encl->mm_list_version != mm_list_version)); + + mutex_lock(&encl->lock); + + ret = __eblock(sgx_get_epc_virt_addr(epc_page)); + if (encls_failed(ret)) + ENCLS_WARN(ret, "EBLOCK"); + + mutex_unlock(&encl->lock); +} + +static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, + struct sgx_backing *backing) +{ + struct sgx_pageinfo pginfo; + int ret; + + pginfo.addr = 0; + pginfo.secs = 0; + + pginfo.contents = (unsigned long)kmap_atomic(backing->contents); + pginfo.metadata = (unsigned long)kmap_atomic(backing->pcmd) + + backing->pcmd_offset; + + ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot); + + kunmap_atomic((void *)(unsigned long)(pginfo.metadata - + backing->pcmd_offset)); + kunmap_atomic((void *)(unsigned long)pginfo.contents); + + return ret; +} + +static void sgx_ipi_cb(void *info) +{ +} + +static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl) +{ + cpumask_t *cpumask = &encl->cpumask; + struct sgx_encl_mm *encl_mm; + int idx; + + /* + * Can race with sgx_encl_mm_add(), but ETRACK has already been + * executed, which means that the CPUs running in the new mm will enter + * into the enclave with a fresh epoch. + */ + cpumask_clear(cpumask); + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm)); + + mmput_async(encl_mm->mm); + } + + srcu_read_unlock(&encl->srcu, idx); + + return cpumask; +} + +/* + * Swap page to the regular memory transformed to the blocked state by using + * EBLOCK, which means that it can no loger be referenced (no new TLB entries). + * + * The first trial just tries to write the page assuming that some other thread + * has reset the count for threads inside the enlave by using ETRACK, and + * previous thread count has been zeroed out. The second trial calls ETRACK + * before EWB. If that fails we kick all the HW threads out, and then do EWB, + * which should be guaranteed the succeed. + */ +static void sgx_encl_ewb(struct sgx_epc_page *epc_page, + struct sgx_backing *backing) +{ + struct sgx_encl_page *encl_page = epc_page->owner; + struct sgx_encl *encl = encl_page->encl; + struct sgx_va_page *va_page; + unsigned int va_offset; + void *va_slot; + int ret; + + encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED; + + va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, + list); + va_offset = sgx_alloc_va_slot(va_page); + va_slot = sgx_get_epc_virt_addr(va_page->epc_page) + va_offset; + if (sgx_va_page_full(va_page)) + list_move_tail(&va_page->list, &encl->va_pages); + + ret = __sgx_encl_ewb(epc_page, va_slot, backing); + if (ret == SGX_NOT_TRACKED) { + ret = __etrack(sgx_get_epc_virt_addr(encl->secs.epc_page)); + if (ret) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "ETRACK"); + } + + ret = __sgx_encl_ewb(epc_page, va_slot, backing); + if (ret == SGX_NOT_TRACKED) { + /* + * Slow path, send IPIs to kick cpus out of the + * enclave. Note, it's imperative that the cpu + * mask is generated *after* ETRACK, else we'll + * miss cpus that entered the enclave between + * generating the mask and incrementing epoch. + */ + on_each_cpu_mask(sgx_encl_ewb_cpumask(encl), + sgx_ipi_cb, NULL, 1); + ret = __sgx_encl_ewb(epc_page, va_slot, backing); + } + } + + if (ret) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "EWB"); + + sgx_free_va_slot(va_page, va_offset); + } else { + encl_page->desc |= va_offset; + encl_page->va_page = va_page; + } +} + +static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, + struct sgx_backing *backing) +{ + struct sgx_encl_page *encl_page = epc_page->owner; + struct sgx_encl *encl = encl_page->encl; + struct sgx_backing secs_backing; + int ret; + + mutex_lock(&encl->lock); + + sgx_encl_ewb(epc_page, backing); + encl_page->epc_page = NULL; + encl->secs_child_cnt--; + + if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) { + ret = sgx_encl_get_backing(encl, PFN_DOWN(encl->size), + &secs_backing); + if (ret) + goto out; + + sgx_encl_ewb(encl->secs.epc_page, &secs_backing); + + sgx_free_epc_page(encl->secs.epc_page); + encl->secs.epc_page = NULL; + + sgx_encl_put_backing(&secs_backing, true); + } + +out: + mutex_unlock(&encl->lock); +} + +/* + * Take a fixed number of pages from the head of the active page pool and + * reclaim them to the enclave's private shmem files. Skip the pages, which have + * been accessed since the last scan. Move those pages to the tail of active + * page pool so that the pages get scanned in LRU like fashion. + * + * Batch process a chunk of pages (at the moment 16) in order to degrade amount + * of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit + * among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI + * + EWB) but not sufficiently. Reclaiming one page at a time would also be + * problematic as it would increase the lock contention too much, which would + * halt forward progress. + */ +static void sgx_reclaim_pages(void) +{ + struct sgx_epc_page *chunk[SGX_NR_TO_SCAN]; + struct sgx_backing backing[SGX_NR_TO_SCAN]; + struct sgx_epc_section *section; + struct sgx_encl_page *encl_page; + struct sgx_epc_page *epc_page; + pgoff_t page_index; + int cnt = 0; + int ret; + int i; + + spin_lock(&sgx_reclaimer_lock); + for (i = 0; i < SGX_NR_TO_SCAN; i++) { + if (list_empty(&sgx_active_page_list)) + break; + + epc_page = list_first_entry(&sgx_active_page_list, + struct sgx_epc_page, list); + list_del_init(&epc_page->list); + encl_page = epc_page->owner; + + if (kref_get_unless_zero(&encl_page->encl->refcount) != 0) + chunk[cnt++] = epc_page; + else + /* The owner is freeing the page. No need to add the + * page back to the list of reclaimable pages. + */ + epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; + } + spin_unlock(&sgx_reclaimer_lock); + + for (i = 0; i < cnt; i++) { + epc_page = chunk[i]; + encl_page = epc_page->owner; + + if (!sgx_reclaimer_age(epc_page)) + goto skip; + + page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); + ret = sgx_encl_get_backing(encl_page->encl, page_index, &backing[i]); + if (ret) + goto skip; + + mutex_lock(&encl_page->encl->lock); + encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED; + mutex_unlock(&encl_page->encl->lock); + continue; + +skip: + spin_lock(&sgx_reclaimer_lock); + list_add_tail(&epc_page->list, &sgx_active_page_list); + spin_unlock(&sgx_reclaimer_lock); + + kref_put(&encl_page->encl->refcount, sgx_encl_release); + + chunk[i] = NULL; + } + + for (i = 0; i < cnt; i++) { + epc_page = chunk[i]; + if (epc_page) + sgx_reclaimer_block(epc_page); + } + + for (i = 0; i < cnt; i++) { + epc_page = chunk[i]; + if (!epc_page) + continue; + + encl_page = epc_page->owner; + sgx_reclaimer_write(epc_page, &backing[i]); + sgx_encl_put_backing(&backing[i], true); + + kref_put(&encl_page->encl->refcount, sgx_encl_release); + epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; + + section = &sgx_epc_sections[epc_page->section]; + spin_lock(§ion->lock); + list_add_tail(&epc_page->list, §ion->page_list); + section->free_cnt++; + spin_unlock(§ion->lock); + } +} + +static unsigned long sgx_nr_free_pages(void) +{ + unsigned long cnt = 0; + int i; + + for (i = 0; i < sgx_nr_epc_sections; i++) + cnt += sgx_epc_sections[i].free_cnt; + + return cnt; +} + +static bool sgx_should_reclaim(unsigned long watermark) +{ + return sgx_nr_free_pages() < watermark && + !list_empty(&sgx_active_page_list); +} + +static int ksgxd(void *p) +{ + int i; + + set_freezable(); + + /* + * Sanitize pages in order to recover from kexec(). The 2nd pass is + * required for SECS pages, whose child pages blocked EREMOVE. + */ + for (i = 0; i < sgx_nr_epc_sections; i++) + sgx_sanitize_section(&sgx_epc_sections[i]); + + for (i = 0; i < sgx_nr_epc_sections; i++) { + sgx_sanitize_section(&sgx_epc_sections[i]); + + /* Should never happen. */ + if (!list_empty(&sgx_epc_sections[i].init_laundry_list)) + WARN(1, "EPC section %d has unsanitized pages.\n", i); + } + + while (!kthread_should_stop()) { + if (try_to_freeze()) + continue; + + wait_event_freezable(ksgxd_waitq, + kthread_should_stop() || + sgx_should_reclaim(SGX_NR_HIGH_PAGES)); + + if (sgx_should_reclaim(SGX_NR_HIGH_PAGES)) + sgx_reclaim_pages(); + + cond_resched(); + } + + return 0; +} + +static bool __init sgx_page_reclaimer_init(void) +{ + struct task_struct *tsk; + + tsk = kthread_run(ksgxd, NULL, "ksgxd"); + if (IS_ERR(tsk)) + return false; + + ksgxd_tsk = tsk; + + return true; +} + +static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_section *section) +{ + struct sgx_epc_page *page; + + spin_lock(§ion->lock); + + if (list_empty(§ion->page_list)) { + spin_unlock(§ion->lock); + return NULL; + } + + page = list_first_entry(§ion->page_list, struct sgx_epc_page, list); + list_del_init(&page->list); + section->free_cnt--; + + spin_unlock(§ion->lock); + return page; +} + +/** + * __sgx_alloc_epc_page() - Allocate an EPC page + * + * Iterate through EPC sections and borrow a free EPC page to the caller. When a + * page is no longer needed it must be released with sgx_free_epc_page(). + * + * Return: + * an EPC page, + * -errno on error + */ +struct sgx_epc_page *__sgx_alloc_epc_page(void) +{ + struct sgx_epc_section *section; + struct sgx_epc_page *page; + int i; + + for (i = 0; i < sgx_nr_epc_sections; i++) { + section = &sgx_epc_sections[i]; + + page = __sgx_alloc_epc_page_from_section(section); + if (page) + return page; + } + + return ERR_PTR(-ENOMEM); +} + +/** + * sgx_mark_page_reclaimable() - Mark a page as reclaimable + * @page: EPC page + * + * Mark a page as reclaimable and add it to the active page list. Pages + * are automatically removed from the active list when freed. + */ +void sgx_mark_page_reclaimable(struct sgx_epc_page *page) +{ + spin_lock(&sgx_reclaimer_lock); + page->flags |= SGX_EPC_PAGE_RECLAIMER_TRACKED; + list_add_tail(&page->list, &sgx_active_page_list); + spin_unlock(&sgx_reclaimer_lock); +} + +/** + * sgx_unmark_page_reclaimable() - Remove a page from the reclaim list + * @page: EPC page + * + * Clear the reclaimable flag and remove the page from the active page list. + * + * Return: + * 0 on success, + * -EBUSY if the page is in the process of being reclaimed + */ +int sgx_unmark_page_reclaimable(struct sgx_epc_page *page) +{ + spin_lock(&sgx_reclaimer_lock); + if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) { + /* The page is being reclaimed. */ + if (list_empty(&page->list)) { + spin_unlock(&sgx_reclaimer_lock); + return -EBUSY; + } + + list_del(&page->list); + page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; + } + spin_unlock(&sgx_reclaimer_lock); + + return 0; +} + +/** + * sgx_alloc_epc_page() - Allocate an EPC page + * @owner: the owner of the EPC page + * @reclaim: reclaim pages if necessary + * + * Iterate through EPC sections and borrow a free EPC page to the caller. When a + * page is no longer needed it must be released with sgx_free_epc_page(). If + * @reclaim is set to true, directly reclaim pages when we are out of pages. No + * mm's can be locked when @reclaim is set to true. + * + * Finally, wake up ksgxd when the number of pages goes below the watermark + * before returning back to the caller. + * + * Return: + * an EPC page, + * -errno on error + */ +struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim) +{ + struct sgx_epc_page *page; + + for ( ; ; ) { + page = __sgx_alloc_epc_page(); + if (!IS_ERR(page)) { + page->owner = owner; + break; + } + + if (list_empty(&sgx_active_page_list)) + return ERR_PTR(-ENOMEM); + + if (!reclaim) { + page = ERR_PTR(-EBUSY); + break; + } + + if (signal_pending(current)) { + page = ERR_PTR(-ERESTARTSYS); + break; + } + + sgx_reclaim_pages(); + cond_resched(); + } + + if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) + wake_up(&ksgxd_waitq); + + return page; +} + +/** + * sgx_free_epc_page() - Free an EPC page + * @page: an EPC page + * + * Call EREMOVE for an EPC page and insert it back to the list of free pages. + */ +void sgx_free_epc_page(struct sgx_epc_page *page) +{ + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + int ret; + + WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); + + ret = __eremove(sgx_get_epc_virt_addr(page)); + if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret)) + return; + + spin_lock(§ion->lock); + list_add_tail(&page->list, §ion->page_list); + section->free_cnt++; + spin_unlock(§ion->lock); +} + +static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, + unsigned long index, + struct sgx_epc_section *section) +{ + unsigned long nr_pages = size >> PAGE_SHIFT; + unsigned long i; + + section->virt_addr = memremap(phys_addr, size, MEMREMAP_WB); + if (!section->virt_addr) + return false; + + section->pages = vmalloc(nr_pages * sizeof(struct sgx_epc_page)); + if (!section->pages) { + memunmap(section->virt_addr); + return false; + } + + section->phys_addr = phys_addr; + spin_lock_init(§ion->lock); + INIT_LIST_HEAD(§ion->page_list); + INIT_LIST_HEAD(§ion->init_laundry_list); + + for (i = 0; i < nr_pages; i++) { + section->pages[i].section = index; + section->pages[i].flags = 0; + section->pages[i].owner = NULL; + list_add_tail(§ion->pages[i].list, §ion->init_laundry_list); + } + + section->free_cnt = nr_pages; + return true; +} + +/** + * A section metric is concatenated in a way that @low bits 12-31 define the + * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the + * metric. + */ +static inline u64 __init sgx_calc_section_metric(u64 low, u64 high) +{ + return (low & GENMASK_ULL(31, 12)) + + ((high & GENMASK_ULL(19, 0)) << 32); +} + +static bool __init sgx_page_cache_init(void) +{ + u32 eax, ebx, ecx, edx, type; + u64 pa, size; + int i; + + for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) { + cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx); + + type = eax & SGX_CPUID_EPC_MASK; + if (type == SGX_CPUID_EPC_INVALID) + break; + + if (type != SGX_CPUID_EPC_SECTION) { + pr_err_once("Unknown EPC section type: %u\n", type); + break; + } + + pa = sgx_calc_section_metric(eax, ebx); + size = sgx_calc_section_metric(ecx, edx); + + pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - 1); + + if (!sgx_setup_epc_section(pa, size, i, &sgx_epc_sections[i])) { + pr_err("No free memory for an EPC section\n"); + break; + } + + sgx_nr_epc_sections++; + } + + if (!sgx_nr_epc_sections) { + pr_err("There are zero EPC sections.\n"); + return false; + } + + return true; +} + +static void __init sgx_init(void) +{ + int ret; + int i; + + if (!cpu_feature_enabled(X86_FEATURE_SGX)) + return; + + if (!sgx_page_cache_init()) + return; + + if (!sgx_page_reclaimer_init()) + goto err_page_cache; + + ret = sgx_drv_init(); + if (ret) + goto err_kthread; + + return; + +err_kthread: + kthread_stop(ksgxd_tsk); + +err_page_cache: + for (i = 0; i < sgx_nr_epc_sections; i++) { + vfree(sgx_epc_sections[i].pages); + memunmap(sgx_epc_sections[i].virt_addr); + } +} + +device_initcall(sgx_init); diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h new file mode 100644 index 000000000000..5fa42d143feb --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_SGX_H +#define _X86_SGX_H + +#include <linux/bitops.h> +#include <linux/err.h> +#include <linux/io.h> +#include <linux/rwsem.h> +#include <linux/types.h> +#include <asm/asm.h> +#include "arch.h" + +#undef pr_fmt +#define pr_fmt(fmt) "sgx: " fmt + +#define SGX_MAX_EPC_SECTIONS 8 +#define SGX_EEXTEND_BLOCK_SIZE 256 +#define SGX_NR_TO_SCAN 16 +#define SGX_NR_LOW_PAGES 32 +#define SGX_NR_HIGH_PAGES 64 + +/* Pages, which are being tracked by the page reclaimer. */ +#define SGX_EPC_PAGE_RECLAIMER_TRACKED BIT(0) + +struct sgx_epc_page { + unsigned int section; + unsigned int flags; + struct sgx_encl_page *owner; + struct list_head list; +}; + +/* + * The firmware can define multiple chunks of EPC to the different areas of the + * physical memory e.g. for memory areas of the each node. This structure is + * used to store EPC pages for one EPC section and virtual memory area where + * the pages have been mapped. + * + * 'lock' must be held before accessing 'page_list' or 'free_cnt'. + */ +struct sgx_epc_section { + unsigned long phys_addr; + void *virt_addr; + struct sgx_epc_page *pages; + + spinlock_t lock; + struct list_head page_list; + unsigned long free_cnt; + + /* + * Pages which need EREMOVE run on them before they can be + * used. Only safe to be accessed in ksgxd and init code. + * Not protected by locks. + */ + struct list_head init_laundry_list; +}; + +extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; + +static inline unsigned long sgx_get_epc_phys_addr(struct sgx_epc_page *page) +{ + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + unsigned long index; + + index = ((unsigned long)page - (unsigned long)section->pages) / sizeof(*page); + + return section->phys_addr + index * PAGE_SIZE; +} + +static inline void *sgx_get_epc_virt_addr(struct sgx_epc_page *page) +{ + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + unsigned long index; + + index = ((unsigned long)page - (unsigned long)section->pages) / sizeof(*page); + + return section->virt_addr + index * PAGE_SIZE; +} + +struct sgx_epc_page *__sgx_alloc_epc_page(void); +void sgx_free_epc_page(struct sgx_epc_page *page); + +void sgx_mark_page_reclaimable(struct sgx_epc_page *page); +int sgx_unmark_page_reclaimable(struct sgx_epc_page *page); +struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim); + +#endif /* _X86_SGX_H */ diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index d3a0791bc052..1068002c8532 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -96,6 +96,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c) unsigned int ht_mask_width, core_plus_mask_width, die_plus_mask_width; unsigned int core_select_mask, core_level_siblings; unsigned int die_select_mask, die_level_siblings; + bool die_level_present = false; int leaf; leaf = detect_extended_topology_leaf(c); @@ -126,6 +127,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c) die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); } if (LEAFB_SUBTYPE(ecx) == DIE_TYPE) { + die_level_present = true; die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); } @@ -139,8 +141,12 @@ int detect_extended_topology(struct cpuinfo_x86 *c) c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width) & core_select_mask; - c->cpu_die_id = apic->phys_pkg_id(c->initial_apicid, - core_plus_mask_width) & die_select_mask; + + if (die_level_present) { + c->cpu_die_id = apic->phys_pkg_id(c->initial_apicid, + core_plus_mask_width) & die_select_mask; + } + c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, die_plus_mask_width); /* diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 924571fe5864..c6ede3b3d302 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -501,12 +501,12 @@ static bool vmware_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs) ghcb_rbp_is_valid(ghcb))) return false; - regs->bx = ghcb->save.rbx; - regs->cx = ghcb->save.rcx; - regs->dx = ghcb->save.rdx; - regs->si = ghcb->save.rsi; - regs->di = ghcb->save.rdi; - regs->bp = ghcb->save.rbp; + regs->bx = ghcb_get_rbx(ghcb); + regs->cx = ghcb_get_rcx(ghcb); + regs->dx = ghcb_get_rdx(ghcb); + regs->si = ghcb_get_rsi(ghcb); + regs->di = ghcb_get_rdi(ghcb); + regs->bp = ghcb_get_rbp(ghcb); return true; } diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 3492aa36bf09..6f7b8cc1bc9f 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -74,10 +74,9 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, init_completion(&cmd.done); for (; count; count -= 16) { - call_single_data_t csd = { - .func = cpuid_smp_cpuid, - .info = &cmd, - }; + call_single_data_t csd; + + INIT_CSD(&csd, cpuid_smp_cpuid, &cmd); cmd.regs.eax = pos; cmd.regs.ecx = pos >> 32; diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 33ee47670b99..5fcac46aaf6b 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -13,8 +13,6 @@ #include <linux/uaccess.h> -static void *kdump_buf_page; - static inline bool is_crashed_pfn_valid(unsigned long pfn) { #ifndef CONFIG_X86_PAE @@ -41,15 +39,11 @@ static inline bool is_crashed_pfn_valid(unsigned long pfn) * @userbuf: if set, @buf is in user address space, use copy_to_user(), * otherwise @buf is in kernel address space, use memcpy(). * - * Copy a page from "oldmem". For this page, there is no pte mapped - * in the current kernel. We stitch up a pte, similar to kmap_atomic. - * - * Calling copy_to_user() in atomic context is not desirable. Hence first - * copying the data to a pre-allocated kernel page and then copying to user - * space in non-atomic context. + * Copy a page from "oldmem". For this page, there might be no pte mapped + * in the current kernel. */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, + unsigned long offset, int userbuf) { void *vaddr; @@ -59,38 +53,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!is_crashed_pfn_valid(pfn)) return -EFAULT; - vaddr = kmap_atomic_pfn(pfn); + vaddr = kmap_local_pfn(pfn); if (!userbuf) { - memcpy(buf, (vaddr + offset), csize); - kunmap_atomic(vaddr); + memcpy(buf, vaddr + offset, csize); } else { - if (!kdump_buf_page) { - printk(KERN_WARNING "Kdump: Kdump buffer page not" - " allocated\n"); - kunmap_atomic(vaddr); - return -EFAULT; - } - copy_page(kdump_buf_page, vaddr); - kunmap_atomic(vaddr); - if (copy_to_user(buf, (kdump_buf_page + offset), csize)) - return -EFAULT; + if (copy_to_user(buf, vaddr + offset, csize)) + csize = -EFAULT; } - return csize; -} + kunmap_local(vaddr); -static int __init kdump_buf_page_init(void) -{ - int ret = 0; - - kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!kdump_buf_page) { - printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer" - " page\n"); - ret = -ENOMEM; - } - - return ret; + return csize; } -arch_initcall(kdump_buf_page_init); diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index ddffd80f5c52..6a4cb71c2498 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -184,31 +184,31 @@ static unsigned int ioapic_id; struct of_ioapic_type { u32 out_type; - u32 trigger; - u32 polarity; + u32 is_level; + u32 active_low; }; static struct of_ioapic_type of_ioapic_type[] = { { - .out_type = IRQ_TYPE_EDGE_RISING, - .trigger = IOAPIC_EDGE, - .polarity = 1, + .out_type = IRQ_TYPE_EDGE_FALLING, + .is_level = 0, + .active_low = 1, }, { - .out_type = IRQ_TYPE_LEVEL_LOW, - .trigger = IOAPIC_LEVEL, - .polarity = 0, + .out_type = IRQ_TYPE_LEVEL_HIGH, + .is_level = 1, + .active_low = 0, }, { - .out_type = IRQ_TYPE_LEVEL_HIGH, - .trigger = IOAPIC_LEVEL, - .polarity = 1, + .out_type = IRQ_TYPE_LEVEL_LOW, + .is_level = 1, + .active_low = 1, }, { - .out_type = IRQ_TYPE_EDGE_FALLING, - .trigger = IOAPIC_EDGE, - .polarity = 0, + .out_type = IRQ_TYPE_EDGE_RISING, + .is_level = 0, + .active_low = 0, }, }; @@ -228,7 +228,7 @@ static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, return -EINVAL; it = &of_ioapic_type[type_index]; - ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity); + ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->is_level, it->active_low); tmp.devid = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain)); tmp.ioapic.pin = fwspec->param[0]; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 97aa900386cb..299c20f0a38b 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -183,7 +183,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, } } -void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, +static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, const char *log_lvl) { struct unwind_state state; diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index ac3d5f22fe64..0d54099c2a3a 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -140,16 +140,27 @@ SYM_FUNC_START(ftrace_caller) /* save_mcount_regs fills in first two parameters */ save_mcount_regs + /* Stack - skipping return address of ftrace_caller */ + leaq MCOUNT_REG_SIZE+8(%rsp), %rcx + movq %rcx, RSP(%rsp) + SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL) /* Load the ftrace_ops into the 3rd parameter */ movq function_trace_op(%rip), %rdx - /* regs go into 4th parameter (but make it NULL) */ - movq $0, %rcx + /* regs go into 4th parameter */ + leaq (%rsp), %rcx + + /* Only ops with REGS flag set should have CS register set */ + movq $0, CS(%rsp) SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) call ftrace_stub + /* Handlers can change the RIP */ + movq RIP(%rsp), %rax + movq %rax, MCOUNT_REG_SIZE(%rsp) + restore_mcount_regs /* diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 05e117137b45..5e9beb77cafd 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -37,7 +37,6 @@ #include <asm/kasan.h> #include <asm/fixmap.h> #include <asm/realmode.h> -#include <asm/desc.h> #include <asm/extable.h> #include <asm/trapnr.h> #include <asm/sev-es.h> diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 3c417734790f..04bddaaba8e2 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -26,15 +26,6 @@ #include <asm/nospec-branch.h> #include <asm/fixmap.h> -#ifdef CONFIG_PARAVIRT_XXL -#include <asm/asm-offsets.h> -#include <asm/paravirt.h> -#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg -#else -#define INTERRUPT_RETURN iretq -#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg -#endif - /* * We are not able to switch in one step to the final KERNEL ADDRESS SPACE * because we need identity-mapped pages. @@ -540,21 +531,19 @@ SYM_DATA_END(level3_kernel_pgt) SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt) /* - * 512 MB kernel mapping. We spend a full page on this pagetable - * anyway. + * Kernel high mapping. * - * The kernel code+data+bss must not be bigger than that. + * The kernel code+data+bss must be located below KERNEL_IMAGE_SIZE in + * virtual address space, which is 1 GiB if RANDOMIZE_BASE is enabled, + * 512 MiB otherwise. * - * (NOTE: at +512MB starts the module area, see MODULES_VADDR. - * If you want to increase this then increase MODULES_VADDR - * too.) + * (NOTE: after that starts the module area, see MODULES_VADDR.) * - * This table is eventually used by the kernel during normal - * runtime. Care must be taken to clear out undesired bits - * later, like _PAGE_RW or _PAGE_GLOBAL in some cases. + * This table is eventually used by the kernel during normal runtime. + * Care must be taken to clear out undesired bits later, like _PAGE_RW + * or _PAGE_GLOBAL in some cases. */ - PMDS(0, __PAGE_KERNEL_LARGE_EXEC, - KERNEL_IMAGE_SIZE/PMD_SIZE) + PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) SYM_DATA_END(level2_kernel_pgt) SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 7a50f0b62a70..08651a4e6aa0 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -7,6 +7,7 @@ #include <linux/cpu.h> #include <linux/irq.h> +#include <asm/irq_remapping.h> #include <asm/hpet.h> #include <asm/time.h> @@ -50,7 +51,7 @@ unsigned long hpet_address; u8 hpet_blockid; /* OS timer block num */ bool hpet_msi_disable; -#ifdef CONFIG_PCI_MSI +#ifdef CONFIG_GENERIC_MSI_IRQ static DEFINE_PER_CPU(struct hpet_channel *, cpu_hpet_channel); static struct irq_domain *hpet_domain; #endif @@ -467,9 +468,8 @@ static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc) /* * HPET MSI Support */ -#ifdef CONFIG_PCI_MSI - -void hpet_msi_unmask(struct irq_data *data) +#ifdef CONFIG_GENERIC_MSI_IRQ +static void hpet_msi_unmask(struct irq_data *data) { struct hpet_channel *hc = irq_data_get_irq_handler_data(data); unsigned int cfg; @@ -479,7 +479,7 @@ void hpet_msi_unmask(struct irq_data *data) hpet_writel(cfg, HPET_Tn_CFG(hc->num)); } -void hpet_msi_mask(struct irq_data *data) +static void hpet_msi_mask(struct irq_data *data) { struct hpet_channel *hc = irq_data_get_irq_handler_data(data); unsigned int cfg; @@ -489,12 +489,122 @@ void hpet_msi_mask(struct irq_data *data) hpet_writel(cfg, HPET_Tn_CFG(hc->num)); } -void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg) +static void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg) { hpet_writel(msg->data, HPET_Tn_ROUTE(hc->num)); hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hc->num) + 4); } +static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) +{ + hpet_msi_write(irq_data_get_irq_handler_data(data), msg); +} + +static struct irq_chip hpet_msi_controller __ro_after_init = { + .name = "HPET-MSI", + .irq_unmask = hpet_msi_unmask, + .irq_mask = hpet_msi_mask, + .irq_ack = irq_chip_ack_parent, + .irq_set_affinity = msi_domain_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_write_msi_msg = hpet_msi_write_msg, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +static int hpet_msi_init(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq, + irq_hw_number_t hwirq, msi_alloc_info_t *arg) +{ + irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); + irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL, + handle_edge_irq, arg->data, "edge"); + + return 0; +} + +static void hpet_msi_free(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq) +{ + irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); +} + +static struct msi_domain_ops hpet_msi_domain_ops = { + .msi_init = hpet_msi_init, + .msi_free = hpet_msi_free, +}; + +static struct msi_domain_info hpet_msi_domain_info = { + .ops = &hpet_msi_domain_ops, + .chip = &hpet_msi_controller, + .flags = MSI_FLAG_USE_DEF_DOM_OPS, +}; + +static struct irq_domain *hpet_create_irq_domain(int hpet_id) +{ + struct msi_domain_info *domain_info; + struct irq_domain *parent, *d; + struct fwnode_handle *fn; + struct irq_fwspec fwspec; + + if (x86_vector_domain == NULL) + return NULL; + + domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL); + if (!domain_info) + return NULL; + + *domain_info = hpet_msi_domain_info; + domain_info->data = (void *)(long)hpet_id; + + fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name, + hpet_id); + if (!fn) { + kfree(domain_info); + return NULL; + } + + fwspec.fwnode = fn; + fwspec.param_count = 1; + fwspec.param[0] = hpet_id; + + parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY); + if (!parent) { + irq_domain_free_fwnode(fn); + kfree(domain_info); + return NULL; + } + if (parent != x86_vector_domain) + hpet_msi_controller.name = "IR-HPET-MSI"; + + d = msi_create_irq_domain(fn, domain_info, parent); + if (!d) { + irq_domain_free_fwnode(fn); + kfree(domain_info); + } + return d; +} + +static inline int hpet_dev_id(struct irq_domain *domain) +{ + struct msi_domain_info *info = msi_get_domain_info(domain); + + return (int)(long)info->data; +} + +static int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc, + int dev_num) +{ + struct irq_alloc_info info; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_HPET; + info.data = hc; + info.devid = hpet_dev_id(domain); + info.hwirq = dev_num; + + return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info); +} + static int hpet_clkevt_msi_resume(struct clock_event_device *evt) { struct hpet_channel *hc = clockevent_to_channel(evt); diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c deleted file mode 100644 index 7dfb1e808928..000000000000 --- a/arch/x86/kernel/ima_arch.c +++ /dev/null @@ -1,94 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * Copyright (C) 2018 IBM Corporation - */ -#include <linux/efi.h> -#include <linux/module.h> -#include <linux/ima.h> - -extern struct boot_params boot_params; - -static enum efi_secureboot_mode get_sb_mode(void) -{ - efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID; - efi_status_t status; - unsigned long size; - u8 secboot, setupmode; - - size = sizeof(secboot); - - if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) { - pr_info("ima: secureboot mode unknown, no efi\n"); - return efi_secureboot_mode_unknown; - } - - /* Get variable contents into buffer */ - status = efi.get_variable(L"SecureBoot", &efi_variable_guid, - NULL, &size, &secboot); - if (status == EFI_NOT_FOUND) { - pr_info("ima: secureboot mode disabled\n"); - return efi_secureboot_mode_disabled; - } - - if (status != EFI_SUCCESS) { - pr_info("ima: secureboot mode unknown\n"); - return efi_secureboot_mode_unknown; - } - - size = sizeof(setupmode); - status = efi.get_variable(L"SetupMode", &efi_variable_guid, - NULL, &size, &setupmode); - - if (status != EFI_SUCCESS) /* ignore unknown SetupMode */ - setupmode = 0; - - if (secboot == 0 || setupmode == 1) { - pr_info("ima: secureboot mode disabled\n"); - return efi_secureboot_mode_disabled; - } - - pr_info("ima: secureboot mode enabled\n"); - return efi_secureboot_mode_enabled; -} - -bool arch_ima_get_secureboot(void) -{ - static enum efi_secureboot_mode sb_mode; - static bool initialized; - - if (!initialized && efi_enabled(EFI_BOOT)) { - sb_mode = boot_params.secure_boot; - - if (sb_mode == efi_secureboot_mode_unset) - sb_mode = get_sb_mode(); - initialized = true; - } - - if (sb_mode == efi_secureboot_mode_enabled) - return true; - else - return false; -} - -/* secureboot arch rules */ -static const char * const sb_arch_rules[] = { -#if !IS_ENABLED(CONFIG_KEXEC_SIG) - "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig", -#endif /* CONFIG_KEXEC_SIG */ - "measure func=KEXEC_KERNEL_CHECK", -#if !IS_ENABLED(CONFIG_MODULE_SIG) - "appraise func=MODULE_CHECK appraise_type=imasig", -#endif - "measure func=MODULE_CHECK", - NULL -}; - -const char * const *arch_get_ima_policy(void) -{ - if (IS_ENABLED(CONFIG_IMA_ARCH_POLICY) && arch_ima_get_secureboot()) { - if (IS_ENABLED(CONFIG_MODULE_SIG)) - set_module_sig_enforced(); - return sb_arch_rules; - } - return NULL; -} diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 547c7abb39f5..a65e9e97857f 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -864,6 +864,7 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs, p->ainsn.boostable = true; goto no_change; } + break; default: break; } @@ -937,6 +938,11 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) * So clear it by resetting the current kprobe: */ regs->flags &= ~X86_EFLAGS_TF; + /* + * Since the single step (trap) has been cancelled, + * we need to restore BTF here. + */ + restore_btf(); /* * If the TF flag was set before the kprobe hit, diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 681a4b36e9bb..373e5fa3ce1f 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -14,15 +14,21 @@ /* Ftrace callback handler for kprobes -- called under preepmt disabed */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { + struct pt_regs *regs = ftrace_get_regs(fregs); struct kprobe *p; struct kprobe_ctlblk *kcb; + int bit; - /* Preempt is disabled by ftrace */ + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + + preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)ip); if (unlikely(!p) || kprobe_disabled(p)) - return; + goto out; kcb = get_kprobe_ctlblk(); if (kprobe_running()) { @@ -52,6 +58,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, */ __this_cpu_write(current_kprobe, NULL); } +out: + preempt_enable_notrace(); + ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 7f57ede3cb8e..5e78e01ca3b4 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -740,6 +740,11 @@ static void __init kvm_apic_init(void) #endif } +static bool __init kvm_msi_ext_dest_id(void) +{ + return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID); +} + static void __init kvm_init_platform(void) { kvmclock_init(); @@ -769,6 +774,7 @@ const __initconst struct hypervisor_x86 x86_hyper_kvm = { .type = X86_HYPER_KVM, .init.guest_late_init = kvm_guest_init, .init.x2apic_available = kvm_para_available, + .init.msi_ext_dest_id = kvm_msi_ext_dest_id, .init.init_platform = kvm_init_platform, #if defined(CONFIG_AMD_MEM_ENCRYPT) .runtime.sev_es_hcall_prepare = kvm_sev_es_hcall_prepare, diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 34b18f6eeb2c..aa593743acf6 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -44,7 +44,6 @@ static int __init parse_no_kvmclock_vsyscall(char *arg) early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); /* Aligned to page sizes to match whats mapped via vsyscalls to userspace */ -#define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS) #define HVC_BOOT_ARRAY_SIZE \ (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index c0d409810658..8a67d1fa8dc5 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -99,11 +99,9 @@ static int filter_write(u32 reg) if (!__ratelimit(&fw_rs)) return 0; - if (reg == MSR_IA32_ENERGY_PERF_BIAS) - return 0; - - pr_err("Write to unrecognized MSR 0x%x by %s (pid: %d). Please report to [email protected].\n", - reg, current->comm, current->pid); + pr_warn("Write to unrecognized MSR 0x%x by %s (pid: %d).\n", + reg, current->comm, current->pid); + pr_warn("See https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/about for details.\n"); return 0; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 4bc77aaf1303..bf250a339655 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -475,7 +475,7 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7); DEFINE_IDTENTRY_RAW(exc_nmi) { - bool irq_state; + irqentry_state_t irq_state; /* * Re-enable NMIs right here when running as an SEV-ES guest. This might @@ -502,14 +502,14 @@ nmi_restart: this_cpu_write(nmi_dr7, local_db_save()); - irq_state = idtentry_enter_nmi(regs); + irq_state = irqentry_nmi_enter(regs); inc_irq_stat(__nmi_count); if (!ignore_nmis) default_do_nmi(regs); - idtentry_exit_nmi(regs, irq_state); + irqentry_nmi_exit(regs, irq_state); local_db_restore(this_cpu_read(nmi_dr7)); diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index f9e5352b3bef..624703af80a1 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -122,7 +122,7 @@ int perf_reg_validate(u64 mask) u64 perf_reg_abi(struct task_struct *task) { - if (test_tsk_thread_flag(task, TIF_IA32)) + if (!user_64bit_mode(task_pt_regs(task))) return PERF_SAMPLE_REGS_ABI_32; else return PERF_SAMPLE_REGS_ABI_64; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index df342bedea88..ad582f9ac5a6 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -511,11 +511,10 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) EXPORT_SYMBOL_GPL(start_thread); #ifdef CONFIG_COMPAT -void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp) +void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32) { start_thread_common(regs, new_ip, new_sp, - test_thread_flag(TIF_X32) - ? __USER_CS : __USER32_CS, + x32 ? __USER_CS : __USER32_CS, __USER_DS, __USER_DS); } #endif @@ -641,16 +640,12 @@ void set_personality_64bit(void) /* inherit personality from parent */ /* Make sure to be in 64bit mode */ - clear_thread_flag(TIF_IA32); clear_thread_flag(TIF_ADDR32); - clear_thread_flag(TIF_X32); /* Pretend that this comes from a 64bit execve */ task_pt_regs(current)->orig_ax = __NR_execve; current_thread_info()->status &= ~TS_COMPAT; - - /* Ensure the corresponding mm is not marked. */ if (current->mm) - current->mm->context.ia32_compat = 0; + current->mm->context.flags = MM_CONTEXT_HAS_VSYSCALL; /* TBD: overwrites user setup. Should have two bits. But 64bit processes have always behaved this way, @@ -662,10 +657,9 @@ void set_personality_64bit(void) static void __set_personality_x32(void) { #ifdef CONFIG_X86_X32 - clear_thread_flag(TIF_IA32); - set_thread_flag(TIF_X32); if (current->mm) - current->mm->context.ia32_compat = TIF_X32; + current->mm->context.flags = 0; + current->personality &= ~READ_IMPLIES_EXEC; /* * in_32bit_syscall() uses the presence of the x32 syscall bit @@ -683,10 +677,14 @@ static void __set_personality_x32(void) static void __set_personality_ia32(void) { #ifdef CONFIG_IA32_EMULATION - set_thread_flag(TIF_IA32); - clear_thread_flag(TIF_X32); - if (current->mm) - current->mm->context.ia32_compat = TIF_IA32; + if (current->mm) { + /* + * uprobes applied to this MM need to know this and + * cannot use user_64bit_mode() at that time. + */ + current->mm->context.flags = MM_CONTEXT_UPROBE_IA32; + } + current->personality |= force_personality32; /* Prepare the first "return" to user space */ task_pt_regs(current)->orig_ax = __NR_ia32_execve; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 84f581c91db4..740f3bdb3f61 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -119,11 +119,6 @@ EXPORT_SYMBOL(boot_cpu_data); unsigned int def_to_bigsmp; -/* For MCA, but anyone else can use it if they want */ -unsigned int machine_id; -unsigned int machine_submodel_id; -unsigned int BIOS_revision; - struct apm_info apm_info; EXPORT_SYMBOL(apm_info); @@ -1054,6 +1049,12 @@ void __init setup_arch(char **cmdline_p) memblock_set_current_limit(ISA_END_ADDRESS); e820__memblock_setup(); + /* + * Needs to run after memblock setup because it needs the physical + * memory size. + */ + sev_setup_arch(); + reserve_bios_regions(); efi_fake_memmap(); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index be0d7d4152ec..ea794a083c44 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -804,11 +804,11 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ -void arch_do_signal(struct pt_regs *regs) +void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { struct ksignal ksig; - if (get_signal(&ksig)) { + if (has_signal && get_signal(&ksig)) { /* Whee! Actually deliver the signal. */ handle_signal(&ksig, regs); return; diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index a7f3e12cfbdb..a5330ff498f0 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -31,7 +31,7 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(NSIGBUS != 5); BUILD_BUG_ON(NSIGTRAP != 5); BUILD_BUG_ON(NSIGCHLD != 6); - BUILD_BUG_ON(NSIGSYS != 1); + BUILD_BUG_ON(NSIGSYS != 2); /* This is part of the ABI and can never change in size: */ BUILD_BUG_ON(sizeof(compat_siginfo_t) != 128); @@ -165,16 +165,9 @@ void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) { signal_compat_build_tests(); - /* Don't leak in-kernel non-uapi flags to user-space */ - if (oact) - oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); - if (!act) return; - /* Don't let flags to be set from userspace */ - act->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); - if (in_ia32_syscall()) act->sa.sa_flags |= SA_IA32_ABI; if (in_x32_syscall()) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index de776b2e6046..8ca66af96a54 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -82,6 +82,10 @@ #include <asm/hw_irq.h> #include <asm/stackprotector.h> +#ifdef CONFIG_ACPI_CPPC_LIB +#include <acpi/cppc_acpi.h> +#endif + /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); @@ -148,7 +152,7 @@ static inline void smpboot_restore_warm_reset_vector(void) *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; } -static void init_freq_invariance(bool secondary); +static void init_freq_invariance(bool secondary, bool cppc_ready); /* * Report back to the Boot Processor during boot time or to the caller processor @@ -186,7 +190,7 @@ static void smp_callin(void) */ set_cpu_sibling_map(raw_smp_processor_id()); - init_freq_invariance(true); + init_freq_invariance(true, false); /* * Get our bogomips. @@ -229,6 +233,7 @@ static void notrace start_secondary(void *unused) #endif cpu_init_exception_handling(); cpu_init(); + rcu_cpu_starting(raw_smp_processor_id()); x86_cpuinit.early_percpu_clock_init(); preempt_disable(); smp_callin(); @@ -747,13 +752,14 @@ static void __init smp_quirk_init_udelay(void) int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) { + u32 dm = apic->dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL; unsigned long send_status, accept_status = 0; int maxlvt; /* Target chip */ /* Boot on the stack */ /* Kick the second */ - apic_icr_write(APIC_DM_NMI | apic->dest_logical, apicid); + apic_icr_write(APIC_DM_NMI | dm, apicid); pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -980,10 +986,7 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid, if (!boot_error) { enable_start_cpu0 = 1; *cpu0_nmi_registered = 1; - if (apic->dest_logical == APIC_DEST_LOGICAL) - id = cpu0_logical_apicid; - else - id = apicid; + id = apic->dest_mode_logical ? cpu0_logical_apicid : apicid; boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip); } @@ -1340,7 +1343,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_sched_topology(x86_topology); set_cpu_sibling_map(0); - init_freq_invariance(false); + init_freq_invariance(false, false); smp_sanity_check(); switch (apic_intr_mode) { @@ -2027,6 +2030,48 @@ out: return true; } +#ifdef CONFIG_ACPI_CPPC_LIB +static bool amd_set_max_freq_ratio(void) +{ + struct cppc_perf_caps perf_caps; + u64 highest_perf, nominal_perf; + u64 perf_ratio; + int rc; + + rc = cppc_get_perf_caps(0, &perf_caps); + if (rc) { + pr_debug("Could not retrieve perf counters (%d)\n", rc); + return false; + } + + highest_perf = perf_caps.highest_perf; + nominal_perf = perf_caps.nominal_perf; + + if (!highest_perf || !nominal_perf) { + pr_debug("Could not retrieve highest or nominal performance\n"); + return false; + } + + perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf); + /* midpoint between max_boost and max_P */ + perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1; + if (!perf_ratio) { + pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n"); + return false; + } + + arch_turbo_freq_ratio = perf_ratio; + arch_set_max_freq_ratio(false); + + return true; +} +#else +static bool amd_set_max_freq_ratio(void) +{ + return false; +} +#endif + static void init_counter_refs(void) { u64 aperf, mperf; @@ -2038,7 +2083,7 @@ static void init_counter_refs(void) this_cpu_write(arch_prev_mperf, mperf); } -static void init_freq_invariance(bool secondary) +static void init_freq_invariance(bool secondary, bool cppc_ready) { bool ret = false; @@ -2054,15 +2099,38 @@ static void init_freq_invariance(bool secondary) if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) ret = intel_set_max_freq_ratio(); + else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (!cppc_ready) { + return; + } + ret = amd_set_max_freq_ratio(); + } if (ret) { init_counter_refs(); static_branch_enable(&arch_scale_freq_key); + pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); } else { pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); } } +#ifdef CONFIG_ACPI_CPPC_LIB +static DEFINE_MUTEX(freq_invariance_lock); + +void init_freq_invariance_cppc(void) +{ + static bool secondary; + + mutex_lock(&freq_invariance_lock); + + init_freq_invariance(secondary, true); + secondary = true; + + mutex_unlock(&freq_invariance_lock); +} +#endif + static void disable_freq_invariance_workfn(struct work_struct *work) { static_branch_disable(&arch_scale_freq_key); @@ -2112,7 +2180,7 @@ error: schedule_work(&disable_freq_invariance_work); } #else -static inline void init_freq_invariance(bool secondary) +static inline void init_freq_invariance(bool secondary, bool cppc_ready) { } #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index ae64f98ec2ab..4c09ba110204 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -93,6 +93,7 @@ static struct mm_struct tboot_mm = { .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), + .write_protect_seq = SEQCNT_ZERO(tboot_mm.write_protect_seq), MMAP_LOCK_INITIALIZER(init_mm) .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 0a2ec801b63f..f5477eab5692 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -25,6 +25,7 @@ * * Send feedback to <[email protected]> */ +#include <linux/interrupt.h> #include <linux/nodemask.h> #include <linux/export.h> #include <linux/mmzone.h> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index e19df6cde35d..7f5aec758f0e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -60,6 +60,7 @@ #include <asm/umip.h> #include <asm/insn.h> #include <asm/insn-eval.h> +#include <asm/vdso.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -117,6 +118,9 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; die(str, regs, error_code); + } else { + if (fixup_vdso_exception(regs, trapnr, error_code, 0)) + return 0; } /* @@ -299,11 +303,12 @@ DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check) local_irq_enable(); if (handle_user_split_lock(regs, error_code)) - return; + goto out; do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs, error_code, BUS_ADRALN, NULL); +out: local_irq_disable(); } @@ -405,7 +410,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault) } #endif - idtentry_enter_nmi(regs); + irqentry_nmi_enter(regs); instrumentation_begin(); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); @@ -550,6 +555,9 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_GP; + if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0)) + return; + show_signal(tsk, SIGSEGV, "", desc, regs, error_code); force_sig(SIGSEGV); goto exit; @@ -651,12 +659,13 @@ DEFINE_IDTENTRY_RAW(exc_int3) instrumentation_end(); irqentry_exit_to_user_mode(regs); } else { - bool irq_state = idtentry_enter_nmi(regs); + irqentry_state_t irq_state = irqentry_nmi_enter(regs); + instrumentation_begin(); if (!do_int3(regs)) die("int3", regs, 0); instrumentation_end(); - idtentry_exit_nmi(regs, irq_state); + irqentry_nmi_exit(regs, irq_state); } } @@ -851,7 +860,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, * includes the entry stack is excluded for everything. */ unsigned long dr7 = local_db_save(); - bool irq_state = idtentry_enter_nmi(regs); + irqentry_state_t irq_state = irqentry_nmi_enter(regs); instrumentation_begin(); /* @@ -908,7 +917,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, regs->flags &= ~X86_EFLAGS_TF; out: instrumentation_end(); - idtentry_exit_nmi(regs, irq_state); + irqentry_nmi_exit(regs, irq_state); local_db_restore(dr7); } @@ -926,7 +935,7 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, /* * NB: We can't easily clear DR7 here because - * idtentry_exit_to_usermode() can invoke ptrace, schedule, access + * irqentry_exit_to_usermode() can invoke ptrace, schedule, access * user memory, etc. This means that a recursive #DB is possible. If * this happens, that #DB will hit exc_debug_kernel() and clear DR7. * Since we're not on the IST stack right now, everything will be @@ -1048,6 +1057,9 @@ static void math_error(struct pt_regs *regs, int trapnr) if (!si_code) goto exit; + if (fixup_vdso_exception(regs, trapnr, 0, 0)) + return; + force_sig_fault(SIGFPE, si_code, (void __user *)uprobe_get_trap_addr(regs)); exit: diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 138bdb1fd136..a2b413394917 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -1017,6 +1017,8 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, if (uprobe_post_sstep_notifier(regs)) ret = NOTIFY_STOP; + break; + default: break; } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index bf9e0adb5b7e..efd9e9ea17f2 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -454,13 +454,13 @@ SECTIONS ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!") } -#ifdef CONFIG_X86_32 /* * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: */ . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), "kernel image bigger than KERNEL_IMAGE_SIZE"); -#else + +#ifdef CONFIG_X86_64 /* * Per-cpu symbols which need to be offset from __per_cpu_load * for the boot processor. @@ -470,18 +470,12 @@ INIT_PER_CPU(gdt_page); INIT_PER_CPU(fixed_percpu_data); INIT_PER_CPU(irq_stack_backing_store); -/* - * Build-time check on the image size: - */ -. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE"); - #ifdef CONFIG_SMP . = ASSERT((fixed_percpu_data == 0), "fixed_percpu_data is not at start of per-cpu area"); #endif -#endif /* CONFIG_X86_32 */ +#endif /* CONFIG_X86_64 */ #ifdef CONFIG_KEXEC_CORE #include <asm/kexec.h> diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index a3038d8deb6a..8b395821cb8d 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -110,6 +110,7 @@ struct x86_init_ops x86_init __initdata = { .init_platform = x86_init_noop, .guest_late_init = x86_init_noop, .x2apic_available = bool_x86_init_noop, + .msi_ext_dest_id = bool_x86_init_noop, .init_mem_mapping = x86_init_noop, .init_after_bootmem = x86_init_noop, }, diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index f92dfd8ef10d..7ac592664c52 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -100,7 +100,8 @@ config KVM_AMD_SEV depends on KVM_AMD && X86_64 depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) help - Provides support for launching Encrypted VMs on AMD processors. + Provides support for launching Encrypted VMs (SEV) and Encrypted VMs + with Encrypted State (SEV-ES) on AMD processors. config KVM_MMU_AUDIT bool "Audit KVM MMU" diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index b804444e16d4..4bd14ab01323 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -10,7 +10,8 @@ endif KVM := ../../../virt/kvm kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ - $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o + $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \ + $(KVM)/dirty_ring.o kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 83637a2ff605..13036cf0b912 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -146,6 +146,7 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) MSR_IA32_MISC_ENABLE_MWAIT); } } +EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime); static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { @@ -418,7 +419,7 @@ void kvm_set_cpu_caps(void) F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) | - F(SERIALIZE) | F(TSXLDTRK) + F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) ); /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */ diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index f7a6e8f83783..dc921d76e42e 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -264,6 +264,20 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) return x86_stepping(best->eax); } +static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu) +{ + return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) || + guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) || + guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) || + guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)); +} + +static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu) +{ + return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) || + guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)); +} + static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu) { return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 5c7c4060b45c..922c69dcca4d 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1951,8 +1951,8 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args) return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd); } -int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) +int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) { uint16_t evmcs_ver = 0; struct kvm_cpuid_entry2 cpuid_entries[] = { @@ -2037,7 +2037,7 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, * Direct Synthetic timers only make sense with in-kernel * LAPIC */ - if (lapic_in_kernel(vcpu)) + if (!vcpu || lapic_in_kernel(vcpu)) ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE; break; diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index e68c6c2e9649..6d7def2b0aad 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -126,7 +126,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, void kvm_hv_init_vm(struct kvm *kvm); void kvm_hv_destroy_vm(struct kvm *kvm); int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args); -int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries); +int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries); #endif diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 4aa1c2e00e2a..8a4de3f12820 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -16,8 +16,6 @@ #include <trace/events/kvm.h> -#include <asm/msidef.h> - #include "irq.h" #include "ioapic.h" @@ -104,22 +102,19 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq) { - trace_kvm_msi_set_irq(e->msi.address_lo | (kvm->arch.x2apic_format ? - (u64)e->msi.address_hi << 32 : 0), - e->msi.data); - - irq->dest_id = (e->msi.address_lo & - MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; - if (kvm->arch.x2apic_format) - irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi); - irq->vector = (e->msi.data & - MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT; - irq->dest_mode = kvm_lapic_irq_dest_mode( - !!((1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo)); - irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data; - irq->delivery_mode = e->msi.data & 0x700; - irq->msi_redir_hint = ((e->msi.address_lo - & MSI_ADDR_REDIRECTION_LOWPRI) > 0); + struct msi_msg msg = { .address_lo = e->msi.address_lo, + .address_hi = e->msi.address_hi, + .data = e->msi.data }; + + trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ? + (u64)msg.address_hi << 32 : 0), msg.data); + + irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format); + irq->vector = msg.arch_data.vector; + irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical); + irq->trig_mode = msg.arch_data.is_level; + irq->delivery_mode = msg.arch_data.delivery_mode << 8; + irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint; irq->level = 1; irq->shorthand = APIC_DEST_NOSHORT; } diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index a889563ad02d..f15bc16de07c 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -9,6 +9,31 @@ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE) +static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); +} + +static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); +} + +static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); +} + +static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); +} + #define BUILD_KVM_GPR_ACCESSORS(lname, uname) \ static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\ { \ @@ -18,6 +43,7 @@ static __always_inline void kvm_##lname##_write(struct kvm_vcpu *vcpu, \ unsigned long val) \ { \ vcpu->arch.regs[VCPU_REGS_##uname] = val; \ + kvm_register_mark_dirty(vcpu, VCPU_REGS_##uname); \ } BUILD_KVM_GPR_ACCESSORS(rax, RAX) BUILD_KVM_GPR_ACCESSORS(rbx, RBX) @@ -37,31 +63,6 @@ BUILD_KVM_GPR_ACCESSORS(r14, R14) BUILD_KVM_GPR_ACCESSORS(r15, R15) #endif -static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu, - enum kvm_reg reg) -{ - return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); -} - -static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu, - enum kvm_reg reg) -{ - return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); -} - -static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu, - enum kvm_reg reg) -{ - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); -} - -static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu, - enum kvm_reg reg) -{ - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); -} - static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg) { if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS)) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 86c33d53c90a..3136e05831cf 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2843,14 +2843,35 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; u8 sipi_vector; + int r; unsigned long pe; - if (!lapic_in_kernel(vcpu) || !apic->pending_events) + if (!lapic_in_kernel(vcpu)) return; /* + * Read pending events before calling the check_events + * callback. + */ + pe = smp_load_acquire(&apic->pending_events); + if (!pe) + return; + + if (is_guest_mode(vcpu)) { + r = kvm_x86_ops.nested_ops->check_events(vcpu); + if (r < 0) + return; + /* + * If an event has happened and caused a vmexit, + * we know INITs are latched and therefore + * we will not incorrectly deliver an APIC + * event instead of a vmexit. + */ + } + + /* * INITs are latched while CPU is in specific states - * (SMM, VMX non-root mode, SVM with GIF=0). + * (SMM, VMX root mode, SVM with GIF=0). * Because a CPU cannot be in these states immediately * after it has processed an INIT signal (and thus in * KVM_MP_STATE_INIT_RECEIVED state), just eat SIPIs @@ -2858,26 +2879,28 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) */ if (kvm_vcpu_latch_init(vcpu)) { WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED); - if (test_bit(KVM_APIC_SIPI, &apic->pending_events)) + if (test_bit(KVM_APIC_SIPI, &pe)) clear_bit(KVM_APIC_SIPI, &apic->pending_events); return; } - pe = xchg(&apic->pending_events, 0); if (test_bit(KVM_APIC_INIT, &pe)) { + clear_bit(KVM_APIC_INIT, &apic->pending_events); kvm_vcpu_reset(vcpu, true); if (kvm_vcpu_is_bsp(apic->vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; } - if (test_bit(KVM_APIC_SIPI, &pe) && - vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { - /* evaluate pending_events before reading the vector */ - smp_rmb(); - sipi_vector = apic->sipi_vector; - kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector); - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + if (test_bit(KVM_APIC_SIPI, &pe)) { + clear_bit(KVM_APIC_SIPI, &apic->pending_events); + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { + /* evaluate pending_events before reading the vector */ + smp_rmb(); + sipi_vector = apic->sipi_vector; + kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector); + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + } } } diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7a6ae9e90bd7..c478904af518 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -820,7 +820,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return NULL; - if (no_dirty_log && slot->dirty_bitmap) + if (no_dirty_log && kvm_slot_dirty_track_enabled(slot)) return NULL; return slot; @@ -1289,6 +1289,14 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } +int kvm_cpu_dirty_log_size(void) +{ + if (kvm_x86_ops.cpu_dirty_log_size) + return kvm_x86_ops.cpu_dirty_log_size(); + + return 0; +} + bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn) { diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index 213699b27b44..e798489b56b5 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -381,6 +381,35 @@ TRACE_EVENT( ) ); +TRACE_EVENT( + kvm_tdp_mmu_spte_changed, + TP_PROTO(int as_id, gfn_t gfn, int level, u64 old_spte, u64 new_spte), + TP_ARGS(as_id, gfn, level, old_spte, new_spte), + + TP_STRUCT__entry( + __field(u64, gfn) + __field(u64, old_spte) + __field(u64, new_spte) + /* Level cannot be larger than 5 on x86, so it fits in a u8. */ + __field(u8, level) + /* as_id can only be 0 or 1 x86, so it fits in a u8. */ + __field(u8, as_id) + ), + + TP_fast_assign( + __entry->gfn = gfn; + __entry->old_spte = old_spte; + __entry->new_spte = new_spte; + __entry->level = level; + __entry->as_id = as_id; + ), + + TP_printk("as id %d gfn %llx level %d old_spte %llx new_spte %llx", + __entry->as_id, __entry->gfn, __entry->level, + __entry->old_spte, __entry->new_spte + ) +); + #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 84c8f06bec26..4bd2f1dc0172 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -7,6 +7,8 @@ #include "tdp_mmu.h" #include "spte.h" +#include <trace/events/kvm.h> + #ifdef CONFIG_X86_64 static bool __read_mostly tdp_mmu_enabled = false; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); @@ -108,6 +110,8 @@ static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, sp->gfn = gfn; sp->tdp_mmu_page = true; + trace_kvm_mmu_get_page(sp, true); + return sp; } @@ -185,7 +189,7 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, if ((!is_writable_pte(old_spte) || pfn_changed) && is_writable_pte(new_spte)) { slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); - mark_page_dirty_in_slot(slot, gfn); + mark_page_dirty_in_slot(kvm, slot, gfn); } } @@ -244,6 +248,8 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, if (old_spte == new_spte) return; + trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); + /* * The only times a SPTE should be changed from a non-present to * non-present state is when an MMIO entry is installed/modified/ @@ -278,6 +284,8 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, pt = spte_to_child_pt(old_spte, level); sp = sptep_to_sp(pt); + trace_kvm_mmu_prepare_zap_page(sp); + list_del(&sp->link); if (sp->lpage_disallowed) @@ -480,11 +488,13 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, if (unlikely(is_noslot_pfn(pfn))) { new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte); - } else + } else { make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, pfn, iter->old_spte, prefault, true, map_writable, !shadow_accessed_mask, &new_spte); + trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); + } if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; @@ -698,6 +708,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); young = 1; + + trace_kvm_age_page(iter.gfn, iter.level, slot, young); } return young; diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 7f0059aa30e1..f472fdb6ae7e 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -84,12 +84,8 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) } else /* MTRR mask */ mask |= 0x7ff; - if (data & mask) { - kvm_inject_gp(vcpu, 0); - return false; - } - return true; + return (data & mask) == 0; } EXPORT_SYMBOL_GPL(kvm_mtrr_valid); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 8c550999ace0..0ef84d57b72e 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -233,7 +233,8 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, */ static int avic_update_access_page(struct kvm *kvm, bool activate) { - int ret = 0; + void __user *ret; + int r = 0; mutex_lock(&kvm->slots_lock); /* @@ -249,13 +250,15 @@ static int avic_update_access_page(struct kvm *kvm, bool activate) APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, APIC_DEFAULT_PHYS_BASE, activate ? PAGE_SIZE : 0); - if (ret) + if (IS_ERR(ret)) { + r = PTR_ERR(ret); goto out; + } kvm->arch.apic_access_page_done = activate; out: mutex_unlock(&kvm->slots_lock); - return ret; + return r; } static int avic_init_backing_page(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 9e4c226dbf7d..b0b667456b2e 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -254,7 +254,7 @@ static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12) (vmcb12->save.cr3 & MSR_CR3_LONG_MBZ_MASK)) return false; } - if (kvm_valid_cr4(&svm->vcpu, vmcb12->save.cr4)) + if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4)) return false; return nested_vmcb_check_controls(&vmcb12->control); @@ -381,7 +381,7 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12) svm->vmcb->save.ds = vmcb12->save.ds; svm->vmcb->save.gdtr = vmcb12->save.gdtr; svm->vmcb->save.idtr = vmcb12->save.idtr; - kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags); + kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); svm_set_efer(&svm->vcpu, vmcb12->save.efer); svm_set_cr0(&svm->vcpu, vmcb12->save.cr0); svm_set_cr4(&svm->vcpu, vmcb12->save.cr4); @@ -394,8 +394,8 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12) svm->vmcb->save.rax = vmcb12->save.rax; svm->vmcb->save.rsp = vmcb12->save.rsp; svm->vmcb->save.rip = vmcb12->save.rip; - svm->vmcb->save.dr7 = vmcb12->save.dr7; - svm->vcpu.arch.dr6 = vmcb12->save.dr6; + svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1; + svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_FIXED_1 | DR6_RTM; svm->vmcb->save.cpl = vmcb12->save.cpl; } @@ -660,13 +660,14 @@ int nested_svm_vmexit(struct vcpu_svm *svm) svm->vmcb->save.gdtr = hsave->save.gdtr; svm->vmcb->save.idtr = hsave->save.idtr; kvm_set_rflags(&svm->vcpu, hsave->save.rflags); + kvm_set_rflags(&svm->vcpu, hsave->save.rflags | X86_EFLAGS_FIXED); svm_set_efer(&svm->vcpu, hsave->save.efer); svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); svm_set_cr4(&svm->vcpu, hsave->save.cr4); kvm_rax_write(&svm->vcpu, hsave->save.rax); kvm_rsp_write(&svm->vcpu, hsave->save.rsp); kvm_rip_write(&svm->vcpu, hsave->save.rip); - svm->vmcb->save.dr7 = 0; + svm->vmcb->save.dr7 = DR7_FIXED_1; svm->vmcb->save.cpl = 0; svm->vmcb->control.exit_int_info = 0; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 566f4d18185b..9858d5ae9ddd 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -14,10 +14,20 @@ #include <linux/psp-sev.h> #include <linux/pagemap.h> #include <linux/swap.h> +#include <linux/processor.h> +#include <linux/trace_events.h> +#include <asm/fpu/internal.h> + +#include <asm/trapnr.h> #include "x86.h" #include "svm.h" +#include "cpuid.h" +#include "trace.h" + +#define __ex(x) __kvm_handle_fault_on_reboot(x) +static u8 sev_enc_bit; static int sev_flush_asids(void); static DECLARE_RWSEM(sev_deactivate_lock); static DEFINE_MUTEX(sev_bitmap_lock); @@ -25,7 +35,6 @@ unsigned int max_sev_asid; static unsigned int min_sev_asid; static unsigned long *sev_asid_bitmap; static unsigned long *sev_reclaim_asid_bitmap; -#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) struct enc_region { struct list_head list; @@ -57,19 +66,19 @@ static int sev_flush_asids(void) } /* Must be called with the sev_bitmap_lock held */ -static bool __sev_recycle_asids(void) +static bool __sev_recycle_asids(int min_asid, int max_asid) { int pos; /* Check if there are any ASIDs to reclaim before performing a flush */ - pos = find_next_bit(sev_reclaim_asid_bitmap, - max_sev_asid, min_sev_asid - 1); - if (pos >= max_sev_asid) + pos = find_next_bit(sev_reclaim_asid_bitmap, max_sev_asid, min_asid); + if (pos >= max_asid) return false; if (sev_flush_asids()) return false; + /* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */ bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap, max_sev_asid); bitmap_zero(sev_reclaim_asid_bitmap, max_sev_asid); @@ -77,20 +86,23 @@ static bool __sev_recycle_asids(void) return true; } -static int sev_asid_new(void) +static int sev_asid_new(struct kvm_sev_info *sev) { + int pos, min_asid, max_asid; bool retry = true; - int pos; mutex_lock(&sev_bitmap_lock); /* - * SEV-enabled guest must use asid from min_sev_asid to max_sev_asid. + * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. + * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. */ + min_asid = sev->es_active ? 0 : min_sev_asid - 1; + max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid; again: - pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1); - if (pos >= max_sev_asid) { - if (retry && __sev_recycle_asids()) { + pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_asid); + if (pos >= max_asid) { + if (retry && __sev_recycle_asids(min_asid, max_asid)) { retry = false; goto again; } @@ -172,7 +184,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) if (unlikely(sev->active)) return ret; - asid = sev_asid_new(); + asid = sev_asid_new(sev); if (asid < 0) return ret; @@ -191,6 +203,16 @@ e_free: return ret; } +static int sev_es_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + if (!sev_es) + return -ENOTTY; + + to_kvm_svm(kvm)->sev_info.es_active = true; + + return sev_guest_init(kvm, argp); +} + static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) { struct sev_data_activate *data; @@ -490,6 +512,96 @@ e_free: return ret; } +static int sev_es_sync_vmsa(struct vcpu_svm *svm) +{ + struct vmcb_save_area *save = &svm->vmcb->save; + + /* Check some debug related fields before encrypting the VMSA */ + if (svm->vcpu.guest_debug || (save->dr7 & ~DR7_FIXED_1)) + return -EINVAL; + + /* Sync registgers */ + save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX]; + save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX]; + save->rcx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; + save->rdx = svm->vcpu.arch.regs[VCPU_REGS_RDX]; + save->rsp = svm->vcpu.arch.regs[VCPU_REGS_RSP]; + save->rbp = svm->vcpu.arch.regs[VCPU_REGS_RBP]; + save->rsi = svm->vcpu.arch.regs[VCPU_REGS_RSI]; + save->rdi = svm->vcpu.arch.regs[VCPU_REGS_RDI]; +#ifdef CONFIG_X86_64 + save->r8 = svm->vcpu.arch.regs[VCPU_REGS_R8]; + save->r9 = svm->vcpu.arch.regs[VCPU_REGS_R9]; + save->r10 = svm->vcpu.arch.regs[VCPU_REGS_R10]; + save->r11 = svm->vcpu.arch.regs[VCPU_REGS_R11]; + save->r12 = svm->vcpu.arch.regs[VCPU_REGS_R12]; + save->r13 = svm->vcpu.arch.regs[VCPU_REGS_R13]; + save->r14 = svm->vcpu.arch.regs[VCPU_REGS_R14]; + save->r15 = svm->vcpu.arch.regs[VCPU_REGS_R15]; +#endif + save->rip = svm->vcpu.arch.regs[VCPU_REGS_RIP]; + + /* Sync some non-GPR registers before encrypting */ + save->xcr0 = svm->vcpu.arch.xcr0; + save->pkru = svm->vcpu.arch.pkru; + save->xss = svm->vcpu.arch.ia32_xss; + + /* + * SEV-ES will use a VMSA that is pointed to by the VMCB, not + * the traditional VMSA that is part of the VMCB. Copy the + * traditional VMSA as it has been built so far (in prep + * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state. + */ + memcpy(svm->vmsa, save, sizeof(*save)); + + return 0; +} + +static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_launch_update_vmsa *vmsa; + int i, ret; + + if (!sev_es_guest(kvm)) + return -ENOTTY; + + vmsa = kzalloc(sizeof(*vmsa), GFP_KERNEL); + if (!vmsa) + return -ENOMEM; + + for (i = 0; i < kvm->created_vcpus; i++) { + struct vcpu_svm *svm = to_svm(kvm->vcpus[i]); + + /* Perform some pre-encryption checks against the VMSA */ + ret = sev_es_sync_vmsa(svm); + if (ret) + goto e_free; + + /* + * The LAUNCH_UPDATE_VMSA command will perform in-place + * encryption of the VMSA memory content (i.e it will write + * the same memory region with the guest's key), so invalidate + * it first. + */ + clflush_cache_range(svm->vmsa, PAGE_SIZE); + + vmsa->handle = sev->handle; + vmsa->address = __sme_pa(svm->vmsa); + vmsa->len = PAGE_SIZE; + ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, vmsa, + &argp->error); + if (ret) + goto e_free; + + svm->vcpu.arch.guest_state_protected = true; + } + +e_free: + kfree(vmsa); + return ret; +} + static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) { void __user *measure = (void __user *)(uintptr_t)argp->data; @@ -932,7 +1044,7 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp) struct kvm_sev_cmd sev_cmd; int r; - if (!svm_sev_enabled()) + if (!svm_sev_enabled() || !sev) return -ENOTTY; if (!argp) @@ -947,12 +1059,18 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp) case KVM_SEV_INIT: r = sev_guest_init(kvm, &sev_cmd); break; + case KVM_SEV_ES_INIT: + r = sev_es_guest_init(kvm, &sev_cmd); + break; case KVM_SEV_LAUNCH_START: r = sev_launch_start(kvm, &sev_cmd); break; case KVM_SEV_LAUNCH_UPDATE_DATA: r = sev_launch_update_data(kvm, &sev_cmd); break; + case KVM_SEV_LAUNCH_UPDATE_VMSA: + r = sev_launch_update_vmsa(kvm, &sev_cmd); + break; case KVM_SEV_LAUNCH_MEASURE: r = sev_launch_measure(kvm, &sev_cmd); break; @@ -1125,49 +1243,61 @@ void sev_vm_destroy(struct kvm *kvm) sev_asid_free(sev->asid); } -int __init sev_hardware_setup(void) +void __init sev_hardware_setup(void) { - struct sev_user_data_status *status; - int rc; + unsigned int eax, ebx, ecx, edx; + bool sev_es_supported = false; + bool sev_supported = false; + + /* Does the CPU support SEV? */ + if (!boot_cpu_has(X86_FEATURE_SEV)) + goto out; + + /* Retrieve SEV CPUID information */ + cpuid(0x8000001f, &eax, &ebx, &ecx, &edx); + + /* Set encryption bit location for SEV-ES guests */ + sev_enc_bit = ebx & 0x3f; /* Maximum number of encrypted guests supported simultaneously */ - max_sev_asid = cpuid_ecx(0x8000001F); + max_sev_asid = ecx; if (!svm_sev_enabled()) - return 1; + goto out; /* Minimum ASID value that should be used for SEV guest */ - min_sev_asid = cpuid_edx(0x8000001F); + min_sev_asid = edx; /* Initialize SEV ASID bitmaps */ sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); if (!sev_asid_bitmap) - return 1; + goto out; sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); if (!sev_reclaim_asid_bitmap) - return 1; + goto out; - status = kmalloc(sizeof(*status), GFP_KERNEL); - if (!status) - return 1; + pr_info("SEV supported: %u ASIDs\n", max_sev_asid - min_sev_asid + 1); + sev_supported = true; - /* - * Check SEV platform status. - * - * PLATFORM_STATUS can be called in any state, if we failed to query - * the PLATFORM status then either PSP firmware does not support SEV - * feature or SEV firmware is dead. - */ - rc = sev_platform_status(status, NULL); - if (rc) - goto err; + /* SEV-ES support requested? */ + if (!sev_es) + goto out; - pr_info("SEV supported\n"); + /* Does the CPU support SEV-ES? */ + if (!boot_cpu_has(X86_FEATURE_SEV_ES)) + goto out; -err: - kfree(status); - return rc; + /* Has the system been allocated ASIDs for SEV-ES? */ + if (min_sev_asid == 1) + goto out; + + pr_info("SEV-ES supported: %u ASIDs\n", min_sev_asid - 1); + sev_es_supported = true; + +out: + sev = sev_supported; + sev_es = sev_es_supported; } void sev_hardware_teardown(void) @@ -1181,13 +1311,329 @@ void sev_hardware_teardown(void) sev_flush_asids(); } +/* + * Pages used by hardware to hold guest encrypted state must be flushed before + * returning them to the system. + */ +static void sev_flush_guest_memory(struct vcpu_svm *svm, void *va, + unsigned long len) +{ + /* + * If hardware enforced cache coherency for encrypted mappings of the + * same physical page is supported, nothing to do. + */ + if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) + return; + + /* + * If the VM Page Flush MSR is supported, use it to flush the page + * (using the page virtual address and the guest ASID). + */ + if (boot_cpu_has(X86_FEATURE_VM_PAGE_FLUSH)) { + struct kvm_sev_info *sev; + unsigned long va_start; + u64 start, stop; + + /* Align start and stop to page boundaries. */ + va_start = (unsigned long)va; + start = (u64)va_start & PAGE_MASK; + stop = PAGE_ALIGN((u64)va_start + len); + + if (start < stop) { + sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info; + + while (start < stop) { + wrmsrl(MSR_AMD64_VM_PAGE_FLUSH, + start | sev->asid); + + start += PAGE_SIZE; + } + + return; + } + + WARN(1, "Address overflow, using WBINVD\n"); + } + + /* + * Hardware should always have one of the above features, + * but if not, use WBINVD and issue a warning. + */ + WARN_ONCE(1, "Using WBINVD to flush guest memory\n"); + wbinvd_on_all_cpus(); +} + +void sev_free_vcpu(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm; + + if (!sev_es_guest(vcpu->kvm)) + return; + + svm = to_svm(vcpu); + + if (vcpu->arch.guest_state_protected) + sev_flush_guest_memory(svm, svm->vmsa, PAGE_SIZE); + __free_page(virt_to_page(svm->vmsa)); + + if (svm->ghcb_sa_free) + kfree(svm->ghcb_sa); +} + +static void dump_ghcb(struct vcpu_svm *svm) +{ + struct ghcb *ghcb = svm->ghcb; + unsigned int nbits; + + /* Re-use the dump_invalid_vmcb module parameter */ + if (!dump_invalid_vmcb) { + pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); + return; + } + + nbits = sizeof(ghcb->save.valid_bitmap) * 8; + + pr_err("GHCB (GPA=%016llx):\n", svm->vmcb->control.ghcb_gpa); + pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", + ghcb->save.sw_exit_code, ghcb_sw_exit_code_is_valid(ghcb)); + pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", + ghcb->save.sw_exit_info_1, ghcb_sw_exit_info_1_is_valid(ghcb)); + pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", + ghcb->save.sw_exit_info_2, ghcb_sw_exit_info_2_is_valid(ghcb)); + pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch", + ghcb->save.sw_scratch, ghcb_sw_scratch_is_valid(ghcb)); + pr_err("%-20s%*pb\n", "valid_bitmap", nbits, ghcb->save.valid_bitmap); +} + +static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + struct ghcb *ghcb = svm->ghcb; + + /* + * The GHCB protocol so far allows for the following data + * to be returned: + * GPRs RAX, RBX, RCX, RDX + * + * Copy their values to the GHCB if they are dirty. + */ + if (kvm_register_is_dirty(vcpu, VCPU_REGS_RAX)) + ghcb_set_rax(ghcb, vcpu->arch.regs[VCPU_REGS_RAX]); + if (kvm_register_is_dirty(vcpu, VCPU_REGS_RBX)) + ghcb_set_rbx(ghcb, vcpu->arch.regs[VCPU_REGS_RBX]); + if (kvm_register_is_dirty(vcpu, VCPU_REGS_RCX)) + ghcb_set_rcx(ghcb, vcpu->arch.regs[VCPU_REGS_RCX]); + if (kvm_register_is_dirty(vcpu, VCPU_REGS_RDX)) + ghcb_set_rdx(ghcb, vcpu->arch.regs[VCPU_REGS_RDX]); +} + +static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) +{ + struct vmcb_control_area *control = &svm->vmcb->control; + struct kvm_vcpu *vcpu = &svm->vcpu; + struct ghcb *ghcb = svm->ghcb; + u64 exit_code; + + /* + * The GHCB protocol so far allows for the following data + * to be supplied: + * GPRs RAX, RBX, RCX, RDX + * XCR0 + * CPL + * + * VMMCALL allows the guest to provide extra registers. KVM also + * expects RSI for hypercalls, so include that, too. + * + * Copy their values to the appropriate location if supplied. + */ + memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); + + vcpu->arch.regs[VCPU_REGS_RAX] = ghcb_get_rax_if_valid(ghcb); + vcpu->arch.regs[VCPU_REGS_RBX] = ghcb_get_rbx_if_valid(ghcb); + vcpu->arch.regs[VCPU_REGS_RCX] = ghcb_get_rcx_if_valid(ghcb); + vcpu->arch.regs[VCPU_REGS_RDX] = ghcb_get_rdx_if_valid(ghcb); + vcpu->arch.regs[VCPU_REGS_RSI] = ghcb_get_rsi_if_valid(ghcb); + + svm->vmcb->save.cpl = ghcb_get_cpl_if_valid(ghcb); + + if (ghcb_xcr0_is_valid(ghcb)) { + vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb); + kvm_update_cpuid_runtime(vcpu); + } + + /* Copy the GHCB exit information into the VMCB fields */ + exit_code = ghcb_get_sw_exit_code(ghcb); + control->exit_code = lower_32_bits(exit_code); + control->exit_code_hi = upper_32_bits(exit_code); + control->exit_info_1 = ghcb_get_sw_exit_info_1(ghcb); + control->exit_info_2 = ghcb_get_sw_exit_info_2(ghcb); + + /* Clear the valid entries fields */ + memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); +} + +static int sev_es_validate_vmgexit(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu; + struct ghcb *ghcb; + u64 exit_code = 0; + + ghcb = svm->ghcb; + + /* Only GHCB Usage code 0 is supported */ + if (ghcb->ghcb_usage) + goto vmgexit_err; + + /* + * Retrieve the exit code now even though is may not be marked valid + * as it could help with debugging. + */ + exit_code = ghcb_get_sw_exit_code(ghcb); + + if (!ghcb_sw_exit_code_is_valid(ghcb) || + !ghcb_sw_exit_info_1_is_valid(ghcb) || + !ghcb_sw_exit_info_2_is_valid(ghcb)) + goto vmgexit_err; + + switch (ghcb_get_sw_exit_code(ghcb)) { + case SVM_EXIT_READ_DR7: + break; + case SVM_EXIT_WRITE_DR7: + if (!ghcb_rax_is_valid(ghcb)) + goto vmgexit_err; + break; + case SVM_EXIT_RDTSC: + break; + case SVM_EXIT_RDPMC: + if (!ghcb_rcx_is_valid(ghcb)) + goto vmgexit_err; + break; + case SVM_EXIT_CPUID: + if (!ghcb_rax_is_valid(ghcb) || + !ghcb_rcx_is_valid(ghcb)) + goto vmgexit_err; + if (ghcb_get_rax(ghcb) == 0xd) + if (!ghcb_xcr0_is_valid(ghcb)) + goto vmgexit_err; + break; + case SVM_EXIT_INVD: + break; + case SVM_EXIT_IOIO: + if (ghcb_get_sw_exit_info_1(ghcb) & SVM_IOIO_STR_MASK) { + if (!ghcb_sw_scratch_is_valid(ghcb)) + goto vmgexit_err; + } else { + if (!(ghcb_get_sw_exit_info_1(ghcb) & SVM_IOIO_TYPE_MASK)) + if (!ghcb_rax_is_valid(ghcb)) + goto vmgexit_err; + } + break; + case SVM_EXIT_MSR: + if (!ghcb_rcx_is_valid(ghcb)) + goto vmgexit_err; + if (ghcb_get_sw_exit_info_1(ghcb)) { + if (!ghcb_rax_is_valid(ghcb) || + !ghcb_rdx_is_valid(ghcb)) + goto vmgexit_err; + } + break; + case SVM_EXIT_VMMCALL: + if (!ghcb_rax_is_valid(ghcb) || + !ghcb_cpl_is_valid(ghcb)) + goto vmgexit_err; + break; + case SVM_EXIT_RDTSCP: + break; + case SVM_EXIT_WBINVD: + break; + case SVM_EXIT_MONITOR: + if (!ghcb_rax_is_valid(ghcb) || + !ghcb_rcx_is_valid(ghcb) || + !ghcb_rdx_is_valid(ghcb)) + goto vmgexit_err; + break; + case SVM_EXIT_MWAIT: + if (!ghcb_rax_is_valid(ghcb) || + !ghcb_rcx_is_valid(ghcb)) + goto vmgexit_err; + break; + case SVM_VMGEXIT_MMIO_READ: + case SVM_VMGEXIT_MMIO_WRITE: + if (!ghcb_sw_scratch_is_valid(ghcb)) + goto vmgexit_err; + break; + case SVM_VMGEXIT_NMI_COMPLETE: + case SVM_VMGEXIT_AP_JUMP_TABLE: + case SVM_VMGEXIT_UNSUPPORTED_EVENT: + break; + default: + goto vmgexit_err; + } + + return 0; + +vmgexit_err: + vcpu = &svm->vcpu; + + if (ghcb->ghcb_usage) { + vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n", + ghcb->ghcb_usage); + } else { + vcpu_unimpl(vcpu, "vmgexit: exit reason %#llx is not valid\n", + exit_code); + dump_ghcb(svm); + } + + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = exit_code; + vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; + + return -EINVAL; +} + +static void pre_sev_es_run(struct vcpu_svm *svm) +{ + if (!svm->ghcb) + return; + + if (svm->ghcb_sa_free) { + /* + * The scratch area lives outside the GHCB, so there is a + * buffer that, depending on the operation performed, may + * need to be synced, then freed. + */ + if (svm->ghcb_sa_sync) { + kvm_write_guest(svm->vcpu.kvm, + ghcb_get_sw_scratch(svm->ghcb), + svm->ghcb_sa, svm->ghcb_sa_len); + svm->ghcb_sa_sync = false; + } + + kfree(svm->ghcb_sa); + svm->ghcb_sa = NULL; + svm->ghcb_sa_free = false; + } + + trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->ghcb); + + sev_es_sync_to_ghcb(svm); + + kvm_vcpu_unmap(&svm->vcpu, &svm->ghcb_map, true); + svm->ghcb = NULL; +} + void pre_sev_run(struct vcpu_svm *svm, int cpu) { struct svm_cpu_data *sd = per_cpu(svm_data, cpu); int asid = sev_get_asid(svm->vcpu.kvm); + /* Perform any SEV-ES pre-run actions */ + pre_sev_es_run(svm); + /* Assign the asid allocated with this SEV guest */ - svm->vmcb->control.asid = asid; + svm->asid = asid; /* * Flush guest TLB: @@ -1203,3 +1649,394 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu) svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; vmcb_mark_dirty(svm->vmcb, VMCB_ASID); } + +#define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE) +static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) +{ + struct vmcb_control_area *control = &svm->vmcb->control; + struct ghcb *ghcb = svm->ghcb; + u64 ghcb_scratch_beg, ghcb_scratch_end; + u64 scratch_gpa_beg, scratch_gpa_end; + void *scratch_va; + + scratch_gpa_beg = ghcb_get_sw_scratch(ghcb); + if (!scratch_gpa_beg) { + pr_err("vmgexit: scratch gpa not provided\n"); + return false; + } + + scratch_gpa_end = scratch_gpa_beg + len; + if (scratch_gpa_end < scratch_gpa_beg) { + pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n", + len, scratch_gpa_beg); + return false; + } + + if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) { + /* Scratch area begins within GHCB */ + ghcb_scratch_beg = control->ghcb_gpa + + offsetof(struct ghcb, shared_buffer); + ghcb_scratch_end = control->ghcb_gpa + + offsetof(struct ghcb, reserved_1); + + /* + * If the scratch area begins within the GHCB, it must be + * completely contained in the GHCB shared buffer area. + */ + if (scratch_gpa_beg < ghcb_scratch_beg || + scratch_gpa_end > ghcb_scratch_end) { + pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n", + scratch_gpa_beg, scratch_gpa_end); + return false; + } + + scratch_va = (void *)svm->ghcb; + scratch_va += (scratch_gpa_beg - control->ghcb_gpa); + } else { + /* + * The guest memory must be read into a kernel buffer, so + * limit the size + */ + if (len > GHCB_SCRATCH_AREA_LIMIT) { + pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n", + len, GHCB_SCRATCH_AREA_LIMIT); + return false; + } + scratch_va = kzalloc(len, GFP_KERNEL); + if (!scratch_va) + return false; + + if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) { + /* Unable to copy scratch area from guest */ + pr_err("vmgexit: kvm_read_guest for scratch area failed\n"); + + kfree(scratch_va); + return false; + } + + /* + * The scratch area is outside the GHCB. The operation will + * dictate whether the buffer needs to be synced before running + * the vCPU next time (i.e. a read was requested so the data + * must be written back to the guest memory). + */ + svm->ghcb_sa_sync = sync; + svm->ghcb_sa_free = true; + } + + svm->ghcb_sa = scratch_va; + svm->ghcb_sa_len = len; + + return true; +} + +static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask, + unsigned int pos) +{ + svm->vmcb->control.ghcb_gpa &= ~(mask << pos); + svm->vmcb->control.ghcb_gpa |= (value & mask) << pos; +} + +static u64 get_ghcb_msr_bits(struct vcpu_svm *svm, u64 mask, unsigned int pos) +{ + return (svm->vmcb->control.ghcb_gpa >> pos) & mask; +} + +static void set_ghcb_msr(struct vcpu_svm *svm, u64 value) +{ + svm->vmcb->control.ghcb_gpa = value; +} + +static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) +{ + struct vmcb_control_area *control = &svm->vmcb->control; + struct kvm_vcpu *vcpu = &svm->vcpu; + u64 ghcb_info; + int ret = 1; + + ghcb_info = control->ghcb_gpa & GHCB_MSR_INFO_MASK; + + trace_kvm_vmgexit_msr_protocol_enter(svm->vcpu.vcpu_id, + control->ghcb_gpa); + + switch (ghcb_info) { + case GHCB_MSR_SEV_INFO_REQ: + set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX, + GHCB_VERSION_MIN, + sev_enc_bit)); + break; + case GHCB_MSR_CPUID_REQ: { + u64 cpuid_fn, cpuid_reg, cpuid_value; + + cpuid_fn = get_ghcb_msr_bits(svm, + GHCB_MSR_CPUID_FUNC_MASK, + GHCB_MSR_CPUID_FUNC_POS); + + /* Initialize the registers needed by the CPUID intercept */ + vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn; + vcpu->arch.regs[VCPU_REGS_RCX] = 0; + + ret = svm_invoke_exit_handler(svm, SVM_EXIT_CPUID); + if (!ret) { + ret = -EINVAL; + break; + } + + cpuid_reg = get_ghcb_msr_bits(svm, + GHCB_MSR_CPUID_REG_MASK, + GHCB_MSR_CPUID_REG_POS); + if (cpuid_reg == 0) + cpuid_value = vcpu->arch.regs[VCPU_REGS_RAX]; + else if (cpuid_reg == 1) + cpuid_value = vcpu->arch.regs[VCPU_REGS_RBX]; + else if (cpuid_reg == 2) + cpuid_value = vcpu->arch.regs[VCPU_REGS_RCX]; + else + cpuid_value = vcpu->arch.regs[VCPU_REGS_RDX]; + + set_ghcb_msr_bits(svm, cpuid_value, + GHCB_MSR_CPUID_VALUE_MASK, + GHCB_MSR_CPUID_VALUE_POS); + + set_ghcb_msr_bits(svm, GHCB_MSR_CPUID_RESP, + GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; + } + case GHCB_MSR_TERM_REQ: { + u64 reason_set, reason_code; + + reason_set = get_ghcb_msr_bits(svm, + GHCB_MSR_TERM_REASON_SET_MASK, + GHCB_MSR_TERM_REASON_SET_POS); + reason_code = get_ghcb_msr_bits(svm, + GHCB_MSR_TERM_REASON_MASK, + GHCB_MSR_TERM_REASON_POS); + pr_info("SEV-ES guest requested termination: %#llx:%#llx\n", + reason_set, reason_code); + fallthrough; + } + default: + ret = -EINVAL; + } + + trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id, + control->ghcb_gpa, ret); + + return ret; +} + +int sev_handle_vmgexit(struct vcpu_svm *svm) +{ + struct vmcb_control_area *control = &svm->vmcb->control; + u64 ghcb_gpa, exit_code; + struct ghcb *ghcb; + int ret; + + /* Validate the GHCB */ + ghcb_gpa = control->ghcb_gpa; + if (ghcb_gpa & GHCB_MSR_INFO_MASK) + return sev_handle_vmgexit_msr_protocol(svm); + + if (!ghcb_gpa) { + vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB gpa is not set\n"); + return -EINVAL; + } + + if (kvm_vcpu_map(&svm->vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) { + /* Unable to map GHCB from guest */ + vcpu_unimpl(&svm->vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n", + ghcb_gpa); + return -EINVAL; + } + + svm->ghcb = svm->ghcb_map.hva; + ghcb = svm->ghcb_map.hva; + + trace_kvm_vmgexit_enter(svm->vcpu.vcpu_id, ghcb); + + exit_code = ghcb_get_sw_exit_code(ghcb); + + ret = sev_es_validate_vmgexit(svm); + if (ret) + return ret; + + sev_es_sync_from_ghcb(svm); + ghcb_set_sw_exit_info_1(ghcb, 0); + ghcb_set_sw_exit_info_2(ghcb, 0); + + ret = -EINVAL; + switch (exit_code) { + case SVM_VMGEXIT_MMIO_READ: + if (!setup_vmgexit_scratch(svm, true, control->exit_info_2)) + break; + + ret = kvm_sev_es_mmio_read(&svm->vcpu, + control->exit_info_1, + control->exit_info_2, + svm->ghcb_sa); + break; + case SVM_VMGEXIT_MMIO_WRITE: + if (!setup_vmgexit_scratch(svm, false, control->exit_info_2)) + break; + + ret = kvm_sev_es_mmio_write(&svm->vcpu, + control->exit_info_1, + control->exit_info_2, + svm->ghcb_sa); + break; + case SVM_VMGEXIT_NMI_COMPLETE: + ret = svm_invoke_exit_handler(svm, SVM_EXIT_IRET); + break; + case SVM_VMGEXIT_AP_JUMP_TABLE: { + struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info; + + switch (control->exit_info_1) { + case 0: + /* Set AP jump table address */ + sev->ap_jump_table = control->exit_info_2; + break; + case 1: + /* Get AP jump table address */ + ghcb_set_sw_exit_info_2(ghcb, sev->ap_jump_table); + break; + default: + pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n", + control->exit_info_1); + ghcb_set_sw_exit_info_1(ghcb, 1); + ghcb_set_sw_exit_info_2(ghcb, + X86_TRAP_UD | + SVM_EVTINJ_TYPE_EXEPT | + SVM_EVTINJ_VALID); + } + + ret = 1; + break; + } + case SVM_VMGEXIT_UNSUPPORTED_EVENT: + vcpu_unimpl(&svm->vcpu, + "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", + control->exit_info_1, control->exit_info_2); + break; + default: + ret = svm_invoke_exit_handler(svm, exit_code); + } + + return ret; +} + +int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) +{ + if (!setup_vmgexit_scratch(svm, in, svm->vmcb->control.exit_info_2)) + return -EINVAL; + + return kvm_sev_es_string_io(&svm->vcpu, size, port, + svm->ghcb_sa, svm->ghcb_sa_len, in); +} + +void sev_es_init_vmcb(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + + svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE; + svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; + + /* + * An SEV-ES guest requires a VMSA area that is a separate from the + * VMCB page. Do not include the encryption mask on the VMSA physical + * address since hardware will access it using the guest key. + */ + svm->vmcb->control.vmsa_pa = __pa(svm->vmsa); + + /* Can't intercept CR register access, HV can't modify CR registers */ + svm_clr_intercept(svm, INTERCEPT_CR0_READ); + svm_clr_intercept(svm, INTERCEPT_CR4_READ); + svm_clr_intercept(svm, INTERCEPT_CR8_READ); + svm_clr_intercept(svm, INTERCEPT_CR0_WRITE); + svm_clr_intercept(svm, INTERCEPT_CR4_WRITE); + svm_clr_intercept(svm, INTERCEPT_CR8_WRITE); + + svm_clr_intercept(svm, INTERCEPT_SELECTIVE_CR0); + + /* Track EFER/CR register changes */ + svm_set_intercept(svm, TRAP_EFER_WRITE); + svm_set_intercept(svm, TRAP_CR0_WRITE); + svm_set_intercept(svm, TRAP_CR4_WRITE); + svm_set_intercept(svm, TRAP_CR8_WRITE); + + /* No support for enable_vmware_backdoor */ + clr_exception_intercept(svm, GP_VECTOR); + + /* Can't intercept XSETBV, HV can't modify XCR0 directly */ + svm_clr_intercept(svm, INTERCEPT_XSETBV); + + /* Clear intercepts on selected MSRs */ + set_msr_interception(vcpu, svm->msrpm, MSR_EFER, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_CR_PAT, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); +} + +void sev_es_create_vcpu(struct vcpu_svm *svm) +{ + /* + * Set the GHCB MSR value as per the GHCB specification when creating + * a vCPU for an SEV-ES guest. + */ + set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX, + GHCB_VERSION_MIN, + sev_enc_bit)); +} + +void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu) +{ + struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + struct vmcb_save_area *hostsa; + unsigned int i; + + /* + * As an SEV-ES guest, hardware will restore the host state on VMEXIT, + * of which one step is to perform a VMLOAD. Since hardware does not + * perform a VMSAVE on VMRUN, the host savearea must be updated. + */ + asm volatile(__ex("vmsave") : : "a" (__sme_page_pa(sd->save_area)) : "memory"); + + /* + * Certain MSRs are restored on VMEXIT, only save ones that aren't + * restored. + */ + for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) { + if (host_save_user_msrs[i].sev_es_restored) + continue; + + rdmsrl(host_save_user_msrs[i].index, svm->host_user_msrs[i]); + } + + /* XCR0 is restored on VMEXIT, save the current host value */ + hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400); + hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + + /* PKRU is restored on VMEXIT, save the curent host value */ + hostsa->pkru = read_pkru(); + + /* MSR_IA32_XSS is restored on VMEXIT, save the currnet host value */ + hostsa->xss = host_xss; +} + +void sev_es_vcpu_put(struct vcpu_svm *svm) +{ + unsigned int i; + + /* + * Certain MSRs are restored on VMEXIT and were saved with vmsave in + * sev_es_vcpu_load() above. Only restore ones that weren't. + */ + for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) { + if (host_save_user_msrs[i].sev_es_restored) + continue; + + wrmsrl(host_save_user_msrs[i].index, svm->host_user_msrs[i]); + } +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index da7eb4aaf44f..cce0143a6f80 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -33,9 +33,9 @@ #include <asm/debugreg.h> #include <asm/kvm_para.h> #include <asm/irq_remapping.h> -#include <asm/mce.h> #include <asm/spec-ctrl.h> #include <asm/cpu_device_id.h> +#include <asm/traps.h> #include <asm/virtext.h> #include "trace.h" @@ -90,7 +90,7 @@ static DEFINE_PER_CPU(u64, current_tsc_ratio); static const struct svm_direct_access_msrs { u32 index; /* Index of the MSR */ - bool always; /* True if intercept is always on */ + bool always; /* True if intercept is initially cleared */ } direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = { { .index = MSR_STAR, .always = true }, { .index = MSR_IA32_SYSENTER_CS, .always = true }, @@ -108,6 +108,9 @@ static const struct svm_direct_access_msrs { { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, { .index = MSR_IA32_LASTINTFROMIP, .always = false }, { .index = MSR_IA32_LASTINTTOIP, .always = false }, + { .index = MSR_EFER, .always = false }, + { .index = MSR_IA32_CR_PAT, .always = false }, + { .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, { .index = MSR_INVALID, .always = false }, }; @@ -187,10 +190,14 @@ static int vgif = true; module_param(vgif, int, 0444); /* enable/disable SEV support */ -static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT); +int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT); module_param(sev, int, 0444); -static bool __read_mostly dump_invalid_vmcb = 0; +/* enable/disable SEV-ES support */ +int sev_es = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT); +module_param(sev_es, int, 0444); + +bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); static u8 rsm_ins_bytes[] = "\x0f\xaa"; @@ -336,6 +343,13 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + /* + * SEV-ES does not expose the next RIP. The RIP update is controlled by + * the type of exit and the #VC handler in the guest. + */ + if (sev_es_guest(vcpu->kvm)) + goto done; + if (nrips && svm->vmcb->control.next_rip != 0) { WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS)); svm->next_rip = svm->vmcb->control.next_rip; @@ -347,6 +361,8 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) } else { kvm_rip_write(vcpu, svm->next_rip); } + +done: svm_set_interrupt_shadow(vcpu, 0); return 1; @@ -484,7 +500,7 @@ static int svm_hardware_enable(void) wrmsrl(MSR_EFER, efer | EFER_SVME); - wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); + wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area)); if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); @@ -552,6 +568,7 @@ static int svm_cpu_init(int cpu) sd->save_area = alloc_page(GFP_KERNEL); if (!sd->save_area) goto free_cpu_data; + clear_page(page_address(sd->save_area)); if (svm_sev_enabled()) { sd->sev_vmcbs = kmalloc_array(max_sev_asid + 1, @@ -662,8 +679,8 @@ static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, msrpm[offset] = tmp; } -static void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, - int read, int write) +void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, + int read, int write) { set_shadow_msr_intercept(vcpu, msr, read, write); set_msr_interception_bitmap(vcpu, msrpm, msr, read, write); @@ -959,15 +976,11 @@ static __init int svm_hardware_setup(void) kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); } - if (sev) { - if (boot_cpu_has(X86_FEATURE_SEV) && - IS_ENABLED(CONFIG_KVM_AMD_SEV)) { - r = sev_hardware_setup(); - if (r) - sev = false; - } else { - sev = false; - } + if (IS_ENABLED(CONFIG_KVM_AMD_SEV) && sev) { + sev_hardware_setup(); + } else { + sev = false; + sev_es = false; } svm_adjust_mmio_mask(); @@ -1215,6 +1228,7 @@ static void init_vmcb(struct vcpu_svm *svm) save->cr4 = 0; } svm->asid_generation = 0; + svm->asid = 0; svm->nested.vmcb12_gpa = 0; svm->vcpu.arch.hflags = 0; @@ -1252,6 +1266,11 @@ static void init_vmcb(struct vcpu_svm *svm) if (sev_guest(svm->vcpu.kvm)) { svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE; clr_exception_intercept(svm, UD_VECTOR); + + if (sev_es_guest(svm->vcpu.kvm)) { + /* Perform SEV-ES specific VMCB updates */ + sev_es_init_vmcb(svm); + } } vmcb_mark_all_dirty(svm->vmcb); @@ -1288,6 +1307,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; struct page *vmcb_page; + struct page *vmsa_page = NULL; int err; BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); @@ -1298,9 +1318,27 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) if (!vmcb_page) goto out; + if (sev_es_guest(svm->vcpu.kvm)) { + /* + * SEV-ES guests require a separate VMSA page used to contain + * the encrypted register state of the guest. + */ + vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!vmsa_page) + goto error_free_vmcb_page; + + /* + * SEV-ES guests maintain an encrypted version of their FPU + * state which is restored and saved on VMRUN and VMEXIT. + * Free the fpu structure to prevent KVM from attempting to + * access the FPU state. + */ + kvm_free_guest_fpu(vcpu); + } + err = avic_init_vcpu(svm); if (err) - goto error_free_vmcb_page; + goto error_free_vmsa_page; /* We initialize this flag to true to make sure that the is_running * bit would be set the first time the vcpu is loaded. @@ -1311,21 +1349,32 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm->msrpm = svm_vcpu_alloc_msrpm(); if (!svm->msrpm) { err = -ENOMEM; - goto error_free_vmcb_page; + goto error_free_vmsa_page; } svm_vcpu_init_msrpm(vcpu, svm->msrpm); svm->vmcb = page_address(vmcb_page); svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT); + + if (vmsa_page) + svm->vmsa = page_address(vmsa_page); + svm->asid_generation = 0; init_vmcb(svm); svm_init_osvw(vcpu); vcpu->arch.microcode_version = 0x01000065; + if (sev_es_guest(svm->vcpu.kvm)) + /* Perform SEV-ES specific VMCB creation updates */ + sev_es_create_vcpu(svm); + return 0; +error_free_vmsa_page: + if (vmsa_page) + __free_page(vmsa_page); error_free_vmcb_page: __free_page(vmcb_page); out: @@ -1353,6 +1402,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) svm_free_nested(svm); + sev_free_vcpu(vcpu); + __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT)); __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); } @@ -1368,15 +1419,20 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vmcb_mark_all_dirty(svm->vmcb); } + if (sev_es_guest(svm->vcpu.kvm)) { + sev_es_vcpu_load(svm, cpu); + } else { #ifdef CONFIG_X86_64 - rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base); + rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base); #endif - savesegment(fs, svm->host.fs); - savesegment(gs, svm->host.gs); - svm->host.ldt = kvm_read_ldt(); + savesegment(fs, svm->host.fs); + savesegment(gs, svm->host.gs); + svm->host.ldt = kvm_read_ldt(); - for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) - rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); + for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) + rdmsrl(host_save_user_msrs[i].index, + svm->host_user_msrs[i]); + } if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio; @@ -1404,18 +1460,24 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) avic_vcpu_put(vcpu); ++vcpu->stat.host_state_reload; - kvm_load_ldt(svm->host.ldt); + if (sev_es_guest(svm->vcpu.kvm)) { + sev_es_vcpu_put(svm); + } else { + kvm_load_ldt(svm->host.ldt); #ifdef CONFIG_X86_64 - loadsegment(fs, svm->host.fs); - wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase); - load_gs_index(svm->host.gs); + loadsegment(fs, svm->host.fs); + wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase); + load_gs_index(svm->host.gs); #else #ifdef CONFIG_X86_32_LAZY_GS - loadsegment(gs, svm->host.gs); + loadsegment(gs, svm->host.gs); #endif #endif - for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) - wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); + + for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) + wrmsrl(host_save_user_msrs[i].index, + svm->host_user_msrs[i]); + } } static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) @@ -1633,9 +1695,18 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) static void update_cr0_intercept(struct vcpu_svm *svm) { - ulong gcr0 = svm->vcpu.arch.cr0; - u64 *hcr0 = &svm->vmcb->save.cr0; + ulong gcr0; + u64 *hcr0; + /* + * SEV-ES guests must always keep the CR intercepts cleared. CR + * tracking is done using the CR write traps. + */ + if (sev_es_guest(svm->vcpu.kvm)) + return; + + gcr0 = svm->vcpu.arch.cr0; + hcr0 = &svm->vmcb->save.cr0; *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) | (gcr0 & SVM_CR0_SELECTIVE_MASK); @@ -1655,7 +1726,7 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) struct vcpu_svm *svm = to_svm(vcpu); #ifdef CONFIG_X86_64 - if (vcpu->arch.efer & EFER_LME) { + if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { vcpu->arch.efer |= EFER_LMA; svm->vmcb->save.efer |= EFER_LMA | EFER_LME; @@ -1684,13 +1755,15 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) update_cr0_intercept(svm); } -int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; - unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; + return true; +} - if (cr4 & X86_CR4_VMXE) - return 1; +void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; + unsigned long old_cr4 = vcpu->arch.cr4; if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) svm_flush_tlb(vcpu); @@ -1701,7 +1774,9 @@ int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) cr4 |= host_cr4_mce; to_svm(vcpu)->vmcb->save.cr4 = cr4; vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); - return 0; + + if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) + kvm_update_cpuid_runtime(vcpu); } static void svm_set_segment(struct kvm_vcpu *vcpu, @@ -1753,18 +1828,20 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) ++sd->asid_generation; sd->next_asid = sd->min_asid; svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; + vmcb_mark_dirty(svm->vmcb, VMCB_ASID); } svm->asid_generation = sd->asid_generation; - svm->vmcb->control.asid = sd->next_asid++; - - vmcb_mark_dirty(svm->vmcb, VMCB_ASID); + svm->asid = sd->next_asid++; } static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) { struct vmcb *vmcb = svm->vmcb; + if (svm->vcpu.arch.guest_state_protected) + return; + if (unlikely(value != vmcb->save.dr6)) { vmcb->save.dr6 = value; vmcb_mark_dirty(vmcb, VMCB_DR); @@ -1775,6 +1852,9 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + if (vcpu->arch.guest_state_protected) + return; + get_debugreg(vcpu->arch.db[0], 0); get_debugreg(vcpu->arch.db[1], 1); get_debugreg(vcpu->arch.db[2], 2); @@ -1793,6 +1873,9 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) { struct vcpu_svm *svm = to_svm(vcpu); + if (vcpu->arch.guest_state_protected) + return; + svm->vmcb->save.dr7 = value; vmcb_mark_dirty(svm->vmcb, VMCB_DR); } @@ -1931,25 +2014,6 @@ static bool is_erratum_383(void) return true; } -/* - * Trigger machine check on the host. We assume all the MSRs are already set up - * by the CPU and that we still run on the same CPU as the MCE occurred on. - * We pass a fake environment to the machine check handler because we want - * the guest to be always treated like user space, no matter what context - * it used internally. - */ -static void kvm_machine_check(void) -{ -#if defined(CONFIG_X86_MCE) - struct pt_regs regs = { - .cs = 3, /* Fake ring 3 no matter what the guest ran on */ - .flags = X86_EFLAGS_IF, - }; - - do_machine_check(®s); -#endif -} - static void svm_handle_mce(struct vcpu_svm *svm) { if (is_erratum_383()) { @@ -1981,6 +2045,13 @@ static int shutdown_interception(struct vcpu_svm *svm) struct kvm_run *kvm_run = svm->vcpu.run; /* + * The VM save area has already been encrypted so it + * cannot be reinitialized - just terminate. + */ + if (sev_es_guest(svm->vcpu.kvm)) + return -EINVAL; + + /* * VMCB is undefined after a SHUTDOWN intercept * so reinitialize it. */ @@ -2001,11 +2072,16 @@ static int io_interception(struct vcpu_svm *svm) ++svm->vcpu.stat.io_exits; string = (io_info & SVM_IOIO_STR_MASK) != 0; in = (io_info & SVM_IOIO_TYPE_MASK) != 0; - if (string) - return kvm_emulate_instruction(vcpu, 0); - port = io_info >> 16; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; + + if (string) { + if (sev_es_guest(vcpu->kvm)) + return sev_es_string_io(svm, size, port, in); + else + return kvm_emulate_instruction(vcpu, 0); + } + svm->next_rip = svm->vmcb->control.exit_info_2; return kvm_fast_pio(&svm->vcpu, size, port, in); @@ -2269,9 +2345,11 @@ static int cpuid_interception(struct vcpu_svm *svm) static int iret_interception(struct vcpu_svm *svm) { ++svm->vcpu.stat.nmi_window_exits; - svm_clr_intercept(svm, INTERCEPT_IRET); svm->vcpu.arch.hflags |= HF_IRET_MASK; - svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); + if (!sev_es_guest(svm->vcpu.kvm)) { + svm_clr_intercept(svm, INTERCEPT_IRET); + svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); + } kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); return 1; } @@ -2408,6 +2486,41 @@ static int cr_interception(struct vcpu_svm *svm) return kvm_complete_insn_gp(&svm->vcpu, err); } +static int cr_trap(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + unsigned long old_value, new_value; + unsigned int cr; + int ret = 0; + + new_value = (unsigned long)svm->vmcb->control.exit_info_1; + + cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP; + switch (cr) { + case 0: + old_value = kvm_read_cr0(vcpu); + svm_set_cr0(vcpu, new_value); + + kvm_post_set_cr0(vcpu, old_value, new_value); + break; + case 4: + old_value = kvm_read_cr4(vcpu); + svm_set_cr4(vcpu, new_value); + + kvm_post_set_cr4(vcpu, old_value, new_value); + break; + case 8: + ret = kvm_set_cr8(&svm->vcpu, new_value); + break; + default: + WARN(1, "unhandled CR%d write trap", cr); + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + return kvm_complete_insn_gp(vcpu, ret); +} + static int dr_interception(struct vcpu_svm *svm) { int reg, dr; @@ -2461,6 +2574,25 @@ static int cr8_write_interception(struct vcpu_svm *svm) return 0; } +static int efer_trap(struct vcpu_svm *svm) +{ + struct msr_data msr_info; + int ret; + + /* + * Clear the EFER_SVME bit from EFER. The SVM code always sets this + * bit in svm_set_efer(), but __kvm_valid_efer() checks it against + * whether the guest has X86_FEATURE_SVM - this avoids a failure if + * the guest doesn't have X86_FEATURE_SVM. + */ + msr_info.host_initiated = false; + msr_info.index = MSR_EFER; + msr_info.data = svm->vmcb->control.exit_info_1 & ~EFER_SVME; + ret = kvm_set_msr_common(&svm->vcpu, &msr_info); + + return kvm_complete_insn_gp(&svm->vcpu, ret); +} + static int svm_get_msr_feature(struct kvm_msr_entry *msr) { msr->data = 0; @@ -2543,10 +2675,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) + !guest_has_spec_ctrl_msr(vcpu)) return 1; msr_info->data = svm->spec_ctrl; @@ -2584,6 +2713,20 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } +static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) +{ + struct vcpu_svm *svm = to_svm(vcpu); + if (!sev_es_guest(svm->vcpu.kvm) || !err) + return kvm_complete_insn_gp(&svm->vcpu, err); + + ghcb_set_sw_exit_info_1(svm->ghcb, 1); + ghcb_set_sw_exit_info_2(svm->ghcb, + X86_TRAP_GP | + SVM_EVTINJ_TYPE_EXEPT | + SVM_EVTINJ_VALID); + return 1; +} + static int rdmsr_interception(struct vcpu_svm *svm) { return kvm_emulate_rdmsr(&svm->vcpu); @@ -2630,10 +2773,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_SPEC_CTRL: if (!msr->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) + !guest_has_spec_ctrl_msr(vcpu)) return 1; if (kvm_spec_ctrl_test_value(data)) @@ -2658,12 +2798,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_PRED_CMD: if (!msr->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)) + !guest_has_pred_cmd_msr(vcpu)) return 1; if (data & ~PRED_CMD_IBPB) return 1; - if (!boot_cpu_has(X86_FEATURE_AMD_IBPB)) + if (!boot_cpu_has(X86_FEATURE_IBPB)) return 1; if (!data) break; @@ -2805,7 +2945,14 @@ static int interrupt_window_interception(struct vcpu_svm *svm) static int pause_interception(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - bool in_kernel = (svm_get_cpl(vcpu) == 0); + bool in_kernel; + + /* + * CPL is not made available for an SEV-ES guest, therefore + * vcpu->arch.preempted_in_kernel can never be true. Just + * set in_kernel to false as well. + */ + in_kernel = !sev_es_guest(svm->vcpu.kvm) && svm_get_cpl(vcpu) == 0; if (!kvm_pause_in_guest(vcpu->kvm)) grow_ple_window(vcpu); @@ -2920,11 +3067,16 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_XSETBV] = xsetbv_interception, [SVM_EXIT_RDPRU] = rdpru_interception, + [SVM_EXIT_EFER_WRITE_TRAP] = efer_trap, + [SVM_EXIT_CR0_WRITE_TRAP] = cr_trap, + [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap, + [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap, [SVM_EXIT_INVPCID] = invpcid_interception, [SVM_EXIT_NPF] = npf_interception, [SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, + [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit, }; static void dump_vmcb(struct kvm_vcpu *vcpu) @@ -2966,6 +3118,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); + pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa); pr_err("%-20s%08x\n", "event_inj:", control->event_inj); pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext); @@ -2973,6 +3126,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page); pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); + pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa); pr_err("VMCB State Save Area:\n"); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "es:", @@ -3045,6 +3199,43 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) "excp_to:", save->last_excp_to); } +static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) +{ + if (exit_code < ARRAY_SIZE(svm_exit_handlers) && + svm_exit_handlers[exit_code]) + return 0; + + vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code); + dump_vmcb(vcpu); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = exit_code; + vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; + + return -EINVAL; +} + +int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code) +{ + if (svm_handle_invalid_exit(&svm->vcpu, exit_code)) + return 0; + +#ifdef CONFIG_RETPOLINE + if (exit_code == SVM_EXIT_MSR) + return msr_interception(svm); + else if (exit_code == SVM_EXIT_VINTR) + return interrupt_window_interception(svm); + else if (exit_code == SVM_EXIT_INTR) + return intr_interception(svm); + else if (exit_code == SVM_EXIT_HLT) + return halt_interception(svm); + else if (exit_code == SVM_EXIT_NPF) + return npf_interception(svm); +#endif + return svm_exit_handlers[exit_code](svm); +} + static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) { @@ -3068,10 +3259,13 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); - if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE)) - vcpu->arch.cr0 = svm->vmcb->save.cr0; - if (npt_enabled) - vcpu->arch.cr3 = svm->vmcb->save.cr3; + /* SEV-ES guests must use the CR write traps to track CR registers. */ + if (!sev_es_guest(vcpu->kvm)) { + if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE)) + vcpu->arch.cr0 = svm->vmcb->save.cr0; + if (npt_enabled) + vcpu->arch.cr3 = svm->vmcb->save.cr3; + } if (is_guest_mode(vcpu)) { int vmexit; @@ -3108,32 +3302,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (exit_fastpath != EXIT_FASTPATH_NONE) return 1; - if (exit_code >= ARRAY_SIZE(svm_exit_handlers) - || !svm_exit_handlers[exit_code]) { - vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code); - dump_vmcb(vcpu); - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = - KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = exit_code; - vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; - return 0; - } - -#ifdef CONFIG_RETPOLINE - if (exit_code == SVM_EXIT_MSR) - return msr_interception(svm); - else if (exit_code == SVM_EXIT_VINTR) - return interrupt_window_interception(svm); - else if (exit_code == SVM_EXIT_INTR) - return intr_interception(svm); - else if (exit_code == SVM_EXIT_HLT) - return halt_interception(svm); - else if (exit_code == SVM_EXIT_NPF) - return npf_interception(svm); -#endif - return svm_exit_handlers[exit_code](svm); + return svm_invoke_exit_handler(svm, exit_code); } static void reload_tss(struct kvm_vcpu *vcpu) @@ -3162,7 +3331,8 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; vcpu->arch.hflags |= HF_NMI_MASK; - svm_set_intercept(svm, INTERCEPT_IRET); + if (!sev_es_guest(svm->vcpu.kvm)) + svm_set_intercept(svm, INTERCEPT_IRET); ++vcpu->stat.nmi_injections; } @@ -3183,6 +3353,13 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vcpu_svm *svm = to_svm(vcpu); + /* + * SEV-ES guests must always keep the CR intercepts cleared. CR + * tracking is done using the CR write traps. + */ + if (sev_es_guest(vcpu->kvm)) + return; + if (nested_svm_virtualize_tpr(vcpu)) return; @@ -3239,10 +3416,12 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) if (masked) { svm->vcpu.arch.hflags |= HF_NMI_MASK; - svm_set_intercept(svm, INTERCEPT_IRET); + if (!sev_es_guest(svm->vcpu.kvm)) + svm_set_intercept(svm, INTERCEPT_IRET); } else { svm->vcpu.arch.hflags &= ~HF_NMI_MASK; - svm_clr_intercept(svm, INTERCEPT_IRET); + if (!sev_es_guest(svm->vcpu.kvm)) + svm_clr_intercept(svm, INTERCEPT_IRET); } } @@ -3254,7 +3433,14 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) if (!gif_set(svm)) return true; - if (is_guest_mode(vcpu)) { + if (sev_es_guest(svm->vcpu.kvm)) { + /* + * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask + * bit to determine the state of the IF flag. + */ + if (!(vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK)) + return true; + } else if (is_guest_mode(vcpu)) { /* As long as interrupts are being delivered... */ if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF) @@ -3413,8 +3599,9 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) * If we've made progress since setting HF_IRET_MASK, we've * executed an IRET and can allow NMI injection. */ - if ((svm->vcpu.arch.hflags & HF_IRET_MASK) - && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) { + if ((svm->vcpu.arch.hflags & HF_IRET_MASK) && + (sev_es_guest(svm->vcpu.kvm) || + kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip)) { svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); } @@ -3437,6 +3624,12 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) break; case SVM_EXITINTINFO_TYPE_EXEPT: /* + * Never re-inject a #VC exception. + */ + if (vector == X86_TRAP_VC) + break; + + /* * In case of software exceptions, do not reinject the vector, * but re-execute the instruction instead. Rewind RIP first * if we emulated INT3 before. @@ -3509,16 +3702,20 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, guest_enter_irqoff(); lockdep_hardirqs_on(CALLER_ADDR0); - __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); + if (sev_es_guest(svm->vcpu.kvm)) { + __svm_sev_es_vcpu_run(svm->vmcb_pa); + } else { + __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); #ifdef CONFIG_X86_64 - native_wrmsrl(MSR_GS_BASE, svm->host.gs_base); + native_wrmsrl(MSR_GS_BASE, svm->host.gs_base); #else - loadsegment(fs, svm->host.fs); + loadsegment(fs, svm->host.fs); #ifndef CONFIG_X86_32_LAZY_GS - loadsegment(gs, svm->host.gs); + loadsegment(gs, svm->host.gs); #endif #endif + } /* * VMEXIT disables interrupts (host state), but tracing and lockdep @@ -3568,6 +3765,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) sync_lapic_to_cr8(vcpu); + if (unlikely(svm->asid != svm->vmcb->control.asid)) { + svm->vmcb->control.asid = svm->asid; + vmcb_mark_dirty(svm->vmcb, VMCB_ASID); + } svm->vmcb->save.cr2 = vcpu->arch.cr2; /* @@ -3612,14 +3813,17 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); - reload_tss(vcpu); + if (!sev_es_guest(svm->vcpu.kvm)) + reload_tss(vcpu); x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); - vcpu->arch.cr2 = svm->vmcb->save.cr2; - vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; - vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; - vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; + if (!sev_es_guest(svm->vcpu.kvm)) { + vcpu->arch.cr2 = svm->vmcb->save.cr2; + vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; + vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; + vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; + } if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_interrupt(&svm->vcpu); @@ -3722,12 +3926,21 @@ static bool svm_cpu_has_accelerated_tpr(void) return false; } -static bool svm_has_emulated_msr(u32 index) +/* + * The kvm parameter can be NULL (module initialization, or invocation before + * VM creation). Be sure to check the kvm parameter before using it. + */ +static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) { switch (index) { case MSR_IA32_MCG_EXT_CTL: case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: return false; + case MSR_IA32_SMBASE: + /* SEV-ES guests do not support SMM, so report false */ + if (kvm && sev_es_guest(kvm)) + return false; + break; default: break; } @@ -4086,6 +4299,12 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int i unsigned long cr4; /* + * When the guest is an SEV-ES guest, emulation is not possible. + */ + if (sev_es_guest(vcpu->kvm)) + return false; + + /* * Detect and workaround Errata 1096 Fam_17h_00_0Fh. * * Errata: @@ -4217,6 +4436,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .get_cpl = svm_get_cpl, .get_cs_db_l_bits = kvm_get_cs_db_l_bits, .set_cr0 = svm_set_cr0, + .is_valid_cr4 = svm_is_valid_cr4, .set_cr4 = svm_set_cr4, .set_efer = svm_set_efer, .get_idt = svm_get_idt, @@ -4305,6 +4525,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .apic_init_signal_blocked = svm_apic_init_signal_blocked, .msr_filter_changed = svm_msr_filter_changed, + .complete_emulated_msr = svm_complete_emulated_msr, }; static struct kvm_x86_init_ops svm_init_ops __initdata = { diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 1d853fe4c778..5431e6335e2e 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -17,21 +17,32 @@ #include <linux/kvm_types.h> #include <linux/kvm_host.h> +#include <linux/bits.h> #include <asm/svm.h> -static const u32 host_save_user_msrs[] = { +#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) + +static const struct svm_host_save_msrs { + u32 index; /* Index of the MSR */ + bool sev_es_restored; /* True if MSR is restored on SEV-ES VMEXIT */ +} host_save_user_msrs[] = { #ifdef CONFIG_X86_64 - MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, - MSR_FS_BASE, + { .index = MSR_STAR, .sev_es_restored = true }, + { .index = MSR_LSTAR, .sev_es_restored = true }, + { .index = MSR_CSTAR, .sev_es_restored = true }, + { .index = MSR_SYSCALL_MASK, .sev_es_restored = true }, + { .index = MSR_KERNEL_GS_BASE, .sev_es_restored = true }, + { .index = MSR_FS_BASE, .sev_es_restored = true }, #endif - MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, - MSR_TSC_AUX, + { .index = MSR_IA32_SYSENTER_CS, .sev_es_restored = true }, + { .index = MSR_IA32_SYSENTER_ESP, .sev_es_restored = true }, + { .index = MSR_IA32_SYSENTER_EIP, .sev_es_restored = true }, + { .index = MSR_TSC_AUX, .sev_es_restored = false }, }; - #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) -#define MAX_DIRECT_ACCESS_MSRS 15 +#define MAX_DIRECT_ACCESS_MSRS 18 #define MSRPM_OFFSETS 16 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; @@ -61,11 +72,13 @@ enum { struct kvm_sev_info { bool active; /* SEV enabled guest */ + bool es_active; /* SEV-ES enabled guest */ unsigned int asid; /* ASID used for this guest */ unsigned int handle; /* SEV firmware handle */ int fd; /* SEV device fd */ unsigned long pages_locked; /* Number of pages locked */ struct list_head regions_list; /* List of registered regions */ + u64 ap_jump_table; /* SEV-ES AP Jump Table address */ }; struct kvm_svm { @@ -106,6 +119,7 @@ struct vcpu_svm { struct vmcb *vmcb; unsigned long vmcb_pa; struct svm_cpu_data *svm_data; + u32 asid; uint64_t asid_generation; uint64_t sysenter_esp; uint64_t sysenter_eip; @@ -166,6 +180,17 @@ struct vcpu_svm { DECLARE_BITMAP(read, MAX_DIRECT_ACCESS_MSRS); DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS); } shadow_msr_intercept; + + /* SEV-ES support */ + struct vmcb_save_area *vmsa; + struct ghcb *ghcb; + struct kvm_host_map ghcb_map; + + /* SEV-ES scratch area support */ + void *ghcb_sa; + u64 ghcb_sa_len; + bool ghcb_sa_sync; + bool ghcb_sa_free; }; struct svm_cpu_data { @@ -193,6 +218,28 @@ static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) return container_of(kvm, struct kvm_svm, kvm); } +static inline bool sev_guest(struct kvm *kvm) +{ +#ifdef CONFIG_KVM_AMD_SEV + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + + return sev->active; +#else + return false; +#endif +} + +static inline bool sev_es_guest(struct kvm *kvm) +{ +#ifdef CONFIG_KVM_AMD_SEV + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + + return sev_guest(kvm) && sev->es_active; +#else + return false; +#endif +} + static inline void vmcb_mark_all_dirty(struct vmcb *vmcb) { vmcb->control.clean = 0; @@ -244,21 +291,24 @@ static inline void set_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ); + if (!sev_es_guest(svm->vcpu.kvm)) { + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE); + } + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE); vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); recalc_intercepts(svm); @@ -270,6 +320,12 @@ static inline void clr_dr_intercepts(struct vcpu_svm *svm) vmcb->control.intercepts[INTERCEPT_DR] = 0; + /* DR7 access must remain intercepted for an SEV-ES guest */ + if (sev_es_guest(svm->vcpu.kvm)) { + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); + } + recalc_intercepts(svm); } @@ -351,6 +407,10 @@ static inline bool gif_set(struct vcpu_svm *svm) #define MSR_CR3_LONG_MBZ_MASK 0xfff0000000000000U #define MSR_INVALID 0xffffffffU +extern int sev; +extern int sev_es; +extern bool dump_invalid_vmcb; + u32 svm_msrpm_offset(u32 msr); u32 *svm_vcpu_alloc_msrpm(void); void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm); @@ -358,13 +418,16 @@ void svm_vcpu_free_msrpm(u32 *msrpm); int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); -int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void svm_flush_tlb(struct kvm_vcpu *vcpu); void disable_nmi_singlestep(struct vcpu_svm *svm); bool svm_smi_blocked(struct kvm_vcpu *vcpu); bool svm_nmi_blocked(struct kvm_vcpu *vcpu); bool svm_interrupt_blocked(struct kvm_vcpu *vcpu); void svm_set_gif(struct vcpu_svm *svm, bool value); +int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code); +void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, + int read, int write); /* nested.c */ @@ -470,18 +533,42 @@ void svm_vcpu_unblocking(struct kvm_vcpu *vcpu); /* sev.c */ -extern unsigned int max_sev_asid; +#define GHCB_VERSION_MAX 1ULL +#define GHCB_VERSION_MIN 1ULL + +#define GHCB_MSR_INFO_POS 0 +#define GHCB_MSR_INFO_MASK (BIT_ULL(12) - 1) + +#define GHCB_MSR_SEV_INFO_RESP 0x001 +#define GHCB_MSR_SEV_INFO_REQ 0x002 +#define GHCB_MSR_VER_MAX_POS 48 +#define GHCB_MSR_VER_MAX_MASK 0xffff +#define GHCB_MSR_VER_MIN_POS 32 +#define GHCB_MSR_VER_MIN_MASK 0xffff +#define GHCB_MSR_CBIT_POS 24 +#define GHCB_MSR_CBIT_MASK 0xff +#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \ + ((((_max) & GHCB_MSR_VER_MAX_MASK) << GHCB_MSR_VER_MAX_POS) | \ + (((_min) & GHCB_MSR_VER_MIN_MASK) << GHCB_MSR_VER_MIN_POS) | \ + (((_cbit) & GHCB_MSR_CBIT_MASK) << GHCB_MSR_CBIT_POS) | \ + GHCB_MSR_SEV_INFO_RESP) + +#define GHCB_MSR_CPUID_REQ 0x004 +#define GHCB_MSR_CPUID_RESP 0x005 +#define GHCB_MSR_CPUID_FUNC_POS 32 +#define GHCB_MSR_CPUID_FUNC_MASK 0xffffffff +#define GHCB_MSR_CPUID_VALUE_POS 32 +#define GHCB_MSR_CPUID_VALUE_MASK 0xffffffff +#define GHCB_MSR_CPUID_REG_POS 30 +#define GHCB_MSR_CPUID_REG_MASK 0x3 + +#define GHCB_MSR_TERM_REQ 0x100 +#define GHCB_MSR_TERM_REASON_SET_POS 12 +#define GHCB_MSR_TERM_REASON_SET_MASK 0xf +#define GHCB_MSR_TERM_REASON_POS 16 +#define GHCB_MSR_TERM_REASON_MASK 0xff -static inline bool sev_guest(struct kvm *kvm) -{ -#ifdef CONFIG_KVM_AMD_SEV - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - - return sev->active; -#else - return false; -#endif -} +extern unsigned int max_sev_asid; static inline bool svm_sev_enabled(void) { @@ -495,7 +582,19 @@ int svm_register_enc_region(struct kvm *kvm, int svm_unregister_enc_region(struct kvm *kvm, struct kvm_enc_region *range); void pre_sev_run(struct vcpu_svm *svm, int cpu); -int __init sev_hardware_setup(void); +void __init sev_hardware_setup(void); void sev_hardware_teardown(void); +void sev_free_vcpu(struct kvm_vcpu *vcpu); +int sev_handle_vmgexit(struct vcpu_svm *svm); +int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); +void sev_es_init_vmcb(struct vcpu_svm *svm); +void sev_es_create_vcpu(struct vcpu_svm *svm); +void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu); +void sev_es_vcpu_put(struct vcpu_svm *svm); + +/* vmenter.S */ + +void __svm_sev_es_vcpu_run(unsigned long vmcb_pa); +void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs); #endif diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 1ec1ac40e328..6feb8c08f45a 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -168,3 +168,53 @@ SYM_FUNC_START(__svm_vcpu_run) pop %_ASM_BP ret SYM_FUNC_END(__svm_vcpu_run) + +/** + * __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode + * @vmcb_pa: unsigned long + */ +SYM_FUNC_START(__svm_sev_es_vcpu_run) + push %_ASM_BP +#ifdef CONFIG_X86_64 + push %r15 + push %r14 + push %r13 + push %r12 +#else + push %edi + push %esi +#endif + push %_ASM_BX + + /* Enter guest mode */ + mov %_ASM_ARG1, %_ASM_AX + sti + +1: vmrun %_ASM_AX + jmp 3f +2: cmpb $0, kvm_rebooting + jne 3f + ud2 + _ASM_EXTABLE(1b, 2b) + +3: cli + +#ifdef CONFIG_RETPOLINE + /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ + FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE +#endif + + pop %_ASM_BX + +#ifdef CONFIG_X86_64 + pop %r12 + pop %r13 + pop %r14 + pop %r15 +#else + pop %esi + pop %edi +#endif + pop %_ASM_BP + ret +SYM_FUNC_END(__svm_sev_es_vcpu_run) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index aef960f90f26..2de30c20bc26 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1578,6 +1578,103 @@ TRACE_EVENT(kvm_hv_syndbg_get_msr, __entry->vcpu_id, __entry->vp_index, __entry->msr, __entry->data) ); + +/* + * Tracepoint for the start of VMGEXIT processing + */ +TRACE_EVENT(kvm_vmgexit_enter, + TP_PROTO(unsigned int vcpu_id, struct ghcb *ghcb), + TP_ARGS(vcpu_id, ghcb), + + TP_STRUCT__entry( + __field(unsigned int, vcpu_id) + __field(u64, exit_reason) + __field(u64, info1) + __field(u64, info2) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->exit_reason = ghcb->save.sw_exit_code; + __entry->info1 = ghcb->save.sw_exit_info_1; + __entry->info2 = ghcb->save.sw_exit_info_2; + ), + + TP_printk("vcpu %u, exit_reason %llx, exit_info1 %llx, exit_info2 %llx", + __entry->vcpu_id, __entry->exit_reason, + __entry->info1, __entry->info2) +); + +/* + * Tracepoint for the end of VMGEXIT processing + */ +TRACE_EVENT(kvm_vmgexit_exit, + TP_PROTO(unsigned int vcpu_id, struct ghcb *ghcb), + TP_ARGS(vcpu_id, ghcb), + + TP_STRUCT__entry( + __field(unsigned int, vcpu_id) + __field(u64, exit_reason) + __field(u64, info1) + __field(u64, info2) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->exit_reason = ghcb->save.sw_exit_code; + __entry->info1 = ghcb->save.sw_exit_info_1; + __entry->info2 = ghcb->save.sw_exit_info_2; + ), + + TP_printk("vcpu %u, exit_reason %llx, exit_info1 %llx, exit_info2 %llx", + __entry->vcpu_id, __entry->exit_reason, + __entry->info1, __entry->info2) +); + +/* + * Tracepoint for the start of VMGEXIT MSR procotol processing + */ +TRACE_EVENT(kvm_vmgexit_msr_protocol_enter, + TP_PROTO(unsigned int vcpu_id, u64 ghcb_gpa), + TP_ARGS(vcpu_id, ghcb_gpa), + + TP_STRUCT__entry( + __field(unsigned int, vcpu_id) + __field(u64, ghcb_gpa) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->ghcb_gpa = ghcb_gpa; + ), + + TP_printk("vcpu %u, ghcb_gpa %016llx", + __entry->vcpu_id, __entry->ghcb_gpa) +); + +/* + * Tracepoint for the end of VMGEXIT MSR procotol processing + */ +TRACE_EVENT(kvm_vmgexit_msr_protocol_exit, + TP_PROTO(unsigned int vcpu_id, u64 ghcb_gpa, int result), + TP_ARGS(vcpu_id, ghcb_gpa, result), + + TP_STRUCT__entry( + __field(unsigned int, vcpu_id) + __field(u64, ghcb_gpa) + __field(int, result) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->ghcb_gpa = ghcb_gpa; + __entry->result = result; + ), + + TP_printk("vcpu %u, ghcb_gpa %016llx, result %d", + __entry->vcpu_id, __entry->ghcb_gpa, __entry->result) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index f3199bb02f22..41f24661af04 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -326,7 +326,6 @@ bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa) uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); /* * vmcs_version represents the range of supported Enlightened VMCS * versions: lower 8 bits is the minimal version, higher 8 bits is the @@ -334,7 +333,7 @@ uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu) * KVM_EVMCS_VERSION. */ if (kvm_cpu_cap_get(X86_FEATURE_VMX) && - vmx->nested.enlightened_vmcs_enabled) + (!vcpu || to_vmx(vcpu)->nested.enlightened_vmcs_enabled)) return (KVM_EVMCS_VERSION << 8) | 1; return 0; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 89af692deb7e..e2f26564a12d 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2952,7 +2952,8 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) { if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && - vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)) + vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && + vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) return -EINVAL; return 0; @@ -3559,19 +3560,29 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) */ nested_cache_shadow_vmcs12(vcpu, vmcs12); - /* - * If we're entering a halted L2 vcpu and the L2 vcpu won't be - * awakened by event injection or by an NMI-window VM-exit or - * by an interrupt-window VM-exit, halt the vcpu. - */ - if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && - !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && - !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_NMI_WINDOW_EXITING) && - !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_INTR_WINDOW_EXITING) && - (vmcs12->guest_rflags & X86_EFLAGS_IF))) { + switch (vmcs12->guest_activity_state) { + case GUEST_ACTIVITY_HLT: + /* + * If we're entering a halted L2 vcpu and the L2 vcpu won't be + * awakened by event injection or by an NMI-window VM-exit or + * by an interrupt-window VM-exit, halt the vcpu. + */ + if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && + !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && + !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && + (vmcs12->guest_rflags & X86_EFLAGS_IF))) { + vmx->nested.nested_run_pending = 0; + return kvm_vcpu_halt(vcpu); + } + break; + case GUEST_ACTIVITY_WAIT_SIPI: vmx->nested.nested_run_pending = 0; - return kvm_vcpu_halt(vcpu); + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + break; + default: + break; } + return 1; vmentry_failed: @@ -3797,7 +3808,20 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) return -EBUSY; nested_vmx_update_pending_dbg(vcpu); clear_bit(KVM_APIC_INIT, &apic->pending_events); - nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); + if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) + nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); + return 0; + } + + if (lapic_in_kernel(vcpu) && + test_bit(KVM_APIC_SIPI, &apic->pending_events)) { + if (block_nested_events) + return -EBUSY; + + clear_bit(KVM_APIC_SIPI, &apic->pending_events); + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) + nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, + apic->sipi_vector & 0xFFUL); return 0; } @@ -4036,6 +4060,8 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; + else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) + vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; else vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; @@ -4814,7 +4840,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) /* * The Intel VMX Instruction Reference lists a bunch of bits that are * prerequisite to running VMXON, most notably cr4.VMXE must be set to - * 1 (see vmx_set_cr4() for when we allow the guest to set this). + * 1 (see vmx_is_valid_cr4() for when we allow the guest to set this). * Otherwise, we should fail with #UD. But most faulting conditions * have already been checked by hardware, prior to the VM-exit for * VMXON. We do test guest cr4.VMXE because processor CR4 always has @@ -6483,7 +6509,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) msrs->misc_low |= MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | - VMX_MISC_ACTIVITY_HLT; + VMX_MISC_ACTIVITY_HLT | + VMX_MISC_ACTIVITY_WAIT_SIPI; msrs->misc_high = 0; /* diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 90ad7a6246e3..e85aa5faa22d 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -132,7 +132,7 @@ SYM_FUNC_START(__vmx_vcpu_run) mov (%_ASM_SP), %_ASM_AX /* Check if vmlaunch or vmresume is needed */ - cmpb $0, %bl + testb %bl, %bl /* Load guest registers. Don't clobber flags. */ mov VCPU_RCX(%_ASM_AX), %_ASM_CX diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 47b8357b9751..75c9c6a0a3a4 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -40,7 +40,6 @@ #include <asm/irq_remapping.h> #include <asm/kexec.h> #include <asm/perf_event.h> -#include <asm/mce.h> #include <asm/mmu_context.h> #include <asm/mshyperv.h> #include <asm/mwait.h> @@ -1826,7 +1825,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + !guest_has_spec_ctrl_msr(vcpu)) return 1; msr_info->data = to_vmx(vcpu)->spec_ctrl; @@ -2028,7 +2027,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + !guest_has_spec_ctrl_msr(vcpu)) return 1; if (kvm_spec_ctrl_test_value(data)) @@ -2063,12 +2062,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) goto find_uret_msr; case MSR_IA32_PRED_CMD: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + !guest_has_pred_cmd_msr(vcpu)) return 1; if (data & ~PRED_CMD_IBPB) return 1; - if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL)) + if (!boot_cpu_has(X86_FEATURE_IBPB)) return 1; if (!data) break; @@ -3095,8 +3094,25 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd, vmcs_writel(GUEST_CR3, guest_cr3); } -int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { + /* + * We operate under the default treatment of SMM, so VMX cannot be + * enabled under SMM. Note, whether or not VMXE is allowed at all is + * handled by kvm_is_valid_cr4(). + */ + if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) + return false; + + if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) + return false; + + return true; +} + +void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + unsigned long old_cr4 = vcpu->arch.cr4; struct vcpu_vmx *vmx = to_vmx(vcpu); /* * Pass through host's Machine Check Enable value to hw_cr4, which @@ -3123,21 +3139,6 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) } } - if (cr4 & X86_CR4_VMXE) { - /* - * To use VMXON (and later other VMX instructions), a guest - * must first be able to turn on cr4.VMXE (see handle_vmon()). - * So basically the check on whether to allow nested VMX - * is here. We operate under the default treatment of SMM, - * so VMX cannot be enabled under SMM. - */ - if (!nested_vmx_allowed(vcpu) || is_smm(vcpu)) - return 1; - } - - if (vmx->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) - return 1; - vcpu->arch.cr4 = cr4; kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); @@ -3168,7 +3169,9 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); - return 0; + + if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) + kvm_update_cpuid_runtime(vcpu); } void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) @@ -3515,42 +3518,33 @@ bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu) return true; } -static int init_rmode_tss(struct kvm *kvm) +static int init_rmode_tss(struct kvm *kvm, void __user *ua) { - gfn_t fn; - u16 data = 0; - int idx, r; + const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); + u16 data; + int i; + + for (i = 0; i < 3; i++) { + if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE)) + return -EFAULT; + } - idx = srcu_read_lock(&kvm->srcu); - fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT; - r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); - if (r < 0) - goto out; data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; - r = kvm_write_guest_page(kvm, fn++, &data, - TSS_IOPB_BASE_OFFSET, sizeof(u16)); - if (r < 0) - goto out; - r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); - if (r < 0) - goto out; - r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); - if (r < 0) - goto out; + if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16))) + return -EFAULT; + data = ~0; - r = kvm_write_guest_page(kvm, fn, &data, - RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, - sizeof(u8)); -out: - srcu_read_unlock(&kvm->srcu, idx); - return r; + if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8))) + return -EFAULT; + + return 0; } static int init_rmode_identity_map(struct kvm *kvm) { struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); int i, r = 0; - kvm_pfn_t identity_map_pfn; + void __user *uaddr; u32 tmp; /* Protect kvm_vmx->ept_identity_pagetable_done. */ @@ -3561,24 +3555,24 @@ static int init_rmode_identity_map(struct kvm *kvm) if (!kvm_vmx->ept_identity_map_addr) kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; - identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT; - r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, - kvm_vmx->ept_identity_map_addr, PAGE_SIZE); - if (r < 0) + uaddr = __x86_set_memory_region(kvm, + IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, + kvm_vmx->ept_identity_map_addr, + PAGE_SIZE); + if (IS_ERR(uaddr)) { + r = PTR_ERR(uaddr); goto out; + } - r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); - if (r < 0) - goto out; /* Set up identity-mapping pagetable for EPT in real mode */ for (i = 0; i < PT32_ENT_PER_PAGE; i++) { tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); - r = kvm_write_guest_page(kvm, identity_map_pfn, - &tmp, i * sizeof(tmp), sizeof(tmp)); - if (r < 0) + if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) { + r = -EFAULT; goto out; + } } kvm_vmx->ept_identity_pagetable_done = true; @@ -3605,19 +3599,22 @@ static void seg_setup(int seg) static int alloc_apic_access_page(struct kvm *kvm) { struct page *page; - int r = 0; + void __user *hva; + int ret = 0; mutex_lock(&kvm->slots_lock); if (kvm->arch.apic_access_page_done) goto out; - r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, - APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); - if (r) + hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, + APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); + if (IS_ERR(hva)) { + ret = PTR_ERR(hva); goto out; + } page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); if (is_error_page(page)) { - r = -EFAULT; + ret = -EFAULT; goto out; } @@ -3629,7 +3626,7 @@ static int alloc_apic_access_page(struct kvm *kvm) kvm->arch.apic_access_page_done = true; out: mutex_unlock(&kvm->slots_lock); - return r; + return ret; } int allocate_vpid(void) @@ -4638,7 +4635,7 @@ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) { - int ret; + void __user *ret; if (enable_unrestricted_guest) return 0; @@ -4648,10 +4645,12 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) PAGE_SIZE * 3); mutex_unlock(&kvm->slots_lock); - if (ret) - return ret; + if (IS_ERR(ret)) + return PTR_ERR(ret); + to_kvm_vmx(kvm)->tss_addr = addr; - return init_rmode_tss(kvm); + + return init_rmode_tss(kvm, ret); } static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) @@ -4716,25 +4715,6 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, return 1; } -/* - * Trigger machine check on the host. We assume all the MSRs are already set up - * by the CPU and that we still run on the same CPU as the MCE occurred on. - * We pass a fake environment to the machine check handler because we want - * the guest to be always treated like user space, no matter what context - * it used internally. - */ -static void kvm_machine_check(void) -{ -#if defined(CONFIG_X86_MCE) - struct pt_regs regs = { - .cs = 3, /* Fake ring 3 no matter what the guest ran on */ - .flags = X86_EFLAGS_IF, - }; - - do_machine_check(®s); -#endif -} - static int handle_machine_check(struct kvm_vcpu *vcpu) { /* handled by vmx_vcpu_run() */ @@ -6399,7 +6379,11 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) handle_exception_nmi_irqoff(vmx); } -static bool vmx_has_emulated_msr(u32 index) +/* + * The kvm parameter can be NULL (module initialization, or invocation before + * VM creation). Be sure to check the kvm parameter before using it. + */ +static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) { switch (index) { case MSR_IA32_SMBASE: @@ -7558,7 +7542,7 @@ static void enable_smi_window(struct kvm_vcpu *vcpu) static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) { - return to_vmx(vcpu)->nested.vmxon; + return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu); } static void vmx_migrate_timers(struct kvm_vcpu *vcpu) @@ -7587,6 +7571,11 @@ static bool vmx_check_apicv_inhibit_reasons(ulong bit) return supported & BIT(bit); } +static int vmx_cpu_dirty_log_size(void) +{ + return enable_pml ? PML_ENTITY_NUM : 0; +} + static struct kvm_x86_ops vmx_x86_ops __initdata = { .hardware_unsetup = hardware_unsetup, @@ -7616,6 +7605,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .get_cpl = vmx_get_cpl, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, .set_cr0 = vmx_set_cr0, + .is_valid_cr4 = vmx_is_valid_cr4, .set_cr4 = vmx_set_cr4, .set_efer = vmx_set_efer, .get_idt = vmx_get_idt, @@ -7715,6 +7705,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .migrate_timers = vmx_migrate_timers, .msr_filter_changed = vmx_msr_filter_changed, + .complete_emulated_msr = kvm_complete_insn_gp, + .cpu_dirty_log_size = vmx_cpu_dirty_log_size, }; static __init int hardware_setup(void) @@ -7832,6 +7824,7 @@ static __init int hardware_setup(void) vmx_x86_ops.slot_disable_log_dirty = NULL; vmx_x86_ops.flush_log_dirty = NULL; vmx_x86_ops.enable_log_dirty_pt_masked = NULL; + vmx_x86_ops.cpu_dirty_log_size = NULL; } if (!cpu_has_vmx_preemption_timer()) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index f6f66e5c6510..9d3a557949ac 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -321,7 +321,7 @@ u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu); void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask); int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer); void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); -int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); void ept_save_pdptrs(struct kvm_vcpu *vcpu); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e545a8a613b1..3f7c1fc7a3ce 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -197,7 +197,8 @@ EXPORT_SYMBOL_GPL(host_efer); bool __read_mostly allow_smaller_maxphyaddr = 0; EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); -static u64 __read_mostly host_xss; +u64 __read_mostly host_xss; +EXPORT_SYMBOL_GPL(host_xss); u64 __read_mostly supported_xss; EXPORT_SYMBOL_GPL(supported_xss); @@ -804,11 +805,29 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(pdptrs_changed); +void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0) +{ + unsigned long update_bits = X86_CR0_PG | X86_CR0_WP; + + if ((cr0 ^ old_cr0) & X86_CR0_PG) { + kvm_clear_async_pf_completion_queue(vcpu); + kvm_async_pf_hash_reset(vcpu); + } + + if ((cr0 ^ old_cr0) & update_bits) + kvm_mmu_reset_context(vcpu); + + if (((cr0 ^ old_cr0) & X86_CR0_CD) && + kvm_arch_has_noncoherent_dma(vcpu->kvm) && + !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) + kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); +} +EXPORT_SYMBOL_GPL(kvm_post_set_cr0); + int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { unsigned long old_cr0 = kvm_read_cr0(vcpu); unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG; - unsigned long update_bits = X86_CR0_PG | X86_CR0_WP; cr0 |= X86_CR0_ET; @@ -847,18 +866,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) kvm_x86_ops.set_cr0(vcpu, cr0); - if ((cr0 ^ old_cr0) & X86_CR0_PG) { - kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); - } - - if ((cr0 ^ old_cr0) & update_bits) - kvm_mmu_reset_context(vcpu); - - if (((cr0 ^ old_cr0) & X86_CR0_CD) && - kvm_arch_has_noncoherent_dma(vcpu->kvm) && - !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) - kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); + kvm_post_set_cr0(vcpu, old_cr0, cr0); return 0; } @@ -872,6 +880,9 @@ EXPORT_SYMBOL_GPL(kvm_lmsw); void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) { + if (vcpu->arch.guest_state_protected) + return; + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { if (vcpu->arch.xcr0 != host_xcr0) @@ -892,6 +903,9 @@ EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) { + if (vcpu->arch.guest_state_protected) + return; + if (static_cpu_has(X86_FEATURE_PKU) && (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) { @@ -964,26 +978,36 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); -int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & cr4_reserved_bits) - return -EINVAL; + return false; if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) - return -EINVAL; + return false; - return 0; + return kvm_x86_ops.is_valid_cr4(vcpu, cr4); +} +EXPORT_SYMBOL_GPL(kvm_is_valid_cr4); + +void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) +{ + unsigned long mmu_role_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; + + if (((cr4 ^ old_cr4) & mmu_role_bits) || + (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) + kvm_mmu_reset_context(vcpu); } -EXPORT_SYMBOL_GPL(kvm_valid_cr4); +EXPORT_SYMBOL_GPL(kvm_post_set_cr4); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_SMEP; - unsigned long mmu_role_bits = pdptr_bits | X86_CR4_SMAP | X86_CR4_PKE; - if (kvm_valid_cr4(vcpu, cr4)) + if (!kvm_is_valid_cr4(vcpu, cr4)) return 1; if (is_long_mode(vcpu)) { @@ -1006,15 +1030,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; } - if (kvm_x86_ops.set_cr4(vcpu, cr4)) - return 1; + kvm_x86_ops.set_cr4(vcpu, cr4); - if (((cr4 ^ old_cr4) & mmu_role_bits) || - (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) - kvm_mmu_reset_context(vcpu); - - if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) - kvm_update_cpuid_runtime(vcpu); + kvm_post_set_cr4(vcpu, old_cr4, cr4); return 0; } @@ -1638,27 +1656,20 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) } EXPORT_SYMBOL_GPL(kvm_set_msr); -static int complete_emulated_msr(struct kvm_vcpu *vcpu, bool is_read) +static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) { - if (vcpu->run->msr.error) { - kvm_inject_gp(vcpu, 0); - return 1; - } else if (is_read) { + int err = vcpu->run->msr.error; + if (!err) { kvm_rax_write(vcpu, (u32)vcpu->run->msr.data); kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32); } - return kvm_skip_emulated_instruction(vcpu); -} - -static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) -{ - return complete_emulated_msr(vcpu, true); + return kvm_x86_ops.complete_emulated_msr(vcpu, err); } static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu) { - return complete_emulated_msr(vcpu, false); + return kvm_x86_ops.complete_emulated_msr(vcpu, vcpu->run->msr.error); } static u64 kvm_msr_reason(int r) @@ -1721,18 +1732,16 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) return 0; } - /* MSR read failed? Inject a #GP */ - if (r) { + if (!r) { + trace_kvm_msr_read(ecx, data); + + kvm_rax_write(vcpu, data & -1u); + kvm_rdx_write(vcpu, (data >> 32) & -1u); + } else { trace_kvm_msr_read_ex(ecx); - kvm_inject_gp(vcpu, 0); - return 1; } - trace_kvm_msr_read(ecx, data); - - kvm_rax_write(vcpu, data & -1u); - kvm_rdx_write(vcpu, (data >> 32) & -1u); - return kvm_skip_emulated_instruction(vcpu); + return kvm_x86_ops.complete_emulated_msr(vcpu, r); } EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); @@ -1753,15 +1762,12 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) if (r < 0) return r; - /* MSR write failed? Inject a #GP */ - if (r > 0) { + if (!r) + trace_kvm_msr_write(ecx, data); + else trace_kvm_msr_write_ex(ecx, data); - kvm_inject_gp(vcpu, 0); - return 1; - } - trace_kvm_msr_write(ecx, data); - return kvm_skip_emulated_instruction(vcpu); + return kvm_x86_ops.complete_emulated_msr(vcpu, r); } EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); @@ -3678,6 +3684,27 @@ static inline bool kvm_can_mwait_in_guest(void) boot_cpu_has(X86_FEATURE_ARAT); } +static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid2 __user *cpuid_arg) +{ + struct kvm_cpuid2 cpuid; + int r; + + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) + return r; + + r = kvm_get_hv_cpuid(vcpu, &cpuid, cpuid_arg->entries); + if (r) + return r; + + r = -EFAULT; + if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) + return r; + + return 0; +} + int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r = 0; @@ -3714,6 +3741,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_TLBFLUSH: case KVM_CAP_HYPERV_SEND_IPI: case KVM_CAP_HYPERV_CPUID: + case KVM_CAP_SYS_HYPERV_CPUID: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: @@ -3762,7 +3790,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) * fringe case that is not enabled except via specific settings * of the module parameters. */ - r = kvm_x86_ops.has_emulated_msr(MSR_IA32_SMBASE); + r = kvm_x86_ops.has_emulated_msr(kvm, MSR_IA32_SMBASE); break; case KVM_CAP_VAPIC: r = !kvm_x86_ops.cpu_has_accelerated_tpr(); @@ -3899,6 +3927,9 @@ long kvm_arch_dev_ioctl(struct file *filp, case KVM_GET_MSRS: r = msr_io(NULL, argp, do_get_msr_feature, 1); break; + case KVM_GET_SUPPORTED_HV_CPUID: + r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp); + break; default: r = -EINVAL; break; @@ -3997,7 +4028,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { int idx; - if (vcpu->preempted) + if (vcpu->preempted && !vcpu->arch.guest_state_protected) vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu); /* @@ -4481,6 +4512,9 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { + if (!vcpu->arch.guest_fpu) + return; + if (boot_cpu_has(X86_FEATURE_XSAVE)) { memset(guest_xsave, 0, sizeof(struct kvm_xsave)); fill_xsave((u8 *) guest_xsave->region, vcpu); @@ -4498,9 +4532,14 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { - u64 xstate_bv = - *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; - u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)]; + u64 xstate_bv; + u32 mxcsr; + + if (!vcpu->arch.guest_fpu) + return 0; + + xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; + mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)]; if (boot_cpu_has(X86_FEATURE_XSAVE)) { /* @@ -4977,25 +5016,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } - case KVM_GET_SUPPORTED_HV_CPUID: { - struct kvm_cpuid2 __user *cpuid_arg = argp; - struct kvm_cpuid2 cpuid; - - r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) - goto out; - - r = kvm_vcpu_ioctl_get_hv_cpuid(vcpu, &cpuid, - cpuid_arg->entries); - if (r) - goto out; - - r = -EFAULT; - if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) - goto out; - r = 0; + case KVM_GET_SUPPORTED_HV_CPUID: + r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp); break; - } default: r = -EINVAL; } @@ -5776,7 +5799,7 @@ static void kvm_init_msr_list(void) } for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { - if (!kvm_x86_ops.has_emulated_msr(emulated_msrs_all[i])) + if (!kvm_x86_ops.has_emulated_msr(NULL, emulated_msrs_all[i])) continue; emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i]; @@ -8158,7 +8181,14 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) { struct kvm_run *kvm_run = vcpu->run; - kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; + /* + * if_flag is obsolete and useless, so do not bother + * setting it for SEV-ES guests. Userspace can just + * use kvm_run->ready_for_interrupt_injection. + */ + kvm_run->if_flag = !vcpu->arch.guest_state_protected + && (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; + kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0; kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); @@ -8748,6 +8778,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; + /* Forbid vmenter if vcpu dirty ring is soft-full */ + if (unlikely(vcpu->kvm->dirty_ring_size && + kvm_dirty_ring_soft_full(&vcpu->dirty_ring))) { + vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL; + trace_kvm_dirty_ring_exit(vcpu); + r = 0; + goto out; + } + if (kvm_request_pending(vcpu)) { if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) { @@ -9223,9 +9262,14 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) kvm_save_current_fpu(vcpu->arch.user_fpu); - /* PKRU is separately restored in kvm_x86_ops.run. */ - __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, - ~XFEATURE_MASK_PKRU); + /* + * Guests with protected state can't have it set by the hypervisor, + * so skip trying to set it. + */ + if (vcpu->arch.guest_fpu) + /* PKRU is separately restored in kvm_x86_ops.run. */ + __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, + ~XFEATURE_MASK_PKRU); fpregs_mark_activate(); fpregs_unlock(); @@ -9238,7 +9282,12 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { fpregs_lock(); - kvm_save_current_fpu(vcpu->arch.guest_fpu); + /* + * Guests with protected state can't have it read by the hypervisor, + * so skip trying to save it. + */ + if (vcpu->arch.guest_fpu) + kvm_save_current_fpu(vcpu->arch.guest_fpu); copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state); @@ -9417,6 +9466,9 @@ static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { struct desc_ptr dt; + if (vcpu->arch.guest_state_protected) + goto skip_protected_regs; + kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); @@ -9434,9 +9486,11 @@ static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) sregs->gdt.limit = dt.size; sregs->gdt.base = dt.address; - sregs->cr0 = kvm_read_cr0(vcpu); sregs->cr2 = vcpu->arch.cr2; sregs->cr3 = kvm_read_cr3(vcpu); + +skip_protected_regs: + sregs->cr0 = kvm_read_cr0(vcpu); sregs->cr4 = kvm_read_cr4(vcpu); sregs->cr8 = kvm_get_cr8(vcpu); sregs->efer = vcpu->arch.efer; @@ -9535,7 +9589,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, } EXPORT_SYMBOL_GPL(kvm_task_switch); -static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { /* @@ -9543,31 +9597,29 @@ static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) * 64-bit mode (though maybe in a 32-bit code segment). * CR4.PAE and EFER.LMA must be set. */ - if (!(sregs->cr4 & X86_CR4_PAE) - || !(sregs->efer & EFER_LMA)) - return -EINVAL; + if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA)) + return false; } else { /* * Not in 64-bit mode: EFER.LMA is clear and the code * segment cannot be 64-bit. */ if (sregs->efer & EFER_LMA || sregs->cs.l) - return -EINVAL; + return false; } - return kvm_valid_cr4(vcpu, sregs->cr4); + return kvm_is_valid_cr4(vcpu, sregs->cr4); } static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { struct msr_data apic_base_msr; int mmu_reset_needed = 0; - int cpuid_update_needed = 0; int pending_vec, max_bits, idx; struct desc_ptr dt; int ret = -EINVAL; - if (kvm_valid_sregs(vcpu, sregs)) + if (!kvm_is_valid_sregs(vcpu, sregs)) goto out; apic_base_msr.data = sregs->apic_base; @@ -9575,6 +9627,9 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) if (kvm_set_apic_base(vcpu, &apic_base_msr)) goto out; + if (vcpu->arch.guest_state_protected) + goto skip_protected_regs; + dt.size = sregs->idt.limit; dt.address = sregs->idt.base; kvm_x86_ops.set_idt(vcpu, &dt); @@ -9597,11 +9652,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) vcpu->arch.cr0 = sregs->cr0; mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; - cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) & - (X86_CR4_OSXSAVE | X86_CR4_PKE)); kvm_x86_ops.set_cr4(vcpu, sregs->cr4); - if (cpuid_update_needed) - kvm_update_cpuid_runtime(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); if (is_pae_paging(vcpu)) { @@ -9613,14 +9664,6 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); - max_bits = KVM_NR_INTERRUPTS; - pending_vec = find_first_bit( - (const unsigned long *)sregs->interrupt_bitmap, max_bits); - if (pending_vec < max_bits) { - kvm_queue_interrupt(vcpu, pending_vec, false); - pr_debug("Set back pending irq %d\n", pending_vec); - } - kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); @@ -9639,6 +9682,15 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) !is_protmode(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; +skip_protected_regs: + max_bits = KVM_NR_INTERRUPTS; + pending_vec = find_first_bit( + (const unsigned long *)sregs->interrupt_bitmap, max_bits); + if (pending_vec < max_bits) { + kvm_queue_interrupt(vcpu, pending_vec, false); + pr_debug("Set back pending irq %d\n", pending_vec); + } + kvm_make_request(KVM_REQ_EVENT, vcpu); ret = 0; @@ -9663,6 +9715,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, unsigned long rflags; int i, r; + if (vcpu->arch.guest_state_protected) + return -EINVAL; + vcpu_load(vcpu); if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { @@ -9742,6 +9797,9 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxregs_state *fxsave; + if (!vcpu->arch.guest_fpu) + return 0; + vcpu_load(vcpu); fxsave = &vcpu->arch.guest_fpu->state.fxsave; @@ -9762,6 +9820,9 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxregs_state *fxsave; + if (!vcpu->arch.guest_fpu) + return 0; + vcpu_load(vcpu); fxsave = &vcpu->arch.guest_fpu->state.fxsave; @@ -9820,6 +9881,9 @@ static int sync_regs(struct kvm_vcpu *vcpu) static void fx_init(struct kvm_vcpu *vcpu) { + if (!vcpu->arch.guest_fpu) + return; + fpstate_init(&vcpu->arch.guest_fpu->state); if (boot_cpu_has(X86_FEATURE_XSAVES)) vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv = @@ -9833,6 +9897,15 @@ static void fx_init(struct kvm_vcpu *vcpu) vcpu->arch.cr0 |= X86_CR0_ET; } +void kvm_free_guest_fpu(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.guest_fpu) { + kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); + vcpu->arch.guest_fpu = NULL; + } +} +EXPORT_SYMBOL_GPL(kvm_free_guest_fpu); + int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) @@ -9869,7 +9942,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) r = -ENOMEM; - page = alloc_page(GFP_KERNEL | __GFP_ZERO); + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!page) goto fail_free_lapic; vcpu->arch.pio_data = page_address(page); @@ -9928,7 +10001,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) return 0; free_guest_fpu: - kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); + kvm_free_guest_fpu(vcpu); free_user_fpu: kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); free_emulate_ctxt: @@ -9982,7 +10055,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); - kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); + kvm_free_guest_fpu(vcpu); kvm_hv_vcpu_uninit(vcpu); kvm_pmu_destroy(vcpu); @@ -10030,7 +10103,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; - if (kvm_mpx_supported()) { + if (vcpu->arch.guest_fpu && kvm_mpx_supported()) { void *mpx_state_buffer; /* @@ -10349,7 +10422,32 @@ void kvm_arch_sync_events(struct kvm *kvm) kvm_free_pit(kvm); } -int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) +#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) + +/** + * __x86_set_memory_region: Setup KVM internal memory slot + * + * @kvm: the kvm pointer to the VM. + * @id: the slot ID to setup. + * @gpa: the GPA to install the slot (unused when @size == 0). + * @size: the size of the slot. Set to zero to uninstall a slot. + * + * This function helps to setup a KVM internal memory slot. Specify + * @size > 0 to install a new slot, while @size == 0 to uninstall a + * slot. The return code can be one of the following: + * + * HVA: on success (uninstall will return a bogus HVA) + * -errno: on error + * + * The caller should always use IS_ERR() to check the return value + * before use. Note, the KVM internal memory slots are guaranteed to + * remain valid and unchanged until the VM is destroyed, i.e., the + * GPA->HVA translation will not change. However, the HVA is a user + * address, i.e. its accessibility is not guaranteed, and must be + * accessed via __copy_{to,from}_user(). + */ +void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, + u32 size) { int i, r; unsigned long hva, old_npages; @@ -10358,12 +10456,12 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) /* Called with kvm->slots_lock held. */ if (WARN_ON(id >= KVM_MEM_SLOTS_NUM)) - return -EINVAL; + return ERR_PTR_USR(-EINVAL); slot = id_to_memslot(slots, id); if (size) { if (slot && slot->npages) - return -EEXIST; + return ERR_PTR_USR(-EEXIST); /* * MAP_SHARED to prevent internal slot pages from being moved @@ -10372,7 +10470,7 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0); if (IS_ERR((void *)hva)) - return PTR_ERR((void *)hva); + return (void __user *)hva; } else { if (!slot || !slot->npages) return 0; @@ -10391,13 +10489,13 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) m.memory_size = size; r = __kvm_set_memory_region(kvm, &m); if (r < 0) - return r; + return ERR_PTR_USR(r); } if (!size) vm_munmap(hva, old_npages * PAGE_SIZE); - return 0; + return (void __user *)hva; } EXPORT_SYMBOL_GPL(__x86_set_memory_region); @@ -10754,6 +10852,10 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) { + /* Can't read the RIP when guest state is protected, just return 0 */ + if (vcpu->arch.guest_state_protected) + return 0; + if (is_64_bit_mode(vcpu)) return kvm_rip_read(vcpu); return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) + @@ -11263,6 +11365,179 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) } EXPORT_SYMBOL_GPL(kvm_handle_invpcid); +static int complete_sev_es_emulated_mmio(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + struct kvm_mmio_fragment *frag; + unsigned int len; + + BUG_ON(!vcpu->mmio_needed); + + /* Complete previous fragment */ + frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment]; + len = min(8u, frag->len); + if (!vcpu->mmio_is_write) + memcpy(frag->data, run->mmio.data, len); + + if (frag->len <= 8) { + /* Switch to the next fragment. */ + frag++; + vcpu->mmio_cur_fragment++; + } else { + /* Go forward to the next mmio piece. */ + frag->data += len; + frag->gpa += len; + frag->len -= len; + } + + if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) { + vcpu->mmio_needed = 0; + + // VMG change, at this point, we're always done + // RIP has already been advanced + return 1; + } + + // More MMIO is needed + run->mmio.phys_addr = frag->gpa; + run->mmio.len = min(8u, frag->len); + run->mmio.is_write = vcpu->mmio_is_write; + if (run->mmio.is_write) + memcpy(run->mmio.data, frag->data, min(8u, frag->len)); + run->exit_reason = KVM_EXIT_MMIO; + + vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio; + + return 0; +} + +int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, + void *data) +{ + int handled; + struct kvm_mmio_fragment *frag; + + if (!data) + return -EINVAL; + + handled = write_emultor.read_write_mmio(vcpu, gpa, bytes, data); + if (handled == bytes) + return 1; + + bytes -= handled; + gpa += handled; + data += handled; + + /*TODO: Check if need to increment number of frags */ + frag = vcpu->mmio_fragments; + vcpu->mmio_nr_fragments = 1; + frag->len = bytes; + frag->gpa = gpa; + frag->data = data; + + vcpu->mmio_needed = 1; + vcpu->mmio_cur_fragment = 0; + + vcpu->run->mmio.phys_addr = gpa; + vcpu->run->mmio.len = min(8u, frag->len); + vcpu->run->mmio.is_write = 1; + memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len)); + vcpu->run->exit_reason = KVM_EXIT_MMIO; + + vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio; + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_write); + +int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, + void *data) +{ + int handled; + struct kvm_mmio_fragment *frag; + + if (!data) + return -EINVAL; + + handled = read_emultor.read_write_mmio(vcpu, gpa, bytes, data); + if (handled == bytes) + return 1; + + bytes -= handled; + gpa += handled; + data += handled; + + /*TODO: Check if need to increment number of frags */ + frag = vcpu->mmio_fragments; + vcpu->mmio_nr_fragments = 1; + frag->len = bytes; + frag->gpa = gpa; + frag->data = data; + + vcpu->mmio_needed = 1; + vcpu->mmio_cur_fragment = 0; + + vcpu->run->mmio.phys_addr = gpa; + vcpu->run->mmio.len = min(8u, frag->len); + vcpu->run->mmio.is_write = 0; + vcpu->run->exit_reason = KVM_EXIT_MMIO; + + vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio; + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read); + +static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) +{ + memcpy(vcpu->arch.guest_ins_data, vcpu->arch.pio_data, + vcpu->arch.pio.count * vcpu->arch.pio.size); + vcpu->arch.pio.count = 0; + + return 1; +} + +static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, + unsigned int port, void *data, unsigned int count) +{ + int ret; + + ret = emulator_pio_out_emulated(vcpu->arch.emulate_ctxt, size, port, + data, count); + if (ret) + return ret; + + vcpu->arch.pio.count = 0; + + return 0; +} + +static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, + unsigned int port, void *data, unsigned int count) +{ + int ret; + + ret = emulator_pio_in_emulated(vcpu->arch.emulate_ctxt, size, port, + data, count); + if (ret) { + vcpu->arch.pio.count = 0; + } else { + vcpu->arch.guest_ins_data = data; + vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins; + } + + return 0; +} + +int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size, + unsigned int port, void *data, unsigned int count, + int in) +{ + return in ? kvm_sev_es_ins(vcpu, size, port, data, count) + : kvm_sev_es_outs(vcpu, size, port, data, count); +} +EXPORT_SYMBOL_GPL(kvm_sev_es_string_io); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); @@ -11285,3 +11560,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index e7ca622a468f..c5ee0f5ce0f1 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -3,6 +3,7 @@ #define ARCH_X86_KVM_X86_H #include <linux/kvm_host.h> +#include <asm/mce.h> #include <asm/pvclock.h> #include "kvm_cache_regs.h" #include "kvm_emulate.h" @@ -278,6 +279,7 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); extern u64 host_xcr0; extern u64 supported_xcr0; +extern u64 host_xss; extern u64 supported_xss; static inline bool kvm_mpx_supported(void) @@ -366,10 +368,29 @@ static inline bool kvm_dr6_valid(u64 data) return !(data >> 32); } +/* + * Trigger machine check on the host. We assume all the MSRs are already set up + * by the CPU and that we still run on the same CPU as the MCE occurred on. + * We pass a fake environment to the machine check handler because we want + * the guest to be always treated like user space, no matter what context + * it used internally. + */ +static inline void kvm_machine_check(void) +{ +#if defined(CONFIG_X86_MCE) + struct pt_regs regs = { + .cs = 3, /* Fake ring 3 no matter what the guest ran on */ + .flags = X86_EFLAGS_IF, + }; + + do_machine_check(®s); +#endif +} + void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); int kvm_spec_ctrl_test_value(u64 value); -int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu); int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, struct x86_exception *e); @@ -407,4 +428,12 @@ bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type); __reserved_bits; \ }) +int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t src, unsigned int bytes, + void *dst); +int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t src, unsigned int bytes, + void *dst); +int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size, + unsigned int port, void *data, unsigned int count, + int in); + #endif diff --git a/arch/x86/lib/copy_mc.c b/arch/x86/lib/copy_mc.c index c13e8c9ee926..80efd45a7761 100644 --- a/arch/x86/lib/copy_mc.c +++ b/arch/x86/lib/copy_mc.c @@ -10,10 +10,6 @@ #include <asm/mce.h> #ifdef CONFIG_X86_MCE -/* - * See COPY_MC_TEST for self-test of the copy_mc_fragile() - * implementation. - */ static DEFINE_STATIC_KEY_FALSE(copy_mc_fragile_key); void enable_copy_mc_fragile(void) diff --git a/arch/x86/lib/copy_mc_64.S b/arch/x86/lib/copy_mc_64.S index 892d8915f609..e5f77e293034 100644 --- a/arch/x86/lib/copy_mc_64.S +++ b/arch/x86/lib/copy_mc_64.S @@ -2,14 +2,11 @@ /* Copyright(c) 2016-2020 Intel Corporation. All rights reserved. */ #include <linux/linkage.h> -#include <asm/copy_mc_test.h> -#include <asm/export.h> #include <asm/asm.h> #ifndef CONFIG_UML #ifdef CONFIG_X86_MCE -COPY_MC_TEST_CTL /* * copy_mc_fragile - copy memory with indication if an exception / fault happened @@ -38,8 +35,6 @@ SYM_FUNC_START(copy_mc_fragile) subl %ecx, %edx .L_read_leading_bytes: movb (%rsi), %al - COPY_MC_TEST_SRC %rsi 1 .E_leading_bytes - COPY_MC_TEST_DST %rdi 1 .E_leading_bytes .L_write_leading_bytes: movb %al, (%rdi) incq %rsi @@ -55,8 +50,6 @@ SYM_FUNC_START(copy_mc_fragile) .L_read_words: movq (%rsi), %r8 - COPY_MC_TEST_SRC %rsi 8 .E_read_words - COPY_MC_TEST_DST %rdi 8 .E_write_words .L_write_words: movq %r8, (%rdi) addq $8, %rsi @@ -73,8 +66,6 @@ SYM_FUNC_START(copy_mc_fragile) movl %edx, %ecx .L_read_trailing_bytes: movb (%rsi), %al - COPY_MC_TEST_SRC %rsi 1 .E_trailing_bytes - COPY_MC_TEST_DST %rdi 1 .E_trailing_bytes .L_write_trailing_bytes: movb %al, (%rdi) incq %rsi @@ -88,7 +79,6 @@ SYM_FUNC_START(copy_mc_fragile) .L_done: ret SYM_FUNC_END(copy_mc_fragile) -EXPORT_SYMBOL_GPL(copy_mc_fragile) .section .fixup, "ax" /* diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c index fee8b9c0520c..75a0915b0d01 100644 --- a/arch/x86/lib/msr-smp.c +++ b/arch/x86/lib/msr-smp.c @@ -169,12 +169,11 @@ static void __wrmsr_safe_on_cpu(void *info) int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { struct msr_info_completion rv; - call_single_data_t csd = { - .func = __rdmsr_safe_on_cpu, - .info = &rv, - }; + call_single_data_t csd; int err; + INIT_CSD(&csd, __rdmsr_safe_on_cpu, &rv); + memset(&rv, 0, sizeof(rv)); init_completion(&rv.done); rv.msr.msr_no = msr_no; diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index 3f435d7fca5e..c3e8a62ca561 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -9,9 +9,23 @@ #include <asm/tlbflush.h> -/* - * We rely on the nested NMI work to allow atomic faults from the NMI path; the - * nested NMI paths are careful to preserve CR2. +/** + * copy_from_user_nmi - NMI safe copy from user + * @to: Pointer to the destination buffer + * @from: Pointer to a user space address of the current task + * @n: Number of bytes to copy + * + * Returns: The number of not copied bytes. 0 is success, i.e. all bytes copied + * + * Contrary to other copy_from_user() variants this function can be called + * from NMI context. Despite the name it is not restricted to be called + * from NMI context. It is safe to be called from any other context as + * well. It disables pagefaults across the copy which means a fault will + * abort the copy. + * + * For NMI context invocations this relies on the nested NMI work to allow + * atomic faults from the NMI path; the nested NMI paths are careful to + * preserve CR2. */ unsigned long copy_from_user_nmi(void *to, const void __user *from, unsigned long n) @@ -27,7 +41,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) /* * Even though this function is typically called from NMI/IRQ context * disable pagefaults so that its behaviour is consistent even when - * called form other contexts. + * called from other contexts. */ pagefault_disable(); ret = __copy_from_user_inatomic(to, from, n); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 82bf37a5c9ec..f1f1b5a0956a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -30,6 +30,7 @@ #include <asm/cpu_entry_area.h> /* exception stack */ #include <asm/pgtable_areas.h> /* VMALLOC_START, ... */ #include <asm/kvm_para.h> /* kvm_handle_async_pf */ +#include <asm/vdso.h> /* fixup_vdso_exception() */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -602,11 +603,9 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, oops_end(flags, regs, sig); } -static void set_signal_archinfo(unsigned long address, - unsigned long error_code) +static void sanitize_error_code(unsigned long address, + unsigned long *error_code) { - struct task_struct *tsk = current; - /* * To avoid leaking information about the kernel page * table layout, pretend that user-mode accesses to @@ -617,7 +616,13 @@ static void set_signal_archinfo(unsigned long address, * information and does not appear to cause any problems. */ if (address >= TASK_SIZE_MAX) - error_code |= X86_PF_PROT; + *error_code |= X86_PF_PROT; +} + +static void set_signal_archinfo(unsigned long address, + unsigned long error_code) +{ + struct task_struct *tsk = current; tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code | X86_PF_USER; @@ -658,6 +663,8 @@ no_context(struct pt_regs *regs, unsigned long error_code, * faulting through the emulate_vsyscall() logic. */ if (current->thread.sig_on_uaccess_err && signal) { + sanitize_error_code(address, &error_code); + set_signal_archinfo(address, error_code); /* XXX: hwpoison faults will set the wrong code. */ @@ -806,13 +813,10 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (is_errata100(regs, address)) return; - /* - * To avoid leaking information about the kernel page table - * layout, pretend that user-mode accesses to kernel addresses - * are always protection faults. - */ - if (address >= TASK_SIZE_MAX) - error_code |= X86_PF_PROT; + sanitize_error_code(address, &error_code); + + if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) + return; if (likely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); @@ -931,6 +935,11 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, if (is_prefetch(regs, error_code, address)) return; + sanitize_error_code(address, &error_code); + + if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) + return; + set_signal_archinfo(address, error_code); #ifdef CONFIG_MEMORY_FAILURE @@ -1102,6 +1111,18 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) return 1; /* + * SGX hardware blocked the access. This usually happens + * when the enclave memory contents have been destroyed, like + * after a suspend/resume cycle. In any case, the kernel can't + * fix the cause of the fault. Handle the fault as an access + * error even in cases where no actual access violation + * occurred. This allows userspace to rebuild the enclave in + * response to the signal. + */ + if (unlikely(error_code & X86_PF_SGX)) + return 1; + + /* * Make sure to check the VMA so that we do not perform * faults just to hit a X86_PF_PK as soon as we fill in a * page. diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 075fe51317b0..2c54b76d8f84 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -4,65 +4,6 @@ #include <linux/swap.h> /* for totalram_pages */ #include <linux/memblock.h> -void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) -{ - unsigned long vaddr; - int idx, type; - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - BUG_ON(!pte_none(*(kmap_pte-idx))); - set_pte(kmap_pte-idx, mk_pte(page, prot)); - arch_flush_lazy_mmu_mode(); - - return (void *)vaddr; -} -EXPORT_SYMBOL(kmap_atomic_high_prot); - -/* - * This is the same as kmap_atomic() but can map memory that doesn't - * have a struct page associated with it. - */ -void *kmap_atomic_pfn(unsigned long pfn) -{ - return kmap_atomic_prot_pfn(pfn, kmap_prot); -} -EXPORT_SYMBOL_GPL(kmap_atomic_pfn); - -void kunmap_atomic_high(void *kvaddr) -{ - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - - if (vaddr >= __fix_to_virt(FIX_KMAP_END) && - vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { - int idx, type; - - type = kmap_atomic_idx(); - idx = type + KM_TYPE_NR * smp_processor_id(); - -#ifdef CONFIG_DEBUG_HIGHMEM - WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); -#endif - /* - * Force other mappings to Oops if they'll try to access this - * pte without first remap it. Keeping stale mappings around - * is a bad idea also, in case the page changes cacheability - * attributes or becomes a protected page in a hypervisor. - */ - kpte_clear_flush(kmap_pte-idx, vaddr); - kmap_atomic_idx_pop(); - arch_flush_lazy_mmu_mode(); - } -#ifdef CONFIG_DEBUG_HIGHMEM - else { - BUG_ON(vaddr < PAGE_OFFSET); - BUG_ON(vaddr >= (unsigned long)high_memory); - } -#endif -} -EXPORT_SYMBOL(kunmap_atomic_high); - void __init set_highmem_pages_init(void) { struct zone *zone; diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index fe7a12599d8e..968d7005f4a7 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -62,6 +62,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, unsigned long addr, unsigned long end) { unsigned long next; + int result; for (; addr < end; addr = next) { p4d_t *p4d = p4d_page + p4d_index(addr); @@ -73,13 +74,20 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, if (p4d_present(*p4d)) { pud = pud_offset(p4d, 0); - ident_pud_init(info, pud, addr, next); + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + continue; } pud = (pud_t *)info->alloc_pgt_page(info->context); if (!pud) return -ENOMEM; - ident_pud_init(info, pud, addr, next); + + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag)); } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index c7a47603537f..e26f5c5c6565 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -596,7 +596,7 @@ static unsigned long __init get_new_step_size(unsigned long step_size) static void __init memory_map_top_down(unsigned long map_start, unsigned long map_end) { - unsigned long real_end, start, last_start; + unsigned long real_end, last_start; unsigned long step_size; unsigned long addr; unsigned long mapped_ram_size = 0; @@ -609,7 +609,7 @@ static void __init memory_map_top_down(unsigned long map_start, step_size = PMD_SIZE; max_pfn_mapped = 0; /* will get exact value next */ min_pfn_mapped = real_end >> PAGE_SHIFT; - last_start = start = real_end; + last_start = real_end; /* * We start from the top (end of memory) and go to the bottom. @@ -618,6 +618,8 @@ static void __init memory_map_top_down(unsigned long map_start, * for page table. */ while (last_start > map_start) { + unsigned long start; + if (last_start > step_size) { start = round_down(last_start - 1, step_size); if (start < map_start) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 7c055259de3a..da31c2635ee4 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -394,19 +394,6 @@ repeat: return last_map_addr; } -pte_t *kmap_pte; - -static void __init kmap_init(void) -{ - unsigned long kmap_vstart; - - /* - * Cache the first kmap pte: - */ - kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); - kmap_pte = virt_to_kpte(kmap_vstart); -} - #ifdef CONFIG_HIGHMEM static void __init permanent_kmaps_init(pgd_t *pgd_base) { @@ -712,8 +699,6 @@ void __init paging_init(void) __flush_tlb_all(); - kmap_init(); - /* * NOTE: at this point the bootmem allocator is fully available. */ diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index f60398aeb644..9aaa756ddf21 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -44,28 +44,7 @@ void iomap_free(resource_size_t base, unsigned long size) } EXPORT_SYMBOL_GPL(iomap_free); -void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) -{ - unsigned long vaddr; - int idx, type; - - preempt_disable(); - pagefault_disable(); - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR * smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); - arch_flush_lazy_mmu_mode(); - - return (void *)vaddr; -} - -/* - * Map 'pfn' using protections 'prot' - */ -void __iomem * -iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) +void __iomem *__iomap_local_pfn_prot(unsigned long pfn, pgprot_t prot) { /* * For non-PAT systems, translate non-WB request to UC- just in @@ -81,36 +60,6 @@ iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) /* Filter out unsupported __PAGE_KERNEL* bits: */ pgprot_val(prot) &= __default_kernel_pte_mask; - return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot); -} -EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); - -void -iounmap_atomic(void __iomem *kvaddr) -{ - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - - if (vaddr >= __fix_to_virt(FIX_KMAP_END) && - vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { - int idx, type; - - type = kmap_atomic_idx(); - idx = type + KM_TYPE_NR * smp_processor_id(); - -#ifdef CONFIG_DEBUG_HIGHMEM - WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); -#endif - /* - * Force other mappings to Oops if they'll try to access this - * pte without first remap it. Keeping stale mappings around - * is a bad idea also, in case the page changes cacheability - * attributes or becomes a protected page in a hypervisor. - */ - kpte_clear_flush(kmap_pte-idx, vaddr); - kmap_atomic_idx_pop(); - } - - pagefault_enable(); - preempt_enable(); + return (void __force __iomem *)__kmap_local_pfn_prot(pfn, prot); } -EXPORT_SYMBOL_GPL(iounmap_atomic); +EXPORT_SYMBOL_GPL(__iomap_local_pfn_prot); diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index bc0833713be9..c79e5736ab2b 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -198,6 +198,37 @@ void __init sme_early_init(void) swiotlb_force = SWIOTLB_FORCE; } +void __init sev_setup_arch(void) +{ + phys_addr_t total_mem = memblock_phys_mem_size(); + unsigned long size; + + if (!sev_active()) + return; + + /* + * For SEV, all DMA has to occur via shared/unencrypted pages. + * SEV uses SWIOTLB to make this happen without changing device + * drivers. However, depending on the workload being run, the + * default 64MB of SWIOTLB may not be enough and SWIOTLB may + * run out of buffers for DMA, resulting in I/O errors and/or + * performance degradation especially with high I/O workloads. + * + * Adjust the default size of SWIOTLB for SEV guests using + * a percentage of guest memory for SWIOTLB buffers. + * Also, as the SWIOTLB bounce buffer memory is allocated + * from low memory, ensure that the adjusted size is within + * the limits of low available memory. + * + * The percentage of guest memory used here for SWIOTLB buffers + * is more of an approximation of the static adjustment which + * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6% + */ + size = total_mem * 6 / 100; + size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G); + swiotlb_adjust_size(size); +} + static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) { pgprot_t old_prot, new_prot; diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 40baa90e74f4..16f878c26667 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -2194,6 +2194,7 @@ int set_direct_map_default_noflush(struct page *page) return __set_pages_p(page, 1); } +#ifdef CONFIG_DEBUG_PAGEALLOC void __kernel_map_pages(struct page *page, int numpages, int enable) { if (PageHighMem(page)) @@ -2225,8 +2226,8 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) arch_flush_lazy_mmu_mode(); } +#endif /* CONFIG_DEBUG_PAGEALLOC */ -#ifdef CONFIG_HIBERNATION bool kernel_page_present(struct page *page) { unsigned int level; @@ -2238,7 +2239,6 @@ bool kernel_page_present(struct page *page) pte = lookup_address((unsigned long)page_address(page), &level); return (pte_val(*pte) & _PAGE_PRESENT); } -#endif /* CONFIG_HIBERNATION */ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, unsigned numpages, unsigned long page_flags) diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index a2488b6e27d6..1d8391fcca68 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -49,7 +49,7 @@ x86_backtrace_32(struct pt_regs * const regs, unsigned int depth) struct stack_frame_ia32 *head; /* User process is IA32 */ - if (!current || !test_thread_flag(TIF_IA32)) + if (!current || user_64bit_mode(regs)) return 0; head = (struct stack_frame_ia32 *) regs->bp; diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index fa855bbaebaf..f2f4a5d50b27 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -366,9 +366,9 @@ static int __init pcibios_assign_resources(void) return 0; } -/** - * called in fs_initcall (one below subsys_initcall), - * give a chance for motherboard reserve resources +/* + * This is an fs_initcall (one below subsys_initcall) in order to reserve + * resources properly. */ fs_initcall(pcibios_assign_resources); diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 24ca4ee2802f..95e2e6bd8d8c 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -215,7 +215,7 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, static int intel_mid_pci_irq_enable(struct pci_dev *dev) { struct irq_alloc_info info; - int polarity; + bool polarity_low; int ret; u8 gsi; @@ -230,7 +230,7 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) switch (intel_mid_identify_cpu()) { case INTEL_MID_CPU_CHIP_TANGIER: - polarity = IOAPIC_POL_HIGH; + polarity_low = false; /* Special treatment for IRQ0 */ if (gsi == 0) { @@ -252,11 +252,11 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) } break; default: - polarity = IOAPIC_POL_LOW; + polarity_low = true; break; } - ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity); + ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity_low); /* * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 6fa42e9c4e6f..234998f196d4 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -425,7 +425,7 @@ static acpi_status find_mboard_resource(acpi_handle handle, u32 lvl, return AE_OK; } -static bool is_acpi_reserved(u64 start, u64 end, unsigned not_used) +static bool is_acpi_reserved(u64 start, u64 end, enum e820_type not_used) { struct resource mcfg_res; @@ -442,7 +442,7 @@ static bool is_acpi_reserved(u64 start, u64 end, unsigned not_used) return mcfg_res.flags; } -typedef bool (*check_reserved_t)(u64 start, u64 end, unsigned type); +typedef bool (*check_reserved_t)(u64 start, u64 end, enum e820_type type); static bool __ref is_mmconf_reserved(check_reserved_t is_reserved, struct pci_mmcfg_region *cfg, diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index 5701d5ba3df4..7d2525691854 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -11,7 +11,8 @@ #include <linux/pci_ids.h> #include <linux/export.h> #include <linux/list.h> -#include <linux/dma-direct.h> +#include <linux/dma-map-ops.h> +#include <linux/swiotlb.h> #include <asm/iommu.h> #define STA2X11_SWIOTLB_SIZE (4*1024*1024) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index c552cd2d0632..3d41a09c2c14 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -152,7 +152,6 @@ static int acpi_register_gsi_xen(struct device *dev, u32 gsi, #if defined(CONFIG_PCI_MSI) #include <linux/msi.h> -#include <asm/msidef.h> struct xen_pci_frontend_ops *xen_pci_frontend; EXPORT_SYMBOL_GPL(xen_pci_frontend); @@ -210,23 +209,20 @@ free: return ret; } -#define XEN_PIRQ_MSI_DATA (MSI_DATA_TRIGGER_EDGE | \ - MSI_DATA_LEVEL_ASSERT | (3 << 8) | MSI_DATA_VECTOR(0)) - static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq, struct msi_msg *msg) { - /* We set vector == 0 to tell the hypervisor we don't care about it, - * but we want a pirq setup instead. - * We use the dest_id field to pass the pirq that we want. */ - msg->address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(pirq); - msg->address_lo = - MSI_ADDR_BASE_LO | - MSI_ADDR_DEST_MODE_PHYSICAL | - MSI_ADDR_REDIRECTION_CPU | - MSI_ADDR_DEST_ID(pirq); - - msg->data = XEN_PIRQ_MSI_DATA; + /* + * We set vector == 0 to tell the hypervisor we don't care about + * it, but we want a pirq setup instead. We use the dest_id fields + * to pass the pirq that we want. + */ + memset(msg, 0, sizeof(*msg)); + msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH; + msg->arch_addr_hi.destid_8_31 = pirq >> 8; + msg->arch_addr_lo.destid_0_7 = pirq & 0xFF; + msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW; + msg->arch_data.delivery_mode = APIC_DELIVERY_MODE_EXTINT; } static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) diff --git a/arch/x86/platform/uv/Makefile b/arch/x86/platform/uv/Makefile index 224ff0504890..1441dda8edf7 100644 --- a/arch/x86/platform/uv/Makefile +++ b/arch/x86/platform/uv/Makefile @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_X86_UV) += bios_uv.o uv_irq.o uv_sysfs.o uv_time.o uv_nmi.o +obj-$(CONFIG_X86_UV) += bios_uv.o uv_irq.o uv_time.o uv_nmi.o diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 54511eaccf4d..bf31af3d32d6 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -72,6 +72,7 @@ static s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, long sn_partition_id; EXPORT_SYMBOL_GPL(sn_partition_id); long sn_coherency_id; +EXPORT_SYMBOL_GPL(sn_coherency_id); long sn_region_size; EXPORT_SYMBOL_GPL(sn_region_size); long system_serial_number; @@ -171,6 +172,60 @@ int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus) (u64)decode, (u64)domain, (u64)bus, 0, 0); } +extern s64 uv_bios_get_master_nasid(u64 size, u64 *master_nasid) +{ + return uv_bios_call(UV_BIOS_EXTRA, 0, UV_BIOS_EXTRA_MASTER_NASID, 0, + size, (u64)master_nasid); +} +EXPORT_SYMBOL_GPL(uv_bios_get_master_nasid); + +extern s64 uv_bios_get_heapsize(u64 nasid, u64 size, u64 *heap_size) +{ + return uv_bios_call(UV_BIOS_EXTRA, nasid, UV_BIOS_EXTRA_GET_HEAPSIZE, + 0, size, (u64)heap_size); +} +EXPORT_SYMBOL_GPL(uv_bios_get_heapsize); + +extern s64 uv_bios_install_heap(u64 nasid, u64 heap_size, u64 *bios_heap) +{ + return uv_bios_call(UV_BIOS_EXTRA, nasid, UV_BIOS_EXTRA_INSTALL_HEAP, + 0, heap_size, (u64)bios_heap); +} +EXPORT_SYMBOL_GPL(uv_bios_install_heap); + +extern s64 uv_bios_obj_count(u64 nasid, u64 size, u64 *objcnt) +{ + return uv_bios_call(UV_BIOS_EXTRA, nasid, UV_BIOS_EXTRA_OBJECT_COUNT, + 0, size, (u64)objcnt); +} +EXPORT_SYMBOL_GPL(uv_bios_obj_count); + +extern s64 uv_bios_enum_objs(u64 nasid, u64 size, u64 *objbuf) +{ + return uv_bios_call(UV_BIOS_EXTRA, nasid, UV_BIOS_EXTRA_ENUM_OBJECTS, + 0, size, (u64)objbuf); +} +EXPORT_SYMBOL_GPL(uv_bios_enum_objs); + +extern s64 uv_bios_enum_ports(u64 nasid, u64 obj_id, u64 size, u64 *portbuf) +{ + return uv_bios_call(UV_BIOS_EXTRA, nasid, UV_BIOS_EXTRA_ENUM_PORTS, + obj_id, size, (u64)portbuf); +} +EXPORT_SYMBOL_GPL(uv_bios_enum_ports); + +extern s64 uv_bios_get_geoinfo(u64 nasid, u64 size, u64 *buf) +{ + return uv_bios_call(UV_BIOS_GET_GEOINFO, nasid, (u64)buf, size, 0, 0); +} +EXPORT_SYMBOL_GPL(uv_bios_get_geoinfo); + +extern s64 uv_bios_get_pci_topology(u64 size, u64 *buf) +{ + return uv_bios_call(UV_BIOS_GET_PCI_TOPOLOGY, (u64)buf, size, 0, 0, 0); +} +EXPORT_SYMBOL_GPL(uv_bios_get_pci_topology); + unsigned long get_uv_systab_phys(bool msg) { if ((uv_systab_phys == EFI_INVALID_TABLE_ADDR) || diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 18ca2261cc9a..1a536a187d74 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -35,8 +35,8 @@ static void uv_program_mmr(struct irq_cfg *cfg, struct uv_irq_2_mmr_pnode *info) mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; + entry->delivery_mode = apic->delivery_mode; + entry->dest_mode = apic->dest_mode_logical; entry->polarity = 0; entry->trigger = 0; entry->mask = 0; diff --git a/arch/x86/platform/uv/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c deleted file mode 100644 index 266773e2fb37..000000000000 --- a/arch/x86/platform/uv/uv_sysfs.c +++ /dev/null @@ -1,63 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV. - * - * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) Russ Anderson - */ - -#include <linux/device.h> -#include <asm/uv/bios.h> -#include <asm/uv/uv.h> - -struct kobject *sgi_uv_kobj; - -static ssize_t partition_id_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id); -} - -static ssize_t coherence_id_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%ld\n", sn_coherency_id); -} - -static struct kobj_attribute partition_id_attr = - __ATTR(partition_id, S_IRUGO, partition_id_show, NULL); - -static struct kobj_attribute coherence_id_attr = - __ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL); - - -static int __init sgi_uv_sysfs_init(void) -{ - unsigned long ret; - - if (!is_uv_system()) - return -ENODEV; - - if (!sgi_uv_kobj) - sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj); - if (!sgi_uv_kobj) { - printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n"); - return -EINVAL; - } - - ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr); - if (ret) { - printk(KERN_WARNING "sysfs_create_file partition_id failed\n"); - return ret; - } - - ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr); - if (ret) { - printk(KERN_WARNING "sysfs_create_file coherence_id failed\n"); - return ret; - } - - return 0; -} - -device_initcall(sgi_uv_sysfs_init); diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c index 7b37a412f829..f03b64d9cb51 100644 --- a/arch/x86/purgatory/purgatory.c +++ b/arch/x86/purgatory/purgatory.c @@ -9,7 +9,7 @@ */ #include <linux/bug.h> -#include <crypto/sha.h> +#include <crypto/sha2.h> #include <asm/purgatory.h> #include "../boot/string.h" diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 218acbd5c7a0..afc1da68b06d 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -26,6 +26,19 @@ config XEN_PV help Support running as a Xen PV guest. +config XEN_512GB + bool "Limit Xen pv-domain memory to 512GB" + depends on XEN_PV + default y + help + Limit paravirtualized user domains to 512GB of RAM. + + The Xen tools and crash dump analysis tools might not support + pv-domains with more than 512 GB of RAM. This option controls the + default setting of the kernel to use only up to 512 GB or more. + It is always possible to change the default via specifying the + boot parameter "xen_512gb_limit". + config XEN_PV_SMP def_bool y depends on XEN_PV && SMP @@ -39,28 +52,19 @@ config XEN_DOM0 Support running as a Xen PV Dom0 guest. config XEN_PVHVM - bool "Xen PVHVM guest support" - default y - depends on XEN && PCI && X86_LOCAL_APIC - help - Support running as a Xen PVHVM guest. + def_bool y + depends on XEN && X86_LOCAL_APIC config XEN_PVHVM_SMP def_bool y depends on XEN_PVHVM && SMP -config XEN_512GB - bool "Limit Xen pv-domain memory to 512GB" - depends on XEN_PV +config XEN_PVHVM_GUEST + bool "Xen PVHVM guest support" default y + depends on XEN_PVHVM && PCI help - Limit paravirtualized user domains to 512GB of RAM. - - The Xen tools and crash dump analysis tools might not support - pv-domains with more than 512 GB of RAM. This option controls the - default setting of the kernel to use only up to 512 GB or more. - It is always possible to change the default via specifying the - boot parameter "xen_512gb_limit". + Support running as a Xen PVHVM guest. config XEN_SAVE_RESTORE bool @@ -76,7 +80,9 @@ config XEN_DEBUG_FS Enabling this option may incur a significant performance overhead. config XEN_PVH - bool "Support for running as a Xen PVH guest" + bool "Xen PVH guest support" depends on XEN && XEN_PVHVM && ACPI select PVH def_bool n + help + Support for running as a Xen PVH guest. diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index e82fd1910dae..0d46cc283cf5 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -148,15 +148,12 @@ static struct apic xen_pv_apic = { .apic_id_valid = xen_id_always_valid, .apic_id_registered = xen_id_always_registered, - /* .irq_delivery_mode - used in native_compose_msi_msg only */ - /* .irq_dest_mode - used in native_compose_msi_msg only */ + /* .delivery_mode and .dest_mode_logical not used by XENPV */ .disable_esr = 0, - /* .dest_logical - default_send_IPI_ use it but we use our own. */ - .check_apicid_used = default_check_apicid_used, /* Used on 32-bit */ + .check_apicid_used = default_check_apicid_used, /* Used on 32-bit */ .init_apic_ldr = xen_noop, /* setup_local_APIC calls it */ - .ioapic_phys_id_map = default_ioapic_phys_id_map, /* Used on 32-bit */ .setup_apic_routing = NULL, .cpu_present_to_apicid = xen_cpu_present_to_apicid, diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index 205a9bc981b0..7d7ffb9c826a 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -93,37 +93,22 @@ static efi_system_table_t __init *xen_efi_probe(void) /* * Determine whether we're in secure boot mode. - * - * Please keep the logic in sync with - * drivers/firmware/efi/libstub/secureboot.c:efi_get_secureboot(). */ static enum efi_secureboot_mode xen_efi_get_secureboot(void) { - static efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID; static efi_guid_t shim_guid = EFI_SHIM_LOCK_GUID; + enum efi_secureboot_mode mode; efi_status_t status; - u8 moksbstate, secboot, setupmode; + u8 moksbstate; unsigned long size; - size = sizeof(secboot); - status = efi.get_variable(L"SecureBoot", &efi_variable_guid, - NULL, &size, &secboot); - - if (status == EFI_NOT_FOUND) - return efi_secureboot_mode_disabled; - - if (status != EFI_SUCCESS) - goto out_efi_err; - - size = sizeof(setupmode); - status = efi.get_variable(L"SetupMode", &efi_variable_guid, - NULL, &size, &setupmode); - - if (status != EFI_SUCCESS) - goto out_efi_err; - - if (secboot == 0 || setupmode == 1) - return efi_secureboot_mode_disabled; + mode = efi_get_secureboot_mode(efi.get_variable); + if (mode == efi_secureboot_mode_unknown) { + pr_err("Could not determine UEFI Secure Boot status.\n"); + return efi_secureboot_mode_unknown; + } + if (mode != efi_secureboot_mode_enabled) + return mode; /* See if a user has put the shim into insecure mode. */ size = sizeof(moksbstate); @@ -140,10 +125,6 @@ static enum efi_secureboot_mode xen_efi_get_secureboot(void) secure_boot_enabled: pr_info("UEFI Secure Boot is enabled.\n"); return efi_secureboot_mode_enabled; - - out_efi_err: - pr_err("Could not determine UEFI Secure Boot status.\n"); - return efi_secureboot_mode_unknown; } void __init xen_efi_init(struct boot_params *boot_params) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index be4151f42611..3301875dd196 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -795,17 +795,7 @@ static int p2m_dump_show(struct seq_file *m, void *v) return 0; } -static int p2m_dump_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, p2m_dump_show, NULL); -} - -static const struct file_operations p2m_dump_fops = { - .open = p2m_dump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(p2m_dump); static struct dentry *d_mmu_debug; |