diff options
Diffstat (limited to 'arch')
476 files changed, 13608 insertions, 4920 deletions
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 97fce7386b00..780d4673c3ca 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -3,6 +3,7 @@ config ALPHA bool default y select ARCH_32BIT_USTAT_F_TINODE + select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_NO_PREEMPT diff --git a/arch/alpha/boot/bootp.c b/arch/alpha/boot/bootp.c index b4faba2432d5..842e85776cc0 100644 --- a/arch/alpha/boot/bootp.c +++ b/arch/alpha/boot/bootp.c @@ -18,7 +18,7 @@ #include <asm/hwrpb.h> #include <asm/io.h> -#include <stdarg.h> +#include <linux/stdarg.h> #include "ksize.h" diff --git a/arch/alpha/boot/bootpz.c b/arch/alpha/boot/bootpz.c index 90a2b341e9c0..c6079308eab3 100644 --- a/arch/alpha/boot/bootpz.c +++ b/arch/alpha/boot/bootpz.c @@ -20,7 +20,7 @@ #include <asm/hwrpb.h> #include <asm/io.h> -#include <stdarg.h> +#include <linux/stdarg.h> #include "kzsize.h" diff --git a/arch/alpha/boot/main.c b/arch/alpha/boot/main.c index e5347a080008..22a1cb0264af 100644 --- a/arch/alpha/boot/main.c +++ b/arch/alpha/boot/main.c @@ -15,7 +15,7 @@ #include <asm/console.h> #include <asm/hwrpb.h> -#include <stdarg.h> +#include <linux/stdarg.h> #include "ksize.h" diff --git a/arch/alpha/boot/misc.c b/arch/alpha/boot/misc.c index 325d4dd4f904..1ab91852d9f7 100644 --- a/arch/alpha/boot/misc.c +++ b/arch/alpha/boot/misc.c @@ -89,8 +89,6 @@ static ulg output_ptr; static ulg bytes_out; static void error(char *m); -static void gzip_mark(void **); -static void gzip_release(void **); extern int end; static ulg free_mem_ptr; diff --git a/arch/alpha/boot/stdio.c b/arch/alpha/boot/stdio.c index 60f73ccd2e89..faa5234b90b8 100644 --- a/arch/alpha/boot/stdio.c +++ b/arch/alpha/boot/stdio.c @@ -2,8 +2,8 @@ /* * Copyright (C) Paul Mackerras 1997. */ -#include <stdarg.h> -#include <stddef.h> +#include <linux/string.h> +#include <linux/stdarg.h> size_t strnlen(const char * s, size_t count) { @@ -42,8 +42,8 @@ static int skip_atoi(const char **s) static char * number(char * str, unsigned long long num, int base, int size, int precision, int type) { - char c,sign,tmp[66]; - const char *digits="0123456789abcdefghijklmnopqrstuvwxyz"; + char c, sign, tmp[66]; + const char *digits = "0123456789abcdefghijklmnopqrstuvwxyz"; int i; if (type & LARGE) @@ -83,14 +83,14 @@ static char * number(char * str, unsigned long long num, int base, int size, int precision = i; size -= precision; if (!(type&(ZEROPAD+LEFT))) - while(size-->0) + while (size-- > 0) *str++ = ' '; if (sign) *str++ = sign; if (type & SPECIAL) { if (base==8) *str++ = '0'; - else if (base==16) { + else if (base == 16) { *str++ = '0'; *str++ = digits[33]; } @@ -125,7 +125,7 @@ int vsprintf(char *buf, const char *fmt, va_list args) /* 'z' changed to 'Z' --davidm 1/25/99 */ - for (str=buf ; *fmt ; ++fmt) { + for (str = buf ; *fmt ; ++fmt) { if (*fmt != '%') { *str++ = *fmt; continue; @@ -296,7 +296,7 @@ int sprintf(char * buf, const char *fmt, ...) int i; va_start(args, fmt); - i=vsprintf(buf,fmt,args); + i = vsprintf(buf, fmt, args); va_end(args); return i; } diff --git a/arch/alpha/boot/tools/objstrip.c b/arch/alpha/boot/tools/objstrip.c index 08b430d25a31..7cf92d172dce 100644 --- a/arch/alpha/boot/tools/objstrip.c +++ b/arch/alpha/boot/tools/objstrip.c @@ -148,7 +148,7 @@ main (int argc, char *argv[]) #ifdef __ELF__ elf = (struct elfhdr *) buf; - if (elf->e_ident[0] == 0x7f && str_has_prefix((char *)elf->e_ident + 1, "ELF")) { + if (memcmp(&elf->e_ident[EI_MAG0], ELFMAG, SELFMAG) == 0) { if (elf->e_type != ET_EXEC) { fprintf(stderr, "%s: %s is not an ELF executable\n", prog_name, inname); diff --git a/arch/alpha/configs/defconfig b/arch/alpha/configs/defconfig index 6a39fe8ce9e5..1816c1dc22b1 100644 --- a/arch/alpha/configs/defconfig +++ b/arch/alpha/configs/defconfig @@ -39,14 +39,12 @@ CONFIG_PATA_CYPRESS=y CONFIG_ATA_GENERIC=y CONFIG_NETDEVICES=y CONFIG_DUMMY=m -CONFIG_NET_ETHERNET=y CONFIG_NET_VENDOR_3COM=y CONFIG_VORTEX=y CONFIG_NET_TULIP=y CONFIG_DE2104X=m CONFIG_TULIP=y CONFIG_TULIP_MMIO=y -CONFIG_NET_PCI=y CONFIG_YELLOWFIN=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index 54f5126628c6..dd31e97edae8 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild @@ -2,6 +2,7 @@ generated-y += syscall_table.h generic-y += agp.h +generic-y += asm-offsets.h generic-y += export.h generic-y += kvm_para.h generic-y += mcs_spinlock.h diff --git a/arch/alpha/include/asm/asm-offsets.h b/arch/alpha/include/asm/asm-offsets.h deleted file mode 100644 index d370ee36a182..000000000000 --- a/arch/alpha/include/asm/asm-offsets.h +++ /dev/null @@ -1 +0,0 @@ -#include <generated/asm-offsets.h> diff --git a/arch/alpha/include/asm/div64.h b/arch/alpha/include/asm/div64.h deleted file mode 100644 index 6cd978cefb28..000000000000 --- a/arch/alpha/include/asm/div64.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/div64.h> diff --git a/arch/alpha/include/asm/fpu.h b/arch/alpha/include/asm/fpu.h index b9691405e56b..30b24135dd7a 100644 --- a/arch/alpha/include/asm/fpu.h +++ b/arch/alpha/include/asm/fpu.h @@ -15,21 +15,27 @@ rdfpcr(void) { unsigned long tmp, ret; + preempt_disable(); + if (current_thread_info()->status & TS_SAVED_FP) { + ret = current_thread_info()->fp[31]; + } else { #if defined(CONFIG_ALPHA_EV6) || defined(CONFIG_ALPHA_EV67) - __asm__ __volatile__ ( - "ftoit $f0,%0\n\t" - "mf_fpcr $f0\n\t" - "ftoit $f0,%1\n\t" - "itoft %0,$f0" - : "=r"(tmp), "=r"(ret)); + __asm__ __volatile__ ( + "ftoit $f0,%0\n\t" + "mf_fpcr $f0\n\t" + "ftoit $f0,%1\n\t" + "itoft %0,$f0" + : "=r"(tmp), "=r"(ret)); #else - __asm__ __volatile__ ( - "stt $f0,%0\n\t" - "mf_fpcr $f0\n\t" - "stt $f0,%1\n\t" - "ldt $f0,%0" - : "=m"(tmp), "=m"(ret)); + __asm__ __volatile__ ( + "stt $f0,%0\n\t" + "mf_fpcr $f0\n\t" + "stt $f0,%1\n\t" + "ldt $f0,%0" + : "=m"(tmp), "=m"(ret)); #endif + } + preempt_enable(); return ret; } @@ -39,21 +45,28 @@ wrfpcr(unsigned long val) { unsigned long tmp; + preempt_disable(); + if (current_thread_info()->status & TS_SAVED_FP) { + current_thread_info()->status |= TS_RESTORE_FP; + current_thread_info()->fp[31] = val; + } else { #if defined(CONFIG_ALPHA_EV6) || defined(CONFIG_ALPHA_EV67) - __asm__ __volatile__ ( - "ftoit $f0,%0\n\t" - "itoft %1,$f0\n\t" - "mt_fpcr $f0\n\t" - "itoft %0,$f0" - : "=&r"(tmp) : "r"(val)); + __asm__ __volatile__ ( + "ftoit $f0,%0\n\t" + "itoft %1,$f0\n\t" + "mt_fpcr $f0\n\t" + "itoft %0,$f0" + : "=&r"(tmp) : "r"(val)); #else - __asm__ __volatile__ ( - "stt $f0,%0\n\t" - "ldt $f0,%1\n\t" - "mt_fpcr $f0\n\t" - "ldt $f0,%0" - : "=m"(tmp) : "m"(val)); + __asm__ __volatile__ ( + "stt $f0,%0\n\t" + "ldt $f0,%1\n\t" + "mt_fpcr $f0\n\t" + "ldt $f0,%0" + : "=m"(tmp) : "m"(val)); #endif + } + preempt_enable(); } static inline unsigned long diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h index 1c3605d874e9..7aeaf7c30a6f 100644 --- a/arch/alpha/include/asm/io.h +++ b/arch/alpha/include/asm/io.h @@ -14,10 +14,6 @@ the implementation we have here matches that interface. */ #include <asm-generic/iomap.h> -/* We don't use IO slowdowns on the Alpha, but.. */ -#define __SLOW_DOWN_IO do { } while (0) -#define SLOW_DOWN_IO do { } while (0) - /* * Virtual -> physical identity mapping starts at this offset */ diff --git a/arch/alpha/include/asm/irq_regs.h b/arch/alpha/include/asm/irq_regs.h deleted file mode 100644 index 3dd9c0b70270..000000000000 --- a/arch/alpha/include/asm/irq_regs.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/irq_regs.h> diff --git a/arch/alpha/include/asm/kdebug.h b/arch/alpha/include/asm/kdebug.h deleted file mode 100644 index 6ece1b037665..000000000000 --- a/arch/alpha/include/asm/kdebug.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/kdebug.h> diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h index 082631465074..4a4d00b37986 100644 --- a/arch/alpha/include/asm/thread_info.h +++ b/arch/alpha/include/asm/thread_info.h @@ -26,6 +26,7 @@ struct thread_info { int bpt_nsaved; unsigned long bpt_addr[2]; /* breakpoint handling */ unsigned int bpt_insn[2]; + unsigned long fp[32]; }; /* @@ -41,6 +42,8 @@ struct thread_info { register struct thread_info *__current_thread_info __asm__("$8"); #define current_thread_info() __current_thread_info +register unsigned long *current_stack_pointer __asm__ ("$30"); + #endif /* __ASSEMBLY__ */ /* Thread information allocation. */ @@ -81,6 +84,9 @@ register struct thread_info *__current_thread_info __asm__("$8"); #define TS_UAC_NOFIX 0x0002 /* ! flags as they match */ #define TS_UAC_SIGBUS 0x0004 /* ! userspace part of 'osf_sysinfo' */ +#define TS_SAVED_FP 0x0008 +#define TS_RESTORE_FP 0x0010 + #define SET_UNALIGN_CTL(task,value) ({ \ __u32 status = task_thread_info(task)->status & ~UAC_BITMASK; \ if (value & PR_UNALIGN_NOPRINT) \ @@ -104,5 +110,17 @@ register struct thread_info *__current_thread_info __asm__("$8"); put_user(res, (int __user *)(value)); \ }) +#ifndef __ASSEMBLY__ +extern void __save_fpu(void); + +static inline void save_fpu(void) +{ + if (!(current_thread_info()->status & TS_SAVED_FP)) { + current_thread_info()->status |= TS_SAVED_FP; + __save_fpu(); + } +} +#endif + #endif /* __KERNEL__ */ #endif /* _ALPHA_THREAD_INFO_H */ diff --git a/arch/alpha/include/asm/unistd.h b/arch/alpha/include/asm/unistd.h index 986f5da9b7d8..caabd92ea709 100644 --- a/arch/alpha/include/asm/unistd.h +++ b/arch/alpha/include/asm/unistd.h @@ -4,7 +4,7 @@ #include <uapi/asm/unistd.h> -#define NR_SYSCALLS __NR_syscalls +#define NR_syscalls __NR_syscalls #define __ARCH_WANT_NEW_STAT #define __ARCH_WANT_OLD_READDIR diff --git a/arch/alpha/include/uapi/asm/ptrace.h b/arch/alpha/include/uapi/asm/ptrace.h index c29194181025..5ca45934fcbb 100644 --- a/arch/alpha/include/uapi/asm/ptrace.h +++ b/arch/alpha/include/uapi/asm/ptrace.h @@ -64,7 +64,9 @@ struct switch_stack { unsigned long r14; unsigned long r15; unsigned long r26; +#ifndef __KERNEL__ unsigned long fp[32]; /* fp[31] is fpcr */ +#endif }; diff --git a/arch/alpha/kernel/asm-offsets.c b/arch/alpha/kernel/asm-offsets.c index 2e125e5c1508..b121294bee26 100644 --- a/arch/alpha/kernel/asm-offsets.c +++ b/arch/alpha/kernel/asm-offsets.c @@ -17,6 +17,8 @@ void foo(void) DEFINE(TI_TASK, offsetof(struct thread_info, task)); DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); + DEFINE(TI_FP, offsetof(struct thread_info, fp)); + DEFINE(TI_STATUS, offsetof(struct thread_info, status)); BLANK(); DEFINE(TASK_BLOCKED, offsetof(struct task_struct, blocked)); diff --git a/arch/alpha/kernel/core_cia.c b/arch/alpha/kernel/core_cia.c index f489170201c3..12926e9538b8 100644 --- a/arch/alpha/kernel/core_cia.c +++ b/arch/alpha/kernel/core_cia.c @@ -527,7 +527,7 @@ verify_tb_operation(void) if (use_tbia_try2) { alpha_mv.mv_pci_tbi = cia_pci_tbi_try2; - /* Tags 0-3 must be disabled if we use this workaraund. */ + /* Tags 0-3 must be disabled if we use this workaround. */ wmb(); *(vip)CIA_IOC_TB_TAGn(0) = 2; *(vip)CIA_IOC_TB_TAGn(1) = 2; diff --git a/arch/alpha/kernel/entry.S b/arch/alpha/kernel/entry.S index a6207c47f089..eb51f93a70c8 100644 --- a/arch/alpha/kernel/entry.S +++ b/arch/alpha/kernel/entry.S @@ -17,7 +17,7 @@ /* Stack offsets. */ #define SP_OFF 184 -#define SWITCH_STACK_SIZE 320 +#define SWITCH_STACK_SIZE 64 .macro CFI_START_OSF_FRAME func .align 4 @@ -159,7 +159,6 @@ .cfi_rel_offset $13, 32 .cfi_rel_offset $14, 40 .cfi_rel_offset $15, 48 - /* We don't really care about the FP registers for debugging. */ .endm .macro UNDO_SWITCH_STACK @@ -454,7 +453,7 @@ entSys: SAVE_ALL lda $8, 0x3fff bic $sp, $8, $8 - lda $4, NR_SYSCALLS($31) + lda $4, NR_syscalls($31) stq $16, SP_OFF+24($sp) lda $5, sys_call_table lda $27, sys_ni_syscall @@ -498,6 +497,10 @@ ret_to_user: and $17, _TIF_WORK_MASK, $2 bne $2, work_pending restore_all: + ldl $2, TI_STATUS($8) + and $2, TS_SAVED_FP | TS_RESTORE_FP, $3 + bne $3, restore_fpu +restore_other: .cfi_remember_state RESTORE_ALL call_pal PAL_rti @@ -506,7 +509,7 @@ ret_to_kernel: .cfi_restore_state lda $16, 7 call_pal PAL_swpipl - br restore_all + br restore_other .align 3 $syscall_error: @@ -570,6 +573,14 @@ $work_notifysig: .type strace, @function strace: /* set up signal stack, call syscall_trace */ + // NB: if anyone adds preemption, this block will need to be protected + ldl $1, TI_STATUS($8) + and $1, TS_SAVED_FP, $3 + or $1, TS_SAVED_FP, $2 + bne $3, 1f + stl $2, TI_STATUS($8) + bsr $26, __save_fpu +1: DO_SWITCH_STACK jsr $26, syscall_trace_enter /* returns the syscall number */ UNDO_SWITCH_STACK @@ -583,7 +594,7 @@ strace: ldq $21, 88($sp) /* get the system call pointer.. */ - lda $1, NR_SYSCALLS($31) + lda $1, NR_syscalls($31) lda $2, sys_call_table lda $27, sys_ni_syscall cmpult $0, $1, $1 @@ -649,40 +660,6 @@ do_switch_stack: stq $14, 40($sp) stq $15, 48($sp) stq $26, 56($sp) - stt $f0, 64($sp) - stt $f1, 72($sp) - stt $f2, 80($sp) - stt $f3, 88($sp) - stt $f4, 96($sp) - stt $f5, 104($sp) - stt $f6, 112($sp) - stt $f7, 120($sp) - stt $f8, 128($sp) - stt $f9, 136($sp) - stt $f10, 144($sp) - stt $f11, 152($sp) - stt $f12, 160($sp) - stt $f13, 168($sp) - stt $f14, 176($sp) - stt $f15, 184($sp) - stt $f16, 192($sp) - stt $f17, 200($sp) - stt $f18, 208($sp) - stt $f19, 216($sp) - stt $f20, 224($sp) - stt $f21, 232($sp) - stt $f22, 240($sp) - stt $f23, 248($sp) - stt $f24, 256($sp) - stt $f25, 264($sp) - stt $f26, 272($sp) - stt $f27, 280($sp) - mf_fpcr $f0 # get fpcr - stt $f28, 288($sp) - stt $f29, 296($sp) - stt $f30, 304($sp) - stt $f0, 312($sp) # save fpcr in slot of $f31 - ldt $f0, 64($sp) # dont let "do_switch_stack" change fp state. ret $31, ($1), 1 .cfi_endproc .size do_switch_stack, .-do_switch_stack @@ -701,54 +678,71 @@ undo_switch_stack: ldq $14, 40($sp) ldq $15, 48($sp) ldq $26, 56($sp) - ldt $f30, 312($sp) # get saved fpcr - ldt $f0, 64($sp) - ldt $f1, 72($sp) - ldt $f2, 80($sp) - ldt $f3, 88($sp) - mt_fpcr $f30 # install saved fpcr - ldt $f4, 96($sp) - ldt $f5, 104($sp) - ldt $f6, 112($sp) - ldt $f7, 120($sp) - ldt $f8, 128($sp) - ldt $f9, 136($sp) - ldt $f10, 144($sp) - ldt $f11, 152($sp) - ldt $f12, 160($sp) - ldt $f13, 168($sp) - ldt $f14, 176($sp) - ldt $f15, 184($sp) - ldt $f16, 192($sp) - ldt $f17, 200($sp) - ldt $f18, 208($sp) - ldt $f19, 216($sp) - ldt $f20, 224($sp) - ldt $f21, 232($sp) - ldt $f22, 240($sp) - ldt $f23, 248($sp) - ldt $f24, 256($sp) - ldt $f25, 264($sp) - ldt $f26, 272($sp) - ldt $f27, 280($sp) - ldt $f28, 288($sp) - ldt $f29, 296($sp) - ldt $f30, 304($sp) lda $sp, SWITCH_STACK_SIZE($sp) ret $31, ($1), 1 .cfi_endproc .size undo_switch_stack, .-undo_switch_stack + +#define FR(n) n * 8 + TI_FP($8) + .align 4 + .globl __save_fpu + .type __save_fpu, @function +__save_fpu: +#define V(n) stt $f##n, FR(n) + V( 0); V( 1); V( 2); V( 3) + V( 4); V( 5); V( 6); V( 7) + V( 8); V( 9); V(10); V(11) + V(12); V(13); V(14); V(15) + V(16); V(17); V(18); V(19) + V(20); V(21); V(22); V(23) + V(24); V(25); V(26); V(27) + mf_fpcr $f0 # get fpcr + V(28); V(29); V(30) + stt $f0, FR(31) # save fpcr in slot of $f31 + ldt $f0, FR(0) # don't let "__save_fpu" change fp state. + ret +#undef V + .size __save_fpu, .-__save_fpu + + .align 4 +restore_fpu: + and $3, TS_RESTORE_FP, $3 + bic $2, TS_SAVED_FP | TS_RESTORE_FP, $2 + beq $3, 1f +#define V(n) ldt $f##n, FR(n) + ldt $f30, FR(31) # get saved fpcr + V( 0); V( 1); V( 2); V( 3) + mt_fpcr $f30 # install saved fpcr + V( 4); V( 5); V( 6); V( 7) + V( 8); V( 9); V(10); V(11) + V(12); V(13); V(14); V(15) + V(16); V(17); V(18); V(19) + V(20); V(21); V(22); V(23) + V(24); V(25); V(26); V(27) + V(28); V(29); V(30) +1: stl $2, TI_STATUS($8) + br restore_other +#undef V + /* * The meat of the context switch code. */ - .align 4 .globl alpha_switch_to .type alpha_switch_to, @function .cfi_startproc alpha_switch_to: DO_SWITCH_STACK + ldl $1, TI_STATUS($8) + and $1, TS_RESTORE_FP, $3 + bne $3, 1f + or $1, TS_RESTORE_FP | TS_SAVED_FP, $2 + and $1, TS_SAVED_FP, $3 + stl $2, TI_STATUS($8) + bne $3, 1f + bsr $26, __save_fpu +1: call_pal PAL_swpctx lda $8, 0x3fff UNDO_SWITCH_STACK @@ -799,6 +793,14 @@ ret_from_kernel_thread: alpha_\name: .prologue 0 bsr $1, do_switch_stack + // NB: if anyone adds preemption, this block will need to be protected + ldl $1, TI_STATUS($8) + and $1, TS_SAVED_FP, $3 + or $1, TS_SAVED_FP, $2 + bne $3, 1f + stl $2, TI_STATUS($8) + bsr $26, __save_fpu +1: jsr $26, sys_\name ldq $26, 56($sp) lda $sp, SWITCH_STACK_SIZE($sp) diff --git a/arch/alpha/kernel/module.c b/arch/alpha/kernel/module.c index 5b60c248de9e..cbefa5a77384 100644 --- a/arch/alpha/kernel/module.c +++ b/arch/alpha/kernel/module.c @@ -146,10 +146,8 @@ apply_relocate_add(Elf64_Shdr *sechdrs, const char *strtab, base = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr; symtab = (Elf64_Sym *)sechdrs[symindex].sh_addr; - /* The small sections were sorted to the end of the segment. - The following should definitely cover them. */ - gp = (u64)me->core_layout.base + me->core_layout.size - 0x8000; got = sechdrs[me->arch.gotsecindex].sh_addr; + gp = got + 0x8000; for (i = 0; i < n; i++) { unsigned long r_sym = ELF64_R_SYM (rela[i].r_info); diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index c54469b369cb..2a9a877a0508 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -522,7 +522,7 @@ SYSCALL_DEFINE4(osf_mount, unsigned long, typenr, const char __user *, path, break; default: retval = -EINVAL; - printk("osf_mount(%ld, %x)\n", typenr, flag); + printk_ratelimited("osf_mount(%ld, %x)\n", typenr, flag); } return retval; diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c index e83a02ed5267..c81183935e97 100644 --- a/arch/alpha/kernel/pci_iommu.c +++ b/arch/alpha/kernel/pci_iommu.c @@ -127,10 +127,12 @@ again: goto again; } - if (ptes[p+i]) - p = ALIGN(p + i + 1, mask + 1), i = 0; - else + if (ptes[p+i]) { + p = ALIGN(p + i + 1, mask + 1); + i = 0; + } else { i = i + 1; + } } if (i < n) { diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c index efcf7321701b..ccdb508c1516 100644 --- a/arch/alpha/kernel/perf_event.c +++ b/arch/alpha/kernel/perf_event.c @@ -689,8 +689,6 @@ static int __hw_perf_event_init(struct perf_event *event) */ static int alpha_pmu_event_init(struct perf_event *event) { - int err; - /* does not support taken branch sampling */ if (has_branch_stack(event)) return -EOPNOTSUPP; @@ -709,9 +707,7 @@ static int alpha_pmu_event_init(struct perf_event *event) return -ENODEV; /* Do the real initialisation work. */ - err = __hw_perf_event_init(event); - - return err; + return __hw_perf_event_init(event); } /* diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 0eddd22c6212..e9cf7193eb81 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -133,7 +133,7 @@ common_shutdown_1(void *generic_ptr) #ifdef CONFIG_DUMMY_CONSOLE /* If we've gotten here after SysRq-b, leave interrupt context before taking over the console. */ - if (in_irq()) + if (in_hardirq()) irq_exit(); /* This has the effect of resetting the VGA video origin. */ console_lock(); @@ -243,6 +243,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) childstack = ((struct switch_stack *) childregs) - 1; childti->pcb.ksp = (unsigned long) childstack; childti->pcb.flags = 1; /* set FEN, clear everything else */ + childti->status |= TS_SAVED_FP | TS_RESTORE_FP; if (unlikely(args->fn)) { /* kernel thread */ @@ -252,6 +253,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) childstack->r9 = (unsigned long) args->fn; childstack->r10 = (unsigned long) args->fn_arg; childregs->hae = alpha_mv.hae_cache; + memset(childti->fp, '\0', sizeof(childti->fp)); childti->pcb.usp = 0; return 0; } @@ -334,8 +336,7 @@ EXPORT_SYMBOL(dump_elf_task); int elf_core_copy_task_fpregs(struct task_struct *t, elf_fpregset_t *fpu) { - struct switch_stack *sw = (struct switch_stack *)task_pt_regs(t) - 1; - memcpy(fpu, sw->fp, 32 * 8); + memcpy(fpu, task_thread_info(t)->fp, 32 * 8); return 1; } diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c index a1a239ea002d..fde4c68e7a0b 100644 --- a/arch/alpha/kernel/ptrace.c +++ b/arch/alpha/kernel/ptrace.c @@ -78,6 +78,8 @@ enum { (PAGE_SIZE*2 - sizeof(struct pt_regs) - sizeof(struct switch_stack) \ + offsetof(struct switch_stack, reg)) +#define FP_REG(reg) (offsetof(struct thread_info, reg)) + static int regoff[] = { PT_REG( r0), PT_REG( r1), PT_REG( r2), PT_REG( r3), PT_REG( r4), PT_REG( r5), PT_REG( r6), PT_REG( r7), @@ -87,14 +89,14 @@ static int regoff[] = { PT_REG( r20), PT_REG( r21), PT_REG( r22), PT_REG( r23), PT_REG( r24), PT_REG( r25), PT_REG( r26), PT_REG( r27), PT_REG( r28), PT_REG( gp), -1, -1, - SW_REG(fp[ 0]), SW_REG(fp[ 1]), SW_REG(fp[ 2]), SW_REG(fp[ 3]), - SW_REG(fp[ 4]), SW_REG(fp[ 5]), SW_REG(fp[ 6]), SW_REG(fp[ 7]), - SW_REG(fp[ 8]), SW_REG(fp[ 9]), SW_REG(fp[10]), SW_REG(fp[11]), - SW_REG(fp[12]), SW_REG(fp[13]), SW_REG(fp[14]), SW_REG(fp[15]), - SW_REG(fp[16]), SW_REG(fp[17]), SW_REG(fp[18]), SW_REG(fp[19]), - SW_REG(fp[20]), SW_REG(fp[21]), SW_REG(fp[22]), SW_REG(fp[23]), - SW_REG(fp[24]), SW_REG(fp[25]), SW_REG(fp[26]), SW_REG(fp[27]), - SW_REG(fp[28]), SW_REG(fp[29]), SW_REG(fp[30]), SW_REG(fp[31]), + FP_REG(fp[ 0]), FP_REG(fp[ 1]), FP_REG(fp[ 2]), FP_REG(fp[ 3]), + FP_REG(fp[ 4]), FP_REG(fp[ 5]), FP_REG(fp[ 6]), FP_REG(fp[ 7]), + FP_REG(fp[ 8]), FP_REG(fp[ 9]), FP_REG(fp[10]), FP_REG(fp[11]), + FP_REG(fp[12]), FP_REG(fp[13]), FP_REG(fp[14]), FP_REG(fp[15]), + FP_REG(fp[16]), FP_REG(fp[17]), FP_REG(fp[18]), FP_REG(fp[19]), + FP_REG(fp[20]), FP_REG(fp[21]), FP_REG(fp[22]), FP_REG(fp[23]), + FP_REG(fp[24]), FP_REG(fp[25]), FP_REG(fp[26]), FP_REG(fp[27]), + FP_REG(fp[28]), FP_REG(fp[29]), FP_REG(fp[30]), FP_REG(fp[31]), PT_REG( pc) }; diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c index 6f47f256fe80..e62d1d461b1f 100644 --- a/arch/alpha/kernel/signal.c +++ b/arch/alpha/kernel/signal.c @@ -150,9 +150,10 @@ restore_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs) { unsigned long usp; struct switch_stack *sw = (struct switch_stack *)regs - 1; - long i, err = __get_user(regs->pc, &sc->sc_pc); + long err = __get_user(regs->pc, &sc->sc_pc); current->restart_block.fn = do_no_restart_syscall; + current_thread_info()->status |= TS_SAVED_FP | TS_RESTORE_FP; sw->r26 = (unsigned long) ret_from_sys_call; @@ -189,9 +190,9 @@ restore_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs) err |= __get_user(usp, sc->sc_regs+30); wrusp(usp); - for (i = 0; i < 31; i++) - err |= __get_user(sw->fp[i], sc->sc_fpregs+i); - err |= __get_user(sw->fp[31], &sc->sc_fpcr); + err |= __copy_from_user(current_thread_info()->fp, + sc->sc_fpregs, 31 * 8); + err |= __get_user(current_thread_info()->fp[31], &sc->sc_fpcr); return err; } @@ -272,7 +273,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, unsigned long sp) { struct switch_stack *sw = (struct switch_stack *)regs - 1; - long i, err = 0; + long err = 0; err |= __put_user(on_sig_stack((unsigned long)sc), &sc->sc_onstack); err |= __put_user(mask, &sc->sc_mask); @@ -312,10 +313,10 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, err |= __put_user(sp, sc->sc_regs+30); err |= __put_user(0, sc->sc_regs+31); - for (i = 0; i < 31; i++) - err |= __put_user(sw->fp[i], sc->sc_fpregs+i); + err |= __copy_to_user(sc->sc_fpregs, + current_thread_info()->fp, 31 * 8); err |= __put_user(0, sc->sc_fpregs+31); - err |= __put_user(sw->fp[31], &sc->sc_fpcr); + err |= __put_user(current_thread_info()->fp[31], &sc->sc_fpcr); err |= __put_user(regs->trap_a0, &sc->sc_traparg_a0); err |= __put_user(regs->trap_a1, &sc->sc_traparg_a1); @@ -528,6 +529,9 @@ do_work_pending(struct pt_regs *regs, unsigned long thread_flags, } else { local_irq_enable(); if (thread_flags & (_TIF_SIGPENDING|_TIF_NOTIFY_SIGNAL)) { + preempt_disable(); + save_fpu(); + preempt_enable(); do_signal(regs, r0, r19); r0 = 0; } else { diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index 8a66fe544c69..d9a67b370e04 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -233,7 +233,21 @@ do_entIF(unsigned long type, struct pt_regs *regs) { int signo, code; - if ((regs->ps & ~IPL_MAX) == 0) { + if (type == 3) { /* FEN fault */ + /* Irritating users can call PAL_clrfen to disable the + FPU for the process. The kernel will then trap in + do_switch_stack and undo_switch_stack when we try + to save and restore the FP registers. + + Given that GCC by default generates code that uses the + FP registers, PAL_clrfen is not useful except for DoS + attacks. So turn the bleeding FPU back on and be done + with it. */ + current_thread_info()->pcb.flags |= 1; + __reload_thread(¤t_thread_info()->pcb); + return; + } + if (!user_mode(regs)) { if (type == 1) { const unsigned int *data = (const unsigned int *) regs->pc; @@ -366,20 +380,6 @@ do_entIF(unsigned long type, struct pt_regs *regs) } break; - case 3: /* FEN fault */ - /* Irritating users can call PAL_clrfen to disable the - FPU for the process. The kernel will then trap in - do_switch_stack and undo_switch_stack when we try - to save and restore the FP registers. - - Given that GCC by default generates code that uses the - FP registers, PAL_clrfen is not useful except for DoS - attacks. So turn the bleeding FPU back on and be done - with it. */ - current_thread_info()->pcb.flags |= 1; - __reload_thread(¤t_thread_info()->pcb); - return; - case 5: /* illoc */ default: /* unexpected instruction-fault type */ ; diff --git a/arch/alpha/lib/fpreg.c b/arch/alpha/lib/fpreg.c index 34fea465645b..612c5eca71bc 100644 --- a/arch/alpha/lib/fpreg.c +++ b/arch/alpha/lib/fpreg.c @@ -7,6 +7,8 @@ #include <linux/compiler.h> #include <linux/export.h> +#include <linux/preempt.h> +#include <asm/thread_info.h> #if defined(CONFIG_ALPHA_EV6) || defined(CONFIG_ALPHA_EV67) #define STT(reg,val) asm volatile ("ftoit $f"#reg",%0" : "=r"(val)); @@ -19,7 +21,12 @@ alpha_read_fp_reg (unsigned long reg) { unsigned long val; - switch (reg) { + if (unlikely(reg >= 32)) + return 0; + preempt_enable(); + if (current_thread_info()->status & TS_SAVED_FP) + val = current_thread_info()->fp[reg]; + else switch (reg) { case 0: STT( 0, val); break; case 1: STT( 1, val); break; case 2: STT( 2, val); break; @@ -52,8 +59,8 @@ alpha_read_fp_reg (unsigned long reg) case 29: STT(29, val); break; case 30: STT(30, val); break; case 31: STT(31, val); break; - default: return 0; } + preempt_enable(); return val; } EXPORT_SYMBOL(alpha_read_fp_reg); @@ -67,7 +74,14 @@ EXPORT_SYMBOL(alpha_read_fp_reg); void alpha_write_fp_reg (unsigned long reg, unsigned long val) { - switch (reg) { + if (unlikely(reg >= 32)) + return; + + preempt_disable(); + if (current_thread_info()->status & TS_SAVED_FP) { + current_thread_info()->status |= TS_RESTORE_FP; + current_thread_info()->fp[reg] = val; + } else switch (reg) { case 0: LDT( 0, val); break; case 1: LDT( 1, val); break; case 2: LDT( 2, val); break; @@ -101,6 +115,7 @@ alpha_write_fp_reg (unsigned long reg, unsigned long val) case 30: LDT(30, val); break; case 31: LDT(31, val); break; } + preempt_enable(); } EXPORT_SYMBOL(alpha_write_fp_reg); @@ -115,7 +130,14 @@ alpha_read_fp_reg_s (unsigned long reg) { unsigned long val; - switch (reg) { + if (unlikely(reg >= 32)) + return 0; + + preempt_enable(); + if (current_thread_info()->status & TS_SAVED_FP) { + LDT(0, current_thread_info()->fp[reg]); + STS(0, val); + } else switch (reg) { case 0: STS( 0, val); break; case 1: STS( 1, val); break; case 2: STS( 2, val); break; @@ -148,8 +170,8 @@ alpha_read_fp_reg_s (unsigned long reg) case 29: STS(29, val); break; case 30: STS(30, val); break; case 31: STS(31, val); break; - default: return 0; } + preempt_enable(); return val; } EXPORT_SYMBOL(alpha_read_fp_reg_s); @@ -163,7 +185,15 @@ EXPORT_SYMBOL(alpha_read_fp_reg_s); void alpha_write_fp_reg_s (unsigned long reg, unsigned long val) { - switch (reg) { + if (unlikely(reg >= 32)) + return; + + preempt_disable(); + if (current_thread_info()->status & TS_SAVED_FP) { + current_thread_info()->status |= TS_RESTORE_FP; + LDS(0, val); + STT(0, current_thread_info()->fp[reg]); + } else switch (reg) { case 0: LDS( 0, val); break; case 1: LDS( 1, val); break; case 2: LDS( 2, val); break; @@ -197,5 +227,6 @@ alpha_write_fp_reg_s (unsigned long reg, unsigned long val) case 30: LDS(30, val); break; case 31: LDS(31, val); break; } + preempt_enable(); } EXPORT_SYMBOL(alpha_write_fp_reg_s); diff --git a/arch/alpha/lib/stacktrace.c b/arch/alpha/lib/stacktrace.c index 62454a7810e2..2b1176dd5174 100644 --- a/arch/alpha/lib/stacktrace.c +++ b/arch/alpha/lib/stacktrace.c @@ -92,7 +92,7 @@ stacktrace(void) { instr * ret_pc; instr * prologue = (instr *)stacktrace; - register unsigned char * sp __asm__ ("$30"); + unsigned char *sp = (unsigned char *)current_stack_pointer; printk("\tstack trace:\n"); do { diff --git a/arch/arm/boot/dts/aspeed-bmc-ibm-bonnell.dts b/arch/arm/boot/dts/aspeed-bmc-ibm-bonnell.dts index 0f9a4f0a5571..a5be0ee048ec 100644 --- a/arch/arm/boot/dts/aspeed-bmc-ibm-bonnell.dts +++ b/arch/arm/boot/dts/aspeed-bmc-ibm-bonnell.dts @@ -124,7 +124,7 @@ }; }; - iio-hwmon-battery { + iio-hwmon { compatible = "iio-hwmon"; io-channels = <&adc1 7>; }; diff --git a/arch/arm/boot/dts/aspeed-bmc-ibm-everest.dts b/arch/arm/boot/dts/aspeed-bmc-ibm-everest.dts index 456ca2830a31..c3b0cd61ac85 100644 --- a/arch/arm/boot/dts/aspeed-bmc-ibm-everest.dts +++ b/arch/arm/boot/dts/aspeed-bmc-ibm-everest.dts @@ -244,7 +244,7 @@ }; }; - iio-hwmon-battery { + iio-hwmon { compatible = "iio-hwmon"; io-channels = <&adc1 7>; }; diff --git a/arch/arm/boot/dts/aspeed-bmc-ibm-rainier.dts b/arch/arm/boot/dts/aspeed-bmc-ibm-rainier.dts index e1b5d44308fe..7162e65b8115 100644 --- a/arch/arm/boot/dts/aspeed-bmc-ibm-rainier.dts +++ b/arch/arm/boot/dts/aspeed-bmc-ibm-rainier.dts @@ -220,7 +220,7 @@ }; }; - iio-hwmon-battery { + iio-hwmon { compatible = "iio-hwmon"; io-channels = <&adc1 7>; }; diff --git a/arch/arm/boot/dts/exynos4-cpu-thermal.dtsi b/arch/arm/boot/dts/exynos4-cpu-thermal.dtsi index 021d9fc1b492..27a1a8952665 100644 --- a/arch/arm/boot/dts/exynos4-cpu-thermal.dtsi +++ b/arch/arm/boot/dts/exynos4-cpu-thermal.dtsi @@ -10,7 +10,7 @@ / { thermal-zones { cpu_thermal: cpu-thermal { - thermal-sensors = <&tmu 0>; + thermal-sensors = <&tmu>; polling-delay-passive = <0>; polling-delay = <0>; trips { diff --git a/arch/arm/boot/dts/exynos4210.dtsi b/arch/arm/boot/dts/exynos4210.dtsi index 5a1ec714c612..0e27c3375e2e 100644 --- a/arch/arm/boot/dts/exynos4210.dtsi +++ b/arch/arm/boot/dts/exynos4210.dtsi @@ -393,7 +393,6 @@ &cpu_thermal { polling-delay-passive = <0>; polling-delay = <0>; - thermal-sensors = <&tmu 0>; }; &gic { diff --git a/arch/arm/boot/dts/exynos5250.dtsi b/arch/arm/boot/dts/exynos5250.dtsi index a5db4ac213d5..60a623e3a200 100644 --- a/arch/arm/boot/dts/exynos5250.dtsi +++ b/arch/arm/boot/dts/exynos5250.dtsi @@ -1107,7 +1107,7 @@ &cpu_thermal { polling-delay-passive = <0>; polling-delay = <0>; - thermal-sensors = <&tmu 0>; + thermal-sensors = <&tmu>; cooling-maps { map0 { diff --git a/arch/arm/boot/dts/exynos5410-odroidxu.dts b/arch/arm/boot/dts/exynos5410-odroidxu.dts index 232561620da2..6ddd1dd2fb0b 100644 --- a/arch/arm/boot/dts/exynos5410-odroidxu.dts +++ b/arch/arm/boot/dts/exynos5410-odroidxu.dts @@ -120,7 +120,6 @@ }; &cpu0_thermal { - thermal-sensors = <&tmu_cpu0 0>; polling-delay-passive = <0>; polling-delay = <0>; diff --git a/arch/arm/boot/dts/exynos5422-odroidhc1.dts b/arch/arm/boot/dts/exynos5422-odroidhc1.dts index 3de7019572a2..5e4280393706 100644 --- a/arch/arm/boot/dts/exynos5422-odroidhc1.dts +++ b/arch/arm/boot/dts/exynos5422-odroidhc1.dts @@ -31,7 +31,7 @@ thermal-zones { cpu0_thermal: cpu0-thermal { - thermal-sensors = <&tmu_cpu0 0>; + thermal-sensors = <&tmu_cpu0>; trips { cpu0_alert0: cpu-alert-0 { temperature = <70000>; /* millicelsius */ @@ -86,7 +86,7 @@ }; }; cpu1_thermal: cpu1-thermal { - thermal-sensors = <&tmu_cpu1 0>; + thermal-sensors = <&tmu_cpu1>; trips { cpu1_alert0: cpu-alert-0 { temperature = <70000>; @@ -130,7 +130,7 @@ }; }; cpu2_thermal: cpu2-thermal { - thermal-sensors = <&tmu_cpu2 0>; + thermal-sensors = <&tmu_cpu2>; trips { cpu2_alert0: cpu-alert-0 { temperature = <70000>; @@ -174,7 +174,7 @@ }; }; cpu3_thermal: cpu3-thermal { - thermal-sensors = <&tmu_cpu3 0>; + thermal-sensors = <&tmu_cpu3>; trips { cpu3_alert0: cpu-alert-0 { temperature = <70000>; @@ -218,7 +218,7 @@ }; }; gpu_thermal: gpu-thermal { - thermal-sensors = <&tmu_gpu 0>; + thermal-sensors = <&tmu_gpu>; trips { gpu_alert0: gpu-alert-0 { temperature = <70000>; diff --git a/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi b/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi index a6961ff24030..e6e7e2ff2a26 100644 --- a/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi +++ b/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi @@ -50,7 +50,7 @@ thermal-zones { cpu0_thermal: cpu0-thermal { - thermal-sensors = <&tmu_cpu0 0>; + thermal-sensors = <&tmu_cpu0>; polling-delay-passive = <250>; polling-delay = <0>; trips { @@ -139,7 +139,7 @@ }; }; cpu1_thermal: cpu1-thermal { - thermal-sensors = <&tmu_cpu1 0>; + thermal-sensors = <&tmu_cpu1>; polling-delay-passive = <250>; polling-delay = <0>; trips { @@ -212,7 +212,7 @@ }; }; cpu2_thermal: cpu2-thermal { - thermal-sensors = <&tmu_cpu2 0>; + thermal-sensors = <&tmu_cpu2>; polling-delay-passive = <250>; polling-delay = <0>; trips { @@ -285,7 +285,7 @@ }; }; cpu3_thermal: cpu3-thermal { - thermal-sensors = <&tmu_cpu3 0>; + thermal-sensors = <&tmu_cpu3>; polling-delay-passive = <250>; polling-delay = <0>; trips { @@ -358,7 +358,7 @@ }; }; gpu_thermal: gpu-thermal { - thermal-sensors = <&tmu_gpu 0>; + thermal-sensors = <&tmu_gpu>; polling-delay-passive = <250>; polling-delay = <0>; trips { diff --git a/arch/arm/boot/dts/spear320-hmi.dts b/arch/arm/boot/dts/spear320-hmi.dts index 34503ac9c51c..721e5ee7b680 100644 --- a/arch/arm/boot/dts/spear320-hmi.dts +++ b/arch/arm/boot/dts/spear320-hmi.dts @@ -241,7 +241,7 @@ irq-trigger = <0x1>; stmpegpio: stmpe-gpio { - compatible = "stmpe,gpio"; + compatible = "st,stmpe-gpio"; reg = <0>; gpio-controller; #gpio-cells = <2>; diff --git a/arch/arm/mach-qcom/platsmp.c b/arch/arm/mach-qcom/platsmp.c index 5d2f386a46d8..eca2fe0f4314 100644 --- a/arch/arm/mach-qcom/platsmp.c +++ b/arch/arm/mach-qcom/platsmp.c @@ -14,7 +14,7 @@ #include <linux/of_address.h> #include <linux/smp.h> #include <linux/io.h> -#include <linux/qcom_scm.h> +#include <linux/firmware/qcom/qcom_scm.h> #include <asm/smp_plat.h> diff --git a/arch/arm/mach-s3c/Makefile b/arch/arm/mach-s3c/Makefile index 988c49672715..713827bef831 100644 --- a/arch/arm/mach-s3c/Makefile +++ b/arch/arm/mach-s3c/Makefile @@ -2,7 +2,7 @@ # # Copyright 2009 Simtec Electronics -include $(src)/Makefile.s3c64xx +include $(srctree)/$(src)/Makefile.s3c64xx # Objects we always build independent of SoC choice diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index c135f6e37a00..8bc01071474a 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -984,7 +984,8 @@ __iommu_create_mapping(struct device *dev, struct page **pages, size_t size, len = (j - i) << PAGE_SHIFT; ret = iommu_map(mapping->domain, iova, phys, len, - __dma_info_to_prot(DMA_BIDIRECTIONAL, attrs)); + __dma_info_to_prot(DMA_BIDIRECTIONAL, attrs), + GFP_KERNEL); if (ret < 0) goto fail; iova += len; @@ -1207,7 +1208,8 @@ static int __map_sg_chunk(struct device *dev, struct scatterlist *sg, prot = __dma_info_to_prot(dir, attrs); - ret = iommu_map(mapping->domain, iova, phys, len, prot); + ret = iommu_map(mapping->domain, iova, phys, len, prot, + GFP_KERNEL); if (ret < 0) goto fail; count += len >> PAGE_SHIFT; @@ -1379,7 +1381,8 @@ static dma_addr_t arm_iommu_map_page(struct device *dev, struct page *page, prot = __dma_info_to_prot(dir, attrs); - ret = iommu_map(mapping->domain, dma_addr, page_to_phys(page), len, prot); + ret = iommu_map(mapping->domain, dma_addr, page_to_phys(page), len, + prot, GFP_KERNEL); if (ret < 0) goto fail; @@ -1443,7 +1446,7 @@ static dma_addr_t arm_iommu_map_resource(struct device *dev, prot = __dma_info_to_prot(dir, attrs) | IOMMU_MMIO; - ret = iommu_map(mapping->domain, dma_addr, addr, len, prot); + ret = iommu_map(mapping->domain, dma_addr, addr, len, prot, GFP_KERNEL); if (ret < 0) goto fail; diff --git a/arch/arm64/boot/dts/qcom/sm8250.dtsi b/arch/arm64/boot/dts/qcom/sm8250.dtsi index f69fa68a1222..2f0e460acccd 100644 --- a/arch/arm64/boot/dts/qcom/sm8250.dtsi +++ b/arch/arm64/boot/dts/qcom/sm8250.dtsi @@ -2293,7 +2293,7 @@ pinctrl-names = "default"; pinctrl-0 = <&rx_swr_active>; compatible = "qcom,sm8250-lpass-rx-macro"; - reg = <0 0x3200000 0 0x1000>; + reg = <0 0x03200000 0 0x1000>; status = "disabled"; clocks = <&q6afecc LPASS_CLK_ID_TX_CORE_MCLK LPASS_CLK_ATTRIBUTE_COUPLE_NO>, @@ -2310,7 +2310,7 @@ }; swr1: soundwire-controller@3210000 { - reg = <0 0x3210000 0 0x2000>; + reg = <0 0x03210000 0 0x2000>; compatible = "qcom,soundwire-v1.5.1"; status = "disabled"; interrupts = <GIC_SPI 298 IRQ_TYPE_LEVEL_HIGH>; @@ -2339,7 +2339,7 @@ pinctrl-names = "default"; pinctrl-0 = <&tx_swr_active>; compatible = "qcom,sm8250-lpass-tx-macro"; - reg = <0 0x3220000 0 0x1000>; + reg = <0 0x03220000 0 0x1000>; status = "disabled"; clocks = <&q6afecc LPASS_CLK_ID_TX_CORE_MCLK LPASS_CLK_ATTRIBUTE_COUPLE_NO>, @@ -2357,7 +2357,7 @@ /* tx macro */ swr2: soundwire-controller@3230000 { - reg = <0 0x3230000 0 0x2000>; + reg = <0 0x03230000 0 0x2000>; compatible = "qcom,soundwire-v1.5.1"; interrupts-extended = <&intc GIC_SPI 297 IRQ_TYPE_LEVEL_HIGH>; interrupt-names = "core"; diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 7e0487bbdaa0..7790ee42c68a 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -115,6 +115,7 @@ CONFIG_KVM=y CONFIG_JUMP_LABEL=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y +CONFIG_IOSCHED_BFQ=y # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set # CONFIG_COMPAT_BRK is not set CONFIG_MEMORY_HOTPLUG=y @@ -219,7 +220,7 @@ CONFIG_PCI_HOST_THUNDER_ECAM=y CONFIG_PCIE_ROCKCHIP_HOST=m CONFIG_PCIE_MEDIATEK_GEN3=m CONFIG_PCIE_BRCMSTB=m -CONFIG_PCI_IMX6=y +CONFIG_PCI_IMX6_HOST=y CONFIG_PCI_LAYERSCAPE=y CONFIG_PCI_HISI=y CONFIG_PCIE_QCOM=y diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h index c0b178d1bb4f..a51e6e8f3171 100644 --- a/arch/arm64/include/asm/cache.h +++ b/arch/arm64/include/asm/cache.h @@ -16,6 +16,15 @@ #define CLIDR_LOC(clidr) (((clidr) >> CLIDR_LOC_SHIFT) & 0x7) #define CLIDR_LOUIS(clidr) (((clidr) >> CLIDR_LOUIS_SHIFT) & 0x7) +/* Ctypen, bits[3(n - 1) + 2 : 3(n - 1)], for n = 1 to 7 */ +#define CLIDR_CTYPE_SHIFT(level) (3 * (level - 1)) +#define CLIDR_CTYPE_MASK(level) (7 << CLIDR_CTYPE_SHIFT(level)) +#define CLIDR_CTYPE(clidr, level) \ + (((clidr) & CLIDR_CTYPE_MASK(level)) >> CLIDR_CTYPE_SHIFT(level)) + +/* Ttypen, bits [2(n - 1) + 34 : 2(n - 1) + 33], for n = 1 to 7 */ +#define CLIDR_TTYPE_SHIFT(level) (2 * ((level) - 1) + CLIDR_EL1_Ttypen_SHIFT) + /* * Memory returned by kmalloc() may be used for DMA, so we must make * sure that all such allocations are cache aligned. Otherwise, diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 2cdd010f9524..037724b19c5c 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -196,4 +196,103 @@ __init_el2_nvhe_prepare_eret .endm +#ifndef __KVM_NVHE_HYPERVISOR__ +// This will clobber tmp1 and tmp2, and expect tmp1 to contain +// the id register value as read from the HW +.macro __check_override idreg, fld, width, pass, fail, tmp1, tmp2 + ubfx \tmp1, \tmp1, #\fld, #\width + cbz \tmp1, \fail + + adr_l \tmp1, \idreg\()_override + ldr \tmp2, [\tmp1, FTR_OVR_VAL_OFFSET] + ldr \tmp1, [\tmp1, FTR_OVR_MASK_OFFSET] + ubfx \tmp2, \tmp2, #\fld, #\width + ubfx \tmp1, \tmp1, #\fld, #\width + cmp \tmp1, xzr + and \tmp2, \tmp2, \tmp1 + csinv \tmp2, \tmp2, xzr, ne + cbnz \tmp2, \pass + b \fail +.endm + +// This will clobber tmp1 and tmp2 +.macro check_override idreg, fld, pass, fail, tmp1, tmp2 + mrs \tmp1, \idreg\()_el1 + __check_override \idreg \fld 4 \pass \fail \tmp1 \tmp2 +.endm +#else +// This will clobber tmp +.macro __check_override idreg, fld, width, pass, fail, tmp, ignore + ldr_l \tmp, \idreg\()_el1_sys_val + ubfx \tmp, \tmp, #\fld, #\width + cbnz \tmp, \pass + b \fail +.endm + +.macro check_override idreg, fld, pass, fail, tmp, ignore + __check_override \idreg \fld 4 \pass \fail \tmp \ignore +.endm +#endif + +.macro finalise_el2_state + check_override id_aa64pfr0, ID_AA64PFR0_EL1_SVE_SHIFT, .Linit_sve_\@, .Lskip_sve_\@, x1, x2 + +.Linit_sve_\@: /* SVE register access */ + mrs x0, cptr_el2 // Disable SVE traps + bic x0, x0, #CPTR_EL2_TZ + msr cptr_el2, x0 + isb + mov x1, #ZCR_ELx_LEN_MASK // SVE: Enable full vector + msr_s SYS_ZCR_EL2, x1 // length for EL1. + +.Lskip_sve_\@: + check_override id_aa64pfr1, ID_AA64PFR1_EL1_SME_SHIFT, .Linit_sme_\@, .Lskip_sme_\@, x1, x2 + +.Linit_sme_\@: /* SME register access and priority mapping */ + mrs x0, cptr_el2 // Disable SME traps + bic x0, x0, #CPTR_EL2_TSM + msr cptr_el2, x0 + isb + + mrs x1, sctlr_el2 + orr x1, x1, #SCTLR_ELx_ENTP2 // Disable TPIDR2 traps + msr sctlr_el2, x1 + isb + + mov x0, #0 // SMCR controls + + // Full FP in SM? + mrs_s x1, SYS_ID_AA64SMFR0_EL1 + __check_override id_aa64smfr0, ID_AA64SMFR0_EL1_FA64_SHIFT, 1, .Linit_sme_fa64_\@, .Lskip_sme_fa64_\@, x1, x2 + +.Linit_sme_fa64_\@: + orr x0, x0, SMCR_ELx_FA64_MASK +.Lskip_sme_fa64_\@: + + // ZT0 available? + mrs_s x1, SYS_ID_AA64SMFR0_EL1 + __check_override id_aa64smfr0, ID_AA64SMFR0_EL1_SMEver_SHIFT, 4, .Linit_sme_zt0_\@, .Lskip_sme_zt0_\@, x1, x2 +.Linit_sme_zt0_\@: + orr x0, x0, SMCR_ELx_EZT0_MASK +.Lskip_sme_zt0_\@: + + orr x0, x0, #SMCR_ELx_LEN_MASK // Enable full SME vector + msr_s SYS_SMCR_EL2, x0 // length for EL1. + + mrs_s x1, SYS_SMIDR_EL1 // Priority mapping supported? + ubfx x1, x1, #SMIDR_EL1_SMPS_SHIFT, #1 + cbz x1, .Lskip_sme_\@ + + msr_s SYS_SMPRIMAP_EL2, xzr // Make all priorities equal + + mrs x1, id_aa64mmfr1_el1 // HCRX_EL2 present? + ubfx x1, x1, #ID_AA64MMFR1_EL1_HCX_SHIFT, #4 + cbz x1, .Lskip_sme_\@ + + mrs_s x1, SYS_HCRX_EL2 + orr x1, x1, #HCRX_EL2_SMPME_MASK // Enable priority mapping + msr_s SYS_HCRX_EL2, x1 +.Lskip_sme_\@: +.endm + #endif /* __ARM_KVM_INIT_H__ */ diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index c9f15b9e3c71..8487aec9b658 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -272,6 +272,10 @@ (((e) & ESR_ELx_SYS64_ISS_OP2_MASK) >> \ ESR_ELx_SYS64_ISS_OP2_SHIFT)) +/* ISS field definitions for ERET/ERETAA/ERETAB trapping */ +#define ESR_ELx_ERET_ISS_ERET 0x2 +#define ESR_ELx_ERET_ISS_ERETA 0x1 + /* * ISS field definitions for floating-point exception traps * (FP_EXC_32/FP_EXC_64). diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 26b0c97df986..baef29fcbeee 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -81,11 +81,12 @@ * SWIO: Turn set/way invalidates into set/way clean+invalidate * PTW: Take a stage2 fault if a stage1 walk steps in device memory * TID3: Trap EL1 reads of group 3 ID registers + * TID2: Trap CTR_EL0, CCSIDR2_EL1, CLIDR_EL1, and CSSELR_EL1 */ #define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \ HCR_BSU_IS | HCR_FB | HCR_TACR | \ HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \ - HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3 ) + HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3 | HCR_TID2) #define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF) #define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA) #define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC) @@ -344,10 +345,26 @@ ECN(SP_ALIGN), ECN(FP_EXC32), ECN(FP_EXC64), ECN(SERROR), \ ECN(BREAKPT_LOW), ECN(BREAKPT_CUR), ECN(SOFTSTP_LOW), \ ECN(SOFTSTP_CUR), ECN(WATCHPT_LOW), ECN(WATCHPT_CUR), \ - ECN(BKPT32), ECN(VECTOR32), ECN(BRK64) + ECN(BKPT32), ECN(VECTOR32), ECN(BRK64), ECN(ERET) -#define CPACR_EL1_TTA (1 << 28) #define CPACR_EL1_DEFAULT (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN |\ CPACR_EL1_ZEN_EL1EN) +#define kvm_mode_names \ + { PSR_MODE_EL0t, "EL0t" }, \ + { PSR_MODE_EL1t, "EL1t" }, \ + { PSR_MODE_EL1h, "EL1h" }, \ + { PSR_MODE_EL2t, "EL2t" }, \ + { PSR_MODE_EL2h, "EL2h" }, \ + { PSR_MODE_EL3t, "EL3t" }, \ + { PSR_MODE_EL3h, "EL3h" }, \ + { PSR_AA32_MODE_USR, "32-bit USR" }, \ + { PSR_AA32_MODE_FIQ, "32-bit FIQ" }, \ + { PSR_AA32_MODE_IRQ, "32-bit IRQ" }, \ + { PSR_AA32_MODE_SVC, "32-bit SVC" }, \ + { PSR_AA32_MODE_ABT, "32-bit ABT" }, \ + { PSR_AA32_MODE_HYP, "32-bit HYP" }, \ + { PSR_AA32_MODE_UND, "32-bit UND" }, \ + { PSR_AA32_MODE_SYS, "32-bit SYS" } + #endif /* __ARM64_KVM_ARM_H__ */ diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 193583df2d9c..b31b32ecbe2d 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -33,6 +33,12 @@ enum exception_type { except_type_serror = 0x180, }; +#define kvm_exception_type_names \ + { except_type_sync, "SYNC" }, \ + { except_type_irq, "IRQ" }, \ + { except_type_fiq, "FIQ" }, \ + { except_type_serror, "SERROR" } + bool kvm_condition_valid32(const struct kvm_vcpu *vcpu); void kvm_skip_instr32(struct kvm_vcpu *vcpu); @@ -44,6 +50,10 @@ void kvm_inject_size_fault(struct kvm_vcpu *vcpu); void kvm_vcpu_wfi(struct kvm_vcpu *vcpu); +void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu); +int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2); +int kvm_inject_nested_irq(struct kvm_vcpu *vcpu); + #if defined(__KVM_VHE_HYPERVISOR__) || defined(__KVM_NVHE_HYPERVISOR__) static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu) { @@ -88,10 +98,6 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) if (vcpu_el1_is_32bit(vcpu)) vcpu->arch.hcr_el2 &= ~HCR_RW; - if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) || - vcpu_el1_is_32bit(vcpu)) - vcpu->arch.hcr_el2 |= HCR_TID2; - if (kvm_has_mte(vcpu->kvm)) vcpu->arch.hcr_el2 |= HCR_ATA; } @@ -183,6 +189,62 @@ static __always_inline void vcpu_set_reg(struct kvm_vcpu *vcpu, u8 reg_num, vcpu_gp_regs(vcpu)->regs[reg_num] = val; } +static inline bool vcpu_is_el2_ctxt(const struct kvm_cpu_context *ctxt) +{ + switch (ctxt->regs.pstate & (PSR_MODE32_BIT | PSR_MODE_MASK)) { + case PSR_MODE_EL2h: + case PSR_MODE_EL2t: + return true; + default: + return false; + } +} + +static inline bool vcpu_is_el2(const struct kvm_vcpu *vcpu) +{ + return vcpu_is_el2_ctxt(&vcpu->arch.ctxt); +} + +static inline bool __vcpu_el2_e2h_is_set(const struct kvm_cpu_context *ctxt) +{ + return ctxt_sys_reg(ctxt, HCR_EL2) & HCR_E2H; +} + +static inline bool vcpu_el2_e2h_is_set(const struct kvm_vcpu *vcpu) +{ + return __vcpu_el2_e2h_is_set(&vcpu->arch.ctxt); +} + +static inline bool __vcpu_el2_tge_is_set(const struct kvm_cpu_context *ctxt) +{ + return ctxt_sys_reg(ctxt, HCR_EL2) & HCR_TGE; +} + +static inline bool vcpu_el2_tge_is_set(const struct kvm_vcpu *vcpu) +{ + return __vcpu_el2_tge_is_set(&vcpu->arch.ctxt); +} + +static inline bool __is_hyp_ctxt(const struct kvm_cpu_context *ctxt) +{ + /* + * We are in a hypervisor context if the vcpu mode is EL2 or + * E2H and TGE bits are set. The latter means we are in the user space + * of the VHE kernel. ARMv8.1 ARM describes this as 'InHost' + * + * Note that the HCR_EL2.{E2H,TGE}={0,1} isn't really handled in the + * rest of the KVM code, and will result in a misbehaving guest. + */ + return vcpu_is_el2_ctxt(ctxt) || + (__vcpu_el2_e2h_is_set(ctxt) && __vcpu_el2_tge_is_set(ctxt)) || + __vcpu_el2_tge_is_set(ctxt); +} + +static inline bool is_hyp_ctxt(const struct kvm_vcpu *vcpu) +{ + return __is_hyp_ctxt(&vcpu->arch.ctxt); +} + /* * The layout of SPSR for an AArch32 state is different when observed from an * AArch64 SPSR_ELx or an AArch32 SPSR_*. This function generates the AArch32 diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 35a159d131b5..a1892a8f6032 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -60,14 +60,19 @@ enum kvm_mode { KVM_MODE_DEFAULT, KVM_MODE_PROTECTED, + KVM_MODE_NV, KVM_MODE_NONE, }; +#ifdef CONFIG_KVM enum kvm_mode kvm_get_mode(void); +#else +static inline enum kvm_mode kvm_get_mode(void) { return KVM_MODE_NONE; }; +#endif DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); -extern unsigned int kvm_sve_max_vl; -int kvm_arm_init_sve(void); +extern unsigned int __ro_after_init kvm_sve_max_vl; +int __init kvm_arm_init_sve(void); u32 __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); @@ -252,6 +257,7 @@ struct kvm_vcpu_fault_info { enum vcpu_sysreg { __INVALID_SYSREG__, /* 0 is reserved as an invalid value */ MPIDR_EL1, /* MultiProcessor Affinity Register */ + CLIDR_EL1, /* Cache Level ID Register */ CSSELR_EL1, /* Cache Size Selection Register */ SCTLR_EL1, /* System Control Register */ ACTLR_EL1, /* Auxiliary Control Register */ @@ -320,12 +326,43 @@ enum vcpu_sysreg { TFSR_EL1, /* Tag Fault Status Register (EL1) */ TFSRE0_EL1, /* Tag Fault Status Register (EL0) */ - /* 32bit specific registers. Keep them at the end of the range */ + /* 32bit specific registers. */ DACR32_EL2, /* Domain Access Control Register */ IFSR32_EL2, /* Instruction Fault Status Register */ FPEXC32_EL2, /* Floating-Point Exception Control Register */ DBGVCR32_EL2, /* Debug Vector Catch Register */ + /* EL2 registers */ + VPIDR_EL2, /* Virtualization Processor ID Register */ + VMPIDR_EL2, /* Virtualization Multiprocessor ID Register */ + SCTLR_EL2, /* System Control Register (EL2) */ + ACTLR_EL2, /* Auxiliary Control Register (EL2) */ + HCR_EL2, /* Hypervisor Configuration Register */ + MDCR_EL2, /* Monitor Debug Configuration Register (EL2) */ + CPTR_EL2, /* Architectural Feature Trap Register (EL2) */ + HSTR_EL2, /* Hypervisor System Trap Register */ + HACR_EL2, /* Hypervisor Auxiliary Control Register */ + TTBR0_EL2, /* Translation Table Base Register 0 (EL2) */ + TTBR1_EL2, /* Translation Table Base Register 1 (EL2) */ + TCR_EL2, /* Translation Control Register (EL2) */ + VTTBR_EL2, /* Virtualization Translation Table Base Register */ + VTCR_EL2, /* Virtualization Translation Control Register */ + SPSR_EL2, /* EL2 saved program status register */ + ELR_EL2, /* EL2 exception link register */ + AFSR0_EL2, /* Auxiliary Fault Status Register 0 (EL2) */ + AFSR1_EL2, /* Auxiliary Fault Status Register 1 (EL2) */ + ESR_EL2, /* Exception Syndrome Register (EL2) */ + FAR_EL2, /* Fault Address Register (EL2) */ + HPFAR_EL2, /* Hypervisor IPA Fault Address Register */ + MAIR_EL2, /* Memory Attribute Indirection Register (EL2) */ + AMAIR_EL2, /* Auxiliary Memory Attribute Indirection Register (EL2) */ + VBAR_EL2, /* Vector Base Address Register (EL2) */ + RVBAR_EL2, /* Reset Vector Base Address Register */ + CONTEXTIDR_EL2, /* Context ID Register (EL2) */ + TPIDR_EL2, /* EL2 Software Thread ID Register */ + CNTHCTL_EL2, /* Counter-timer Hypervisor Control register */ + SP_EL2, /* EL2 Stack Pointer */ + NR_SYS_REGS /* Nothing after this line! */ }; @@ -501,6 +538,9 @@ struct kvm_vcpu_arch { u64 last_steal; gpa_t base; } steal; + + /* Per-vcpu CCSIDR override or NULL */ + u32 *ccsidr; }; /* @@ -598,7 +638,7 @@ struct kvm_vcpu_arch { #define EXCEPT_AA64_EL1_IRQ __vcpu_except_flags(1) #define EXCEPT_AA64_EL1_FIQ __vcpu_except_flags(2) #define EXCEPT_AA64_EL1_SERR __vcpu_except_flags(3) -/* For AArch64 with NV (one day): */ +/* For AArch64 with NV: */ #define EXCEPT_AA64_EL2_SYNC __vcpu_except_flags(4) #define EXCEPT_AA64_EL2_IRQ __vcpu_except_flags(5) #define EXCEPT_AA64_EL2_FIQ __vcpu_except_flags(6) @@ -609,6 +649,8 @@ struct kvm_vcpu_arch { #define DEBUG_STATE_SAVE_SPE __vcpu_single_flag(iflags, BIT(5)) /* Save TRBE context if active */ #define DEBUG_STATE_SAVE_TRBE __vcpu_single_flag(iflags, BIT(6)) +/* vcpu running in HYP context */ +#define VCPU_HYP_CONTEXT __vcpu_single_flag(iflags, BIT(7)) /* SVE enabled for host EL0 */ #define HOST_SVE_ENABLED __vcpu_single_flag(sflags, BIT(0)) @@ -705,7 +747,6 @@ static inline bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val) return false; switch (reg) { - case CSSELR_EL1: *val = read_sysreg_s(SYS_CSSELR_EL1); break; case SCTLR_EL1: *val = read_sysreg_s(SYS_SCTLR_EL12); break; case CPACR_EL1: *val = read_sysreg_s(SYS_CPACR_EL12); break; case TTBR0_EL1: *val = read_sysreg_s(SYS_TTBR0_EL12); break; @@ -750,7 +791,6 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg) return false; switch (reg) { - case CSSELR_EL1: write_sysreg_s(val, SYS_CSSELR_EL1); break; case SCTLR_EL1: write_sysreg_s(val, SYS_SCTLR_EL12); break; case CPACR_EL1: write_sysreg_s(val, SYS_CPACR_EL12); break; case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); break; @@ -877,7 +917,7 @@ int kvm_handle_cp10_id(struct kvm_vcpu *vcpu); void kvm_reset_sys_regs(struct kvm_vcpu *vcpu); -int kvm_sys_reg_table_init(void); +int __init kvm_sys_reg_table_init(void); /* MMIO helpers */ void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); @@ -908,20 +948,20 @@ int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu, int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); -extern unsigned int kvm_arm_vmid_bits; -int kvm_arm_vmid_alloc_init(void); -void kvm_arm_vmid_alloc_free(void); +extern unsigned int __ro_after_init kvm_arm_vmid_bits; +int __init kvm_arm_vmid_alloc_init(void); +void __init kvm_arm_vmid_alloc_free(void); void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid); void kvm_arm_vmid_clear_active(void); static inline void kvm_arm_pvtime_vcpu_init(struct kvm_vcpu_arch *vcpu_arch) { - vcpu_arch->steal.base = GPA_INVALID; + vcpu_arch->steal.base = INVALID_GPA; } static inline bool kvm_arm_is_pvtime_enabled(struct kvm_vcpu_arch *vcpu_arch) { - return (vcpu_arch->steal.base != GPA_INVALID); + return (vcpu_arch->steal.base != INVALID_GPA); } void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome); @@ -943,7 +983,6 @@ static inline bool kvm_system_needs_idmapped_vectors(void) void kvm_arm_vcpu_ptrauth_trap(struct kvm_vcpu *vcpu); -static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} @@ -994,7 +1033,7 @@ static inline void kvm_clr_pmu_events(u32 clr) {} void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu); void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu); -int kvm_set_ipa_limit(void); +int __init kvm_set_ipa_limit(void); #define __KVM_HAVE_ARCH_VM_ALLOC struct kvm *kvm_arch_alloc_vm(void); diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 6797eafe7890..bdd9cf546d95 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -122,6 +122,7 @@ extern u64 kvm_nvhe_sym(id_aa64isar2_el1_sys_val); extern u64 kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val); extern u64 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val); extern u64 kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val); +extern u64 kvm_nvhe_sym(id_aa64smfr0_el1_sys_val); extern unsigned long kvm_nvhe_sym(__icache_flags); extern unsigned int kvm_nvhe_sym(kvm_arm_vmid_bits); diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index e4a7e6369499..083cc47dca08 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -115,6 +115,7 @@ alternative_cb_end #include <asm/cache.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> +#include <asm/kvm_emulate.h> #include <asm/kvm_host.h> void kvm_update_va_mask(struct alt_instr *alt, @@ -163,7 +164,7 @@ int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, void __iomem **haddr); int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, void **haddr); -void free_hyp_pgds(void); +void __init free_hyp_pgds(void); void stage2_unmap_vm(struct kvm *kvm); int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type); @@ -175,7 +176,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu); phys_addr_t kvm_mmu_get_httbr(void); phys_addr_t kvm_get_idmap_vector(void); -int kvm_mmu_init(u32 *hyp_va_bits); +int __init kvm_mmu_init(u32 *hyp_va_bits); static inline void *__kvm_vector_slot2addr(void *base, enum arm64_hyp_spectre_vector slot) @@ -192,7 +193,15 @@ struct kvm; static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu) { - return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101; + u64 cache_bits = SCTLR_ELx_M | SCTLR_ELx_C; + int reg; + + if (vcpu_is_el2(vcpu)) + reg = SCTLR_EL2; + else + reg = SCTLR_EL1; + + return (vcpu_read_sys_reg(vcpu, reg) & cache_bits) == cache_bits; } static inline void __clean_dcache_guest_page(void *va, size_t size) diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h new file mode 100644 index 000000000000..8fb67f032fd1 --- /dev/null +++ b/arch/arm64/include/asm/kvm_nested.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ARM64_KVM_NESTED_H +#define __ARM64_KVM_NESTED_H + +#include <linux/kvm_host.h> + +static inline bool vcpu_has_nv(const struct kvm_vcpu *vcpu) +{ + return (!__is_defined(__KVM_NVHE_HYPERVISOR__) && + cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) && + test_bit(KVM_ARM_VCPU_HAS_EL2, vcpu->arch.features)); +} + +struct sys_reg_params; +struct sys_reg_desc; + +void access_nested_id_reg(struct kvm_vcpu *v, struct sys_reg_params *p, + const struct sys_reg_desc *r); + +#endif /* __ARM64_KVM_NESTED_H */ diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 63f81b27a4e3..4cd6762bda80 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -71,6 +71,11 @@ static inline kvm_pte_t kvm_phys_to_pte(u64 pa) return pte; } +static inline kvm_pfn_t kvm_pte_to_pfn(kvm_pte_t pte) +{ + return __phys_to_pfn(kvm_pte_to_phys(pte)); +} + static inline u64 kvm_granule_shift(u32 level) { /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */ @@ -188,12 +193,15 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end, * children. * @KVM_PGTABLE_WALK_SHARED: Indicates the page-tables may be shared * with other software walkers. + * @KVM_PGTABLE_WALK_HANDLE_FAULT: Indicates the page-table walk was + * invoked from a fault handler. */ enum kvm_pgtable_walk_flags { KVM_PGTABLE_WALK_LEAF = BIT(0), KVM_PGTABLE_WALK_TABLE_PRE = BIT(1), KVM_PGTABLE_WALK_TABLE_POST = BIT(2), KVM_PGTABLE_WALK_SHARED = BIT(3), + KVM_PGTABLE_WALK_HANDLE_FAULT = BIT(4), }; struct kvm_pgtable_visit_ctx { diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 043ecc3405e7..9e3ecba3c4e6 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -325,7 +325,6 @@ #define SYS_CNTKCTL_EL1 sys_reg(3, 0, 14, 1, 0) -#define SYS_CCSIDR_EL1 sys_reg(3, 1, 0, 0, 0) #define SYS_AIDR_EL1 sys_reg(3, 1, 0, 0, 7) #define SYS_RNDR_EL0 sys_reg(3, 3, 2, 4, 0) @@ -411,23 +410,51 @@ #define SYS_PMCCFILTR_EL0 sys_reg(3, 3, 14, 15, 7) +#define SYS_VPIDR_EL2 sys_reg(3, 4, 0, 0, 0) +#define SYS_VMPIDR_EL2 sys_reg(3, 4, 0, 0, 5) + #define SYS_SCTLR_EL2 sys_reg(3, 4, 1, 0, 0) +#define SYS_ACTLR_EL2 sys_reg(3, 4, 1, 0, 1) +#define SYS_HCR_EL2 sys_reg(3, 4, 1, 1, 0) +#define SYS_MDCR_EL2 sys_reg(3, 4, 1, 1, 1) +#define SYS_CPTR_EL2 sys_reg(3, 4, 1, 1, 2) +#define SYS_HSTR_EL2 sys_reg(3, 4, 1, 1, 3) #define SYS_HFGRTR_EL2 sys_reg(3, 4, 1, 1, 4) #define SYS_HFGWTR_EL2 sys_reg(3, 4, 1, 1, 5) #define SYS_HFGITR_EL2 sys_reg(3, 4, 1, 1, 6) +#define SYS_HACR_EL2 sys_reg(3, 4, 1, 1, 7) + +#define SYS_TTBR0_EL2 sys_reg(3, 4, 2, 0, 0) +#define SYS_TTBR1_EL2 sys_reg(3, 4, 2, 0, 1) +#define SYS_TCR_EL2 sys_reg(3, 4, 2, 0, 2) +#define SYS_VTTBR_EL2 sys_reg(3, 4, 2, 1, 0) +#define SYS_VTCR_EL2 sys_reg(3, 4, 2, 1, 2) + #define SYS_TRFCR_EL2 sys_reg(3, 4, 1, 2, 1) #define SYS_HDFGRTR_EL2 sys_reg(3, 4, 3, 1, 4) #define SYS_HDFGWTR_EL2 sys_reg(3, 4, 3, 1, 5) #define SYS_HAFGRTR_EL2 sys_reg(3, 4, 3, 1, 6) #define SYS_SPSR_EL2 sys_reg(3, 4, 4, 0, 0) #define SYS_ELR_EL2 sys_reg(3, 4, 4, 0, 1) +#define SYS_SP_EL1 sys_reg(3, 4, 4, 1, 0) #define SYS_IFSR32_EL2 sys_reg(3, 4, 5, 0, 1) +#define SYS_AFSR0_EL2 sys_reg(3, 4, 5, 1, 0) +#define SYS_AFSR1_EL2 sys_reg(3, 4, 5, 1, 1) #define SYS_ESR_EL2 sys_reg(3, 4, 5, 2, 0) #define SYS_VSESR_EL2 sys_reg(3, 4, 5, 2, 3) #define SYS_FPEXC32_EL2 sys_reg(3, 4, 5, 3, 0) #define SYS_TFSR_EL2 sys_reg(3, 4, 5, 6, 0) -#define SYS_VDISR_EL2 sys_reg(3, 4, 12, 1, 1) +#define SYS_FAR_EL2 sys_reg(3, 4, 6, 0, 0) +#define SYS_HPFAR_EL2 sys_reg(3, 4, 6, 0, 4) + +#define SYS_MAIR_EL2 sys_reg(3, 4, 10, 2, 0) +#define SYS_AMAIR_EL2 sys_reg(3, 4, 10, 3, 0) + +#define SYS_VBAR_EL2 sys_reg(3, 4, 12, 0, 0) +#define SYS_RVBAR_EL2 sys_reg(3, 4, 12, 0, 1) +#define SYS_RMR_EL2 sys_reg(3, 4, 12, 0, 2) +#define SYS_VDISR_EL2 sys_reg(3, 4, 12, 1, 1) #define __SYS__AP0Rx_EL2(x) sys_reg(3, 4, 12, 8, x) #define SYS_ICH_AP0R0_EL2 __SYS__AP0Rx_EL2(0) #define SYS_ICH_AP0R1_EL2 __SYS__AP0Rx_EL2(1) @@ -469,6 +496,12 @@ #define SYS_ICH_LR14_EL2 __SYS__LR8_EL2(6) #define SYS_ICH_LR15_EL2 __SYS__LR8_EL2(7) +#define SYS_CONTEXTIDR_EL2 sys_reg(3, 4, 13, 0, 1) +#define SYS_TPIDR_EL2 sys_reg(3, 4, 13, 0, 2) + +#define SYS_CNTVOFF_EL2 sys_reg(3, 4, 14, 0, 3) +#define SYS_CNTHCTL_EL2 sys_reg(3, 4, 14, 1, 0) + /* VHE encodings for architectural EL0/1 system registers */ #define SYS_SCTLR_EL12 sys_reg(3, 5, 1, 0, 0) #define SYS_TTBR0_EL12 sys_reg(3, 5, 2, 0, 0) @@ -491,6 +524,8 @@ #define SYS_CNTV_CTL_EL02 sys_reg(3, 5, 14, 3, 1) #define SYS_CNTV_CVAL_EL02 sys_reg(3, 5, 14, 3, 2) +#define SYS_SP_EL2 sys_reg(3, 6, 4, 1, 0) + /* Common SCTLR_ELx flags. */ #define SCTLR_ELx_ENTP2 (BIT(60)) #define SCTLR_ELx_DSSBS (BIT(44)) diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index a7a857f1784d..f8129c624b07 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -109,6 +109,7 @@ struct kvm_regs { #define KVM_ARM_VCPU_SVE 4 /* enable SVE for this CPU */ #define KVM_ARM_VCPU_PTRAUTH_ADDRESS 5 /* VCPU uses address authentication */ #define KVM_ARM_VCPU_PTRAUTH_GENERIC 6 /* VCPU uses generic authentication */ +#define KVM_ARM_VCPU_HAS_EL2 7 /* Support nested virtualization */ struct kvm_vcpu_init { __u32 target; diff --git a/arch/arm64/kernel/cacheinfo.c b/arch/arm64/kernel/cacheinfo.c index 3ba70985e3a2..c307f69e9b55 100644 --- a/arch/arm64/kernel/cacheinfo.c +++ b/arch/arm64/kernel/cacheinfo.c @@ -11,11 +11,6 @@ #include <linux/of.h> #define MAX_CACHE_LEVEL 7 /* Max 7 level supported */ -/* Ctypen, bits[3(n - 1) + 2 : 3(n - 1)], for n = 1 to 7 */ -#define CLIDR_CTYPE_SHIFT(level) (3 * (level - 1)) -#define CLIDR_CTYPE_MASK(level) (7 << CLIDR_CTYPE_SHIFT(level)) -#define CLIDR_CTYPE(clidr, level) \ - (((clidr) & CLIDR_CTYPE_MASK(level)) >> CLIDR_CTYPE_SHIFT(level)) int cache_line_size(void) { diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 45a42cf2191c..87687e99fee3 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1967,6 +1967,20 @@ static void cpu_copy_el2regs(const struct arm64_cpu_capabilities *__unused) write_sysreg(read_sysreg(tpidr_el1), tpidr_el2); } +static bool has_nested_virt_support(const struct arm64_cpu_capabilities *cap, + int scope) +{ + if (kvm_get_mode() != KVM_MODE_NV) + return false; + + if (!has_cpuid_feature(cap, scope)) { + pr_warn("unavailable: %s\n", cap->desc); + return false; + } + + return true; +} + #ifdef CONFIG_ARM64_PAN static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused) { @@ -2263,6 +2277,17 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = cpu_copy_el2regs, }, { + .desc = "Nested Virtualization Support", + .capability = ARM64_HAS_NESTED_VIRT, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_nested_virt_support, + .sys_reg = SYS_ID_AA64MMFR2_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64MMFR2_EL1_NV_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64MMFR2_EL1_NV_IMP, + }, + { .capability = ARM64_HAS_32BIT_EL0_DO_NOT_USE, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_32bit_el0, diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 111ff33d93ee..9439240c3fcf 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -16,30 +16,6 @@ #include <asm/ptrace.h> #include <asm/virt.h> -// Warning, hardcoded register allocation -// This will clobber x1 and x2, and expect x1 to contain -// the id register value as read from the HW -.macro __check_override idreg, fld, width, pass, fail - ubfx x1, x1, #\fld, #\width - cbz x1, \fail - - adr_l x1, \idreg\()_override - ldr x2, [x1, FTR_OVR_VAL_OFFSET] - ldr x1, [x1, FTR_OVR_MASK_OFFSET] - ubfx x2, x2, #\fld, #\width - ubfx x1, x1, #\fld, #\width - cmp x1, xzr - and x2, x2, x1 - csinv x2, x2, xzr, ne - cbnz x2, \pass - b \fail -.endm - -.macro check_override idreg, fld, pass, fail - mrs x1, \idreg\()_el1 - __check_override \idreg \fld 4 \pass \fail -.endm - .text .pushsection .hyp.text, "ax" @@ -98,65 +74,7 @@ SYM_CODE_START_LOCAL(elx_sync) SYM_CODE_END(elx_sync) SYM_CODE_START_LOCAL(__finalise_el2) - check_override id_aa64pfr0 ID_AA64PFR0_EL1_SVE_SHIFT .Linit_sve .Lskip_sve - -.Linit_sve: /* SVE register access */ - mrs x0, cptr_el2 // Disable SVE traps - bic x0, x0, #CPTR_EL2_TZ - msr cptr_el2, x0 - isb - mov x1, #ZCR_ELx_LEN_MASK // SVE: Enable full vector - msr_s SYS_ZCR_EL2, x1 // length for EL1. - -.Lskip_sve: - check_override id_aa64pfr1 ID_AA64PFR1_EL1_SME_SHIFT .Linit_sme .Lskip_sme - -.Linit_sme: /* SME register access and priority mapping */ - mrs x0, cptr_el2 // Disable SME traps - bic x0, x0, #CPTR_EL2_TSM - msr cptr_el2, x0 - isb - - mrs x1, sctlr_el2 - orr x1, x1, #SCTLR_ELx_ENTP2 // Disable TPIDR2 traps - msr sctlr_el2, x1 - isb - - mov x0, #0 // SMCR controls - - // Full FP in SM? - mrs_s x1, SYS_ID_AA64SMFR0_EL1 - __check_override id_aa64smfr0 ID_AA64SMFR0_EL1_FA64_SHIFT 1 .Linit_sme_fa64 .Lskip_sme_fa64 - -.Linit_sme_fa64: - orr x0, x0, SMCR_ELx_FA64_MASK -.Lskip_sme_fa64: - - // ZT0 available? - mrs_s x1, SYS_ID_AA64SMFR0_EL1 - __check_override id_aa64smfr0 ID_AA64SMFR0_EL1_SMEver_SHIFT 4 .Linit_sme_zt0 .Lskip_sme_zt0 -.Linit_sme_zt0: - orr x0, x0, SMCR_ELx_EZT0_MASK -.Lskip_sme_zt0: - - orr x0, x0, #SMCR_ELx_LEN_MASK // Enable full SME vector - msr_s SYS_SMCR_EL2, x0 // length for EL1. - - mrs_s x1, SYS_SMIDR_EL1 // Priority mapping supported? - ubfx x1, x1, #SMIDR_EL1_SMPS_SHIFT, #1 - cbz x1, .Lskip_sme - - msr_s SYS_SMPRIMAP_EL2, xzr // Make all priorities equal - - mrs x1, id_aa64mmfr1_el1 // HCRX_EL2 present? - ubfx x1, x1, #ID_AA64MMFR1_EL1_HCX_SHIFT, #4 - cbz x1, .Lskip_sme - - mrs_s x1, SYS_HCRX_EL2 - orr x1, x1, #HCRX_EL2_SMPME_MASK // Enable priority mapping - msr_s SYS_HCRX_EL2, x1 - -.Lskip_sme: + finalise_el2_state // nVHE? No way! Give me the real thing! // Sanity check: MMU *must* be off @@ -164,7 +82,7 @@ SYM_CODE_START_LOCAL(__finalise_el2) tbnz x1, #0, 1f // Needs to be VHE capable, obviously - check_override id_aa64mmfr1 ID_AA64MMFR1_EL1_VH_SHIFT 2f 1f + check_override id_aa64mmfr1 ID_AA64MMFR1_EL1_VH_SHIFT 2f 1f x1 x2 1: mov_q x0, HVC_STUB_ERR eret diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 05da3c8f7e88..ca6eadeb7d1a 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -21,6 +21,7 @@ if VIRTUALIZATION menuconfig KVM bool "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM + select KVM_GENERIC_HARDWARE_ENABLING select MMU_NOTIFIER select PREEMPT_NOTIFIERS select HAVE_KVM_CPU_RELAX_INTERCEPT diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 5e33c2d4645a..c0c050e53157 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -14,7 +14,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ inject_fault.o va_layout.o handle_exit.o \ guest.o debug.o reset.o sys_regs.o stacktrace.o \ vgic-sys-reg-v3.o fpsimd.o pkvm.o \ - arch_timer.o trng.o vmid.o \ + arch_timer.o trng.o vmid.o emulate-nested.o nested.o \ vgic/vgic.o vgic/vgic-init.o \ vgic/vgic-irqfd.o vgic/vgic-v2.o \ vgic/vgic-v3.o vgic/vgic-v4.o \ diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index bb24a76b4224..00610477ec7b 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -428,14 +428,17 @@ static void timer_emulate(struct arch_timer_context *ctx) * scheduled for the future. If the timer cannot fire at all, * then we also don't need a soft timer. */ - if (!kvm_timer_irq_can_fire(ctx)) { - soft_timer_cancel(&ctx->hrtimer); + if (should_fire || !kvm_timer_irq_can_fire(ctx)) return; - } soft_timer_start(&ctx->hrtimer, kvm_timer_compute_delta(ctx)); } +static void set_cntvoff(u64 cntvoff) +{ + kvm_call_hyp(__kvm_timer_set_cntvoff, cntvoff); +} + static void timer_save_state(struct arch_timer_context *ctx) { struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu); @@ -459,6 +462,22 @@ static void timer_save_state(struct arch_timer_context *ctx) write_sysreg_el0(0, SYS_CNTV_CTL); isb(); + /* + * The kernel may decide to run userspace after + * calling vcpu_put, so we reset cntvoff to 0 to + * ensure a consistent read between user accesses to + * the virtual counter and kernel access to the + * physical counter of non-VHE case. + * + * For VHE, the virtual counter uses a fixed virtual + * offset of zero, so no need to zero CNTVOFF_EL2 + * register, but this is actually useful when switching + * between EL1/vEL2 with NV. + * + * Do it unconditionally, as this is either unavoidable + * or dirt cheap. + */ + set_cntvoff(0); break; case TIMER_PTIMER: timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTP_CTL)); @@ -532,6 +551,7 @@ static void timer_restore_state(struct arch_timer_context *ctx) switch (index) { case TIMER_VTIMER: + set_cntvoff(timer_get_offset(ctx)); write_sysreg_el0(timer_get_cval(ctx), SYS_CNTV_CVAL); isb(); write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTV_CTL); @@ -552,11 +572,6 @@ out: local_irq_restore(flags); } -static void set_cntvoff(u64 cntvoff) -{ - kvm_call_hyp(__kvm_timer_set_cntvoff, cntvoff); -} - static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, bool active) { int r; @@ -631,8 +646,6 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) kvm_timer_vcpu_load_nogic(vcpu); } - set_cntvoff(timer_get_offset(map.direct_vtimer)); - kvm_timer_unblocking(vcpu); timer_restore_state(map.direct_vtimer); @@ -688,15 +701,6 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) if (kvm_vcpu_is_blocking(vcpu)) kvm_timer_blocking(vcpu); - - /* - * The kernel may decide to run userspace after calling vcpu_put, so - * we reset cntvoff to 0 to ensure a consistent read between user - * accesses to the virtual counter and kernel access to the physical - * counter of non-VHE case. For VHE, the virtual counter uses a fixed - * virtual offset of zero, so no need to zero CNTVOFF_EL2 register. - */ - set_cntvoff(0); } /* @@ -811,10 +815,18 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) ptimer->host_timer_irq_flags = host_ptimer_irq_flags; } -static void kvm_timer_init_interrupt(void *info) +void kvm_timer_cpu_up(void) { enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); - enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags); + if (host_ptimer_irq) + enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags); +} + +void kvm_timer_cpu_down(void) +{ + disable_percpu_irq(host_vtimer_irq); + if (host_ptimer_irq) + disable_percpu_irq(host_ptimer_irq); } int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) @@ -926,14 +938,22 @@ u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu, enum kvm_arch_timers tmr, enum kvm_arch_timer_regs treg) { + struct arch_timer_context *timer; + struct timer_map map; u64 val; + get_timer_map(vcpu, &map); + timer = vcpu_get_timer(vcpu, tmr); + + if (timer == map.emul_ptimer) + return kvm_arm_timer_read(vcpu, timer, treg); + preempt_disable(); - kvm_timer_vcpu_put(vcpu); + timer_save_state(timer); - val = kvm_arm_timer_read(vcpu, vcpu_get_timer(vcpu, tmr), treg); + val = kvm_arm_timer_read(vcpu, timer, treg); - kvm_timer_vcpu_load(vcpu); + timer_restore_state(timer); preempt_enable(); return val; @@ -967,25 +987,22 @@ void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu, enum kvm_arch_timer_regs treg, u64 val) { - preempt_disable(); - kvm_timer_vcpu_put(vcpu); - - kvm_arm_timer_write(vcpu, vcpu_get_timer(vcpu, tmr), treg, val); - - kvm_timer_vcpu_load(vcpu); - preempt_enable(); -} - -static int kvm_timer_starting_cpu(unsigned int cpu) -{ - kvm_timer_init_interrupt(NULL); - return 0; -} + struct arch_timer_context *timer; + struct timer_map map; -static int kvm_timer_dying_cpu(unsigned int cpu) -{ - disable_percpu_irq(host_vtimer_irq); - return 0; + get_timer_map(vcpu, &map); + timer = vcpu_get_timer(vcpu, tmr); + if (timer == map.emul_ptimer) { + soft_timer_cancel(&timer->hrtimer); + kvm_arm_timer_write(vcpu, timer, treg, val); + timer_emulate(timer); + } else { + preempt_disable(); + timer_save_state(timer); + kvm_arm_timer_write(vcpu, timer, treg, val); + timer_restore_state(timer); + preempt_enable(); + } } static int timer_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu) @@ -1117,7 +1134,7 @@ static int kvm_irq_init(struct arch_timer_kvm_info *info) return 0; } -int kvm_timer_hyp_init(bool has_gic) +int __init kvm_timer_hyp_init(bool has_gic) { struct arch_timer_kvm_info *info; int err; @@ -1185,9 +1202,6 @@ int kvm_timer_hyp_init(bool has_gic) goto out_free_irq; } - cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING, - "kvm/arm/timer:starting", kvm_timer_starting_cpu, - kvm_timer_dying_cpu); return 0; out_free_irq: free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus()); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 9c5573bc4614..3bd732eaf087 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -63,16 +63,6 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; } -int kvm_arch_hardware_setup(void *opaque) -{ - return 0; -} - -int kvm_arch_check_processor_compat(void *opaque) -{ - return 0; -} - int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { @@ -146,7 +136,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (ret) goto err_unshare_kvm; - if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL_ACCOUNT)) { ret = -ENOMEM; goto err_unshare_kvm; } @@ -1539,7 +1529,7 @@ static int kvm_init_vector_slots(void) return 0; } -static void cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) +static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) { struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); unsigned long tcr; @@ -1682,7 +1672,15 @@ static void _kvm_arch_hardware_enable(void *discard) int kvm_arch_hardware_enable(void) { + int was_enabled = __this_cpu_read(kvm_arm_hardware_enabled); + _kvm_arch_hardware_enable(NULL); + + if (!was_enabled) { + kvm_vgic_cpu_up(); + kvm_timer_cpu_up(); + } + return 0; } @@ -1696,6 +1694,11 @@ static void _kvm_arch_hardware_disable(void *discard) void kvm_arch_hardware_disable(void) { + if (__this_cpu_read(kvm_arm_hardware_enabled)) { + kvm_timer_cpu_down(); + kvm_vgic_cpu_down(); + } + if (!is_protected_kvm_enabled()) _kvm_arch_hardware_disable(NULL); } @@ -1738,26 +1741,26 @@ static struct notifier_block hyp_init_cpu_pm_nb = { .notifier_call = hyp_init_cpu_pm_notifier, }; -static void hyp_cpu_pm_init(void) +static void __init hyp_cpu_pm_init(void) { if (!is_protected_kvm_enabled()) cpu_pm_register_notifier(&hyp_init_cpu_pm_nb); } -static void hyp_cpu_pm_exit(void) +static void __init hyp_cpu_pm_exit(void) { if (!is_protected_kvm_enabled()) cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb); } #else -static inline void hyp_cpu_pm_init(void) +static inline void __init hyp_cpu_pm_init(void) { } -static inline void hyp_cpu_pm_exit(void) +static inline void __init hyp_cpu_pm_exit(void) { } #endif -static void init_cpu_logical_map(void) +static void __init init_cpu_logical_map(void) { unsigned int cpu; @@ -1774,7 +1777,7 @@ static void init_cpu_logical_map(void) #define init_psci_0_1_impl_state(config, what) \ config.psci_0_1_ ## what ## _implemented = psci_ops.what -static bool init_psci_relay(void) +static bool __init init_psci_relay(void) { /* * If PSCI has not been initialized, protected KVM cannot install @@ -1797,7 +1800,7 @@ static bool init_psci_relay(void) return true; } -static int init_subsystems(void) +static int __init init_subsystems(void) { int err = 0; @@ -1838,13 +1841,22 @@ static int init_subsystems(void) kvm_register_perf_callbacks(NULL); out: + if (err) + hyp_cpu_pm_exit(); + if (err || !is_protected_kvm_enabled()) on_each_cpu(_kvm_arch_hardware_disable, NULL, 1); return err; } -static void teardown_hyp_mode(void) +static void __init teardown_subsystems(void) +{ + kvm_unregister_perf_callbacks(); + hyp_cpu_pm_exit(); +} + +static void __init teardown_hyp_mode(void) { int cpu; @@ -1855,7 +1867,7 @@ static void teardown_hyp_mode(void) } } -static int do_pkvm_init(u32 hyp_va_bits) +static int __init do_pkvm_init(u32 hyp_va_bits) { void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)); int ret; @@ -1887,11 +1899,12 @@ static void kvm_hyp_init_symbols(void) kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1); + kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1); kvm_nvhe_sym(__icache_flags) = __icache_flags; kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits; } -static int kvm_hyp_init_protection(u32 hyp_va_bits) +static int __init kvm_hyp_init_protection(u32 hyp_va_bits) { void *addr = phys_to_virt(hyp_mem_base); int ret; @@ -1909,10 +1922,8 @@ static int kvm_hyp_init_protection(u32 hyp_va_bits) return 0; } -/** - * Inits Hyp-mode on all online CPUs - */ -static int init_hyp_mode(void) +/* Inits Hyp-mode on all online CPUs */ +static int __init init_hyp_mode(void) { u32 hyp_va_bits; int cpu; @@ -2094,7 +2105,7 @@ out_err: return err; } -static void _kvm_host_prot_finalize(void *arg) +static void __init _kvm_host_prot_finalize(void *arg) { int *err = arg; @@ -2102,7 +2113,7 @@ static void _kvm_host_prot_finalize(void *arg) WRITE_ONCE(*err, -EINVAL); } -static int pkvm_drop_host_privileges(void) +static int __init pkvm_drop_host_privileges(void) { int ret = 0; @@ -2115,7 +2126,7 @@ static int pkvm_drop_host_privileges(void) return ret; } -static int finalize_hyp_mode(void) +static int __init finalize_hyp_mode(void) { if (!is_protected_kvm_enabled()) return 0; @@ -2187,10 +2198,8 @@ void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons) kvm_arm_resume_guest(irqfd->kvm); } -/** - * Initialize Hyp-mode and memory mappings on all CPUs. - */ -int kvm_arch_init(void *opaque) +/* Initialize Hyp-mode and memory mappings on all CPUs */ +static __init int kvm_arm_init(void) { int err; bool in_hyp_mode; @@ -2241,7 +2250,7 @@ int kvm_arch_init(void *opaque) err = kvm_init_vector_slots(); if (err) { kvm_err("Cannot initialise vector slots\n"); - goto out_err; + goto out_hyp; } err = init_subsystems(); @@ -2252,7 +2261,7 @@ int kvm_arch_init(void *opaque) err = finalize_hyp_mode(); if (err) { kvm_err("Failed to finalize Hyp protection\n"); - goto out_hyp; + goto out_subs; } } @@ -2264,10 +2273,19 @@ int kvm_arch_init(void *opaque) kvm_info("Hyp mode initialized successfully\n"); } + /* + * FIXME: Do something reasonable if kvm_init() fails after pKVM + * hypervisor protection is finalized. + */ + err = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); + if (err) + goto out_subs; + return 0; +out_subs: + teardown_subsystems(); out_hyp: - hyp_cpu_pm_exit(); if (!in_hyp_mode) teardown_hyp_mode(); out_err: @@ -2275,12 +2293,6 @@ out_err: return err; } -/* NOP: Compiling as a module not supported */ -void kvm_arch_exit(void) -{ - kvm_unregister_perf_callbacks(); -} - static int __init early_kvm_mode_cfg(char *arg) { if (!arg) @@ -2310,6 +2322,11 @@ static int __init early_kvm_mode_cfg(char *arg) return 0; } + if (strcmp(arg, "nested") == 0 && !WARN_ON(!is_kernel_in_hyp_mode())) { + kvm_mode = KVM_MODE_NV; + return 0; + } + return -EINVAL; } early_param("kvm-arm.mode", early_kvm_mode_cfg); @@ -2319,10 +2336,4 @@ enum kvm_mode kvm_get_mode(void) return kvm_mode; } -static int arm_init(void) -{ - int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); - return rc; -} - -module_init(arm_init); +module_init(kvm_arm_init); diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c new file mode 100644 index 000000000000..b96662029fb1 --- /dev/null +++ b/arch/arm64/kvm/emulate-nested.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2016 - Linaro and Columbia University + * Author: Jintack Lim <jintack.lim@linaro.org> + */ + +#include <linux/kvm.h> +#include <linux/kvm_host.h> + +#include <asm/kvm_emulate.h> +#include <asm/kvm_nested.h> + +#include "hyp/include/hyp/adjust_pc.h" + +#include "trace.h" + +static u64 kvm_check_illegal_exception_return(struct kvm_vcpu *vcpu, u64 spsr) +{ + u64 mode = spsr & PSR_MODE_MASK; + + /* + * Possible causes for an Illegal Exception Return from EL2: + * - trying to return to EL3 + * - trying to return to an illegal M value + * - trying to return to a 32bit EL + * - trying to return to EL1 with HCR_EL2.TGE set + */ + if (mode == PSR_MODE_EL3t || mode == PSR_MODE_EL3h || + mode == 0b00001 || (mode & BIT(1)) || + (spsr & PSR_MODE32_BIT) || + (vcpu_el2_tge_is_set(vcpu) && (mode == PSR_MODE_EL1t || + mode == PSR_MODE_EL1h))) { + /* + * The guest is playing with our nerves. Preserve EL, SP, + * masks, flags from the existing PSTATE, and set IL. + * The HW will then generate an Illegal State Exception + * immediately after ERET. + */ + spsr = *vcpu_cpsr(vcpu); + + spsr &= (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | + PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT | + PSR_MODE_MASK | PSR_MODE32_BIT); + spsr |= PSR_IL_BIT; + } + + return spsr; +} + +void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu) +{ + u64 spsr, elr, mode; + bool direct_eret; + + /* + * Going through the whole put/load motions is a waste of time + * if this is a VHE guest hypervisor returning to its own + * userspace, or the hypervisor performing a local exception + * return. No need to save/restore registers, no need to + * switch S2 MMU. Just do the canonical ERET. + */ + spsr = vcpu_read_sys_reg(vcpu, SPSR_EL2); + spsr = kvm_check_illegal_exception_return(vcpu, spsr); + + mode = spsr & (PSR_MODE_MASK | PSR_MODE32_BIT); + + direct_eret = (mode == PSR_MODE_EL0t && + vcpu_el2_e2h_is_set(vcpu) && + vcpu_el2_tge_is_set(vcpu)); + direct_eret |= (mode == PSR_MODE_EL2h || mode == PSR_MODE_EL2t); + + if (direct_eret) { + *vcpu_pc(vcpu) = vcpu_read_sys_reg(vcpu, ELR_EL2); + *vcpu_cpsr(vcpu) = spsr; + trace_kvm_nested_eret(vcpu, *vcpu_pc(vcpu), spsr); + return; + } + + preempt_disable(); + kvm_arch_vcpu_put(vcpu); + + elr = __vcpu_sys_reg(vcpu, ELR_EL2); + + trace_kvm_nested_eret(vcpu, elr, spsr); + + /* + * Note that the current exception level is always the virtual EL2, + * since we set HCR_EL2.NV bit only when entering the virtual EL2. + */ + *vcpu_pc(vcpu) = elr; + *vcpu_cpsr(vcpu) = spsr; + + kvm_arch_vcpu_load(vcpu, smp_processor_id()); + preempt_enable(); +} + +static void kvm_inject_el2_exception(struct kvm_vcpu *vcpu, u64 esr_el2, + enum exception_type type) +{ + trace_kvm_inject_nested_exception(vcpu, esr_el2, type); + + switch (type) { + case except_type_sync: + kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC); + vcpu_write_sys_reg(vcpu, esr_el2, ESR_EL2); + break; + case except_type_irq: + kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_IRQ); + break; + default: + WARN_ONCE(1, "Unsupported EL2 exception injection %d\n", type); + } +} + +/* + * Emulate taking an exception to EL2. + * See ARM ARM J8.1.2 AArch64.TakeException() + */ +static int kvm_inject_nested(struct kvm_vcpu *vcpu, u64 esr_el2, + enum exception_type type) +{ + u64 pstate, mode; + bool direct_inject; + + if (!vcpu_has_nv(vcpu)) { + kvm_err("Unexpected call to %s for the non-nesting configuration\n", + __func__); + return -EINVAL; + } + + /* + * As for ERET, we can avoid doing too much on the injection path by + * checking that we either took the exception from a VHE host + * userspace or from vEL2. In these cases, there is no change in + * translation regime (or anything else), so let's do as little as + * possible. + */ + pstate = *vcpu_cpsr(vcpu); + mode = pstate & (PSR_MODE_MASK | PSR_MODE32_BIT); + + direct_inject = (mode == PSR_MODE_EL0t && + vcpu_el2_e2h_is_set(vcpu) && + vcpu_el2_tge_is_set(vcpu)); + direct_inject |= (mode == PSR_MODE_EL2h || mode == PSR_MODE_EL2t); + + if (direct_inject) { + kvm_inject_el2_exception(vcpu, esr_el2, type); + return 1; + } + + preempt_disable(); + + /* + * We may have an exception or PC update in the EL0/EL1 context. + * Commit it before entering EL2. + */ + __kvm_adjust_pc(vcpu); + + kvm_arch_vcpu_put(vcpu); + + kvm_inject_el2_exception(vcpu, esr_el2, type); + + /* + * A hard requirement is that a switch between EL1 and EL2 + * contexts has to happen between a put/load, so that we can + * pick the correct timer and interrupt configuration, among + * other things. + * + * Make sure the exception actually took place before we load + * the new context. + */ + __kvm_adjust_pc(vcpu); + + kvm_arch_vcpu_load(vcpu, smp_processor_id()); + preempt_enable(); + + return 1; +} + +int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2) +{ + return kvm_inject_nested(vcpu, esr_el2, except_type_sync); +} + +int kvm_inject_nested_irq(struct kvm_vcpu *vcpu) +{ + /* + * Do not inject an irq if the: + * - Current exception level is EL2, and + * - virtual HCR_EL2.TGE == 0 + * - virtual HCR_EL2.IMO == 0 + * + * See Table D1-17 "Physical interrupt target and masking when EL3 is + * not implemented and EL2 is implemented" in ARM DDI 0487C.a. + */ + + if (vcpu_is_el2(vcpu) && !vcpu_el2_tge_is_set(vcpu) && + !(__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_IMO)) + return 1; + + /* esr_el2 value doesn't matter for exits due to irqs. */ + return kvm_inject_nested(vcpu, 0, except_type_irq); +} diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 235775d0c825..1279949599b5 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -184,6 +184,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) sysreg_clear_set(CPACR_EL1, CPACR_EL1_SMEN_EL0EN, CPACR_EL1_SMEN_EL1EN); + isb(); } if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) { diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index cf4c495a4321..07444fa22888 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -24,6 +24,7 @@ #include <asm/fpsimd.h> #include <asm/kvm.h> #include <asm/kvm_emulate.h> +#include <asm/kvm_nested.h> #include <asm/sigcontext.h> #include "trace.h" @@ -253,6 +254,11 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if (!vcpu_el1_is_32bit(vcpu)) return -EINVAL; break; + case PSR_MODE_EL2h: + case PSR_MODE_EL2t: + if (!vcpu_has_nv(vcpu)) + return -EINVAL; + fallthrough; case PSR_MODE_EL0t: case PSR_MODE_EL1t: case PSR_MODE_EL1h: diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index e778eefcf214..a798c0b4d717 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -16,6 +16,7 @@ #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_mmu.h> +#include <asm/kvm_nested.h> #include <asm/debug-monitors.h> #include <asm/stacktrace/nvhe.h> #include <asm/traps.h> @@ -41,6 +42,16 @@ static int handle_hvc(struct kvm_vcpu *vcpu) kvm_vcpu_hvc_get_imm(vcpu)); vcpu->stat.hvc_exit_stat++; + /* Forward hvc instructions to the virtual EL2 if the guest has EL2. */ + if (vcpu_has_nv(vcpu)) { + if (vcpu_read_sys_reg(vcpu, HCR_EL2) & HCR_HCD) + kvm_inject_undefined(vcpu); + else + kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + + return 1; + } + ret = kvm_hvc_call_handler(vcpu); if (ret < 0) { vcpu_set_reg(vcpu, 0, ~0UL); @@ -52,6 +63,8 @@ static int handle_hvc(struct kvm_vcpu *vcpu) static int handle_smc(struct kvm_vcpu *vcpu) { + int ret; + /* * "If an SMC instruction executed at Non-secure EL1 is * trapped to EL2 because HCR_EL2.TSC is 1, the exception is a @@ -59,10 +72,30 @@ static int handle_smc(struct kvm_vcpu *vcpu) * * We need to advance the PC after the trap, as it would * otherwise return to the same address... + * + * Only handle SMCs from the virtual EL2 with an immediate of zero and + * skip it otherwise. */ - vcpu_set_reg(vcpu, 0, ~0UL); + if (!vcpu_is_el2(vcpu) || kvm_vcpu_hvc_get_imm(vcpu)) { + vcpu_set_reg(vcpu, 0, ~0UL); + kvm_incr_pc(vcpu); + return 1; + } + + /* + * If imm is zero then it is likely an SMCCC call. + * + * Note that on ARMv8.3, even if EL3 is not implemented, SMC executed + * at Non-secure EL1 is trapped to EL2 if HCR_EL2.TSC==1, rather than + * being treated as UNDEFINED. + */ + ret = kvm_hvc_call_handler(vcpu); + if (ret < 0) + vcpu_set_reg(vcpu, 0, ~0UL); + kvm_incr_pc(vcpu); - return 1; + + return ret; } /* @@ -196,6 +229,15 @@ static int kvm_handle_ptrauth(struct kvm_vcpu *vcpu) return 1; } +static int kvm_handle_eret(struct kvm_vcpu *vcpu) +{ + if (kvm_vcpu_get_esr(vcpu) & ESR_ELx_ERET_ISS_ERET) + return kvm_handle_ptrauth(vcpu); + + kvm_emulate_nested_eret(vcpu); + return 1; +} + static exit_handle_fn arm_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = kvm_handle_unknown_ec, [ESR_ELx_EC_WFx] = kvm_handle_wfx, @@ -211,6 +253,7 @@ static exit_handle_fn arm_exit_handlers[] = { [ESR_ELx_EC_SMC64] = handle_smc, [ESR_ELx_EC_SYS64] = kvm_handle_sys_reg, [ESR_ELx_EC_SVE] = handle_sve, + [ESR_ELx_EC_ERET] = kvm_handle_eret, [ESR_ELx_EC_IABT_LOW] = kvm_handle_guest_abort, [ESR_ELx_EC_DABT_LOW] = kvm_handle_guest_abort, [ESR_ELx_EC_SOFTSTP_LOW]= kvm_handle_guest_debug, diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c index 791d3de76771..424a5107cddb 100644 --- a/arch/arm64/kvm/hyp/exception.c +++ b/arch/arm64/kvm/hyp/exception.c @@ -14,6 +14,7 @@ #include <linux/kvm_host.h> #include <asm/kvm_emulate.h> #include <asm/kvm_mmu.h> +#include <asm/kvm_nested.h> #if !defined (__KVM_NVHE_HYPERVISOR__) && !defined (__KVM_VHE_HYPERVISOR__) #error Hypervisor code only! @@ -23,7 +24,9 @@ static inline u64 __vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) { u64 val; - if (__vcpu_read_sys_reg_from_cpu(reg, &val)) + if (unlikely(vcpu_has_nv(vcpu))) + return vcpu_read_sys_reg(vcpu, reg); + else if (__vcpu_read_sys_reg_from_cpu(reg, &val)) return val; return __vcpu_sys_reg(vcpu, reg); @@ -31,18 +34,25 @@ static inline u64 __vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) { - if (__vcpu_write_sys_reg_to_cpu(val, reg)) - return; - - __vcpu_sys_reg(vcpu, reg) = val; + if (unlikely(vcpu_has_nv(vcpu))) + vcpu_write_sys_reg(vcpu, val, reg); + else if (!__vcpu_write_sys_reg_to_cpu(val, reg)) + __vcpu_sys_reg(vcpu, reg) = val; } -static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, u64 val) +static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long target_mode, + u64 val) { - if (has_vhe()) + if (unlikely(vcpu_has_nv(vcpu))) { + if (target_mode == PSR_MODE_EL1h) + vcpu_write_sys_reg(vcpu, val, SPSR_EL1); + else + vcpu_write_sys_reg(vcpu, val, SPSR_EL2); + } else if (has_vhe()) { write_sysreg_el1(val, SYS_SPSR); - else + } else { __vcpu_sys_reg(vcpu, SPSR_EL1) = val; + } } static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val) @@ -101,6 +111,11 @@ static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode, sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1); __vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL1); break; + case PSR_MODE_EL2h: + vbar = __vcpu_read_sys_reg(vcpu, VBAR_EL2); + sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL2); + __vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL2); + break; default: /* Don't do that */ BUG(); @@ -153,7 +168,7 @@ static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode, new |= target_mode; *vcpu_cpsr(vcpu) = new; - __vcpu_write_spsr(vcpu, old); + __vcpu_write_spsr(vcpu, target_mode, old); } /* @@ -323,11 +338,20 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu) case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC): enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync); break; + + case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC): + enter_exception64(vcpu, PSR_MODE_EL2h, except_type_sync); + break; + + case unpack_vcpu_flag(EXCEPT_AA64_EL2_IRQ): + enter_exception64(vcpu, PSR_MODE_EL2h, except_type_irq); + break; + default: /* - * Only EL1_SYNC makes sense so far, EL2_{SYNC,IRQ} - * will be implemented at some point. Everything - * else gets silently ignored. + * Only EL1_SYNC and EL2_{SYNC,IRQ} makes + * sense so far. Everything else gets silently + * ignored. */ break; } diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index baa5b9b3dde5..699ea1f8d409 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -39,7 +39,6 @@ static inline bool ctxt_has_mte(struct kvm_cpu_context *ctxt) static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) { - ctxt_sys_reg(ctxt, CSSELR_EL1) = read_sysreg(csselr_el1); ctxt_sys_reg(ctxt, SCTLR_EL1) = read_sysreg_el1(SYS_SCTLR); ctxt_sys_reg(ctxt, CPACR_EL1) = read_sysreg_el1(SYS_CPACR); ctxt_sys_reg(ctxt, TTBR0_EL1) = read_sysreg_el1(SYS_TTBR0); @@ -95,7 +94,6 @@ static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) { write_sysreg(ctxt_sys_reg(ctxt, MPIDR_EL1), vmpidr_el2); - write_sysreg(ctxt_sys_reg(ctxt, CSSELR_EL1), csselr_el1); if (has_vhe() || !cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { @@ -156,9 +154,26 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) write_sysreg_el1(ctxt_sys_reg(ctxt, SPSR_EL1), SYS_SPSR); } +/* Read the VCPU state's PSTATE, but translate (v)EL2 to EL1. */ +static inline u64 to_hw_pstate(const struct kvm_cpu_context *ctxt) +{ + u64 mode = ctxt->regs.pstate & (PSR_MODE_MASK | PSR_MODE32_BIT); + + switch (mode) { + case PSR_MODE_EL2t: + mode = PSR_MODE_EL1t; + break; + case PSR_MODE_EL2h: + mode = PSR_MODE_EL1h; + break; + } + + return (ctxt->regs.pstate & ~(PSR_MODE_MASK | PSR_MODE32_BIT)) | mode; +} + static inline void __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt) { - u64 pstate = ctxt->regs.pstate; + u64 pstate = to_hw_pstate(ctxt); u64 mode = pstate & PSR_AA32_MODE_MASK; /* diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index c953fb4b9a13..a6d67c2bb5ae 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -183,6 +183,7 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu) /* Initialize EL2 CPU state to sane values. */ init_el2_state // Clobbers x0..x2 + finalise_el2_state /* Enable MMU, set vectors and stack. */ mov x0, x28 diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 0f9ac25afdf4..08d2b004f4b7 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -26,6 +26,7 @@ u64 id_aa64isar2_el1_sys_val; u64 id_aa64mmfr0_el1_sys_val; u64 id_aa64mmfr1_el1_sys_val; u64 id_aa64mmfr2_el1_sys_val; +u64 id_aa64smfr0_el1_sys_val; /* * Inject an unknown/undefined exception to an AArch64 guest while most of its diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index b11cf2c618a6..3d61bd3e591d 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -168,6 +168,25 @@ static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, return walker->cb(ctx, visit); } +static bool kvm_pgtable_walk_continue(const struct kvm_pgtable_walker *walker, + int r) +{ + /* + * Visitor callbacks return EAGAIN when the conditions that led to a + * fault are no longer reflected in the page tables due to a race to + * update a PTE. In the context of a fault handler this is interpreted + * as a signal to retry guest execution. + * + * Ignore the return code altogether for walkers outside a fault handler + * (e.g. write protecting a range of memory) and chug along with the + * page table walk. + */ + if (r == -EAGAIN) + return !(walker->flags & KVM_PGTABLE_WALK_HANDLE_FAULT); + + return !r; +} + static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level); @@ -200,7 +219,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, table = kvm_pte_table(ctx.old, level); } - if (ret) + if (!kvm_pgtable_walk_continue(data->walker, ret)) goto out; if (!table) { @@ -211,13 +230,16 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, childp = (kvm_pteref_t)kvm_pte_follow(ctx.old, mm_ops); ret = __kvm_pgtable_walk(data, mm_ops, childp, level + 1); - if (ret) + if (!kvm_pgtable_walk_continue(data->walker, ret)) goto out; if (ctx.flags & KVM_PGTABLE_WALK_TABLE_POST) ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_POST); out: + if (kvm_pgtable_walk_continue(data->walker, ret)) + return 0; + return ret; } @@ -584,12 +606,14 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) lvls = 2; vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls); +#ifdef CONFIG_ARM64_HW_AFDBM /* * Enable the Hardware Access Flag management, unconditionally * on all CPUs. The features is RES0 on CPUs without the support * and must be ignored by the CPUs. */ vtcr |= VTCR_EL2_HA; +#endif /* CONFIG_ARM64_HW_AFDBM */ /* Set the vmid bits */ vtcr |= (get_vmid_bits(mmfr1) == 16) ? @@ -1026,7 +1050,7 @@ static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; if (!kvm_pte_valid(ctx->old)) - return 0; + return -EAGAIN; data->level = ctx->level; data->pte = pte; @@ -1094,9 +1118,15 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr) { kvm_pte_t pte = 0; - stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0, - &pte, NULL, 0); - dsb(ishst); + int ret; + + ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0, + &pte, NULL, + KVM_PGTABLE_WALK_HANDLE_FAULT | + KVM_PGTABLE_WALK_SHARED); + if (!ret) + dsb(ishst); + return pte; } @@ -1141,6 +1171,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, + KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED); if (!ret) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level); diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 1a97391fedd2..cd3f3117bf16 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -40,7 +40,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) ___activate_traps(vcpu); val = read_sysreg(cpacr_el1); - val |= CPACR_EL1_TTA; + val |= CPACR_ELx_TTA; val &= ~(CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN | CPACR_EL1_SMEN_EL0EN | CPACR_EL1_SMEN_EL1EN); @@ -120,6 +120,25 @@ static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) { + /* + * If we were in HYP context on entry, adjust the PSTATE view + * so that the usual helpers work correctly. + */ + if (unlikely(vcpu_get_flag(vcpu, VCPU_HYP_CONTEXT))) { + u64 mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT); + + switch (mode) { + case PSR_MODE_EL1t: + mode = PSR_MODE_EL2t; + break; + case PSR_MODE_EL1h: + mode = PSR_MODE_EL2h; + break; + } + + *vcpu_cpsr(vcpu) &= ~(PSR_MODE_MASK | PSR_MODE32_BIT); + *vcpu_cpsr(vcpu) |= mode; + } } /* Switch to the guest for VHE systems running in EL2 */ @@ -154,6 +173,11 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) sysreg_restore_guest_state_vhe(guest_ctxt); __debug_switch_to_guest(vcpu); + if (is_hyp_ctxt(vcpu)) + vcpu_set_flag(vcpu, VCPU_HYP_CONTEXT); + else + vcpu_clear_flag(vcpu, VCPU_HYP_CONTEXT); + do { /* Jump in the fire! */ exit_code = __guest_enter(vcpu); diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index c9f401fa01a9..64c086c02c60 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -198,7 +198,7 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) break; case ARM_SMCCC_HV_PV_TIME_ST: gpa = kvm_init_stolen_time(vcpu); - if (gpa != GPA_INVALID) + if (gpa != INVALID_GPA) val[0] = gpa; break; case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID: diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index f32f4a2a347f..64c3aec0d937 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -12,17 +12,55 @@ #include <linux/kvm_host.h> #include <asm/kvm_emulate.h> +#include <asm/kvm_nested.h> #include <asm/esr.h> +static void pend_sync_exception(struct kvm_vcpu *vcpu) +{ + /* If not nesting, EL1 is the only possible exception target */ + if (likely(!vcpu_has_nv(vcpu))) { + kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); + return; + } + + /* + * With NV, we need to pick between EL1 and EL2. Note that we + * never deal with a nesting exception here, hence never + * changing context, and the exception itself can be delayed + * until the next entry. + */ + switch(*vcpu_cpsr(vcpu) & PSR_MODE_MASK) { + case PSR_MODE_EL2h: + case PSR_MODE_EL2t: + kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC); + break; + case PSR_MODE_EL1h: + case PSR_MODE_EL1t: + kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); + break; + case PSR_MODE_EL0t: + if (vcpu_el2_tge_is_set(vcpu)) + kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC); + else + kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); + break; + default: + BUG(); + } +} + +static bool match_target_el(struct kvm_vcpu *vcpu, unsigned long target) +{ + return (vcpu_get_flag(vcpu, EXCEPT_MASK) == target); +} + static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr) { unsigned long cpsr = *vcpu_cpsr(vcpu); bool is_aarch32 = vcpu_mode_is_32bit(vcpu); u64 esr = 0; - kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); - - vcpu_write_sys_reg(vcpu, addr, FAR_EL1); + pend_sync_exception(vcpu); /* * Build an {i,d}abort, depending on the level and the @@ -43,14 +81,22 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr if (!is_iabt) esr |= ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT; - vcpu_write_sys_reg(vcpu, esr | ESR_ELx_FSC_EXTABT, ESR_EL1); + esr |= ESR_ELx_FSC_EXTABT; + + if (match_target_el(vcpu, unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC))) { + vcpu_write_sys_reg(vcpu, addr, FAR_EL1); + vcpu_write_sys_reg(vcpu, esr, ESR_EL1); + } else { + vcpu_write_sys_reg(vcpu, addr, FAR_EL2); + vcpu_write_sys_reg(vcpu, esr, ESR_EL2); + } } static void inject_undef64(struct kvm_vcpu *vcpu) { u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); - kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); + pend_sync_exception(vcpu); /* * Build an unknown exception, depending on the instruction @@ -59,7 +105,10 @@ static void inject_undef64(struct kvm_vcpu *vcpu) if (kvm_vcpu_trap_il_is32bit(vcpu)) esr |= ESR_ELx_IL; - vcpu_write_sys_reg(vcpu, esr, ESR_EL1); + if (match_target_el(vcpu, unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC))) + vcpu_write_sys_reg(vcpu, esr, ESR_EL1); + else + vcpu_write_sys_reg(vcpu, esr, ESR_EL2); } #define DFSR_FSC_EXTABT_LPAE 0x10 diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index a3ee3b605c9b..7113587222ff 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -25,11 +25,11 @@ static struct kvm_pgtable *hyp_pgtable; static DEFINE_MUTEX(kvm_hyp_pgd_mutex); -static unsigned long hyp_idmap_start; -static unsigned long hyp_idmap_end; -static phys_addr_t hyp_idmap_vector; +static unsigned long __ro_after_init hyp_idmap_start; +static unsigned long __ro_after_init hyp_idmap_end; +static phys_addr_t __ro_after_init hyp_idmap_vector; -static unsigned long io_map_base; +static unsigned long __ro_after_init io_map_base; static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) { @@ -46,16 +46,17 @@ static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) * long will also starve other vCPUs. We have to also make sure that the page * tables are not freed while we released the lock. */ -static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr, +static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end, int (*fn)(struct kvm_pgtable *, u64, u64), bool resched) { + struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); int ret; u64 next; do { - struct kvm_pgtable *pgt = kvm->arch.mmu.pgt; + struct kvm_pgtable *pgt = mmu->pgt; if (!pgt) return -EINVAL; @@ -71,8 +72,8 @@ static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr, return ret; } -#define stage2_apply_range_resched(kvm, addr, end, fn) \ - stage2_apply_range(kvm, addr, end, fn, true) +#define stage2_apply_range_resched(mmu, addr, end, fn) \ + stage2_apply_range(mmu, addr, end, fn, true) static bool memslot_is_logging(struct kvm_memory_slot *memslot) { @@ -235,7 +236,7 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 lockdep_assert_held_write(&kvm->mmu_lock); WARN_ON(size & ~PAGE_MASK); - WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap, + WARN_ON(stage2_apply_range(mmu, start, end, kvm_pgtable_stage2_unmap, may_block)); } @@ -250,7 +251,7 @@ static void stage2_flush_memslot(struct kvm *kvm, phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; phys_addr_t end = addr + PAGE_SIZE * memslot->npages; - stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush); + stage2_apply_range_resched(&kvm->arch.mmu, addr, end, kvm_pgtable_stage2_flush); } /** @@ -280,7 +281,7 @@ static void stage2_flush_vm(struct kvm *kvm) /** * free_hyp_pgds - free Hyp-mode page tables */ -void free_hyp_pgds(void) +void __init free_hyp_pgds(void) { mutex_lock(&kvm_hyp_pgd_mutex); if (hyp_pgtable) { @@ -934,8 +935,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, */ static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) { - struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); - stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect); + stage2_apply_range_resched(mmu, addr, end, kvm_pgtable_stage2_wrprotect); } /** @@ -1383,7 +1383,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, else ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, __pfn_to_phys(pfn), prot, - memcache, KVM_PGTABLE_WALK_SHARED); + memcache, + KVM_PGTABLE_WALK_HANDLE_FAULT | + KVM_PGTABLE_WALK_SHARED); /* Mark the page dirty only if the fault is handled successfully */ if (writable && !ret) { @@ -1401,20 +1403,18 @@ out_unlock: /* Resolve the access fault by making the page young again. */ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { - pte_t pte; - kvm_pte_t kpte; + kvm_pte_t pte; struct kvm_s2_mmu *mmu; trace_kvm_access_fault(fault_ipa); - write_lock(&vcpu->kvm->mmu_lock); + read_lock(&vcpu->kvm->mmu_lock); mmu = vcpu->arch.hw_mmu; - kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa); - write_unlock(&vcpu->kvm->mmu_lock); + pte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa); + read_unlock(&vcpu->kvm->mmu_lock); - pte = __pte(kpte); - if (pte_valid(pte)) - kvm_set_pfn_accessed(pte_pfn(pte)); + if (kvm_pte_valid(pte)) + kvm_set_pfn_accessed(kvm_pte_to_pfn(pte)); } /** @@ -1668,7 +1668,7 @@ static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { .virt_to_phys = kvm_host_pa, }; -int kvm_mmu_init(u32 *hyp_va_bits) +int __init kvm_mmu_init(u32 *hyp_va_bits) { int err; u32 idmap_bits; diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c new file mode 100644 index 000000000000..315354d27978 --- /dev/null +++ b/arch/arm64/kvm/nested.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 - Columbia University and Linaro Ltd. + * Author: Jintack Lim <jintack.lim@linaro.org> + */ + +#include <linux/kvm.h> +#include <linux/kvm_host.h> + +#include <asm/kvm_emulate.h> +#include <asm/kvm_nested.h> +#include <asm/sysreg.h> + +#include "sys_regs.h" + +/* Protection against the sysreg repainting madness... */ +#define NV_FTR(r, f) ID_AA64##r##_EL1_##f + +/* + * Our emulated CPU doesn't support all the possible features. For the + * sake of simplicity (and probably mental sanity), wipe out a number + * of feature bits we don't intend to support for the time being. + * This list should get updated as new features get added to the NV + * support, and new extension to the architecture. + */ +void access_nested_id_reg(struct kvm_vcpu *v, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 id = reg_to_encoding(r); + u64 val, tmp; + + val = p->regval; + + switch (id) { + case SYS_ID_AA64ISAR0_EL1: + /* Support everything but TME, O.S. and Range TLBIs */ + val &= ~(NV_FTR(ISAR0, TLB) | + NV_FTR(ISAR0, TME)); + break; + + case SYS_ID_AA64ISAR1_EL1: + /* Support everything but PtrAuth and Spec Invalidation */ + val &= ~(GENMASK_ULL(63, 56) | + NV_FTR(ISAR1, SPECRES) | + NV_FTR(ISAR1, GPI) | + NV_FTR(ISAR1, GPA) | + NV_FTR(ISAR1, API) | + NV_FTR(ISAR1, APA)); + break; + + case SYS_ID_AA64PFR0_EL1: + /* No AMU, MPAM, S-EL2, RAS or SVE */ + val &= ~(GENMASK_ULL(55, 52) | + NV_FTR(PFR0, AMU) | + NV_FTR(PFR0, MPAM) | + NV_FTR(PFR0, SEL2) | + NV_FTR(PFR0, RAS) | + NV_FTR(PFR0, SVE) | + NV_FTR(PFR0, EL3) | + NV_FTR(PFR0, EL2) | + NV_FTR(PFR0, EL1)); + /* 64bit EL1/EL2/EL3 only */ + val |= FIELD_PREP(NV_FTR(PFR0, EL1), 0b0001); + val |= FIELD_PREP(NV_FTR(PFR0, EL2), 0b0001); + val |= FIELD_PREP(NV_FTR(PFR0, EL3), 0b0001); + break; + + case SYS_ID_AA64PFR1_EL1: + /* Only support SSBS */ + val &= NV_FTR(PFR1, SSBS); + break; + + case SYS_ID_AA64MMFR0_EL1: + /* Hide ECV, FGT, ExS, Secure Memory */ + val &= ~(GENMASK_ULL(63, 43) | + NV_FTR(MMFR0, TGRAN4_2) | + NV_FTR(MMFR0, TGRAN16_2) | + NV_FTR(MMFR0, TGRAN64_2) | + NV_FTR(MMFR0, SNSMEM)); + + /* Disallow unsupported S2 page sizes */ + switch (PAGE_SIZE) { + case SZ_64K: + val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN16_2), 0b0001); + fallthrough; + case SZ_16K: + val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN4_2), 0b0001); + fallthrough; + case SZ_4K: + /* Support everything */ + break; + } + /* + * Since we can't support a guest S2 page size smaller than + * the host's own page size (due to KVM only populating its + * own S2 using the kernel's page size), advertise the + * limitation using FEAT_GTG. + */ + switch (PAGE_SIZE) { + case SZ_4K: + val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN4_2), 0b0010); + fallthrough; + case SZ_16K: + val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN16_2), 0b0010); + fallthrough; + case SZ_64K: + val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN64_2), 0b0010); + break; + } + /* Cap PARange to 48bits */ + tmp = FIELD_GET(NV_FTR(MMFR0, PARANGE), val); + if (tmp > 0b0101) { + val &= ~NV_FTR(MMFR0, PARANGE); + val |= FIELD_PREP(NV_FTR(MMFR0, PARANGE), 0b0101); + } + break; + + case SYS_ID_AA64MMFR1_EL1: + val &= (NV_FTR(MMFR1, PAN) | + NV_FTR(MMFR1, LO) | + NV_FTR(MMFR1, HPDS) | + NV_FTR(MMFR1, VH) | + NV_FTR(MMFR1, VMIDBits)); + break; + + case SYS_ID_AA64MMFR2_EL1: + val &= ~(NV_FTR(MMFR2, EVT) | + NV_FTR(MMFR2, BBM) | + NV_FTR(MMFR2, TTL) | + GENMASK_ULL(47, 44) | + NV_FTR(MMFR2, ST) | + NV_FTR(MMFR2, CCIDX) | + NV_FTR(MMFR2, VARange)); + + /* Force TTL support */ + val |= FIELD_PREP(NV_FTR(MMFR2, TTL), 0b0001); + break; + + case SYS_ID_AA64DFR0_EL1: + /* Only limited support for PMU, Debug, BPs and WPs */ + val &= (NV_FTR(DFR0, PMUVer) | + NV_FTR(DFR0, WRPs) | + NV_FTR(DFR0, BRPs) | + NV_FTR(DFR0, DebugVer)); + + /* Cap Debug to ARMv8.1 */ + tmp = FIELD_GET(NV_FTR(DFR0, DebugVer), val); + if (tmp > 0b0111) { + val &= ~NV_FTR(DFR0, DebugVer); + val |= FIELD_PREP(NV_FTR(DFR0, DebugVer), 0b0111); + } + break; + + default: + /* Unknown register, just wipe it clean */ + val = 0; + break; + } + + p->regval = val; +} diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c index 78a09f7a6637..4ceabaa4c30b 100644 --- a/arch/arm64/kvm/pvtime.c +++ b/arch/arm64/kvm/pvtime.c @@ -19,7 +19,7 @@ void kvm_update_stolen_time(struct kvm_vcpu *vcpu) u64 steal = 0; int idx; - if (base == GPA_INVALID) + if (base == INVALID_GPA) return; idx = srcu_read_lock(&kvm->srcu); @@ -40,7 +40,7 @@ long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu) switch (feature) { case ARM_SMCCC_HV_PV_TIME_FEATURES: case ARM_SMCCC_HV_PV_TIME_ST: - if (vcpu->arch.steal.base != GPA_INVALID) + if (vcpu->arch.steal.base != INVALID_GPA) val = SMCCC_RET_SUCCESS; break; } @@ -54,7 +54,7 @@ gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu) struct kvm *kvm = vcpu->kvm; u64 base = vcpu->arch.steal.base; - if (base == GPA_INVALID) + if (base == INVALID_GPA) return base; /* @@ -89,7 +89,7 @@ int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu, return -EFAULT; if (!IS_ALIGNED(ipa, 64)) return -EINVAL; - if (vcpu->arch.steal.base != GPA_INVALID) + if (vcpu->arch.steal.base != INVALID_GPA) return -EEXIST; /* Check the address is in a valid memslot */ diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index e0267f672b8a..49a3257dec46 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -27,10 +27,11 @@ #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_mmu.h> +#include <asm/kvm_nested.h> #include <asm/virt.h> /* Maximum phys_shift supported for any VM on this host */ -static u32 kvm_ipa_limit; +static u32 __ro_after_init kvm_ipa_limit; /* * ARMv8 Reset Values @@ -38,12 +39,15 @@ static u32 kvm_ipa_limit; #define VCPU_RESET_PSTATE_EL1 (PSR_MODE_EL1h | PSR_A_BIT | PSR_I_BIT | \ PSR_F_BIT | PSR_D_BIT) +#define VCPU_RESET_PSTATE_EL2 (PSR_MODE_EL2h | PSR_A_BIT | PSR_I_BIT | \ + PSR_F_BIT | PSR_D_BIT) + #define VCPU_RESET_PSTATE_SVC (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | \ PSR_AA32_I_BIT | PSR_AA32_F_BIT) -unsigned int kvm_sve_max_vl; +unsigned int __ro_after_init kvm_sve_max_vl; -int kvm_arm_init_sve(void) +int __init kvm_arm_init_sve(void) { if (system_supports_sve()) { kvm_sve_max_vl = sve_max_virtualisable_vl(); @@ -157,6 +161,7 @@ void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) if (sve_state) kvm_unshare_hyp(sve_state, sve_state + vcpu_sve_state_size(vcpu)); kfree(sve_state); + kfree(vcpu->arch.ccsidr); } static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu) @@ -220,6 +225,10 @@ static int kvm_set_vm_width(struct kvm_vcpu *vcpu) if (kvm_has_mte(kvm) && is32bit) return -EINVAL; + /* NV is incompatible with AArch32 */ + if (vcpu_has_nv(vcpu) && is32bit) + return -EINVAL; + if (is32bit) set_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags); @@ -272,6 +281,12 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) if (loaded) kvm_arch_vcpu_put(vcpu); + /* Disallow NV+SVE for the time being */ + if (vcpu_has_nv(vcpu) && vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) { + ret = -EINVAL; + goto out; + } + if (!kvm_arm_vcpu_sve_finalized(vcpu)) { if (test_bit(KVM_ARM_VCPU_SVE, vcpu->arch.features)) { ret = kvm_vcpu_enable_sve(vcpu); @@ -294,6 +309,8 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) default: if (vcpu_el1_is_32bit(vcpu)) { pstate = VCPU_RESET_PSTATE_SVC; + } else if (vcpu_has_nv(vcpu)) { + pstate = VCPU_RESET_PSTATE_EL2; } else { pstate = VCPU_RESET_PSTATE_EL1; } @@ -352,7 +369,7 @@ u32 get_kvm_ipa_limit(void) return kvm_ipa_limit; } -int kvm_set_ipa_limit(void) +int __init kvm_set_ipa_limit(void) { unsigned int parange; u64 mmfr0; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c6cbfe6b854b..53749d3a0996 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -11,6 +11,7 @@ #include <linux/bitfield.h> #include <linux/bsearch.h> +#include <linux/cacheinfo.h> #include <linux/kvm_host.h> #include <linux/mm.h> #include <linux/printk.h> @@ -24,6 +25,7 @@ #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> +#include <asm/kvm_nested.h> #include <asm/perf_event.h> #include <asm/sysreg.h> @@ -78,28 +80,112 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) __vcpu_write_sys_reg_to_cpu(val, reg)) return; - __vcpu_sys_reg(vcpu, reg) = val; + __vcpu_sys_reg(vcpu, reg) = val; } -/* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */ -static u32 cache_levels; - /* CSSELR values; used to index KVM_REG_ARM_DEMUX_ID_CCSIDR */ #define CSSELR_MAX 14 +/* + * Returns the minimum line size for the selected cache, expressed as + * Log2(bytes). + */ +static u8 get_min_cache_line_size(bool icache) +{ + u64 ctr = read_sanitised_ftr_reg(SYS_CTR_EL0); + u8 field; + + if (icache) + field = SYS_FIELD_GET(CTR_EL0, IminLine, ctr); + else + field = SYS_FIELD_GET(CTR_EL0, DminLine, ctr); + + /* + * Cache line size is represented as Log2(words) in CTR_EL0. + * Log2(bytes) can be derived with the following: + * + * Log2(words) + 2 = Log2(bytes / 4) + 2 + * = Log2(bytes) - 2 + 2 + * = Log2(bytes) + */ + return field + 2; +} + /* Which cache CCSIDR represents depends on CSSELR value. */ -static u32 get_ccsidr(u32 csselr) +static u32 get_ccsidr(struct kvm_vcpu *vcpu, u32 csselr) { - u32 ccsidr; + u8 line_size; - /* Make sure noone else changes CSSELR during this! */ - local_irq_disable(); - write_sysreg(csselr, csselr_el1); - isb(); - ccsidr = read_sysreg(ccsidr_el1); - local_irq_enable(); + if (vcpu->arch.ccsidr) + return vcpu->arch.ccsidr[csselr]; - return ccsidr; + line_size = get_min_cache_line_size(csselr & CSSELR_EL1_InD); + + /* + * Fabricate a CCSIDR value as the overriding value does not exist. + * The real CCSIDR value will not be used as it can vary by the + * physical CPU which the vcpu currently resides in. + * + * The line size is determined with get_min_cache_line_size(), which + * should be valid for all CPUs even if they have different cache + * configuration. + * + * The associativity bits are cleared, meaning the geometry of all data + * and unified caches (which are guaranteed to be PIPT and thus + * non-aliasing) are 1 set and 1 way. + * Guests should not be doing cache operations by set/way at all, and + * for this reason, we trap them and attempt to infer the intent, so + * that we can flush the entire guest's address space at the appropriate + * time. The exposed geometry minimizes the number of the traps. + * [If guests should attempt to infer aliasing properties from the + * geometry (which is not permitted by the architecture), they would + * only do so for virtually indexed caches.] + * + * We don't check if the cache level exists as it is allowed to return + * an UNKNOWN value if not. + */ + return SYS_FIELD_PREP(CCSIDR_EL1, LineSize, line_size - 4); +} + +static int set_ccsidr(struct kvm_vcpu *vcpu, u32 csselr, u32 val) +{ + u8 line_size = FIELD_GET(CCSIDR_EL1_LineSize, val) + 4; + u32 *ccsidr = vcpu->arch.ccsidr; + u32 i; + + if ((val & CCSIDR_EL1_RES0) || + line_size < get_min_cache_line_size(csselr & CSSELR_EL1_InD)) + return -EINVAL; + + if (!ccsidr) { + if (val == get_ccsidr(vcpu, csselr)) + return 0; + + ccsidr = kmalloc_array(CSSELR_MAX, sizeof(u32), GFP_KERNEL_ACCOUNT); + if (!ccsidr) + return -ENOMEM; + + for (i = 0; i < CSSELR_MAX; i++) + ccsidr[i] = get_ccsidr(vcpu, i); + + vcpu->arch.ccsidr = ccsidr; + } + + ccsidr[csselr] = val; + + return 0; +} + +static bool access_rw(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + vcpu_write_sys_reg(vcpu, p->regval, r->reg); + else + p->regval = vcpu_read_sys_reg(vcpu, r->reg); + + return true; } /* @@ -260,6 +346,14 @@ static bool trap_raz_wi(struct kvm_vcpu *vcpu, return read_zero(vcpu, p); } +static bool trap_undef(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + kvm_inject_undefined(vcpu); + return false; +} + /* * ARMv8.1 mandates at least a trivial LORegion implementation, where all the * RW registers are RES0 (which we can implement as RAZ/WI). On an ARMv8.0 @@ -370,12 +464,9 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - if (p->is_write) { - vcpu_write_sys_reg(vcpu, p->regval, r->reg); + access_rw(vcpu, p, r); + if (p->is_write) vcpu_set_flag(vcpu, DEBUG_DIRTY); - } else { - p->regval = vcpu_read_sys_reg(vcpu, r->reg); - } trace_trap_reg(__func__, r->reg, p->is_write, p->regval); @@ -1049,7 +1140,9 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, treg = TIMER_REG_CVAL; break; default: - BUG(); + print_sys_reg_msg(p, "%s", "Unhandled trapped timer register"); + kvm_inject_undefined(vcpu); + return false; } if (p->is_write) @@ -1155,6 +1248,12 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon), pmuver_to_perfmon(vcpu_pmuver(vcpu))); break; + case SYS_ID_AA64MMFR2_EL1: + val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK; + break; + case SYS_ID_MMFR4_EL1: + val &= ~ARM64_FEATURE_MASK(ID_MMFR4_EL1_CCIDX); + break; } return val; @@ -1205,6 +1304,9 @@ static bool access_id_reg(struct kvm_vcpu *vcpu, return write_to_read_only(vcpu, p, r); p->regval = read_id_reg(vcpu, r); + if (vcpu_has_nv(vcpu)) + access_nested_id_reg(vcpu, p, r); + return true; } @@ -1385,10 +1487,78 @@ static bool access_clidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (p->is_write) return write_to_read_only(vcpu, p, r); - p->regval = read_sysreg(clidr_el1); + p->regval = __vcpu_sys_reg(vcpu, r->reg); return true; } +/* + * Fabricate a CLIDR_EL1 value instead of using the real value, which can vary + * by the physical CPU which the vcpu currently resides in. + */ +static void reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + u64 ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0); + u64 clidr; + u8 loc; + + if ((ctr_el0 & CTR_EL0_IDC)) { + /* + * Data cache clean to the PoU is not required so LoUU and LoUIS + * will not be set and a unified cache, which will be marked as + * LoC, will be added. + * + * If not DIC, let the unified cache L2 so that an instruction + * cache can be added as L1 later. + */ + loc = (ctr_el0 & CTR_EL0_DIC) ? 1 : 2; + clidr = CACHE_TYPE_UNIFIED << CLIDR_CTYPE_SHIFT(loc); + } else { + /* + * Data cache clean to the PoU is required so let L1 have a data + * cache and mark it as LoUU and LoUIS. As L1 has a data cache, + * it can be marked as LoC too. + */ + loc = 1; + clidr = 1 << CLIDR_LOUU_SHIFT; + clidr |= 1 << CLIDR_LOUIS_SHIFT; + clidr |= CACHE_TYPE_DATA << CLIDR_CTYPE_SHIFT(1); + } + + /* + * Instruction cache invalidation to the PoU is required so let L1 have + * an instruction cache. If L1 already has a data cache, it will be + * CACHE_TYPE_SEPARATE. + */ + if (!(ctr_el0 & CTR_EL0_DIC)) + clidr |= CACHE_TYPE_INST << CLIDR_CTYPE_SHIFT(1); + + clidr |= loc << CLIDR_LOC_SHIFT; + + /* + * Add tag cache unified to data cache. Allocation tags and data are + * unified in a cache line so that it looks valid even if there is only + * one cache line. + */ + if (kvm_has_mte(vcpu->kvm)) + clidr |= 2 << CLIDR_TTYPE_SHIFT(loc); + + __vcpu_sys_reg(vcpu, r->reg) = clidr; +} + +static int set_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 val) +{ + u64 ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0); + u64 idc = !CLIDR_LOC(val) || (!CLIDR_LOUIS(val) && !CLIDR_LOUU(val)); + + if ((val & CLIDR_EL1_RES0) || (!(ctr_el0 & CTR_EL0_IDC) && idc)) + return -EINVAL; + + __vcpu_sys_reg(vcpu, rd->reg) = val; + + return 0; +} + static bool access_csselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { @@ -1410,22 +1580,10 @@ static bool access_ccsidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return write_to_read_only(vcpu, p, r); csselr = vcpu_read_sys_reg(vcpu, CSSELR_EL1); - p->regval = get_ccsidr(csselr); + csselr &= CSSELR_EL1_Level | CSSELR_EL1_InD; + if (csselr < CSSELR_MAX) + p->regval = get_ccsidr(vcpu, csselr); - /* - * Guests should not be doing cache operations by set/way at all, and - * for this reason, we trap them and attempt to infer the intent, so - * that we can flush the entire guest's address space at the appropriate - * time. - * To prevent this trapping from causing performance problems, let's - * expose the geometry of all data and unified caches (which are - * guaranteed to be PIPT and thus non-aliasing) as 1 set and 1 way. - * [If guests should attempt to infer aliasing properties from the - * geometry (which is not permitted by the architecture), they would - * only do so for virtually indexed caches.] - */ - if (!(csselr & 1)) // data or unified cache - p->regval &= ~GENMASK(27, 3); return true; } @@ -1446,6 +1604,44 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, .visibility = mte_visibility, \ } +static unsigned int el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (vcpu_has_nv(vcpu)) + return 0; + + return REG_HIDDEN; +} + +#define EL2_REG(name, acc, rst, v) { \ + SYS_DESC(SYS_##name), \ + .access = acc, \ + .reset = rst, \ + .reg = name, \ + .visibility = el2_visibility, \ + .val = v, \ +} + +/* + * EL{0,1}2 registers are the EL2 view on an EL0 or EL1 register when + * HCR_EL2.E2H==1, and only in the sysreg table for convenience of + * handling traps. Given that, they are always hidden from userspace. + */ +static unsigned int elx2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return REG_HIDDEN_USER; +} + +#define EL12_REG(name, acc, rst, v) { \ + SYS_DESC(SYS_##name##_EL12), \ + .access = acc, \ + .reset = rst, \ + .reg = name##_EL1, \ + .val = v, \ + .visibility = elx2_visibility, \ +} + /* sys_reg_desc initialiser for known cpufeature ID registers */ #define ID_SANITISED(name) { \ SYS_DESC(SYS_##name), \ @@ -1490,6 +1686,42 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, .visibility = raz_visibility, \ } +static bool access_sp_el1(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + __vcpu_sys_reg(vcpu, SP_EL1) = p->regval; + else + p->regval = __vcpu_sys_reg(vcpu, SP_EL1); + + return true; +} + +static bool access_elr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + vcpu_write_sys_reg(vcpu, p->regval, ELR_EL1); + else + p->regval = vcpu_read_sys_reg(vcpu, ELR_EL1); + + return true; +} + +static bool access_spsr(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + __vcpu_sys_reg(vcpu, SPSR_EL1) = p->regval; + else + p->regval = __vcpu_sys_reg(vcpu, SPSR_EL1); + + return true; +} + /* * Architected system registers. * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 @@ -1646,6 +1878,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { PTRAUTH_KEY(APDB), PTRAUTH_KEY(APGA), + { SYS_DESC(SYS_SPSR_EL1), access_spsr}, + { SYS_DESC(SYS_ELR_EL1), access_elr}, + { SYS_DESC(SYS_AFSR0_EL1), access_vm_reg, reset_unknown, AFSR0_EL1 }, { SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 }, { SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 }, @@ -1693,7 +1928,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_LORC_EL1), trap_loregion }, { SYS_DESC(SYS_LORID_EL1), trap_loregion }, - { SYS_DESC(SYS_VBAR_EL1), NULL, reset_val, VBAR_EL1, 0 }, + { SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 }, { SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 }, { SYS_DESC(SYS_ICC_IAR0_EL1), write_to_read_only }, @@ -1717,7 +1952,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_CNTKCTL_EL1), NULL, reset_val, CNTKCTL_EL1, 0}, { SYS_DESC(SYS_CCSIDR_EL1), access_ccsidr }, - { SYS_DESC(SYS_CLIDR_EL1), access_clidr }, + { SYS_DESC(SYS_CLIDR_EL1), access_clidr, reset_clidr, CLIDR_EL1, + .set_user = set_clidr }, + { SYS_DESC(SYS_CCSIDR2_EL1), undef_access }, { SYS_DESC(SYS_SMIDR_EL1), undef_access }, { SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 }, { SYS_DESC(SYS_CTR_EL0), access_ctr }, @@ -1913,9 +2150,67 @@ static const struct sys_reg_desc sys_reg_descs[] = { { PMU_SYS_REG(SYS_PMCCFILTR_EL0), .access = access_pmu_evtyper, .reset = reset_val, .reg = PMCCFILTR_EL0, .val = 0 }, + EL2_REG(VPIDR_EL2, access_rw, reset_unknown, 0), + EL2_REG(VMPIDR_EL2, access_rw, reset_unknown, 0), + EL2_REG(SCTLR_EL2, access_rw, reset_val, SCTLR_EL2_RES1), + EL2_REG(ACTLR_EL2, access_rw, reset_val, 0), + EL2_REG(HCR_EL2, access_rw, reset_val, 0), + EL2_REG(MDCR_EL2, access_rw, reset_val, 0), + EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_EL2_DEFAULT ), + EL2_REG(HSTR_EL2, access_rw, reset_val, 0), + EL2_REG(HACR_EL2, access_rw, reset_val, 0), + + EL2_REG(TTBR0_EL2, access_rw, reset_val, 0), + EL2_REG(TTBR1_EL2, access_rw, reset_val, 0), + EL2_REG(TCR_EL2, access_rw, reset_val, TCR_EL2_RES1), + EL2_REG(VTTBR_EL2, access_rw, reset_val, 0), + EL2_REG(VTCR_EL2, access_rw, reset_val, 0), + { SYS_DESC(SYS_DACR32_EL2), NULL, reset_unknown, DACR32_EL2 }, + EL2_REG(SPSR_EL2, access_rw, reset_val, 0), + EL2_REG(ELR_EL2, access_rw, reset_val, 0), + { SYS_DESC(SYS_SP_EL1), access_sp_el1}, + { SYS_DESC(SYS_IFSR32_EL2), NULL, reset_unknown, IFSR32_EL2 }, + EL2_REG(AFSR0_EL2, access_rw, reset_val, 0), + EL2_REG(AFSR1_EL2, access_rw, reset_val, 0), + EL2_REG(ESR_EL2, access_rw, reset_val, 0), { SYS_DESC(SYS_FPEXC32_EL2), NULL, reset_val, FPEXC32_EL2, 0x700 }, + + EL2_REG(FAR_EL2, access_rw, reset_val, 0), + EL2_REG(HPFAR_EL2, access_rw, reset_val, 0), + + EL2_REG(MAIR_EL2, access_rw, reset_val, 0), + EL2_REG(AMAIR_EL2, access_rw, reset_val, 0), + + EL2_REG(VBAR_EL2, access_rw, reset_val, 0), + EL2_REG(RVBAR_EL2, access_rw, reset_val, 0), + { SYS_DESC(SYS_RMR_EL2), trap_undef }, + + EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0), + EL2_REG(TPIDR_EL2, access_rw, reset_val, 0), + + EL2_REG(CNTVOFF_EL2, access_rw, reset_val, 0), + EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0), + + EL12_REG(SCTLR, access_vm_reg, reset_val, 0x00C50078), + EL12_REG(CPACR, access_rw, reset_val, 0), + EL12_REG(TTBR0, access_vm_reg, reset_unknown, 0), + EL12_REG(TTBR1, access_vm_reg, reset_unknown, 0), + EL12_REG(TCR, access_vm_reg, reset_val, 0), + { SYS_DESC(SYS_SPSR_EL12), access_spsr}, + { SYS_DESC(SYS_ELR_EL12), access_elr}, + EL12_REG(AFSR0, access_vm_reg, reset_unknown, 0), + EL12_REG(AFSR1, access_vm_reg, reset_unknown, 0), + EL12_REG(ESR, access_vm_reg, reset_unknown, 0), + EL12_REG(FAR, access_vm_reg, reset_unknown, 0), + EL12_REG(MAIR, access_vm_reg, reset_unknown, 0), + EL12_REG(AMAIR, access_vm_reg, reset_amair_el1, 0), + EL12_REG(VBAR, access_rw, reset_val, 0), + EL12_REG(CONTEXTIDR, access_vm_reg, reset_val, 0), + EL12_REG(CNTKCTL, access_rw, reset_val, 0), + + EL2_REG(SP_EL2, NULL, reset_unknown, 0), }; static bool trap_dbgdidr(struct kvm_vcpu *vcpu, @@ -2219,6 +2514,10 @@ static const struct sys_reg_desc cp15_regs[] = { { Op1(1), CRn( 0), CRm( 0), Op2(0), access_ccsidr }, { Op1(1), CRn( 0), CRm( 0), Op2(1), access_clidr }, + + /* CCSIDR2 */ + { Op1(1), CRn( 0), CRm( 0), Op2(2), undef_access }, + { Op1(2), CRn( 0), CRm( 0), Op2(0), access_csselr, NULL, CSSELR_EL1 }, }; @@ -2724,7 +3023,6 @@ id_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id, FUNCTION_INVARIANT(midr_el1) FUNCTION_INVARIANT(revidr_el1) -FUNCTION_INVARIANT(clidr_el1) FUNCTION_INVARIANT(aidr_el1) static void get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r) @@ -2733,10 +3031,9 @@ static void get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r) } /* ->val is filled in by kvm_sys_reg_table_init() */ -static struct sys_reg_desc invariant_sys_regs[] = { +static struct sys_reg_desc invariant_sys_regs[] __ro_after_init = { { SYS_DESC(SYS_MIDR_EL1), NULL, get_midr_el1 }, { SYS_DESC(SYS_REVIDR_EL1), NULL, get_revidr_el1 }, - { SYS_DESC(SYS_CLIDR_EL1), NULL, get_clidr_el1 }, { SYS_DESC(SYS_AIDR_EL1), NULL, get_aidr_el1 }, { SYS_DESC(SYS_CTR_EL0), NULL, get_ctr_el0 }, }; @@ -2773,33 +3070,7 @@ static int set_invariant_sys_reg(u64 id, u64 __user *uaddr) return 0; } -static bool is_valid_cache(u32 val) -{ - u32 level, ctype; - - if (val >= CSSELR_MAX) - return false; - - /* Bottom bit is Instruction or Data bit. Next 3 bits are level. */ - level = (val >> 1); - ctype = (cache_levels >> (level * 3)) & 7; - - switch (ctype) { - case 0: /* No cache */ - return false; - case 1: /* Instruction cache only */ - return (val & 1); - case 2: /* Data cache only */ - case 4: /* Unified cache */ - return !(val & 1); - case 3: /* Separate instruction and data caches */ - return true; - default: /* Reserved: we can't know instruction or data. */ - return false; - } -} - -static int demux_c15_get(u64 id, void __user *uaddr) +static int demux_c15_get(struct kvm_vcpu *vcpu, u64 id, void __user *uaddr) { u32 val; u32 __user *uval = uaddr; @@ -2815,16 +3086,16 @@ static int demux_c15_get(u64 id, void __user *uaddr) return -ENOENT; val = (id & KVM_REG_ARM_DEMUX_VAL_MASK) >> KVM_REG_ARM_DEMUX_VAL_SHIFT; - if (!is_valid_cache(val)) + if (val >= CSSELR_MAX) return -ENOENT; - return put_user(get_ccsidr(val), uval); + return put_user(get_ccsidr(vcpu, val), uval); default: return -ENOENT; } } -static int demux_c15_set(u64 id, void __user *uaddr) +static int demux_c15_set(struct kvm_vcpu *vcpu, u64 id, void __user *uaddr) { u32 val, newval; u32 __user *uval = uaddr; @@ -2840,16 +3111,13 @@ static int demux_c15_set(u64 id, void __user *uaddr) return -ENOENT; val = (id & KVM_REG_ARM_DEMUX_VAL_MASK) >> KVM_REG_ARM_DEMUX_VAL_SHIFT; - if (!is_valid_cache(val)) + if (val >= CSSELR_MAX) return -ENOENT; if (get_user(newval, uval)) return -EFAULT; - /* This is also invariant: you can't change it. */ - if (newval != get_ccsidr(val)) - return -EINVAL; - return 0; + return set_ccsidr(vcpu, val, newval); default: return -ENOENT; } @@ -2864,7 +3132,7 @@ int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, int ret; r = id_to_sys_reg_desc(vcpu, reg->id, table, num); - if (!r) + if (!r || sysreg_hidden_user(vcpu, r)) return -ENOENT; if (r->get_user) { @@ -2886,7 +3154,7 @@ int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg int err; if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) - return demux_c15_get(reg->id, uaddr); + return demux_c15_get(vcpu, reg->id, uaddr); err = get_invariant_sys_reg(reg->id, uaddr); if (err != -ENOENT) @@ -2908,7 +3176,7 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, return -EFAULT; r = id_to_sys_reg_desc(vcpu, reg->id, table, num); - if (!r) + if (!r || sysreg_hidden_user(vcpu, r)) return -ENOENT; if (sysreg_user_write_ignore(vcpu, r)) @@ -2930,7 +3198,7 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg int err; if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) - return demux_c15_set(reg->id, uaddr); + return demux_c15_set(vcpu, reg->id, uaddr); err = set_invariant_sys_reg(reg->id, uaddr); if (err != -ENOENT) @@ -2942,13 +3210,7 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg static unsigned int num_demux_regs(void) { - unsigned int i, count = 0; - - for (i = 0; i < CSSELR_MAX; i++) - if (is_valid_cache(i)) - count++; - - return count; + return CSSELR_MAX; } static int write_demux_regids(u64 __user *uindices) @@ -2958,8 +3220,6 @@ static int write_demux_regids(u64 __user *uindices) val |= KVM_REG_ARM_DEMUX_ID_CCSIDR; for (i = 0; i < CSSELR_MAX; i++) { - if (!is_valid_cache(i)) - continue; if (put_user(val | i, uindices)) return -EFAULT; uindices++; @@ -3002,7 +3262,7 @@ static int walk_one_sys_reg(const struct kvm_vcpu *vcpu, if (!(rd->reg || rd->get_user)) return 0; - if (sysreg_hidden(vcpu, rd)) + if (sysreg_hidden_user(vcpu, rd)) return 0; if (!copy_reg_to_user(rd, uind)) @@ -3057,11 +3317,10 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) return write_demux_regids(uindices); } -int kvm_sys_reg_table_init(void) +int __init kvm_sys_reg_table_init(void) { bool valid = true; unsigned int i; - struct sys_reg_desc clidr; /* Make sure tables are unique and in order. */ valid &= check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), false); @@ -3078,23 +3337,5 @@ int kvm_sys_reg_table_init(void) for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) invariant_sys_regs[i].reset(NULL, &invariant_sys_regs[i]); - /* - * CLIDR format is awkward, so clean it up. See ARM B4.1.20: - * - * If software reads the Cache Type fields from Ctype1 - * upwards, once it has seen a value of 0b000, no caches - * exist at further-out levels of the hierarchy. So, for - * example, if Ctype3 is the first Cache Type field with a - * value of 0b000, the values of Ctype4 to Ctype7 must be - * ignored. - */ - get_clidr_el1(NULL, &clidr); /* Ugly... */ - cache_levels = clidr.val; - for (i = 0; i < 7; i++) - if (((cache_levels >> (i*3)) & 7) == 0) - break; - /* Clear all higher bits. */ - cache_levels &= (1 << (i*3))-1; - return 0; } diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index e4ebb3a379fd..6b11f2cc7146 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -85,8 +85,9 @@ struct sys_reg_desc { }; #define REG_HIDDEN (1 << 0) /* hidden from userspace and guest */ -#define REG_RAZ (1 << 1) /* RAZ from userspace and guest */ -#define REG_USER_WI (1 << 2) /* WI from userspace only */ +#define REG_HIDDEN_USER (1 << 1) /* hidden from userspace only */ +#define REG_RAZ (1 << 2) /* RAZ from userspace and guest */ +#define REG_USER_WI (1 << 3) /* WI from userspace only */ static __printf(2, 3) inline void print_sys_reg_msg(const struct sys_reg_params *p, @@ -152,6 +153,15 @@ static inline bool sysreg_hidden(const struct kvm_vcpu *vcpu, return sysreg_visibility(vcpu, r) & REG_HIDDEN; } +static inline bool sysreg_hidden_user(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + if (likely(!r->visibility)) + return false; + + return r->visibility(vcpu, r) & (REG_HIDDEN | REG_HIDDEN_USER); +} + static inline bool sysreg_visible_as_raz(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h index 33e4e7dd2719..f3e46a976125 100644 --- a/arch/arm64/kvm/trace_arm.h +++ b/arch/arm64/kvm/trace_arm.h @@ -2,6 +2,7 @@ #if !defined(_TRACE_ARM_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_ARM_ARM64_KVM_H +#include <asm/kvm_emulate.h> #include <kvm/arm_arch_timer.h> #include <linux/tracepoint.h> @@ -301,6 +302,64 @@ TRACE_EVENT(kvm_timer_emulate, __entry->timer_idx, __entry->should_fire) ); +TRACE_EVENT(kvm_nested_eret, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned long elr_el2, + unsigned long spsr_el2), + TP_ARGS(vcpu, elr_el2, spsr_el2), + + TP_STRUCT__entry( + __field(struct kvm_vcpu *, vcpu) + __field(unsigned long, elr_el2) + __field(unsigned long, spsr_el2) + __field(unsigned long, target_mode) + __field(unsigned long, hcr_el2) + ), + + TP_fast_assign( + __entry->vcpu = vcpu; + __entry->elr_el2 = elr_el2; + __entry->spsr_el2 = spsr_el2; + __entry->target_mode = spsr_el2 & (PSR_MODE_MASK | PSR_MODE32_BIT); + __entry->hcr_el2 = __vcpu_sys_reg(vcpu, HCR_EL2); + ), + + TP_printk("elr_el2: 0x%lx spsr_el2: 0x%08lx (M: %s) hcr_el2: %lx", + __entry->elr_el2, __entry->spsr_el2, + __print_symbolic(__entry->target_mode, kvm_mode_names), + __entry->hcr_el2) +); + +TRACE_EVENT(kvm_inject_nested_exception, + TP_PROTO(struct kvm_vcpu *vcpu, u64 esr_el2, int type), + TP_ARGS(vcpu, esr_el2, type), + + TP_STRUCT__entry( + __field(struct kvm_vcpu *, vcpu) + __field(unsigned long, esr_el2) + __field(int, type) + __field(unsigned long, spsr_el2) + __field(unsigned long, pc) + __field(unsigned long, source_mode) + __field(unsigned long, hcr_el2) + ), + + TP_fast_assign( + __entry->vcpu = vcpu; + __entry->esr_el2 = esr_el2; + __entry->type = type; + __entry->spsr_el2 = *vcpu_cpsr(vcpu); + __entry->pc = *vcpu_pc(vcpu); + __entry->source_mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT); + __entry->hcr_el2 = __vcpu_sys_reg(vcpu, HCR_EL2); + ), + + TP_printk("%s: esr_el2 0x%lx elr_el2: 0x%lx spsr_el2: 0x%08lx (M: %s) hcr_el2: %lx", + __print_symbolic(__entry->type, kvm_exception_type_names), + __entry->esr_el2, __entry->pc, __entry->spsr_el2, + __print_symbolic(__entry->source_mode, kvm_mode_names), + __entry->hcr_el2) +); + #endif /* _TRACE_ARM_ARM64_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index f6d4f4052555..cd134db41a57 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -465,17 +465,15 @@ out: /* GENERIC PROBE */ -static int vgic_init_cpu_starting(unsigned int cpu) +void kvm_vgic_cpu_up(void) { enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0); - return 0; } -static int vgic_init_cpu_dying(unsigned int cpu) +void kvm_vgic_cpu_down(void) { disable_percpu_irq(kvm_vgic_global_state.maint_irq); - return 0; } static irqreturn_t vgic_maintenance_handler(int irq, void *data) @@ -572,7 +570,7 @@ int kvm_vgic_hyp_init(void) if (ret) return ret; - if (!has_mask) + if (!has_mask && !kvm_vgic_global_state.maint_irq) return 0; ret = request_percpu_irq(kvm_vgic_global_state.maint_irq, @@ -584,19 +582,6 @@ int kvm_vgic_hyp_init(void) return ret; } - ret = cpuhp_setup_state(CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING, - "kvm/arm/vgic:starting", - vgic_init_cpu_starting, vgic_init_cpu_dying); - if (ret) { - kvm_err("Cannot register vgic CPU notifier\n"); - goto out_free_irq; - } - kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq); return 0; - -out_free_irq: - free_percpu_irq(kvm_vgic_global_state.maint_irq, - kvm_get_running_vcpus()); - return ret; } diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c index b32d434c1d4a..e67b3b2c8044 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.c +++ b/arch/arm64/kvm/vgic/vgic-mmio.c @@ -473,9 +473,10 @@ int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu, * active state can be overwritten when the VCPU's state is synced coming back * from the guest. * - * For shared interrupts as well as GICv3 private interrupts, we have to - * stop all the VCPUs because interrupts can be migrated while we don't hold - * the IRQ locks and we don't want to be chasing moving targets. + * For shared interrupts as well as GICv3 private interrupts accessed from the + * non-owning CPU, we have to stop all the VCPUs because interrupts can be + * migrated while we don't hold the IRQ locks and we don't want to be chasing + * moving targets. * * For GICv2 private interrupts we don't have to do anything because * userspace accesses to the VGIC state already require all VCPUs to be @@ -484,7 +485,8 @@ int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu, */ static void vgic_access_active_prepare(struct kvm_vcpu *vcpu, u32 intid) { - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || + if ((vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 && + vcpu != kvm_get_running_vcpu()) || intid >= VGIC_NR_PRIVATE_IRQS) kvm_arm_halt_guest(vcpu->kvm); } @@ -492,7 +494,8 @@ static void vgic_access_active_prepare(struct kvm_vcpu *vcpu, u32 intid) /* See vgic_access_active_prepare */ static void vgic_access_active_finish(struct kvm_vcpu *vcpu, u32 intid) { - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || + if ((vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 && + vcpu != kvm_get_running_vcpu()) || intid >= VGIC_NR_PRIVATE_IRQS) kvm_arm_resume_guest(vcpu->kvm); } diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 684bdfaad4a9..469d816f356f 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -3,6 +3,7 @@ #include <linux/irqchip/arm-gic-v3.h> #include <linux/irq.h> #include <linux/irqdomain.h> +#include <linux/kstrtox.h> #include <linux/kvm.h> #include <linux/kvm_host.h> #include <kvm/arm_vgic.h> @@ -584,25 +585,25 @@ DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap); static int __init early_group0_trap_cfg(char *buf) { - return strtobool(buf, &group0_trap); + return kstrtobool(buf, &group0_trap); } early_param("kvm-arm.vgic_v3_group0_trap", early_group0_trap_cfg); static int __init early_group1_trap_cfg(char *buf) { - return strtobool(buf, &group1_trap); + return kstrtobool(buf, &group1_trap); } early_param("kvm-arm.vgic_v3_group1_trap", early_group1_trap_cfg); static int __init early_common_trap_cfg(char *buf) { - return strtobool(buf, &common_trap); + return kstrtobool(buf, &common_trap); } early_param("kvm-arm.vgic_v3_common_trap", early_common_trap_cfg); static int __init early_gicv4_enable(char *buf) { - return strtobool(buf, &gicv4_enable); + return kstrtobool(buf, &gicv4_enable); } early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable); diff --git a/arch/arm64/kvm/vmid.c b/arch/arm64/kvm/vmid.c index d78ae63d7c15..08978d0672e7 100644 --- a/arch/arm64/kvm/vmid.c +++ b/arch/arm64/kvm/vmid.c @@ -16,7 +16,7 @@ #include <asm/kvm_asm.h> #include <asm/kvm_mmu.h> -unsigned int kvm_arm_vmid_bits; +unsigned int __ro_after_init kvm_arm_vmid_bits; static DEFINE_RAW_SPINLOCK(cpu_vmid_lock); static atomic64_t vmid_generation; @@ -172,7 +172,7 @@ void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) /* * Initialize the VMID allocator */ -int kvm_arm_vmid_alloc_init(void) +int __init kvm_arm_vmid_alloc_init(void) { kvm_arm_vmid_bits = kvm_get_vmid_bits(); @@ -190,7 +190,7 @@ int kvm_arm_vmid_alloc_init(void) return 0; } -void kvm_arm_vmid_alloc_free(void) +void __init kvm_arm_vmid_alloc_free(void) { kfree(vmid_map); } diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 10dcfa13390a..37b1340e9646 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -33,6 +33,7 @@ HAS_GIC_PRIO_MASKING HAS_GIC_PRIO_RELAXED_SYNC HAS_LDAPR HAS_LSE_ATOMICS +HAS_NESTED_VIRT HAS_NO_FPSIMD HAS_NO_HW_PREFETCH HAS_PAN diff --git a/arch/arm64/tools/gen-sysreg.awk b/arch/arm64/tools/gen-sysreg.awk index 7f27d66a17e1..6fa0468caa00 100755 --- a/arch/arm64/tools/gen-sysreg.awk +++ b/arch/arm64/tools/gen-sysreg.awk @@ -103,6 +103,7 @@ END { res0 = "UL(0)" res1 = "UL(0)" + unkn = "UL(0)" next_bit = 63 @@ -117,11 +118,13 @@ END { define(reg "_RES0", "(" res0 ")") define(reg "_RES1", "(" res1 ")") + define(reg "_UNKN", "(" unkn ")") print "" reg = null res0 = null res1 = null + unkn = null next } @@ -139,6 +142,7 @@ END { res0 = "UL(0)" res1 = "UL(0)" + unkn = "UL(0)" define("REG_" reg, "S" op0 "_" op1 "_C" crn "_C" crm "_" op2) define("SYS_" reg, "sys_reg(" op0 ", " op1 ", " crn ", " crm ", " op2 ")") @@ -166,7 +170,9 @@ END { define(reg "_RES0", "(" res0 ")") if (res1 != null) define(reg "_RES1", "(" res1 ")") - if (res0 != null || res1 != null) + if (unkn != null) + define(reg "_UNKN", "(" unkn ")") + if (res0 != null || res1 != null || unkn != null) print "" reg = null @@ -177,6 +183,7 @@ END { op2 = null res0 = null res1 = null + unkn = null next } @@ -195,6 +202,7 @@ END { next_bit = 0 res0 = null res1 = null + unkn = null next } @@ -220,6 +228,16 @@ END { next } +/^Unkn/ && (block == "Sysreg" || block == "SysregFields") { + expect_fields(2) + parse_bitdef(reg, "UNKN", $2) + field = "UNKN_" msb "_" lsb + + unkn = unkn " | GENMASK_ULL(" msb ", " lsb ")" + + next +} + /^Field/ && (block == "Sysreg" || block == "SysregFields") { expect_fields(3) field = $3 diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 94d78acafb67..dd5a9c7e310f 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -15,6 +15,8 @@ # Res1 <msb>[:<lsb>] +# Unkn <msb>[:<lsb>] + # Field <msb>[:<lsb>] <name> # Enum <msb>[:<lsb>] <name> @@ -1779,6 +1781,16 @@ Sysreg SCXTNUM_EL1 3 0 13 0 7 Field 63:0 SoftwareContextNumber EndSysreg +# The bit layout for CCSIDR_EL1 depends on whether FEAT_CCIDX is implemented. +# The following is for case when FEAT_CCIDX is not implemented. +Sysreg CCSIDR_EL1 3 1 0 0 0 +Res0 63:32 +Unkn 31:28 +Field 27:13 NumSets +Field 12:3 Associativity +Field 2:0 LineSize +EndSysreg + Sysreg CLIDR_EL1 3 1 0 0 1 Res0 63:47 Field 46:33 Ttypen @@ -1795,6 +1807,11 @@ Field 5:3 Ctype2 Field 2:0 Ctype1 EndSysreg +Sysreg CCSIDR2_EL1 3 1 0 0 2 +Res0 63:24 +Field 23:0 NumSets +EndSysreg + Sysreg GMID_EL1 3 1 0 0 4 Res0 63:4 Field 3:0 BS diff --git a/arch/csky/lib/delay.c b/arch/csky/lib/delay.c index 22570b0790d6..f5db317313bb 100644 --- a/arch/csky/lib/delay.c +++ b/arch/csky/lib/delay.c @@ -5,7 +5,7 @@ #include <linux/init.h> #include <linux/delay.h> -void __delay(unsigned long loops) +void __aligned(8) __delay(unsigned long loops) { asm volatile ( "mov r0, r0\n" diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index ae9ff07de4ab..d7e1cabee2ec 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -43,4 +43,4 @@ obj-$(CONFIG_ELF_CORE) += elfcore.o CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31 # The gate DSO image is built using a special linker script. -include $(src)/Makefile.gate +include $(srctree)/$(src)/Makefile.gate diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 9cc8b84f7eb0..7fd51257e0ed 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -94,15 +94,21 @@ config LOONGARCH select HAVE_DYNAMIC_FTRACE_WITH_ARGS select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_EBPF_JIT + select HAVE_EFFICIENT_UNALIGNED_ACCESS if !ARCH_STRICT_ALIGN select HAVE_EXIT_THREAD select HAVE_FAST_GUP select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_GENERIC_VDSO + select HAVE_HW_BREAKPOINT if PERF_EVENTS select HAVE_IOREMAP_PROT select HAVE_IRQ_EXIT_ON_IRQ_STACK select HAVE_IRQ_TIME_ACCOUNTING + select HAVE_KPROBES + select HAVE_KPROBES_ON_FTRACE + select HAVE_KRETPROBES select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI select HAVE_PCI @@ -441,6 +447,24 @@ config ARCH_IOREMAP protection support. However, you can enable LoongArch DMW-based ioremap() for better performance. +config ARCH_STRICT_ALIGN + bool "Enable -mstrict-align to prevent unaligned accesses" if EXPERT + default y + help + Not all LoongArch cores support h/w unaligned access, we can use + -mstrict-align build parameter to prevent unaligned accesses. + + CPUs with h/w unaligned access support: + Loongson-2K2000/2K3000/3A5000/3C5000/3D5000. + + CPUs without h/w unaligned access support: + Loongson-2K500/2K1000. + + This option is enabled by default to make the kernel be able to run + on all LoongArch systems. But you can disable it manually if you want + to run kernel only on systems with h/w unaligned access support in + order to optimise for performance. + config KEXEC bool "Kexec system call" select KEXEC_CORE @@ -454,6 +478,7 @@ config KEXEC config CRASH_DUMP bool "Build kdump crash kernel" + select RELOCATABLE help Generate crash dump after being started by kexec. This should be normally only set in special crash dump kernels which are @@ -463,16 +488,38 @@ config CRASH_DUMP For more details see Documentation/admin-guide/kdump/kdump.rst -config PHYSICAL_START - hex "Physical address where the kernel is loaded" - default "0x90000000a0000000" - depends on CRASH_DUMP +config RELOCATABLE + bool "Relocatable kernel" help - This gives the XKPRANGE address where the kernel is loaded. - If you plan to use kernel for capturing the crash dump change - this value to start of the reserved region (the "X" value as - specified in the "crashkernel=YM@XM" command line boot parameter - passed to the panic-ed kernel). + This builds the kernel as a Position Independent Executable (PIE), + which retains all relocation metadata required, so as to relocate + the kernel binary at runtime to a different virtual address from + its link address. + +config RANDOMIZE_BASE + bool "Randomize the address of the kernel (KASLR)" + depends on RELOCATABLE + help + Randomizes the physical and virtual address at which the + kernel image is loaded, as a security feature that + deters exploit attempts relying on knowledge of the location + of kernel internals. + + The kernel will be offset by up to RANDOMIZE_BASE_MAX_OFFSET. + + If unsure, say N. + +config RANDOMIZE_BASE_MAX_OFFSET + hex "Maximum KASLR offset" if EXPERT + depends on RANDOMIZE_BASE + range 0x0 0x10000000 + default "0x01000000" + help + When KASLR is active, this provides the maximum offset that will + be applied to the kernel image. It should be set according to the + amount of physical RAM available in the target system. + + This is limited by the size of the lower address memory, 256MB. config SECCOMP bool "Enable seccomp to safely compute untrusted bytecode" diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index 4402387d2755..f71edf574101 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -71,14 +71,15 @@ KBUILD_AFLAGS_MODULE += -Wa,-mla-global-with-abs KBUILD_CFLAGS_MODULE += -fplt -Wa,-mla-global-with-abs,-mla-local-with-abs endif +ifeq ($(CONFIG_RELOCATABLE),y) +KBUILD_CFLAGS_KERNEL += -fPIE +LDFLAGS_vmlinux += -static -pie --no-dynamic-linker -z notext +endif + cflags-y += -ffreestanding cflags-y += $(call cc-option, -mno-check-zero-division) -ifndef CONFIG_PHYSICAL_START load-y = 0x9000000000200000 -else -load-y = $(CONFIG_PHYSICAL_START) -endif bootvars-y = VMLINUX_LOAD_ADDRESS=$(load-y) drivers-$(CONFIG_PCI) += arch/loongarch/pci/ @@ -91,10 +92,15 @@ KBUILD_CPPFLAGS += -DVMLINUX_LOAD_ADDRESS=$(load-y) # instead of .eh_frame so we don't discard them. KBUILD_CFLAGS += -fno-asynchronous-unwind-tables +ifdef CONFIG_ARCH_STRICT_ALIGN # Don't emit unaligned accesses. # Not all LoongArch cores support unaligned access, and as kernel we can't # rely on others to provide emulation for these accesses. KBUILD_CFLAGS += $(call cc-option,-mstrict-align) +else +# Optimise for performance on hardware supports unaligned access. +KBUILD_CFLAGS += $(call cc-option,-mno-strict-align) +endif KBUILD_CFLAGS += -isystem $(shell $(CC) -print-file-name=include) diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig index eb84cae642e5..e18213f01cc4 100644 --- a/arch/loongarch/configs/loongson3_defconfig +++ b/arch/loongarch/configs/loongson3_defconfig @@ -48,6 +48,7 @@ CONFIG_HOTPLUG_CPU=y CONFIG_NR_CPUS=64 CONFIG_NUMA=y CONFIG_KEXEC=y +CONFIG_CRASH_DUMP=y CONFIG_SUSPEND=y CONFIG_HIBERNATION=y CONFIG_ACPI=y diff --git a/arch/loongarch/include/asm/addrspace.h b/arch/loongarch/include/asm/addrspace.h index d342935e5a72..8fb699b4d40a 100644 --- a/arch/loongarch/include/asm/addrspace.h +++ b/arch/loongarch/include/asm/addrspace.h @@ -125,4 +125,6 @@ extern unsigned long vm_map_base; #define ISA_IOSIZE SZ_16K #define IO_SPACE_LIMIT (PCI_IOSIZE - 1) +#define PHYS_LINK_KADDR PHYSADDR(VMLINUX_LOAD_ADDRESS) + #endif /* _ASM_ADDRSPACE_H */ diff --git a/arch/loongarch/include/asm/asm.h b/arch/loongarch/include/asm/asm.h index 40eea6aa469e..f591b3245def 100644 --- a/arch/loongarch/include/asm/asm.h +++ b/arch/loongarch/include/asm/asm.h @@ -188,4 +188,14 @@ #define PTRLOG 3 #endif +/* Annotate a function as being unsuitable for kprobes. */ +#ifdef CONFIG_KPROBES +#define _ASM_NOKPROBE(name) \ + .pushsection "_kprobe_blacklist", "aw"; \ + .quad name; \ + .popsection +#else +#define _ASM_NOKPROBE(name) +#endif + #endif /* __ASM_ASM_H */ diff --git a/arch/loongarch/include/asm/asmmacro.h b/arch/loongarch/include/asm/asmmacro.h index be037a40580d..c51a1b43acb4 100644 --- a/arch/loongarch/include/asm/asmmacro.h +++ b/arch/loongarch/include/asm/asmmacro.h @@ -274,4 +274,21 @@ nor \dst, \src, zero .endm +.macro la_abs reg, sym +#ifndef CONFIG_RELOCATABLE + la.abs \reg, \sym +#else + 766: + lu12i.w \reg, 0 + ori \reg, \reg, 0 + lu32i.d \reg, 0 + lu52i.d \reg, \reg, 0 + .pushsection ".la_abs", "aw", %progbits + 768: + .dword 768b-766b + .dword \sym + .popsection +#endif +.endm + #endif /* _ASM_ASMMACRO_H */ diff --git a/arch/loongarch/include/asm/cpu.h b/arch/loongarch/include/asm/cpu.h index 754f28506791..c3da91759472 100644 --- a/arch/loongarch/include/asm/cpu.h +++ b/arch/loongarch/include/asm/cpu.h @@ -36,7 +36,7 @@ #define PRID_SERIES_LA132 0x8000 /* Loongson 32bit */ #define PRID_SERIES_LA264 0xa000 /* Loongson 64bit, 2-issue */ -#define PRID_SERIES_LA364 0xb000 /* Loongson 64bit,3-issue */ +#define PRID_SERIES_LA364 0xb000 /* Loongson 64bit, 3-issue */ #define PRID_SERIES_LA464 0xc000 /* Loongson 64bit, 4-issue */ #define PRID_SERIES_LA664 0xd000 /* Loongson 64bit, 6-issue */ diff --git a/arch/loongarch/include/asm/hw_breakpoint.h b/arch/loongarch/include/asm/hw_breakpoint.h new file mode 100644 index 000000000000..21447fb1efc7 --- /dev/null +++ b/arch/loongarch/include/asm/hw_breakpoint.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2022-2023 Loongson Technology Corporation Limited + */ +#ifndef __ASM_HW_BREAKPOINT_H +#define __ASM_HW_BREAKPOINT_H + +#include <asm/loongarch.h> + +#ifdef __KERNEL__ + +/* Breakpoint */ +#define LOONGARCH_BREAKPOINT_EXECUTE (0 << 0) + +/* Watchpoints */ +#define LOONGARCH_BREAKPOINT_LOAD (1 << 0) +#define LOONGARCH_BREAKPOINT_STORE (1 << 1) + +struct arch_hw_breakpoint_ctrl { + u32 __reserved : 28, + len : 2, + type : 2; +}; + +struct arch_hw_breakpoint { + u64 address; + u64 mask; + struct arch_hw_breakpoint_ctrl ctrl; +}; + +/* Lengths */ +#define LOONGARCH_BREAKPOINT_LEN_1 0b11 +#define LOONGARCH_BREAKPOINT_LEN_2 0b10 +#define LOONGARCH_BREAKPOINT_LEN_4 0b01 +#define LOONGARCH_BREAKPOINT_LEN_8 0b00 + +/* + * Limits. + * Changing these will require modifications to the register accessors. + */ +#define LOONGARCH_MAX_BRP 8 +#define LOONGARCH_MAX_WRP 8 + +/* Virtual debug register bases. */ +#define CSR_CFG_ADDR 0 +#define CSR_CFG_MASK (CSR_CFG_ADDR + LOONGARCH_MAX_BRP) +#define CSR_CFG_CTRL (CSR_CFG_MASK + LOONGARCH_MAX_BRP) +#define CSR_CFG_ASID (CSR_CFG_CTRL + LOONGARCH_MAX_WRP) + +/* Debug register names. */ +#define LOONGARCH_CSR_NAME_ADDR ADDR +#define LOONGARCH_CSR_NAME_MASK MASK +#define LOONGARCH_CSR_NAME_CTRL CTRL +#define LOONGARCH_CSR_NAME_ASID ASID + +/* Accessor macros for the debug registers. */ +#define LOONGARCH_CSR_WATCH_READ(N, REG, T, VAL) \ +do { \ + if (T == 0) \ + VAL = csr_read64(LOONGARCH_CSR_##IB##N##REG); \ + else \ + VAL = csr_read64(LOONGARCH_CSR_##DB##N##REG); \ +} while (0) + +#define LOONGARCH_CSR_WATCH_WRITE(N, REG, T, VAL) \ +do { \ + if (T == 0) \ + csr_write64(VAL, LOONGARCH_CSR_##IB##N##REG); \ + else \ + csr_write64(VAL, LOONGARCH_CSR_##DB##N##REG); \ +} while (0) + +/* Exact number */ +#define CSR_FWPC_NUM 0x3f +#define CSR_MWPC_NUM 0x3f + +#define CTRL_PLV_ENABLE 0x1e + +#define MWPnCFG3_LoadEn 8 +#define MWPnCFG3_StoreEn 9 + +#define MWPnCFG3_Type_mask 0x3 +#define MWPnCFG3_Size_mask 0x3 + +static inline u32 encode_ctrl_reg(struct arch_hw_breakpoint_ctrl ctrl) +{ + return (ctrl.len << 10) | (ctrl.type << 8); +} + +static inline void decode_ctrl_reg(u32 reg, struct arch_hw_breakpoint_ctrl *ctrl) +{ + reg >>= 8; + ctrl->type = reg & MWPnCFG3_Type_mask; + reg >>= 2; + ctrl->len = reg & MWPnCFG3_Size_mask; +} + +struct task_struct; +struct notifier_block; +struct perf_event; +struct perf_event_attr; + +extern int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, + int *gen_len, int *gen_type, int *offset); +extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); +extern int hw_breakpoint_arch_parse(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw); +extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, + unsigned long val, void *data); + +extern int arch_install_hw_breakpoint(struct perf_event *bp); +extern void arch_uninstall_hw_breakpoint(struct perf_event *bp); +extern int hw_breakpoint_slots(int type); +extern void hw_breakpoint_pmu_read(struct perf_event *bp); + +void breakpoint_handler(struct pt_regs *regs); +void watchpoint_handler(struct pt_regs *regs); + +#ifdef CONFIG_HAVE_HW_BREAKPOINT +extern void ptrace_hw_copy_thread(struct task_struct *task); +extern void hw_breakpoint_thread_switch(struct task_struct *next); +#else +static inline void ptrace_hw_copy_thread(struct task_struct *task) +{ +} +static inline void hw_breakpoint_thread_switch(struct task_struct *next) +{ +} +#endif + +/* Determine number of BRP registers available. */ +static inline int get_num_brps(void) +{ + return csr_read64(LOONGARCH_CSR_FWPC) & CSR_FWPC_NUM; +} + +/* Determine number of WRP registers available. */ +static inline int get_num_wrps(void) +{ + return csr_read64(LOONGARCH_CSR_MWPC) & CSR_MWPC_NUM; +} + +#endif /* __KERNEL__ */ +#endif /* __ASM_BREAKPOINT_H */ diff --git a/arch/loongarch/include/asm/inst.h b/arch/loongarch/include/asm/inst.h index 7eedd83fd0d7..a04fe755d719 100644 --- a/arch/loongarch/include/asm/inst.h +++ b/arch/loongarch/include/asm/inst.h @@ -7,6 +7,7 @@ #include <linux/types.h> #include <asm/asm.h> +#include <asm/ptrace.h> #define INSN_NOP 0x03400000 #define INSN_BREAK 0x002a0000 @@ -23,6 +24,10 @@ #define ADDR_IMM(addr, INSN) ((addr & ADDR_IMMMASK_##INSN) >> ADDR_IMMSHIFT_##INSN) +enum reg0i15_op { + break_op = 0x54, +}; + enum reg0i26_op { b_op = 0x14, bl_op = 0x15, @@ -32,6 +37,7 @@ enum reg1i20_op { lu12iw_op = 0x0a, lu32id_op = 0x0b, pcaddi_op = 0x0c, + pcalau12i_op = 0x0d, pcaddu12i_op = 0x0e, pcaddu18i_op = 0x0f, }; @@ -178,6 +184,11 @@ enum reg3sa2_op { alsld_op = 0x16, }; +struct reg0i15_format { + unsigned int immediate : 15; + unsigned int opcode : 17; +}; + struct reg0i26_format { unsigned int immediate_h : 10; unsigned int immediate_l : 16; @@ -263,6 +274,7 @@ struct reg3sa2_format { union loongarch_instruction { unsigned int word; + struct reg0i15_format reg0i15_format; struct reg0i26_format reg0i26_format; struct reg1i20_format reg1i20_format; struct reg1i21_format reg1i21_format; @@ -321,6 +333,11 @@ static inline bool is_imm_negative(unsigned long val, unsigned int bit) return val & (1UL << (bit - 1)); } +static inline bool is_break_ins(union loongarch_instruction *ip) +{ + return ip->reg0i15_format.opcode == break_op; +} + static inline bool is_pc_ins(union loongarch_instruction *ip) { return ip->reg1i20_format.opcode >= pcaddi_op && @@ -351,6 +368,47 @@ static inline bool is_stack_alloc_ins(union loongarch_instruction *ip) is_imm12_negative(ip->reg2i12_format.immediate); } +static inline bool is_self_loop_ins(union loongarch_instruction *ip, struct pt_regs *regs) +{ + switch (ip->reg0i26_format.opcode) { + case b_op: + case bl_op: + if (ip->reg0i26_format.immediate_l == 0 + && ip->reg0i26_format.immediate_h == 0) + return true; + } + + switch (ip->reg1i21_format.opcode) { + case beqz_op: + case bnez_op: + case bceqz_op: + if (ip->reg1i21_format.immediate_l == 0 + && ip->reg1i21_format.immediate_h == 0) + return true; + } + + switch (ip->reg2i16_format.opcode) { + case beq_op: + case bne_op: + case blt_op: + case bge_op: + case bltu_op: + case bgeu_op: + if (ip->reg2i16_format.immediate == 0) + return true; + break; + case jirl_op: + if (regs->regs[ip->reg2i16_format.rj] + + ((unsigned long)ip->reg2i16_format.immediate << 2) == (unsigned long)ip) + return true; + } + + return false; +} + +void simu_pc(struct pt_regs *regs, union loongarch_instruction insn); +void simu_branch(struct pt_regs *regs, union loongarch_instruction insn); + int larch_insn_read(void *addr, u32 *insnp); int larch_insn_write(void *addr, u32 insn); int larch_insn_patch_text(void *addr, u32 insn); diff --git a/arch/loongarch/include/asm/kprobes.h b/arch/loongarch/include/asm/kprobes.h new file mode 100644 index 000000000000..798020ae02c6 --- /dev/null +++ b/arch/loongarch/include/asm/kprobes.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_LOONGARCH_KPROBES_H +#define __ASM_LOONGARCH_KPROBES_H + +#include <asm-generic/kprobes.h> + +#ifdef CONFIG_KPROBES + +#include <asm/inst.h> +#include <asm/cacheflush.h> + +#define __ARCH_WANT_KPROBES_INSN_SLOT +#define MAX_INSN_SIZE 2 + +#define flush_insn_slot(p) \ +do { \ + if (p->addr) \ + flush_icache_range((unsigned long)p->addr, \ + (unsigned long)p->addr + \ + (MAX_INSN_SIZE * sizeof(kprobe_opcode_t))); \ +} while (0) + +#define kretprobe_blacklist_size 0 + +typedef union loongarch_instruction kprobe_opcode_t; + +/* Architecture specific copy of original instruction */ +struct arch_specific_insn { + /* copy of the original instruction */ + kprobe_opcode_t *insn; + /* restore address after simulation */ + unsigned long restore; +}; + +struct prev_kprobe { + struct kprobe *kp; + unsigned int status; +}; + +/* per-cpu kprobe control block */ +struct kprobe_ctlblk { + unsigned int kprobe_status; + unsigned long saved_status; + struct prev_kprobe prev_kprobe; +}; + +void arch_remove_kprobe(struct kprobe *p); +bool kprobe_fault_handler(struct pt_regs *regs, int trapnr); +bool kprobe_breakpoint_handler(struct pt_regs *regs); +bool kprobe_singlestep_handler(struct pt_regs *regs); + +void __kretprobe_trampoline(void); +void *trampoline_probe_handler(struct pt_regs *regs); + +#else /* !CONFIG_KPROBES */ + +static inline bool kprobe_breakpoint_handler(struct pt_regs *regs) { return false; } +static inline bool kprobe_singlestep_handler(struct pt_regs *regs) { return false; } + +#endif /* CONFIG_KPROBES */ +#endif /* __ASM_LOONGARCH_KPROBES_H */ diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h index 7f8d57a61c8b..65b7dcdea16d 100644 --- a/arch/loongarch/include/asm/loongarch.h +++ b/arch/loongarch/include/asm/loongarch.h @@ -970,42 +970,42 @@ static __always_inline void iocsr_write64(u64 val, u32 reg) #define LOONGARCH_CSR_DB0ADDR 0x310 /* data breakpoint 0 address */ #define LOONGARCH_CSR_DB0MASK 0x311 /* data breakpoint 0 mask */ -#define LOONGARCH_CSR_DB0CTL 0x312 /* data breakpoint 0 control */ +#define LOONGARCH_CSR_DB0CTRL 0x312 /* data breakpoint 0 control */ #define LOONGARCH_CSR_DB0ASID 0x313 /* data breakpoint 0 asid */ #define LOONGARCH_CSR_DB1ADDR 0x318 /* data breakpoint 1 address */ #define LOONGARCH_CSR_DB1MASK 0x319 /* data breakpoint 1 mask */ -#define LOONGARCH_CSR_DB1CTL 0x31a /* data breakpoint 1 control */ +#define LOONGARCH_CSR_DB1CTRL 0x31a /* data breakpoint 1 control */ #define LOONGARCH_CSR_DB1ASID 0x31b /* data breakpoint 1 asid */ #define LOONGARCH_CSR_DB2ADDR 0x320 /* data breakpoint 2 address */ #define LOONGARCH_CSR_DB2MASK 0x321 /* data breakpoint 2 mask */ -#define LOONGARCH_CSR_DB2CTL 0x322 /* data breakpoint 2 control */ +#define LOONGARCH_CSR_DB2CTRL 0x322 /* data breakpoint 2 control */ #define LOONGARCH_CSR_DB2ASID 0x323 /* data breakpoint 2 asid */ #define LOONGARCH_CSR_DB3ADDR 0x328 /* data breakpoint 3 address */ #define LOONGARCH_CSR_DB3MASK 0x329 /* data breakpoint 3 mask */ -#define LOONGARCH_CSR_DB3CTL 0x32a /* data breakpoint 3 control */ +#define LOONGARCH_CSR_DB3CTRL 0x32a /* data breakpoint 3 control */ #define LOONGARCH_CSR_DB3ASID 0x32b /* data breakpoint 3 asid */ #define LOONGARCH_CSR_DB4ADDR 0x330 /* data breakpoint 4 address */ #define LOONGARCH_CSR_DB4MASK 0x331 /* data breakpoint 4 maks */ -#define LOONGARCH_CSR_DB4CTL 0x332 /* data breakpoint 4 control */ +#define LOONGARCH_CSR_DB4CTRL 0x332 /* data breakpoint 4 control */ #define LOONGARCH_CSR_DB4ASID 0x333 /* data breakpoint 4 asid */ #define LOONGARCH_CSR_DB5ADDR 0x338 /* data breakpoint 5 address */ #define LOONGARCH_CSR_DB5MASK 0x339 /* data breakpoint 5 mask */ -#define LOONGARCH_CSR_DB5CTL 0x33a /* data breakpoint 5 control */ +#define LOONGARCH_CSR_DB5CTRL 0x33a /* data breakpoint 5 control */ #define LOONGARCH_CSR_DB5ASID 0x33b /* data breakpoint 5 asid */ #define LOONGARCH_CSR_DB6ADDR 0x340 /* data breakpoint 6 address */ #define LOONGARCH_CSR_DB6MASK 0x341 /* data breakpoint 6 mask */ -#define LOONGARCH_CSR_DB6CTL 0x342 /* data breakpoint 6 control */ +#define LOONGARCH_CSR_DB6CTRL 0x342 /* data breakpoint 6 control */ #define LOONGARCH_CSR_DB6ASID 0x343 /* data breakpoint 6 asid */ #define LOONGARCH_CSR_DB7ADDR 0x348 /* data breakpoint 7 address */ #define LOONGARCH_CSR_DB7MASK 0x349 /* data breakpoint 7 mask */ -#define LOONGARCH_CSR_DB7CTL 0x34a /* data breakpoint 7 control */ +#define LOONGARCH_CSR_DB7CTRL 0x34a /* data breakpoint 7 control */ #define LOONGARCH_CSR_DB7ASID 0x34b /* data breakpoint 7 asid */ #define LOONGARCH_CSR_FWPC 0x380 /* instruction breakpoint config */ @@ -1013,48 +1013,51 @@ static __always_inline void iocsr_write64(u64 val, u32 reg) #define LOONGARCH_CSR_IB0ADDR 0x390 /* inst breakpoint 0 address */ #define LOONGARCH_CSR_IB0MASK 0x391 /* inst breakpoint 0 mask */ -#define LOONGARCH_CSR_IB0CTL 0x392 /* inst breakpoint 0 control */ +#define LOONGARCH_CSR_IB0CTRL 0x392 /* inst breakpoint 0 control */ #define LOONGARCH_CSR_IB0ASID 0x393 /* inst breakpoint 0 asid */ #define LOONGARCH_CSR_IB1ADDR 0x398 /* inst breakpoint 1 address */ #define LOONGARCH_CSR_IB1MASK 0x399 /* inst breakpoint 1 mask */ -#define LOONGARCH_CSR_IB1CTL 0x39a /* inst breakpoint 1 control */ +#define LOONGARCH_CSR_IB1CTRL 0x39a /* inst breakpoint 1 control */ #define LOONGARCH_CSR_IB1ASID 0x39b /* inst breakpoint 1 asid */ #define LOONGARCH_CSR_IB2ADDR 0x3a0 /* inst breakpoint 2 address */ #define LOONGARCH_CSR_IB2MASK 0x3a1 /* inst breakpoint 2 mask */ -#define LOONGARCH_CSR_IB2CTL 0x3a2 /* inst breakpoint 2 control */ +#define LOONGARCH_CSR_IB2CTRL 0x3a2 /* inst breakpoint 2 control */ #define LOONGARCH_CSR_IB2ASID 0x3a3 /* inst breakpoint 2 asid */ #define LOONGARCH_CSR_IB3ADDR 0x3a8 /* inst breakpoint 3 address */ #define LOONGARCH_CSR_IB3MASK 0x3a9 /* breakpoint 3 mask */ -#define LOONGARCH_CSR_IB3CTL 0x3aa /* inst breakpoint 3 control */ +#define LOONGARCH_CSR_IB3CTRL 0x3aa /* inst breakpoint 3 control */ #define LOONGARCH_CSR_IB3ASID 0x3ab /* inst breakpoint 3 asid */ #define LOONGARCH_CSR_IB4ADDR 0x3b0 /* inst breakpoint 4 address */ #define LOONGARCH_CSR_IB4MASK 0x3b1 /* inst breakpoint 4 mask */ -#define LOONGARCH_CSR_IB4CTL 0x3b2 /* inst breakpoint 4 control */ +#define LOONGARCH_CSR_IB4CTRL 0x3b2 /* inst breakpoint 4 control */ #define LOONGARCH_CSR_IB4ASID 0x3b3 /* inst breakpoint 4 asid */ #define LOONGARCH_CSR_IB5ADDR 0x3b8 /* inst breakpoint 5 address */ #define LOONGARCH_CSR_IB5MASK 0x3b9 /* inst breakpoint 5 mask */ -#define LOONGARCH_CSR_IB5CTL 0x3ba /* inst breakpoint 5 control */ +#define LOONGARCH_CSR_IB5CTRL 0x3ba /* inst breakpoint 5 control */ #define LOONGARCH_CSR_IB5ASID 0x3bb /* inst breakpoint 5 asid */ #define LOONGARCH_CSR_IB6ADDR 0x3c0 /* inst breakpoint 6 address */ #define LOONGARCH_CSR_IB6MASK 0x3c1 /* inst breakpoint 6 mask */ -#define LOONGARCH_CSR_IB6CTL 0x3c2 /* inst breakpoint 6 control */ +#define LOONGARCH_CSR_IB6CTRL 0x3c2 /* inst breakpoint 6 control */ #define LOONGARCH_CSR_IB6ASID 0x3c3 /* inst breakpoint 6 asid */ #define LOONGARCH_CSR_IB7ADDR 0x3c8 /* inst breakpoint 7 address */ #define LOONGARCH_CSR_IB7MASK 0x3c9 /* inst breakpoint 7 mask */ -#define LOONGARCH_CSR_IB7CTL 0x3ca /* inst breakpoint 7 control */ +#define LOONGARCH_CSR_IB7CTRL 0x3ca /* inst breakpoint 7 control */ #define LOONGARCH_CSR_IB7ASID 0x3cb /* inst breakpoint 7 asid */ #define LOONGARCH_CSR_DEBUG 0x500 /* debug config */ #define LOONGARCH_CSR_DERA 0x501 /* debug era */ #define LOONGARCH_CSR_DESAVE 0x502 /* debug save */ +#define CSR_FWPC_SKIP_SHIFT 16 +#define CSR_FWPC_SKIP (_ULCAST_(1) << CSR_FWPC_SKIP_SHIFT) + /* * CSR_ECFG IM */ diff --git a/arch/loongarch/include/asm/processor.h b/arch/loongarch/include/asm/processor.h index 7184f1dc61f2..636e1c66398c 100644 --- a/arch/loongarch/include/asm/processor.h +++ b/arch/loongarch/include/asm/processor.h @@ -11,6 +11,7 @@ #include <asm/cpu.h> #include <asm/cpu-info.h> +#include <asm/hw_breakpoint.h> #include <asm/loongarch.h> #include <asm/vdso/processor.h> #include <uapi/asm/ptrace.h> @@ -124,13 +125,18 @@ struct thread_struct { /* Other stuff associated with the thread. */ unsigned long trap_nr; unsigned long error_code; + unsigned long single_step; /* Used by PTRACE_SINGLESTEP */ struct loongarch_vdso_info *vdso; /* - * FPU & vector registers, must be at last because - * they are conditionally copied at fork(). + * FPU & vector registers, must be at the last of inherited + * context because they are conditionally copied at fork(). */ struct loongarch_fpu fpu FPU_ALIGN; + + /* Hardware breakpoints pinned to this task. */ + struct perf_event *hbp_break[LOONGARCH_MAX_BRP]; + struct perf_event *hbp_watch[LOONGARCH_MAX_WRP]; }; #define thread_saved_ra(tsk) (tsk->thread.sched_ra) @@ -172,6 +178,8 @@ struct thread_struct { .fcc = 0, \ .fpr = {{{0,},},}, \ }, \ + .hbp_break = {0}, \ + .hbp_watch = {0}, \ } struct task_struct; @@ -184,10 +192,6 @@ extern unsigned long boot_option_idle_override; */ extern void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp); -static inline void flush_thread(void) -{ -} - unsigned long __get_wchan(struct task_struct *p); #define __KSTK_TOS(tsk) ((unsigned long)task_stack_page(tsk) + \ diff --git a/arch/loongarch/include/asm/ptrace.h b/arch/loongarch/include/asm/ptrace.h index 59c4608de91d..d761db943335 100644 --- a/arch/loongarch/include/asm/ptrace.h +++ b/arch/loongarch/include/asm/ptrace.h @@ -6,6 +6,7 @@ #define _ASM_PTRACE_H #include <asm/page.h> +#include <asm/irqflags.h> #include <asm/thread_info.h> #include <uapi/asm/ptrace.h> @@ -109,6 +110,40 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsi struct task_struct; +/** + * regs_get_kernel_argument() - get Nth function argument in kernel + * @regs: pt_regs of that context + * @n: function argument number (start from 0) + * + * regs_get_argument() returns @n th argument of the function call. + * Note that this chooses most probably assignment, in some case + * it can be incorrect. + * This is expected to be called from kprobes or ftrace with regs + * where the top of stack is the return address. + */ +static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, + unsigned int n) +{ +#define NR_REG_ARGUMENTS 8 + static const unsigned int args[] = { + offsetof(struct pt_regs, regs[4]), + offsetof(struct pt_regs, regs[5]), + offsetof(struct pt_regs, regs[6]), + offsetof(struct pt_regs, regs[7]), + offsetof(struct pt_regs, regs[8]), + offsetof(struct pt_regs, regs[9]), + offsetof(struct pt_regs, regs[10]), + offsetof(struct pt_regs, regs[11]), + }; + + if (n < NR_REG_ARGUMENTS) + return regs_get_register(regs, args[n]); + else { + n -= NR_REG_ARGUMENTS; + return regs_get_kernel_stack_nth(regs, n); + } +} + /* * Does the process account for user or for system time? */ @@ -149,4 +184,8 @@ static inline void user_stack_pointer_set(struct pt_regs *regs, regs->regs[3] = val; } +#ifdef CONFIG_HAVE_HW_BREAKPOINT +#define arch_has_single_step() (1) +#endif + #endif /* _ASM_PTRACE_H */ diff --git a/arch/loongarch/include/asm/setup.h b/arch/loongarch/include/asm/setup.h index 72ead58039f3..be05c0e706a2 100644 --- a/arch/loongarch/include/asm/setup.h +++ b/arch/loongarch/include/asm/setup.h @@ -21,4 +21,20 @@ extern void per_cpu_trap_init(int cpu); extern void set_handler(unsigned long offset, void *addr, unsigned long len); extern void set_merr_handler(unsigned long offset, void *addr, unsigned long len); +#ifdef CONFIG_RELOCATABLE + +struct rela_la_abs { + long offset; + long symvalue; +}; + +extern long __la_abs_begin; +extern long __la_abs_end; +extern long __rela_dyn_begin; +extern long __rela_dyn_end; + +extern void * __init relocate_kernel(void); + +#endif + #endif /* __SETUP_H */ diff --git a/arch/loongarch/include/asm/stackframe.h b/arch/loongarch/include/asm/stackframe.h index 4ca953062b5b..7df80e6ae9d2 100644 --- a/arch/loongarch/include/asm/stackframe.h +++ b/arch/loongarch/include/asm/stackframe.h @@ -7,6 +7,7 @@ #include <linux/threads.h> +#include <asm/addrspace.h> #include <asm/asm.h> #include <asm/asmmacro.h> #include <asm/asm-offsets.h> @@ -36,6 +37,14 @@ cfi_restore \reg \offset \docfi .endm +/* Jump to the runtime virtual address. */ + .macro JUMP_VIRT_ADDR temp1 temp2 + li.d \temp1, CACHE_BASE + pcaddi \temp2, 0 + or \temp1, \temp1, \temp2 + jirl zero, \temp1, 0xc + .endm + .macro BACKUP_T0T1 csrwr t0, EXCEPTION_KS0 csrwr t1, EXCEPTION_KS1 @@ -77,7 +86,7 @@ * new value in sp. */ .macro get_saved_sp docfi=0 - la.abs t1, kernelsp + la_abs t1, kernelsp #ifdef CONFIG_SMP csrrd t0, PERCPU_BASE_KS LONG_ADD t1, t1, t0 @@ -90,7 +99,7 @@ .endm .macro set_saved_sp stackp temp temp2 - la.abs \temp, kernelsp + la.pcrel \temp, kernelsp #ifdef CONFIG_SMP LONG_ADD \temp, \temp, u0 #endif diff --git a/arch/loongarch/include/asm/switch_to.h b/arch/loongarch/include/asm/switch_to.h index 43a5ab162d38..24e3094bebab 100644 --- a/arch/loongarch/include/asm/switch_to.h +++ b/arch/loongarch/include/asm/switch_to.h @@ -34,6 +34,7 @@ extern asmlinkage struct task_struct *__switch_to(struct task_struct *prev, #define switch_to(prev, next, last) \ do { \ lose_fpu_inatomic(1, prev); \ + hw_breakpoint_thread_switch(next); \ (last) = __switch_to(prev, next, task_thread_info(next), \ __builtin_return_address(0), __builtin_frame_address(0)); \ } while (0) diff --git a/arch/loongarch/include/asm/uaccess.h b/arch/loongarch/include/asm/uaccess.h index 255899d4a7c3..0d22991ae430 100644 --- a/arch/loongarch/include/asm/uaccess.h +++ b/arch/loongarch/include/asm/uaccess.h @@ -22,7 +22,6 @@ extern u64 __ua_limit; #define __UA_ADDR ".dword" -#define __UA_LA "la.abs" #define __UA_LIMIT __ua_limit /* diff --git a/arch/loongarch/include/uapi/asm/ptrace.h b/arch/loongarch/include/uapi/asm/ptrace.h index 083193f4a5d5..cc48ed262021 100644 --- a/arch/loongarch/include/uapi/asm/ptrace.h +++ b/arch/loongarch/include/uapi/asm/ptrace.h @@ -46,6 +46,15 @@ struct user_fp_state { uint32_t fcsr; }; +struct user_watch_state { + uint16_t dbg_info; + struct { + uint64_t addr; + uint64_t mask; + uint32_t ctrl; + } dbg_regs[8]; +}; + #define PTRACE_SYSEMU 0x1f #define PTRACE_SYSEMU_SINGLESTEP 0x20 diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile index c8cfbd562921..78d4e3384305 100644 --- a/arch/loongarch/kernel/Makefile +++ b/arch/loongarch/kernel/Makefile @@ -8,13 +8,15 @@ extra-y := vmlinux.lds obj-y += head.o cpu-probe.o cacheinfo.o env.o setup.o entry.o genex.o \ traps.o irq.o idle.o process.o dma.o mem.o io.o reset.o switch.o \ elf.o syscall.o signal.o time.o topology.o inst.o ptrace.o vdso.o \ - alternative.o unaligned.o unwind.o + alternative.o unwind.o obj-$(CONFIG_ACPI) += acpi.o obj-$(CONFIG_EFI) += efi.o obj-$(CONFIG_CPU_HAS_FPU) += fpu.o +obj-$(CONFIG_ARCH_STRICT_ALIGN) += unaligned.o + ifdef CONFIG_FUNCTION_TRACER ifndef CONFIG_DYNAMIC_FTRACE obj-y += mcount.o ftrace.o @@ -39,6 +41,8 @@ obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_MAGIC_SYSRQ) += sysrq.o +obj-$(CONFIG_RELOCATABLE) += relocate.o + obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o @@ -46,5 +50,8 @@ obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o obj-$(CONFIG_UNWINDER_PROLOGUE) += unwind_prologue.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_regs.o +obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o + +obj-$(CONFIG_KPROBES) += kprobes.o kprobes_trampoline.o CPPFLAGS_vmlinux.lds := $(KBUILD_CFLAGS) diff --git a/arch/loongarch/kernel/entry.S b/arch/loongarch/kernel/entry.S index d53b631c9022..d737e3cf42d3 100644 --- a/arch/loongarch/kernel/entry.S +++ b/arch/loongarch/kernel/entry.S @@ -19,70 +19,71 @@ .cfi_sections .debug_frame .align 5 SYM_FUNC_START(handle_syscall) - csrrd t0, PERCPU_BASE_KS - la.abs t1, kernelsp - add.d t1, t1, t0 - move t2, sp - ld.d sp, t1, 0 + csrrd t0, PERCPU_BASE_KS + la.pcrel t1, kernelsp + add.d t1, t1, t0 + move t2, sp + ld.d sp, t1, 0 - addi.d sp, sp, -PT_SIZE - cfi_st t2, PT_R3 + addi.d sp, sp, -PT_SIZE + cfi_st t2, PT_R3 cfi_rel_offset sp, PT_R3 - st.d zero, sp, PT_R0 - csrrd t2, LOONGARCH_CSR_PRMD - st.d t2, sp, PT_PRMD - csrrd t2, LOONGARCH_CSR_CRMD - st.d t2, sp, PT_CRMD - csrrd t2, LOONGARCH_CSR_EUEN - st.d t2, sp, PT_EUEN - csrrd t2, LOONGARCH_CSR_ECFG - st.d t2, sp, PT_ECFG - csrrd t2, LOONGARCH_CSR_ESTAT - st.d t2, sp, PT_ESTAT - cfi_st ra, PT_R1 - cfi_st a0, PT_R4 - cfi_st a1, PT_R5 - cfi_st a2, PT_R6 - cfi_st a3, PT_R7 - cfi_st a4, PT_R8 - cfi_st a5, PT_R9 - cfi_st a6, PT_R10 - cfi_st a7, PT_R11 - csrrd ra, LOONGARCH_CSR_ERA - st.d ra, sp, PT_ERA + st.d zero, sp, PT_R0 + csrrd t2, LOONGARCH_CSR_PRMD + st.d t2, sp, PT_PRMD + csrrd t2, LOONGARCH_CSR_CRMD + st.d t2, sp, PT_CRMD + csrrd t2, LOONGARCH_CSR_EUEN + st.d t2, sp, PT_EUEN + csrrd t2, LOONGARCH_CSR_ECFG + st.d t2, sp, PT_ECFG + csrrd t2, LOONGARCH_CSR_ESTAT + st.d t2, sp, PT_ESTAT + cfi_st ra, PT_R1 + cfi_st a0, PT_R4 + cfi_st a1, PT_R5 + cfi_st a2, PT_R6 + cfi_st a3, PT_R7 + cfi_st a4, PT_R8 + cfi_st a5, PT_R9 + cfi_st a6, PT_R10 + cfi_st a7, PT_R11 + csrrd ra, LOONGARCH_CSR_ERA + st.d ra, sp, PT_ERA cfi_rel_offset ra, PT_ERA - cfi_st tp, PT_R2 - cfi_st u0, PT_R21 - cfi_st fp, PT_R22 + cfi_st tp, PT_R2 + cfi_st u0, PT_R21 + cfi_st fp, PT_R22 SAVE_STATIC - move u0, t0 - li.d tp, ~_THREAD_MASK - and tp, tp, sp + move u0, t0 + li.d tp, ~_THREAD_MASK + and tp, tp, sp - move a0, sp - bl do_syscall + move a0, sp + bl do_syscall RESTORE_ALL_AND_RET SYM_FUNC_END(handle_syscall) +_ASM_NOKPROBE(handle_syscall) SYM_CODE_START(ret_from_fork) - bl schedule_tail # a0 = struct task_struct *prev - move a0, sp - bl syscall_exit_to_user_mode + bl schedule_tail # a0 = struct task_struct *prev + move a0, sp + bl syscall_exit_to_user_mode RESTORE_STATIC RESTORE_SOME RESTORE_SP_AND_RET SYM_CODE_END(ret_from_fork) SYM_CODE_START(ret_from_kernel_thread) - bl schedule_tail # a0 = struct task_struct *prev - move a0, s1 - jirl ra, s0, 0 - move a0, sp - bl syscall_exit_to_user_mode + bl schedule_tail # a0 = struct task_struct *prev + move a0, s1 + jirl ra, s0, 0 + move a0, sp + bl syscall_exit_to_user_mode RESTORE_STATIC RESTORE_SOME RESTORE_SP_AND_RET diff --git a/arch/loongarch/kernel/ftrace_dyn.c b/arch/loongarch/kernel/ftrace_dyn.c index 0f07591cab30..4a3ef8516ccc 100644 --- a/arch/loongarch/kernel/ftrace_dyn.c +++ b/arch/loongarch/kernel/ftrace_dyn.c @@ -6,6 +6,7 @@ */ #include <linux/ftrace.h> +#include <linux/kprobes.h> #include <linux/uaccess.h> #include <asm/inst.h> @@ -271,3 +272,66 @@ int ftrace_disable_ftrace_graph_caller(void) } #endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +#ifdef CONFIG_KPROBES_ON_FTRACE +/* Ftrace callback handler for kprobes -- called under preepmt disabled */ +void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) +{ + int bit; + struct pt_regs *regs; + struct kprobe *p; + struct kprobe_ctlblk *kcb; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + + p = get_kprobe((kprobe_opcode_t *)ip); + if (unlikely(!p) || kprobe_disabled(p)) + goto out; + + regs = ftrace_get_regs(fregs); + if (!regs) + goto out; + + kcb = get_kprobe_ctlblk(); + if (kprobe_running()) { + kprobes_inc_nmissed_count(p); + } else { + unsigned long orig_ip = instruction_pointer(regs); + + instruction_pointer_set(regs, ip); + + __this_cpu_write(current_kprobe, p); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + if (!p->pre_handler || !p->pre_handler(p, regs)) { + /* + * Emulate singlestep (and also recover regs->csr_era) + * as if there is a nop + */ + instruction_pointer_set(regs, (unsigned long)p->addr + MCOUNT_INSN_SIZE); + if (unlikely(p->post_handler)) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + p->post_handler(p, regs, 0); + } + instruction_pointer_set(regs, orig_ip); + } + + /* + * If pre_handler returns !0, it changes regs->csr_era. We have to + * skip emulating post_handler. + */ + __this_cpu_write(current_kprobe, NULL); + } +out: + ftrace_test_recursion_unlock(bit); +} +NOKPROBE_SYMBOL(kprobe_ftrace_handler); + +int arch_prepare_kprobe_ftrace(struct kprobe *p) +{ + p->ainsn.insn = NULL; + return 0; +} +#endif /* CONFIG_KPROBES_ON_FTRACE */ diff --git a/arch/loongarch/kernel/genex.S b/arch/loongarch/kernel/genex.S index 7e5c293ed89f..44ff1ff64260 100644 --- a/arch/loongarch/kernel/genex.S +++ b/arch/loongarch/kernel/genex.S @@ -34,7 +34,7 @@ SYM_FUNC_END(__arch_cpu_idle) SYM_FUNC_START(handle_vint) BACKUP_T0T1 SAVE_ALL - la.abs t1, __arch_cpu_idle + la_abs t1, __arch_cpu_idle LONG_L t0, sp, PT_ERA /* 32 byte rollback region */ ori t0, t0, 0x1f @@ -43,7 +43,7 @@ SYM_FUNC_START(handle_vint) LONG_S t0, sp, PT_ERA 1: move a0, sp move a1, sp - la.abs t0, do_vint + la_abs t0, do_vint jirl ra, t0, 0 RESTORE_ALL_AND_RET SYM_FUNC_END(handle_vint) @@ -72,7 +72,7 @@ SYM_FUNC_END(except_vec_cex) SAVE_ALL build_prep_\prep move a0, sp - la.abs t0, do_\handler + la_abs t0, do_\handler jirl ra, t0, 0 668: RESTORE_ALL_AND_RET @@ -93,6 +93,6 @@ SYM_FUNC_END(except_vec_cex) BUILD_HANDLER reserved reserved none /* others */ SYM_FUNC_START(handle_sys) - la.abs t0, handle_syscall + la_abs t0, handle_syscall jr t0 SYM_FUNC_END(handle_sys) diff --git a/arch/loongarch/kernel/head.S b/arch/loongarch/kernel/head.S index 57bada6b4e93..aa64b179744f 100644 --- a/arch/loongarch/kernel/head.S +++ b/arch/loongarch/kernel/head.S @@ -24,7 +24,7 @@ _head: .org 0x8 .dword kernel_entry /* Kernel entry point */ .dword _end - _text /* Kernel image effective size */ - .quad 0 /* Kernel image load offset from start of RAM */ + .quad PHYS_LINK_KADDR /* Kernel image load offset from start of RAM */ .org 0x38 /* 0x20 ~ 0x37 reserved */ .long LINUX_PE_MAGIC .long pe_header - _head /* Offset to the PE header */ @@ -50,11 +50,8 @@ SYM_CODE_START(kernel_entry) # kernel entry point li.d t0, CSR_DMW1_INIT # CA, PLV0, 0x9000 xxxx xxxx xxxx csrwr t0, LOONGARCH_CSR_DMWIN1 - /* We might not get launched at the address the kernel is linked to, - so we jump there. */ - la.abs t0, 0f - jr t0 -0: + JUMP_VIRT_ADDR t0, t1 + /* Enable PG */ li.w t0, 0xb0 # PLV=0, IE=0, PG=1 csrwr t0, LOONGARCH_CSR_CRMD @@ -89,6 +86,23 @@ SYM_CODE_START(kernel_entry) # kernel entry point PTR_ADD sp, sp, tp set_saved_sp sp, t0, t1 +#ifdef CONFIG_RELOCATABLE + + bl relocate_kernel + +#ifdef CONFIG_RANDOMIZE_BASE + /* Repoint the sp into the new kernel */ + PTR_LI sp, (_THREAD_SIZE - PT_SIZE) + PTR_ADD sp, sp, tp + set_saved_sp sp, t0, t1 +#endif + + /* relocate_kernel() returns the new kernel entry point */ + jr a0 + ASM_BUG() + +#endif + bl start_kernel ASM_BUG() @@ -106,9 +120,8 @@ SYM_CODE_START(smpboot_entry) li.d t0, CSR_DMW1_INIT # CA, PLV0 csrwr t0, LOONGARCH_CSR_DMWIN1 - la.abs t0, 0f - jr t0 -0: + JUMP_VIRT_ADDR t0, t1 + /* Enable PG */ li.w t0, 0xb0 # PLV=0, IE=0, PG=1 csrwr t0, LOONGARCH_CSR_CRMD @@ -117,7 +130,7 @@ SYM_CODE_START(smpboot_entry) li.w t0, 0x00 # FPE=0, SXE=0, ASXE=0, BTE=0 csrwr t0, LOONGARCH_CSR_EUEN - la.abs t0, cpuboot_data + la.pcrel t0, cpuboot_data ld.d sp, t0, CPU_BOOT_STACK ld.d tp, t0, CPU_BOOT_TINFO diff --git a/arch/loongarch/kernel/hw_breakpoint.c b/arch/loongarch/kernel/hw_breakpoint.c new file mode 100644 index 000000000000..2406c95b34cc --- /dev/null +++ b/arch/loongarch/kernel/hw_breakpoint.c @@ -0,0 +1,548 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022-2023 Loongson Technology Corporation Limited + */ +#define pr_fmt(fmt) "hw-breakpoint: " fmt + +#include <linux/hw_breakpoint.h> +#include <linux/kprobes.h> +#include <linux/perf_event.h> + +#include <asm/hw_breakpoint.h> + +/* Breakpoint currently in use for each BRP. */ +static DEFINE_PER_CPU(struct perf_event *, bp_on_reg[LOONGARCH_MAX_BRP]); + +/* Watchpoint currently in use for each WRP. */ +static DEFINE_PER_CPU(struct perf_event *, wp_on_reg[LOONGARCH_MAX_WRP]); + +int hw_breakpoint_slots(int type) +{ + /* + * We can be called early, so don't rely on + * our static variables being initialised. + */ + switch (type) { + case TYPE_INST: + return get_num_brps(); + case TYPE_DATA: + return get_num_wrps(); + default: + pr_warn("unknown slot type: %d\n", type); + return 0; + } +} + +#define READ_WB_REG_CASE(OFF, N, REG, T, VAL) \ + case (OFF + N): \ + LOONGARCH_CSR_WATCH_READ(N, REG, T, VAL); \ + break + +#define WRITE_WB_REG_CASE(OFF, N, REG, T, VAL) \ + case (OFF + N): \ + LOONGARCH_CSR_WATCH_WRITE(N, REG, T, VAL); \ + break + +#define GEN_READ_WB_REG_CASES(OFF, REG, T, VAL) \ + READ_WB_REG_CASE(OFF, 0, REG, T, VAL); \ + READ_WB_REG_CASE(OFF, 1, REG, T, VAL); \ + READ_WB_REG_CASE(OFF, 2, REG, T, VAL); \ + READ_WB_REG_CASE(OFF, 3, REG, T, VAL); \ + READ_WB_REG_CASE(OFF, 4, REG, T, VAL); \ + READ_WB_REG_CASE(OFF, 5, REG, T, VAL); \ + READ_WB_REG_CASE(OFF, 6, REG, T, VAL); \ + READ_WB_REG_CASE(OFF, 7, REG, T, VAL); + +#define GEN_WRITE_WB_REG_CASES(OFF, REG, T, VAL) \ + WRITE_WB_REG_CASE(OFF, 0, REG, T, VAL); \ + WRITE_WB_REG_CASE(OFF, 1, REG, T, VAL); \ + WRITE_WB_REG_CASE(OFF, 2, REG, T, VAL); \ + WRITE_WB_REG_CASE(OFF, 3, REG, T, VAL); \ + WRITE_WB_REG_CASE(OFF, 4, REG, T, VAL); \ + WRITE_WB_REG_CASE(OFF, 5, REG, T, VAL); \ + WRITE_WB_REG_CASE(OFF, 6, REG, T, VAL); \ + WRITE_WB_REG_CASE(OFF, 7, REG, T, VAL); + +static u64 read_wb_reg(int reg, int n, int t) +{ + u64 val = 0; + + switch (reg + n) { + GEN_READ_WB_REG_CASES(CSR_CFG_ADDR, ADDR, t, val); + GEN_READ_WB_REG_CASES(CSR_CFG_MASK, MASK, t, val); + GEN_READ_WB_REG_CASES(CSR_CFG_CTRL, CTRL, t, val); + GEN_READ_WB_REG_CASES(CSR_CFG_ASID, ASID, t, val); + default: + pr_warn("Attempt to read from unknown breakpoint register %d\n", n); + } + + return val; +} +NOKPROBE_SYMBOL(read_wb_reg); + +static void write_wb_reg(int reg, int n, int t, u64 val) +{ + switch (reg + n) { + GEN_WRITE_WB_REG_CASES(CSR_CFG_ADDR, ADDR, t, val); + GEN_WRITE_WB_REG_CASES(CSR_CFG_MASK, MASK, t, val); + GEN_WRITE_WB_REG_CASES(CSR_CFG_CTRL, CTRL, t, val); + GEN_WRITE_WB_REG_CASES(CSR_CFG_ASID, ASID, t, val); + default: + pr_warn("Attempt to write to unknown breakpoint register %d\n", n); + } +} +NOKPROBE_SYMBOL(write_wb_reg); + +enum hw_breakpoint_ops { + HW_BREAKPOINT_INSTALL, + HW_BREAKPOINT_UNINSTALL, +}; + +/* + * hw_breakpoint_slot_setup - Find and setup a perf slot according to operations + * + * @slots: pointer to array of slots + * @max_slots: max number of slots + * @bp: perf_event to setup + * @ops: operation to be carried out on the slot + * + * Return: + * slot index on success + * -ENOSPC if no slot is available/matches + * -EINVAL on wrong operations parameter + */ + +static int hw_breakpoint_slot_setup(struct perf_event **slots, int max_slots, + struct perf_event *bp, enum hw_breakpoint_ops ops) +{ + int i; + struct perf_event **slot; + + for (i = 0; i < max_slots; ++i) { + slot = &slots[i]; + switch (ops) { + case HW_BREAKPOINT_INSTALL: + if (!*slot) { + *slot = bp; + return i; + } + break; + case HW_BREAKPOINT_UNINSTALL: + if (*slot == bp) { + *slot = NULL; + return i; + } + break; + default: + pr_warn_once("Unhandled hw breakpoint ops %d\n", ops); + return -EINVAL; + } + } + + return -ENOSPC; +} + +void ptrace_hw_copy_thread(struct task_struct *tsk) +{ + memset(tsk->thread.hbp_break, 0, sizeof(tsk->thread.hbp_break)); + memset(tsk->thread.hbp_watch, 0, sizeof(tsk->thread.hbp_watch)); +} + +/* + * Unregister breakpoints from this task and reset the pointers in the thread_struct. + */ +void flush_ptrace_hw_breakpoint(struct task_struct *tsk) +{ + int i; + struct thread_struct *t = &tsk->thread; + + for (i = 0; i < LOONGARCH_MAX_BRP; i++) { + if (t->hbp_break[i]) { + unregister_hw_breakpoint(t->hbp_break[i]); + t->hbp_break[i] = NULL; + } + } + + for (i = 0; i < LOONGARCH_MAX_WRP; i++) { + if (t->hbp_watch[i]) { + unregister_hw_breakpoint(t->hbp_watch[i]); + t->hbp_watch[i] = NULL; + } + } +} + +static int hw_breakpoint_control(struct perf_event *bp, + enum hw_breakpoint_ops ops) +{ + u32 ctrl; + int i, max_slots, enable; + struct perf_event **slots; + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + + if (info->ctrl.type == LOONGARCH_BREAKPOINT_EXECUTE) { + /* Breakpoint */ + slots = this_cpu_ptr(bp_on_reg); + max_slots = boot_cpu_data.watch_ireg_count; + } else { + /* Watchpoint */ + slots = this_cpu_ptr(wp_on_reg); + max_slots = boot_cpu_data.watch_dreg_count; + } + + i = hw_breakpoint_slot_setup(slots, max_slots, bp, ops); + + if (WARN_ONCE(i < 0, "Can't find any breakpoint slot")) + return i; + + switch (ops) { + case HW_BREAKPOINT_INSTALL: + /* Set the FWPnCFG/MWPnCFG 1~4 register. */ + write_wb_reg(CSR_CFG_ADDR, i, 0, info->address); + write_wb_reg(CSR_CFG_ADDR, i, 1, info->address); + write_wb_reg(CSR_CFG_MASK, i, 0, info->mask); + write_wb_reg(CSR_CFG_MASK, i, 1, info->mask); + write_wb_reg(CSR_CFG_ASID, i, 0, 0); + write_wb_reg(CSR_CFG_ASID, i, 1, 0); + if (info->ctrl.type == LOONGARCH_BREAKPOINT_EXECUTE) { + write_wb_reg(CSR_CFG_CTRL, i, 0, CTRL_PLV_ENABLE); + } else { + ctrl = encode_ctrl_reg(info->ctrl); + write_wb_reg(CSR_CFG_CTRL, i, 1, ctrl | CTRL_PLV_ENABLE | + 1 << MWPnCFG3_LoadEn | 1 << MWPnCFG3_StoreEn); + } + enable = csr_read64(LOONGARCH_CSR_CRMD); + csr_write64(CSR_CRMD_WE | enable, LOONGARCH_CSR_CRMD); + break; + case HW_BREAKPOINT_UNINSTALL: + /* Reset the FWPnCFG/MWPnCFG 1~4 register. */ + write_wb_reg(CSR_CFG_ADDR, i, 0, 0); + write_wb_reg(CSR_CFG_ADDR, i, 1, 0); + write_wb_reg(CSR_CFG_MASK, i, 0, 0); + write_wb_reg(CSR_CFG_MASK, i, 1, 0); + write_wb_reg(CSR_CFG_CTRL, i, 0, 0); + write_wb_reg(CSR_CFG_CTRL, i, 1, 0); + write_wb_reg(CSR_CFG_ASID, i, 0, 0); + write_wb_reg(CSR_CFG_ASID, i, 1, 0); + break; + } + + return 0; +} + +/* + * Install a perf counter breakpoint. + */ +int arch_install_hw_breakpoint(struct perf_event *bp) +{ + return hw_breakpoint_control(bp, HW_BREAKPOINT_INSTALL); +} + +void arch_uninstall_hw_breakpoint(struct perf_event *bp) +{ + hw_breakpoint_control(bp, HW_BREAKPOINT_UNINSTALL); +} + +static int get_hbp_len(u8 hbp_len) +{ + unsigned int len_in_bytes = 0; + + switch (hbp_len) { + case LOONGARCH_BREAKPOINT_LEN_1: + len_in_bytes = 1; + break; + case LOONGARCH_BREAKPOINT_LEN_2: + len_in_bytes = 2; + break; + case LOONGARCH_BREAKPOINT_LEN_4: + len_in_bytes = 4; + break; + case LOONGARCH_BREAKPOINT_LEN_8: + len_in_bytes = 8; + break; + } + + return len_in_bytes; +} + +/* + * Check whether bp virtual address is in kernel space. + */ +int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) +{ + unsigned int len; + unsigned long va; + + va = hw->address; + len = get_hbp_len(hw->ctrl.len); + + return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); +} + +/* + * Extract generic type and length encodings from an arch_hw_breakpoint_ctrl. + * Hopefully this will disappear when ptrace can bypass the conversion + * to generic breakpoint descriptions. + */ +int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, + int *gen_len, int *gen_type, int *offset) +{ + /* Type */ + switch (ctrl.type) { + case LOONGARCH_BREAKPOINT_EXECUTE: + *gen_type = HW_BREAKPOINT_X; + break; + case LOONGARCH_BREAKPOINT_LOAD: + *gen_type = HW_BREAKPOINT_R; + break; + case LOONGARCH_BREAKPOINT_STORE: + *gen_type = HW_BREAKPOINT_W; + break; + case LOONGARCH_BREAKPOINT_LOAD | LOONGARCH_BREAKPOINT_STORE: + *gen_type = HW_BREAKPOINT_RW; + break; + default: + return -EINVAL; + } + + if (!ctrl.len) + return -EINVAL; + + *offset = __ffs(ctrl.len); + + /* Len */ + switch (ctrl.len) { + case LOONGARCH_BREAKPOINT_LEN_1: + *gen_len = HW_BREAKPOINT_LEN_1; + break; + case LOONGARCH_BREAKPOINT_LEN_2: + *gen_len = HW_BREAKPOINT_LEN_2; + break; + case LOONGARCH_BREAKPOINT_LEN_4: + *gen_len = HW_BREAKPOINT_LEN_4; + break; + case LOONGARCH_BREAKPOINT_LEN_8: + *gen_len = HW_BREAKPOINT_LEN_8; + break; + default: + return -EINVAL; + } + + return 0; +} + +/* + * Construct an arch_hw_breakpoint from a perf_event. + */ +static int arch_build_bp_info(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw) +{ + /* Type */ + switch (attr->bp_type) { + case HW_BREAKPOINT_X: + hw->ctrl.type = LOONGARCH_BREAKPOINT_EXECUTE; + break; + case HW_BREAKPOINT_R: + hw->ctrl.type = LOONGARCH_BREAKPOINT_LOAD; + break; + case HW_BREAKPOINT_W: + hw->ctrl.type = LOONGARCH_BREAKPOINT_STORE; + break; + case HW_BREAKPOINT_RW: + hw->ctrl.type = LOONGARCH_BREAKPOINT_LOAD | LOONGARCH_BREAKPOINT_STORE; + break; + default: + return -EINVAL; + } + + /* Len */ + switch (attr->bp_len) { + case HW_BREAKPOINT_LEN_1: + hw->ctrl.len = LOONGARCH_BREAKPOINT_LEN_1; + break; + case HW_BREAKPOINT_LEN_2: + hw->ctrl.len = LOONGARCH_BREAKPOINT_LEN_2; + break; + case HW_BREAKPOINT_LEN_4: + hw->ctrl.len = LOONGARCH_BREAKPOINT_LEN_4; + break; + case HW_BREAKPOINT_LEN_8: + hw->ctrl.len = LOONGARCH_BREAKPOINT_LEN_8; + break; + default: + return -EINVAL; + } + + /* Address */ + hw->address = attr->bp_addr; + + return 0; +} + +/* + * Validate the arch-specific HW Breakpoint register settings. + */ +int hw_breakpoint_arch_parse(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw) +{ + int ret; + u64 alignment_mask, offset; + + /* Build the arch_hw_breakpoint. */ + ret = arch_build_bp_info(bp, attr, hw); + if (ret) + return ret; + + if (hw->ctrl.type != LOONGARCH_BREAKPOINT_EXECUTE) + alignment_mask = 0x7; + offset = hw->address & alignment_mask; + + hw->address &= ~alignment_mask; + hw->ctrl.len <<= offset; + + return 0; +} + +static void update_bp_registers(struct pt_regs *regs, int enable, int type) +{ + u32 ctrl; + int i, max_slots; + struct perf_event **slots; + struct arch_hw_breakpoint *info; + + switch (type) { + case 0: + slots = this_cpu_ptr(bp_on_reg); + max_slots = boot_cpu_data.watch_ireg_count; + break; + case 1: + slots = this_cpu_ptr(wp_on_reg); + max_slots = boot_cpu_data.watch_dreg_count; + break; + default: + return; + } + + for (i = 0; i < max_slots; ++i) { + if (!slots[i]) + continue; + + info = counter_arch_bp(slots[i]); + if (enable) { + if ((info->ctrl.type == LOONGARCH_BREAKPOINT_EXECUTE) && (type == 0)) { + write_wb_reg(CSR_CFG_CTRL, i, 0, CTRL_PLV_ENABLE); + write_wb_reg(CSR_CFG_CTRL, i, 0, CTRL_PLV_ENABLE); + } else { + ctrl = read_wb_reg(CSR_CFG_CTRL, i, 1); + if (info->ctrl.type == LOONGARCH_BREAKPOINT_LOAD) + ctrl |= 0x1 << MWPnCFG3_LoadEn; + if (info->ctrl.type == LOONGARCH_BREAKPOINT_STORE) + ctrl |= 0x1 << MWPnCFG3_StoreEn; + write_wb_reg(CSR_CFG_CTRL, i, 1, ctrl); + } + regs->csr_prmd |= CSR_PRMD_PWE; + } else { + if ((info->ctrl.type == LOONGARCH_BREAKPOINT_EXECUTE) && (type == 0)) { + write_wb_reg(CSR_CFG_CTRL, i, 0, 0); + } else { + ctrl = read_wb_reg(CSR_CFG_CTRL, i, 1); + if (info->ctrl.type == LOONGARCH_BREAKPOINT_LOAD) + ctrl &= ~0x1 << MWPnCFG3_LoadEn; + if (info->ctrl.type == LOONGARCH_BREAKPOINT_STORE) + ctrl &= ~0x1 << MWPnCFG3_StoreEn; + write_wb_reg(CSR_CFG_CTRL, i, 1, ctrl); + } + regs->csr_prmd &= ~CSR_PRMD_PWE; + } + } +} +NOKPROBE_SYMBOL(update_bp_registers); + +/* + * Debug exception handlers. + */ +void breakpoint_handler(struct pt_regs *regs) +{ + int i; + struct perf_event *bp, **slots; + + slots = this_cpu_ptr(bp_on_reg); + + for (i = 0; i < boot_cpu_data.watch_ireg_count; ++i) { + bp = slots[i]; + if (bp == NULL) + continue; + perf_bp_event(bp, regs); + } + update_bp_registers(regs, 0, 0); +} +NOKPROBE_SYMBOL(breakpoint_handler); + +void watchpoint_handler(struct pt_regs *regs) +{ + int i; + struct perf_event *wp, **slots; + + slots = this_cpu_ptr(wp_on_reg); + + for (i = 0; i < boot_cpu_data.watch_dreg_count; ++i) { + wp = slots[i]; + if (wp == NULL) + continue; + perf_bp_event(wp, regs); + } + update_bp_registers(regs, 0, 1); +} +NOKPROBE_SYMBOL(watchpoint_handler); + +static int __init arch_hw_breakpoint_init(void) +{ + int cpu; + + boot_cpu_data.watch_ireg_count = get_num_brps(); + boot_cpu_data.watch_dreg_count = get_num_wrps(); + + pr_info("Found %d breakpoint and %d watchpoint registers.\n", + boot_cpu_data.watch_ireg_count, boot_cpu_data.watch_dreg_count); + + for (cpu = 1; cpu < NR_CPUS; cpu++) { + cpu_data[cpu].watch_ireg_count = boot_cpu_data.watch_ireg_count; + cpu_data[cpu].watch_dreg_count = boot_cpu_data.watch_dreg_count; + } + + return 0; +} +arch_initcall(arch_hw_breakpoint_init); + +void hw_breakpoint_thread_switch(struct task_struct *next) +{ + u64 addr, mask; + struct pt_regs *regs = task_pt_regs(next); + + if (test_tsk_thread_flag(next, TIF_SINGLESTEP)) { + addr = read_wb_reg(CSR_CFG_ADDR, 0, 0); + mask = read_wb_reg(CSR_CFG_MASK, 0, 0); + if (!((regs->csr_era ^ addr) & ~mask)) + csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS); + regs->csr_prmd |= CSR_PRMD_PWE; + } else { + /* Update breakpoints */ + update_bp_registers(regs, 1, 0); + /* Update watchpoints */ + update_bp_registers(regs, 1, 1); + } +} + +void hw_breakpoint_pmu_read(struct perf_event *bp) +{ +} + +/* + * Dummy function to register with die_notifier. + */ +int hw_breakpoint_exceptions_notify(struct notifier_block *unused, + unsigned long val, void *data) +{ + return NOTIFY_DONE; +} diff --git a/arch/loongarch/kernel/inst.c b/arch/loongarch/kernel/inst.c index badc59087042..258ef267cd30 100644 --- a/arch/loongarch/kernel/inst.c +++ b/arch/loongarch/kernel/inst.c @@ -10,6 +10,129 @@ static DEFINE_RAW_SPINLOCK(patch_lock); +void simu_pc(struct pt_regs *regs, union loongarch_instruction insn) +{ + unsigned long pc = regs->csr_era; + unsigned int rd = insn.reg1i20_format.rd; + unsigned int imm = insn.reg1i20_format.immediate; + + if (pc & 3) { + pr_warn("%s: invalid pc 0x%lx\n", __func__, pc); + return; + } + + switch (insn.reg1i20_format.opcode) { + case pcaddi_op: + regs->regs[rd] = pc + sign_extend64(imm << 2, 21); + break; + case pcaddu12i_op: + regs->regs[rd] = pc + sign_extend64(imm << 12, 31); + break; + case pcaddu18i_op: + regs->regs[rd] = pc + sign_extend64(imm << 18, 37); + break; + case pcalau12i_op: + regs->regs[rd] = pc + sign_extend64(imm << 12, 31); + regs->regs[rd] &= ~((1 << 12) - 1); + break; + default: + pr_info("%s: unknown opcode\n", __func__); + return; + } + + regs->csr_era += LOONGARCH_INSN_SIZE; +} + +void simu_branch(struct pt_regs *regs, union loongarch_instruction insn) +{ + unsigned int imm, imm_l, imm_h, rd, rj; + unsigned long pc = regs->csr_era; + + if (pc & 3) { + pr_warn("%s: invalid pc 0x%lx\n", __func__, pc); + return; + } + + imm_l = insn.reg0i26_format.immediate_l; + imm_h = insn.reg0i26_format.immediate_h; + switch (insn.reg0i26_format.opcode) { + case b_op: + regs->csr_era = pc + sign_extend64((imm_h << 16 | imm_l) << 2, 27); + return; + case bl_op: + regs->csr_era = pc + sign_extend64((imm_h << 16 | imm_l) << 2, 27); + regs->regs[1] = pc + LOONGARCH_INSN_SIZE; + return; + } + + imm_l = insn.reg1i21_format.immediate_l; + imm_h = insn.reg1i21_format.immediate_h; + rj = insn.reg1i21_format.rj; + switch (insn.reg1i21_format.opcode) { + case beqz_op: + if (regs->regs[rj] == 0) + regs->csr_era = pc + sign_extend64((imm_h << 16 | imm_l) << 2, 22); + else + regs->csr_era = pc + LOONGARCH_INSN_SIZE; + return; + case bnez_op: + if (regs->regs[rj] != 0) + regs->csr_era = pc + sign_extend64((imm_h << 16 | imm_l) << 2, 22); + else + regs->csr_era = pc + LOONGARCH_INSN_SIZE; + return; + } + + imm = insn.reg2i16_format.immediate; + rj = insn.reg2i16_format.rj; + rd = insn.reg2i16_format.rd; + switch (insn.reg2i16_format.opcode) { + case beq_op: + if (regs->regs[rj] == regs->regs[rd]) + regs->csr_era = pc + sign_extend64(imm << 2, 17); + else + regs->csr_era = pc + LOONGARCH_INSN_SIZE; + break; + case bne_op: + if (regs->regs[rj] != regs->regs[rd]) + regs->csr_era = pc + sign_extend64(imm << 2, 17); + else + regs->csr_era = pc + LOONGARCH_INSN_SIZE; + break; + case blt_op: + if ((long)regs->regs[rj] < (long)regs->regs[rd]) + regs->csr_era = pc + sign_extend64(imm << 2, 17); + else + regs->csr_era = pc + LOONGARCH_INSN_SIZE; + break; + case bge_op: + if ((long)regs->regs[rj] >= (long)regs->regs[rd]) + regs->csr_era = pc + sign_extend64(imm << 2, 17); + else + regs->csr_era = pc + LOONGARCH_INSN_SIZE; + break; + case bltu_op: + if (regs->regs[rj] < regs->regs[rd]) + regs->csr_era = pc + sign_extend64(imm << 2, 17); + else + regs->csr_era = pc + LOONGARCH_INSN_SIZE; + break; + case bgeu_op: + if (regs->regs[rj] >= regs->regs[rd]) + regs->csr_era = pc + sign_extend64(imm << 2, 17); + else + regs->csr_era = pc + LOONGARCH_INSN_SIZE; + break; + case jirl_op: + regs->csr_era = regs->regs[rj] + sign_extend64(imm << 2, 17); + regs->regs[rd] = pc + LOONGARCH_INSN_SIZE; + break; + default: + pr_info("%s: unknown opcode\n", __func__); + return; + } +} + int larch_insn_read(void *addr, u32 *insnp) { int ret; diff --git a/arch/loongarch/kernel/kprobes.c b/arch/loongarch/kernel/kprobes.c new file mode 100644 index 000000000000..56c8c4b09a42 --- /dev/null +++ b/arch/loongarch/kernel/kprobes.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kdebug.h> +#include <linux/kprobes.h> +#include <linux/preempt.h> +#include <asm/break.h> + +static const union loongarch_instruction breakpoint_insn = { + .reg0i15_format = { + .opcode = break_op, + .immediate = BRK_KPROBE_BP, + } +}; + +static const union loongarch_instruction singlestep_insn = { + .reg0i15_format = { + .opcode = break_op, + .immediate = BRK_KPROBE_SSTEPBP, + } +}; + +DEFINE_PER_CPU(struct kprobe *, current_kprobe); +DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); + +static bool insns_not_supported(union loongarch_instruction insn) +{ + switch (insn.reg2i14_format.opcode) { + case llw_op: + case lld_op: + case scw_op: + case scd_op: + pr_notice("kprobe: ll and sc instructions are not supported\n"); + return true; + } + + switch (insn.reg1i21_format.opcode) { + case bceqz_op: + pr_notice("kprobe: bceqz and bcnez instructions are not supported\n"); + return true; + } + + return false; +} +NOKPROBE_SYMBOL(insns_not_supported); + +static bool insns_need_simulation(struct kprobe *p) +{ + if (is_pc_ins(&p->opcode)) + return true; + + if (is_branch_ins(&p->opcode)) + return true; + + return false; +} +NOKPROBE_SYMBOL(insns_need_simulation); + +static void arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) +{ + if (is_pc_ins(&p->opcode)) + simu_pc(regs, p->opcode); + else if (is_branch_ins(&p->opcode)) + simu_branch(regs, p->opcode); +} +NOKPROBE_SYMBOL(arch_simulate_insn); + +static void arch_prepare_ss_slot(struct kprobe *p) +{ + p->ainsn.insn[0] = *p->addr; + p->ainsn.insn[1] = singlestep_insn; + p->ainsn.restore = (unsigned long)p->addr + LOONGARCH_INSN_SIZE; +} +NOKPROBE_SYMBOL(arch_prepare_ss_slot); + +static void arch_prepare_simulate(struct kprobe *p) +{ + p->ainsn.restore = 0; +} +NOKPROBE_SYMBOL(arch_prepare_simulate); + +int arch_prepare_kprobe(struct kprobe *p) +{ + if ((unsigned long)p->addr & 0x3) + return -EILSEQ; + + /* copy instruction */ + p->opcode = *p->addr; + + /* decode instruction */ + if (insns_not_supported(p->opcode)) + return -EINVAL; + + if (insns_need_simulation(p)) { + p->ainsn.insn = NULL; + } else { + p->ainsn.insn = get_insn_slot(); + if (!p->ainsn.insn) + return -ENOMEM; + } + + /* prepare the instruction */ + if (p->ainsn.insn) + arch_prepare_ss_slot(p); + else + arch_prepare_simulate(p); + + return 0; +} +NOKPROBE_SYMBOL(arch_prepare_kprobe); + +/* Install breakpoint in text */ +void arch_arm_kprobe(struct kprobe *p) +{ + *p->addr = breakpoint_insn; + flush_insn_slot(p); +} +NOKPROBE_SYMBOL(arch_arm_kprobe); + +/* Remove breakpoint from text */ +void arch_disarm_kprobe(struct kprobe *p) +{ + *p->addr = p->opcode; + flush_insn_slot(p); +} +NOKPROBE_SYMBOL(arch_disarm_kprobe); + +void arch_remove_kprobe(struct kprobe *p) +{ + if (p->ainsn.insn) { + free_insn_slot(p->ainsn.insn, 0); + p->ainsn.insn = NULL; + } +} +NOKPROBE_SYMBOL(arch_remove_kprobe); + +static void save_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + kcb->prev_kprobe.kp = kprobe_running(); + kcb->prev_kprobe.status = kcb->kprobe_status; +} +NOKPROBE_SYMBOL(save_previous_kprobe); + +static void restore_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp); + kcb->kprobe_status = kcb->prev_kprobe.status; +} +NOKPROBE_SYMBOL(restore_previous_kprobe); + +static void set_current_kprobe(struct kprobe *p) +{ + __this_cpu_write(current_kprobe, p); +} +NOKPROBE_SYMBOL(set_current_kprobe); + +/* + * Interrupts need to be disabled before single-step mode is set, + * and not reenabled until after single-step mode ends. + * Without disabling interrupt on local CPU, there is a chance of + * interrupt occurrence in the period of exception return and start + * of out-of-line single-step, that result in wrongly single stepping + * into the interrupt handler. + */ +static void save_local_irqflag(struct kprobe_ctlblk *kcb, + struct pt_regs *regs) +{ + kcb->saved_status = regs->csr_prmd; + regs->csr_prmd &= ~CSR_PRMD_PIE; +} +NOKPROBE_SYMBOL(save_local_irqflag); + +static void restore_local_irqflag(struct kprobe_ctlblk *kcb, + struct pt_regs *regs) +{ + regs->csr_prmd = kcb->saved_status; +} +NOKPROBE_SYMBOL(restore_local_irqflag); + +static void post_kprobe_handler(struct kprobe *cur, struct kprobe_ctlblk *kcb, + struct pt_regs *regs) +{ + /* return addr restore if non-branching insn */ + if (cur->ainsn.restore != 0) + instruction_pointer_set(regs, cur->ainsn.restore); + + /* restore back original saved kprobe variables and continue */ + if (kcb->kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(kcb); + preempt_enable_no_resched(); + return; + } + + /* + * update the kcb status even if the cur->post_handler is + * not set because reset_curent_kprobe() doesn't update kcb. + */ + kcb->kprobe_status = KPROBE_HIT_SSDONE; + if (cur->post_handler) + cur->post_handler(cur, regs, 0); + + reset_current_kprobe(); + preempt_enable_no_resched(); +} +NOKPROBE_SYMBOL(post_kprobe_handler); + +static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb, int reenter) +{ + if (reenter) { + save_previous_kprobe(kcb); + set_current_kprobe(p); + kcb->kprobe_status = KPROBE_REENTER; + } else { + kcb->kprobe_status = KPROBE_HIT_SS; + } + + if (p->ainsn.insn) { + /* IRQs and single stepping do not mix well */ + save_local_irqflag(kcb, regs); + /* set ip register to prepare for single stepping */ + regs->csr_era = (unsigned long)p->ainsn.insn; + } else { + /* simulate single steping */ + arch_simulate_insn(p, regs); + /* now go for post processing */ + post_kprobe_handler(p, kcb, regs); + } +} +NOKPROBE_SYMBOL(setup_singlestep); + +static bool reenter_kprobe(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + switch (kcb->kprobe_status) { + case KPROBE_HIT_SS: + case KPROBE_HIT_SSDONE: + case KPROBE_HIT_ACTIVE: + kprobes_inc_nmissed_count(p); + setup_singlestep(p, regs, kcb, 1); + break; + case KPROBE_REENTER: + pr_warn("Failed to recover from reentered kprobes.\n"); + dump_kprobe(p); + WARN_ON_ONCE(1); + break; + default: + WARN_ON(1); + return false; + } + + return true; +} +NOKPROBE_SYMBOL(reenter_kprobe); + +bool kprobe_breakpoint_handler(struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb; + struct kprobe *p, *cur_kprobe; + kprobe_opcode_t *addr = (kprobe_opcode_t *)regs->csr_era; + + /* + * We don't want to be preempted for the entire + * duration of kprobe processing. + */ + preempt_disable(); + kcb = get_kprobe_ctlblk(); + cur_kprobe = kprobe_running(); + + p = get_kprobe(addr); + if (p) { + if (cur_kprobe) { + if (reenter_kprobe(p, regs, kcb)) + return true; + } else { + /* Probe hit */ + set_current_kprobe(p); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + + /* + * If we have no pre-handler or it returned 0, we + * continue with normal processing. If we have a + * pre-handler and it returned non-zero, it will + * modify the execution path and no need to single + * stepping. Let's just reset current kprobe and exit. + * + * pre_handler can hit a breakpoint and can step thru + * before return. + */ + if (!p->pre_handler || !p->pre_handler(p, regs)) { + setup_singlestep(p, regs, kcb, 0); + } else { + reset_current_kprobe(); + preempt_enable_no_resched(); + } + return true; + } + } + + if (addr->word != breakpoint_insn.word) { + /* + * The breakpoint instruction was removed right + * after we hit it. Another cpu has removed + * either a probepoint or a debugger breakpoint + * at this address. In either case, no further + * handling of this interrupt is appropriate. + * Return back to original instruction, and continue. + */ + regs->csr_era = (unsigned long)addr; + preempt_enable_no_resched(); + return true; + } + + preempt_enable_no_resched(); + return false; +} +NOKPROBE_SYMBOL(kprobe_breakpoint_handler); + +bool kprobe_singlestep_handler(struct pt_regs *regs) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + unsigned long addr = instruction_pointer(regs); + + if (cur && (kcb->kprobe_status & (KPROBE_HIT_SS | KPROBE_REENTER)) && + ((unsigned long)&cur->ainsn.insn[1] == addr)) { + restore_local_irqflag(kcb, regs); + post_kprobe_handler(cur, kcb, regs); + return true; + } + + preempt_enable_no_resched(); + return false; +} +NOKPROBE_SYMBOL(kprobe_singlestep_handler); + +bool kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + switch (kcb->kprobe_status) { + case KPROBE_HIT_SS: + case KPROBE_REENTER: + /* + * We are here because the instruction being single + * stepped caused a page fault. We reset the current + * kprobe and the ip points back to the probe address + * and allow the page fault handler to continue as a + * normal page fault. + */ + regs->csr_era = (unsigned long)cur->addr; + WARN_ON_ONCE(!instruction_pointer(regs)); + + if (kcb->kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(kcb); + } else { + restore_local_irqflag(kcb, regs); + reset_current_kprobe(); + } + preempt_enable_no_resched(); + break; + } + return false; +} +NOKPROBE_SYMBOL(kprobe_fault_handler); + +/* + * Provide a blacklist of symbols identifying ranges which cannot be kprobed. + * This blacklist is exposed to userspace via debugfs (kprobes/blacklist). + */ +int __init arch_populate_kprobe_blacklist(void) +{ + return kprobe_add_area_blacklist((unsigned long)__irqentry_text_start, + (unsigned long)__irqentry_text_end); +} + +int __init arch_init_kprobes(void) +{ + return 0; +} + +/* ASM function that handles the kretprobes must not be probed */ +NOKPROBE_SYMBOL(__kretprobe_trampoline); + +/* Called from __kretprobe_trampoline */ +void __used *trampoline_probe_handler(struct pt_regs *regs) +{ + return (void *)kretprobe_trampoline_handler(regs, NULL); +} +NOKPROBE_SYMBOL(trampoline_probe_handler); + +void arch_prepare_kretprobe(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + ri->ret_addr = (kprobe_opcode_t *)regs->regs[1]; + ri->fp = NULL; + + /* Replace the return addr with trampoline addr */ + regs->regs[1] = (unsigned long)&__kretprobe_trampoline; +} +NOKPROBE_SYMBOL(arch_prepare_kretprobe); + +int arch_trampoline_kprobe(struct kprobe *p) +{ + return 0; +} +NOKPROBE_SYMBOL(arch_trampoline_kprobe); diff --git a/arch/loongarch/kernel/kprobes_trampoline.S b/arch/loongarch/kernel/kprobes_trampoline.S new file mode 100644 index 000000000000..af94b0d213fa --- /dev/null +++ b/arch/loongarch/kernel/kprobes_trampoline.S @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#include <linux/linkage.h> +#include <asm/stackframe.h> + + .text + + .macro save_all_base_regs + cfi_st ra, PT_R1 + cfi_st tp, PT_R2 + cfi_st a0, PT_R4 + cfi_st a1, PT_R5 + cfi_st a2, PT_R6 + cfi_st a3, PT_R7 + cfi_st a4, PT_R8 + cfi_st a5, PT_R9 + cfi_st a6, PT_R10 + cfi_st a7, PT_R11 + cfi_st t0, PT_R12 + cfi_st t1, PT_R13 + cfi_st t2, PT_R14 + cfi_st t3, PT_R15 + cfi_st t4, PT_R16 + cfi_st t5, PT_R17 + cfi_st t6, PT_R18 + cfi_st t7, PT_R19 + cfi_st t8, PT_R20 + cfi_st u0, PT_R21 + cfi_st fp, PT_R22 + cfi_st s0, PT_R23 + cfi_st s1, PT_R24 + cfi_st s2, PT_R25 + cfi_st s3, PT_R26 + cfi_st s4, PT_R27 + cfi_st s5, PT_R28 + cfi_st s6, PT_R29 + cfi_st s7, PT_R30 + cfi_st s8, PT_R31 + csrrd t0, LOONGARCH_CSR_CRMD + andi t0, t0, 0x7 /* extract bit[1:0] PLV, bit[2] IE */ + LONG_S t0, sp, PT_CRMD + .endm + + .macro restore_all_base_regs + cfi_ld tp, PT_R2 + cfi_ld a0, PT_R4 + cfi_ld a1, PT_R5 + cfi_ld a2, PT_R6 + cfi_ld a3, PT_R7 + cfi_ld a4, PT_R8 + cfi_ld a5, PT_R9 + cfi_ld a6, PT_R10 + cfi_ld a7, PT_R11 + cfi_ld t0, PT_R12 + cfi_ld t1, PT_R13 + cfi_ld t2, PT_R14 + cfi_ld t3, PT_R15 + cfi_ld t4, PT_R16 + cfi_ld t5, PT_R17 + cfi_ld t6, PT_R18 + cfi_ld t7, PT_R19 + cfi_ld t8, PT_R20 + cfi_ld u0, PT_R21 + cfi_ld fp, PT_R22 + cfi_ld s0, PT_R23 + cfi_ld s1, PT_R24 + cfi_ld s2, PT_R25 + cfi_ld s3, PT_R26 + cfi_ld s4, PT_R27 + cfi_ld s5, PT_R28 + cfi_ld s6, PT_R29 + cfi_ld s7, PT_R30 + cfi_ld s8, PT_R31 + LONG_L t0, sp, PT_CRMD + li.d t1, 0x7 /* mask bit[1:0] PLV, bit[2] IE */ + csrxchg t0, t1, LOONGARCH_CSR_CRMD + .endm + +SYM_CODE_START(__kretprobe_trampoline) + addi.d sp, sp, -PT_SIZE + save_all_base_regs + + addi.d t0, sp, PT_SIZE + LONG_S t0, sp, PT_R3 + + move a0, sp /* pt_regs */ + + bl trampoline_probe_handler + + /* use the result as the return-address */ + move ra, a0 + + restore_all_base_regs + addi.d sp, sp, PT_SIZE + + jr ra +SYM_CODE_END(__kretprobe_trampoline) diff --git a/arch/loongarch/kernel/process.c b/arch/loongarch/kernel/process.c index edfd220a3737..fa2443c7afb2 100644 --- a/arch/loongarch/kernel/process.c +++ b/arch/loongarch/kernel/process.c @@ -18,6 +18,7 @@ #include <linux/sched/debug.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> +#include <linux/hw_breakpoint.h> #include <linux/mm.h> #include <linux/stddef.h> #include <linux/unistd.h> @@ -96,6 +97,11 @@ void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp) regs->regs[3] = sp; } +void flush_thread(void) +{ + flush_ptrace_hw_breakpoint(current); +} + void exit_thread(struct task_struct *tsk) { } @@ -181,6 +187,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) childregs->regs[2] = tls; out: + ptrace_hw_copy_thread(p); clear_tsk_thread_flag(p, TIF_USEDFPU); clear_tsk_thread_flag(p, TIF_USEDSIMD); clear_tsk_thread_flag(p, TIF_LSX_CTX_LIVE); diff --git a/arch/loongarch/kernel/ptrace.c b/arch/loongarch/kernel/ptrace.c index dc2b82ea894c..06bceae7d104 100644 --- a/arch/loongarch/kernel/ptrace.c +++ b/arch/loongarch/kernel/ptrace.c @@ -20,7 +20,9 @@ #include <linux/context_tracking.h> #include <linux/elf.h> #include <linux/errno.h> +#include <linux/hw_breakpoint.h> #include <linux/mm.h> +#include <linux/nospec.h> #include <linux/ptrace.h> #include <linux/regset.h> #include <linux/sched.h> @@ -29,6 +31,7 @@ #include <linux/smp.h> #include <linux/stddef.h> #include <linux/seccomp.h> +#include <linux/thread_info.h> #include <linux/uaccess.h> #include <asm/byteorder.h> @@ -39,6 +42,7 @@ #include <asm/page.h> #include <asm/pgtable.h> #include <asm/processor.h> +#include <asm/ptrace.h> #include <asm/reg.h> #include <asm/syscall.h> @@ -246,6 +250,384 @@ static int cfg_set(struct task_struct *target, return 0; } +#ifdef CONFIG_HAVE_HW_BREAKPOINT + +/* + * Handle hitting a HW-breakpoint. + */ +static void ptrace_hbptriggered(struct perf_event *bp, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + int i; + struct arch_hw_breakpoint *bkpt = counter_arch_bp(bp); + + for (i = 0; i < LOONGARCH_MAX_BRP; ++i) + if (current->thread.hbp_break[i] == bp) + break; + + for (i = 0; i < LOONGARCH_MAX_WRP; ++i) + if (current->thread.hbp_watch[i] == bp) + break; + + force_sig_ptrace_errno_trap(i, (void __user *)bkpt->address); +} + +static struct perf_event *ptrace_hbp_get_event(unsigned int note_type, + struct task_struct *tsk, + unsigned long idx) +{ + struct perf_event *bp; + + switch (note_type) { + case NT_LOONGARCH_HW_BREAK: + if (idx >= LOONGARCH_MAX_BRP) + return ERR_PTR(-EINVAL); + idx = array_index_nospec(idx, LOONGARCH_MAX_BRP); + bp = tsk->thread.hbp_break[idx]; + break; + case NT_LOONGARCH_HW_WATCH: + if (idx >= LOONGARCH_MAX_WRP) + return ERR_PTR(-EINVAL); + idx = array_index_nospec(idx, LOONGARCH_MAX_WRP); + bp = tsk->thread.hbp_watch[idx]; + break; + } + + return bp; +} + +static int ptrace_hbp_set_event(unsigned int note_type, + struct task_struct *tsk, + unsigned long idx, + struct perf_event *bp) +{ + switch (note_type) { + case NT_LOONGARCH_HW_BREAK: + if (idx >= LOONGARCH_MAX_BRP) + return -EINVAL; + idx = array_index_nospec(idx, LOONGARCH_MAX_BRP); + tsk->thread.hbp_break[idx] = bp; + break; + case NT_LOONGARCH_HW_WATCH: + if (idx >= LOONGARCH_MAX_WRP) + return -EINVAL; + idx = array_index_nospec(idx, LOONGARCH_MAX_WRP); + tsk->thread.hbp_watch[idx] = bp; + break; + } + + return 0; +} + +static struct perf_event *ptrace_hbp_create(unsigned int note_type, + struct task_struct *tsk, + unsigned long idx) +{ + int err, type; + struct perf_event *bp; + struct perf_event_attr attr; + + switch (note_type) { + case NT_LOONGARCH_HW_BREAK: + type = HW_BREAKPOINT_X; + break; + case NT_LOONGARCH_HW_WATCH: + type = HW_BREAKPOINT_RW; + break; + default: + return ERR_PTR(-EINVAL); + } + + ptrace_breakpoint_init(&attr); + + /* + * Initialise fields to sane defaults + * (i.e. values that will pass validation). + */ + attr.bp_addr = 0; + attr.bp_len = HW_BREAKPOINT_LEN_4; + attr.bp_type = type; + attr.disabled = 1; + + bp = register_user_hw_breakpoint(&attr, ptrace_hbptriggered, NULL, tsk); + if (IS_ERR(bp)) + return bp; + + err = ptrace_hbp_set_event(note_type, tsk, idx, bp); + if (err) + return ERR_PTR(err); + + return bp; +} + +static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type, + struct arch_hw_breakpoint_ctrl ctrl, + struct perf_event_attr *attr) +{ + int err, len, type, offset; + + err = arch_bp_generic_fields(ctrl, &len, &type, &offset); + if (err) + return err; + + switch (note_type) { + case NT_LOONGARCH_HW_BREAK: + if ((type & HW_BREAKPOINT_X) != type) + return -EINVAL; + break; + case NT_LOONGARCH_HW_WATCH: + if ((type & HW_BREAKPOINT_RW) != type) + return -EINVAL; + break; + default: + return -EINVAL; + } + + attr->bp_len = len; + attr->bp_type = type; + attr->bp_addr += offset; + + return 0; +} + +static int ptrace_hbp_get_resource_info(unsigned int note_type, u16 *info) +{ + u8 num; + u16 reg = 0; + + switch (note_type) { + case NT_LOONGARCH_HW_BREAK: + num = hw_breakpoint_slots(TYPE_INST); + break; + case NT_LOONGARCH_HW_WATCH: + num = hw_breakpoint_slots(TYPE_DATA); + break; + default: + return -EINVAL; + } + + *info = reg | num; + + return 0; +} + +static struct perf_event *ptrace_hbp_get_initialised_bp(unsigned int note_type, + struct task_struct *tsk, + unsigned long idx) +{ + struct perf_event *bp = ptrace_hbp_get_event(note_type, tsk, idx); + + if (!bp) + bp = ptrace_hbp_create(note_type, tsk, idx); + + return bp; +} + +static int ptrace_hbp_get_ctrl(unsigned int note_type, + struct task_struct *tsk, + unsigned long idx, u32 *ctrl) +{ + struct perf_event *bp = ptrace_hbp_get_event(note_type, tsk, idx); + + if (IS_ERR(bp)) + return PTR_ERR(bp); + + *ctrl = bp ? encode_ctrl_reg(counter_arch_bp(bp)->ctrl) : 0; + + return 0; +} + +static int ptrace_hbp_get_mask(unsigned int note_type, + struct task_struct *tsk, + unsigned long idx, u64 *mask) +{ + struct perf_event *bp = ptrace_hbp_get_event(note_type, tsk, idx); + + if (IS_ERR(bp)) + return PTR_ERR(bp); + + *mask = bp ? counter_arch_bp(bp)->mask : 0; + + return 0; +} + +static int ptrace_hbp_get_addr(unsigned int note_type, + struct task_struct *tsk, + unsigned long idx, u64 *addr) +{ + struct perf_event *bp = ptrace_hbp_get_event(note_type, tsk, idx); + + if (IS_ERR(bp)) + return PTR_ERR(bp); + + *addr = bp ? counter_arch_bp(bp)->address : 0; + + return 0; +} + +static int ptrace_hbp_set_ctrl(unsigned int note_type, + struct task_struct *tsk, + unsigned long idx, u32 uctrl) +{ + int err; + struct perf_event *bp; + struct perf_event_attr attr; + struct arch_hw_breakpoint_ctrl ctrl; + + bp = ptrace_hbp_get_initialised_bp(note_type, tsk, idx); + if (IS_ERR(bp)) + return PTR_ERR(bp); + + attr = bp->attr; + decode_ctrl_reg(uctrl, &ctrl); + err = ptrace_hbp_fill_attr_ctrl(note_type, ctrl, &attr); + if (err) + return err; + + return modify_user_hw_breakpoint(bp, &attr); +} + +static int ptrace_hbp_set_mask(unsigned int note_type, + struct task_struct *tsk, + unsigned long idx, u64 mask) +{ + struct perf_event *bp; + struct perf_event_attr attr; + struct arch_hw_breakpoint *info; + + bp = ptrace_hbp_get_initialised_bp(note_type, tsk, idx); + if (IS_ERR(bp)) + return PTR_ERR(bp); + + attr = bp->attr; + info = counter_arch_bp(bp); + info->mask = mask; + + return modify_user_hw_breakpoint(bp, &attr); +} + +static int ptrace_hbp_set_addr(unsigned int note_type, + struct task_struct *tsk, + unsigned long idx, u64 addr) +{ + struct perf_event *bp; + struct perf_event_attr attr; + + bp = ptrace_hbp_get_initialised_bp(note_type, tsk, idx); + if (IS_ERR(bp)) + return PTR_ERR(bp); + + attr = bp->attr; + attr.bp_addr = addr; + + return modify_user_hw_breakpoint(bp, &attr); +} + +#define PTRACE_HBP_CTRL_SZ sizeof(u32) +#define PTRACE_HBP_ADDR_SZ sizeof(u64) +#define PTRACE_HBP_MASK_SZ sizeof(u64) + +static int hw_break_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + u16 info; + u32 ctrl; + u64 addr, mask; + int ret, idx = 0; + unsigned int note_type = regset->core_note_type; + + /* Resource info */ + ret = ptrace_hbp_get_resource_info(note_type, &info); + if (ret) + return ret; + + membuf_write(&to, &info, sizeof(info)); + + /* (address, ctrl) registers */ + while (to.left) { + ret = ptrace_hbp_get_addr(note_type, target, idx, &addr); + if (ret) + return ret; + + ret = ptrace_hbp_get_mask(note_type, target, idx, &mask); + if (ret) + return ret; + + ret = ptrace_hbp_get_ctrl(note_type, target, idx, &ctrl); + if (ret) + return ret; + + membuf_store(&to, addr); + membuf_store(&to, mask); + membuf_store(&to, ctrl); + idx++; + } + + return 0; +} + +static int hw_break_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + u32 ctrl; + u64 addr, mask; + int ret, idx = 0, offset, limit; + unsigned int note_type = regset->core_note_type; + + /* Resource info */ + offset = offsetof(struct user_watch_state, dbg_regs); + user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, 0, offset); + + /* (address, ctrl) registers */ + limit = regset->n * regset->size; + while (count && offset < limit) { + if (count < PTRACE_HBP_ADDR_SZ) + return -EINVAL; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &addr, + offset, offset + PTRACE_HBP_ADDR_SZ); + if (ret) + return ret; + + ret = ptrace_hbp_set_addr(note_type, target, idx, addr); + if (ret) + return ret; + offset += PTRACE_HBP_ADDR_SZ; + + if (!count) + break; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &mask, + offset, offset + PTRACE_HBP_ADDR_SZ); + if (ret) + return ret; + + ret = ptrace_hbp_set_mask(note_type, target, idx, mask); + if (ret) + return ret; + offset += PTRACE_HBP_MASK_SZ; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &mask, + offset, offset + PTRACE_HBP_MASK_SZ); + if (ret) + return ret; + + ret = ptrace_hbp_set_ctrl(note_type, target, idx, ctrl); + if (ret) + return ret; + offset += PTRACE_HBP_CTRL_SZ; + idx++; + } + + return 0; +} + +#endif + struct pt_regs_offset { const char *name; int offset; @@ -319,6 +701,10 @@ enum loongarch_regset { REGSET_GPR, REGSET_FPR, REGSET_CPUCFG, +#ifdef CONFIG_HAVE_HW_BREAKPOINT + REGSET_HW_BREAK, + REGSET_HW_WATCH, +#endif }; static const struct user_regset loongarch64_regsets[] = { @@ -346,6 +732,24 @@ static const struct user_regset loongarch64_regsets[] = { .regset_get = cfg_get, .set = cfg_set, }, +#ifdef CONFIG_HAVE_HW_BREAKPOINT + [REGSET_HW_BREAK] = { + .core_note_type = NT_LOONGARCH_HW_BREAK, + .n = sizeof(struct user_watch_state) / sizeof(u32), + .size = sizeof(u32), + .align = sizeof(u32), + .regset_get = hw_break_get, + .set = hw_break_set, + }, + [REGSET_HW_WATCH] = { + .core_note_type = NT_LOONGARCH_HW_WATCH, + .n = sizeof(struct user_watch_state) / sizeof(u32), + .size = sizeof(u32), + .align = sizeof(u32), + .regset_get = hw_break_get, + .set = hw_break_set, + }, +#endif }; static const struct user_regset_view user_loongarch64_view = { @@ -431,3 +835,71 @@ long arch_ptrace(struct task_struct *child, long request, return ret; } + +#ifdef CONFIG_HAVE_HW_BREAKPOINT +static void ptrace_triggered(struct perf_event *bp, + struct perf_sample_data *data, struct pt_regs *regs) +{ + struct perf_event_attr attr; + + attr = bp->attr; + attr.disabled = true; + modify_user_hw_breakpoint(bp, &attr); +} + +static int set_single_step(struct task_struct *tsk, unsigned long addr) +{ + struct perf_event *bp; + struct perf_event_attr attr; + struct arch_hw_breakpoint *info; + struct thread_struct *thread = &tsk->thread; + + bp = thread->hbp_break[0]; + if (!bp) { + ptrace_breakpoint_init(&attr); + + attr.bp_addr = addr; + attr.bp_len = HW_BREAKPOINT_LEN_8; + attr.bp_type = HW_BREAKPOINT_X; + + bp = register_user_hw_breakpoint(&attr, ptrace_triggered, + NULL, tsk); + if (IS_ERR(bp)) + return PTR_ERR(bp); + + thread->hbp_break[0] = bp; + } else { + int err; + + attr = bp->attr; + attr.bp_addr = addr; + + /* Reenable breakpoint */ + attr.disabled = false; + err = modify_user_hw_breakpoint(bp, &attr); + if (unlikely(err)) + return err; + + csr_write64(attr.bp_addr, LOONGARCH_CSR_IB0ADDR); + } + info = counter_arch_bp(bp); + info->mask = TASK_SIZE - 1; + + return 0; +} + +/* ptrace API */ +void user_enable_single_step(struct task_struct *task) +{ + struct thread_info *ti = task_thread_info(task); + + set_single_step(task, task_pt_regs(task)->csr_era); + task->thread.single_step = task_pt_regs(task)->csr_era; + set_ti_thread_flag(ti, TIF_SINGLESTEP); +} + +void user_disable_single_step(struct task_struct *task) +{ + clear_tsk_thread_flag(task, TIF_SINGLESTEP); +} +#endif diff --git a/arch/loongarch/kernel/relocate.c b/arch/loongarch/kernel/relocate.c new file mode 100644 index 000000000000..01f94d1e3edf --- /dev/null +++ b/arch/loongarch/kernel/relocate.c @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Support for Kernel relocation at boot time + * + * Copyright (C) 2023 Loongson Technology Corporation Limited + */ + +#include <linux/elf.h> +#include <linux/kernel.h> +#include <linux/printk.h> +#include <linux/panic_notifier.h> +#include <linux/start_kernel.h> +#include <asm/bootinfo.h> +#include <asm/early_ioremap.h> +#include <asm/inst.h> +#include <asm/sections.h> +#include <asm/setup.h> + +#define RELOCATED(x) ((void *)((long)x + reloc_offset)) +#define RELOCATED_KASLR(x) ((void *)((long)x + random_offset)) + +static unsigned long reloc_offset; + +static inline void __init relocate_relative(void) +{ + Elf64_Rela *rela, *rela_end; + rela = (Elf64_Rela *)&__rela_dyn_begin; + rela_end = (Elf64_Rela *)&__rela_dyn_end; + + for ( ; rela < rela_end; rela++) { + Elf64_Addr addr = rela->r_offset; + Elf64_Addr relocated_addr = rela->r_addend; + + if (rela->r_info != R_LARCH_RELATIVE) + continue; + + if (relocated_addr >= VMLINUX_LOAD_ADDRESS) + relocated_addr = (Elf64_Addr)RELOCATED(relocated_addr); + + *(Elf64_Addr *)RELOCATED(addr) = relocated_addr; + } +} + +static inline void __init relocate_absolute(long random_offset) +{ + void *begin, *end; + struct rela_la_abs *p; + + begin = RELOCATED_KASLR(&__la_abs_begin); + end = RELOCATED_KASLR(&__la_abs_end); + + for (p = begin; (void *)p < end; p++) { + long v = p->symvalue; + uint32_t lu12iw, ori, lu32id, lu52id; + union loongarch_instruction *insn = (void *)p - p->offset; + + lu12iw = (v >> 12) & 0xfffff; + ori = v & 0xfff; + lu32id = (v >> 32) & 0xfffff; + lu52id = v >> 52; + + insn[0].reg1i20_format.immediate = lu12iw; + insn[1].reg2i12_format.immediate = ori; + insn[2].reg1i20_format.immediate = lu32id; + insn[3].reg2i12_format.immediate = lu52id; + } +} + +#ifdef CONFIG_RANDOMIZE_BASE +static inline __init unsigned long rotate_xor(unsigned long hash, + const void *area, size_t size) +{ + size_t i, diff; + const typeof(hash) *ptr = PTR_ALIGN(area, sizeof(hash)); + + diff = (void *)ptr - area; + if (size < diff + sizeof(hash)) + return hash; + + size = ALIGN_DOWN(size - diff, sizeof(hash)); + + for (i = 0; i < size / sizeof(hash); i++) { + /* Rotate by odd number of bits and XOR. */ + hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7); + hash ^= ptr[i]; + } + + return hash; +} + +static inline __init unsigned long get_random_boot(void) +{ + unsigned long hash = 0; + unsigned long entropy = random_get_entropy(); + + /* Attempt to create a simple but unpredictable starting entropy. */ + hash = rotate_xor(hash, linux_banner, strlen(linux_banner)); + + /* Add in any runtime entropy we can get */ + hash = rotate_xor(hash, &entropy, sizeof(entropy)); + + return hash; +} + +static inline __init bool kaslr_disabled(void) +{ + char *str; + const char *builtin_cmdline = CONFIG_CMDLINE; + + str = strstr(builtin_cmdline, "nokaslr"); + if (str == builtin_cmdline || (str > builtin_cmdline && *(str - 1) == ' ')) + return true; + + str = strstr(boot_command_line, "nokaslr"); + if (str == boot_command_line || (str > boot_command_line && *(str - 1) == ' ')) + return true; + + return false; +} + +/* Choose a new address for the kernel */ +static inline void __init *determine_relocation_address(void) +{ + unsigned long kernel_length; + unsigned long random_offset; + void *destination = _text; + + if (kaslr_disabled()) + return destination; + + kernel_length = (long)_end - (long)_text; + + random_offset = get_random_boot() << 16; + random_offset &= (CONFIG_RANDOMIZE_BASE_MAX_OFFSET - 1); + if (random_offset < kernel_length) + random_offset += ALIGN(kernel_length, 0xffff); + + return RELOCATED_KASLR(destination); +} + +static inline int __init relocation_addr_valid(void *location_new) +{ + if ((unsigned long)location_new & 0x00000ffff) + return 0; /* Inappropriately aligned new location */ + + if ((unsigned long)location_new < (unsigned long)_end) + return 0; /* New location overlaps original kernel */ + + return 1; +} +#endif + +static inline void __init update_reloc_offset(unsigned long *addr, long random_offset) +{ + unsigned long *new_addr = (unsigned long *)RELOCATED_KASLR(addr); + + *new_addr = (unsigned long)reloc_offset; +} + +void * __init relocate_kernel(void) +{ + unsigned long kernel_length; + unsigned long random_offset = 0; + void *location_new = _text; /* Default to original kernel start */ + void *kernel_entry = start_kernel; /* Default to original kernel entry point */ + char *cmdline = early_ioremap(fw_arg1, COMMAND_LINE_SIZE); /* Boot command line is passed in fw_arg1 */ + + strscpy(boot_command_line, cmdline, COMMAND_LINE_SIZE); + +#ifdef CONFIG_RANDOMIZE_BASE + location_new = determine_relocation_address(); + + /* Sanity check relocation address */ + if (relocation_addr_valid(location_new)) + random_offset = (unsigned long)location_new - (unsigned long)(_text); +#endif + reloc_offset = (unsigned long)_text - VMLINUX_LOAD_ADDRESS; + + if (random_offset) { + kernel_length = (long)(_end) - (long)(_text); + + /* Copy the kernel to it's new location */ + memcpy(location_new, _text, kernel_length); + + /* Sync the caches ready for execution of new kernel */ + __asm__ __volatile__ ( + "ibar 0 \t\n" + "dbar 0 \t\n" + ::: "memory"); + + reloc_offset += random_offset; + + /* Return the new kernel's entry point */ + kernel_entry = RELOCATED_KASLR(start_kernel); + + /* The current thread is now within the relocated kernel */ + __current_thread_info = RELOCATED_KASLR(__current_thread_info); + + update_reloc_offset(&reloc_offset, random_offset); + } + + if (reloc_offset) + relocate_relative(); + + relocate_absolute(random_offset); + + return kernel_entry; +} + +/* + * Show relocation information on panic. + */ +static void show_kernel_relocation(const char *level) +{ + if (reloc_offset > 0) { + printk(level); + pr_cont("Kernel relocated by 0x%lx\n", reloc_offset); + pr_cont(" .text @ 0x%px\n", _text); + pr_cont(" .data @ 0x%px\n", _sdata); + pr_cont(" .bss @ 0x%px\n", __bss_start); + } +} + +static int kernel_location_notifier_fn(struct notifier_block *self, + unsigned long v, void *p) +{ + show_kernel_relocation(KERN_EMERG); + return NOTIFY_DONE; +} + +static struct notifier_block kernel_location_notifier = { + .notifier_call = kernel_location_notifier_fn +}; + +static int __init register_kernel_offset_dumper(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, + &kernel_location_notifier); + return 0; +} + +arch_initcall(register_kernel_offset_dumper); diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 4344502c0b31..bae84ccf6d36 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -234,11 +234,14 @@ static void __init arch_reserve_vmcore(void) #endif } +/* 2MB alignment for crash kernel regions */ +#define CRASH_ALIGN SZ_2M +#define CRASH_ADDR_MAX SZ_4G + static void __init arch_parse_crashkernel(void) { #ifdef CONFIG_KEXEC int ret; - unsigned long long start; unsigned long long total_mem; unsigned long long crash_base, crash_size; @@ -247,8 +250,13 @@ static void __init arch_parse_crashkernel(void) if (ret < 0 || crash_size <= 0) return; - start = memblock_phys_alloc_range(crash_size, 1, crash_base, crash_base + crash_size); - if (start != crash_base) { + if (crash_base <= 0) { + crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, CRASH_ALIGN, CRASH_ADDR_MAX); + if (!crash_base) { + pr_warn("crashkernel reservation failed - No suitable area found.\n"); + return; + } + } else if (!memblock_phys_alloc_range(crash_size, CRASH_ALIGN, crash_base, crash_base + crash_size)) { pr_warn("Invalid memory region reserved for crash kernel\n"); return; } diff --git a/arch/loongarch/kernel/time.c b/arch/loongarch/kernel/time.c index a6576dea590c..4351f69d9950 100644 --- a/arch/loongarch/kernel/time.c +++ b/arch/loongarch/kernel/time.c @@ -140,16 +140,17 @@ static int get_timer_irq(void) int constant_clockevent_init(void) { - int irq; unsigned int cpu = smp_processor_id(); unsigned long min_delta = 0x600; unsigned long max_delta = (1UL << 48) - 1; struct clock_event_device *cd; - static int timer_irq_installed = 0; + static int irq = 0, timer_irq_installed = 0; - irq = get_timer_irq(); - if (irq < 0) - pr_err("Failed to map irq %d (timer)\n", irq); + if (!timer_irq_installed) { + irq = get_timer_irq(); + if (irq < 0) + pr_err("Failed to map irq %d (timer)\n", irq); + } cd = &per_cpu(constant_clockevent_device, cpu); diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index c38a146a973b..de8ebe20b666 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -371,9 +371,14 @@ int no_unaligned_warning __read_mostly = 1; /* Only 1 warning by default */ asmlinkage void noinstr do_ale(struct pt_regs *regs) { - unsigned int *pc; irqentry_state_t state = irqentry_enter(regs); +#ifndef CONFIG_ARCH_STRICT_ALIGN + die_if_kernel("Kernel ale access", regs); + force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)regs->csr_badvaddr); +#else + unsigned int *pc; + perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, regs->csr_badvaddr); /* @@ -397,8 +402,8 @@ asmlinkage void noinstr do_ale(struct pt_regs *regs) sigbus: die_if_kernel("Kernel ale access", regs); force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)regs->csr_badvaddr); - out: +#endif irqentry_exit(regs, state); } @@ -432,7 +437,9 @@ asmlinkage void noinstr do_bp(struct pt_regs *regs) unsigned long era = exception_era(regs); irqentry_state_t state = irqentry_enter(regs); - local_irq_enable(); + if (regs->csr_prmd & CSR_PRMD_PIE) + local_irq_enable(); + current->thread.trap_nr = read_csr_excode(); if (__get_inst(&opcode, (u32 *)era, user)) goto out_sigsegv; @@ -445,14 +452,12 @@ asmlinkage void noinstr do_bp(struct pt_regs *regs) */ switch (bcode) { case BRK_KPROBE_BP: - if (notify_die(DIE_BREAK, "Kprobe", regs, bcode, - current->thread.trap_nr, SIGTRAP) == NOTIFY_STOP) + if (kprobe_breakpoint_handler(regs)) goto out; else break; case BRK_KPROBE_SSTEPBP: - if (notify_die(DIE_SSTEPBP, "Kprobe_SingleStep", regs, bcode, - current->thread.trap_nr, SIGTRAP) == NOTIFY_STOP) + if (kprobe_singlestep_handler(regs)) goto out; else break; @@ -495,7 +500,9 @@ asmlinkage void noinstr do_bp(struct pt_regs *regs) } out: - local_irq_disable(); + if (regs->csr_prmd & CSR_PRMD_PIE) + local_irq_disable(); + irqentry_exit(regs, state); return; @@ -506,7 +513,52 @@ out_sigsegv: asmlinkage void noinstr do_watch(struct pt_regs *regs) { + irqentry_state_t state = irqentry_enter(regs); + +#ifndef CONFIG_HAVE_HW_BREAKPOINT pr_warn("Hardware watch point handler not implemented!\n"); +#else + if (test_tsk_thread_flag(current, TIF_SINGLESTEP)) { + int llbit = (csr_read32(LOONGARCH_CSR_LLBCTL) & 0x1); + unsigned long pc = instruction_pointer(regs); + union loongarch_instruction *ip = (union loongarch_instruction *)pc; + + if (llbit) { + /* + * When the ll-sc combo is encountered, it is regarded as an single + * instruction. So don't clear llbit and reset CSR.FWPS.Skip until + * the llsc execution is completed. + */ + csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS); + csr_write32(CSR_LLBCTL_KLO, LOONGARCH_CSR_LLBCTL); + goto out; + } + + if (pc == current->thread.single_step) { + /* + * Certain insns are occasionally not skipped when CSR.FWPS.Skip is + * set, such as fld.d/fst.d. So singlestep needs to compare whether + * the csr_era is equal to the value of singlestep which last time set. + */ + if (!is_self_loop_ins(ip, regs)) { + /* + * Check if the given instruction the target pc is equal to the + * current pc, If yes, then we should not set the CSR.FWPS.SKIP + * bit to break the original instruction stream. + */ + csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS); + goto out; + } + } + } else { + breakpoint_handler(regs); + watchpoint_handler(regs); + } + + force_sig(SIGTRAP); +out: +#endif + irqentry_exit(regs, state); } asmlinkage void noinstr do_ri(struct pt_regs *regs) diff --git a/arch/loongarch/kernel/vmlinux.lds.S b/arch/loongarch/kernel/vmlinux.lds.S index 78506b31ba61..0c7b041be9d8 100644 --- a/arch/loongarch/kernel/vmlinux.lds.S +++ b/arch/loongarch/kernel/vmlinux.lds.S @@ -65,10 +65,21 @@ SECTIONS __alt_instructions_end = .; } +#ifdef CONFIG_RELOCATABLE + . = ALIGN(8); + .la_abs : AT(ADDR(.la_abs) - LOAD_OFFSET) { + __la_abs_begin = .; + *(.la_abs) + __la_abs_end = .; + } +#endif + .got : ALIGN(16) { *(.got) } .plt : ALIGN(16) { *(.plt) } .got.plt : ALIGN(16) { *(.got.plt) } + .data.rel : { *(.data.rel*) } + . = ALIGN(PECOFF_SEGMENT_ALIGN); __init_begin = .; __inittext_begin = .; @@ -92,8 +103,6 @@ SECTIONS PERCPU_SECTION(1 << CONFIG_L1_CACHE_SHIFT) #endif - .rela.dyn : ALIGN(8) { *(.rela.dyn) *(.rela*) } - .init.bss : { *(.init.bss) } @@ -106,6 +115,12 @@ SECTIONS RO_DATA(4096) RW_DATA(1 << CONFIG_L1_CACHE_SHIFT, PAGE_SIZE, THREAD_SIZE) + .rela.dyn : ALIGN(8) { + __rela_dyn_begin = .; + *(.rela.dyn) *(.rela*) + __rela_dyn_end = .; + } + .sdata : { *(.sdata) } @@ -132,6 +147,7 @@ SECTIONS DISCARDS /DISCARD/ : { + *(.dynamic .dynsym .dynstr .hash .gnu.hash) *(.gnu.attributes) *(.options) *(.eh_frame) diff --git a/arch/loongarch/lib/memcpy.S b/arch/loongarch/lib/memcpy.S index 7c07d595ee89..3b7e1dec7109 100644 --- a/arch/loongarch/lib/memcpy.S +++ b/arch/loongarch/lib/memcpy.S @@ -17,6 +17,7 @@ SYM_FUNC_START(memcpy) ALTERNATIVE "b __memcpy_generic", \ "b __memcpy_fast", CPU_FEATURE_UAL SYM_FUNC_END(memcpy) +_ASM_NOKPROBE(memcpy) EXPORT_SYMBOL(memcpy) @@ -41,6 +42,7 @@ SYM_FUNC_START(__memcpy_generic) 2: move a0, a3 jr ra SYM_FUNC_END(__memcpy_generic) +_ASM_NOKPROBE(__memcpy_generic) /* * void *__memcpy_fast(void *dst, const void *src, size_t n) @@ -93,3 +95,4 @@ SYM_FUNC_START(__memcpy_fast) 3: move a0, a3 jr ra SYM_FUNC_END(__memcpy_fast) +_ASM_NOKPROBE(__memcpy_fast) diff --git a/arch/loongarch/lib/memmove.S b/arch/loongarch/lib/memmove.S index 6ffdb46da78f..b796c3d6da05 100644 --- a/arch/loongarch/lib/memmove.S +++ b/arch/loongarch/lib/memmove.S @@ -29,6 +29,7 @@ SYM_FUNC_START(memmove) b rmemcpy 4: b __rmemcpy_generic SYM_FUNC_END(memmove) +_ASM_NOKPROBE(memmove) EXPORT_SYMBOL(memmove) @@ -39,6 +40,7 @@ SYM_FUNC_START(rmemcpy) ALTERNATIVE "b __rmemcpy_generic", \ "b __rmemcpy_fast", CPU_FEATURE_UAL SYM_FUNC_END(rmemcpy) +_ASM_NOKPROBE(rmemcpy) /* * void *__rmemcpy_generic(void *dst, const void *src, size_t n) @@ -64,6 +66,7 @@ SYM_FUNC_START(__rmemcpy_generic) 2: move a0, a3 jr ra SYM_FUNC_END(__rmemcpy_generic) +_ASM_NOKPROBE(__rmemcpy_generic) /* * void *__rmemcpy_fast(void *dst, const void *src, size_t n) @@ -119,3 +122,4 @@ SYM_FUNC_START(__rmemcpy_fast) 3: move a0, a3 jr ra SYM_FUNC_END(__rmemcpy_fast) +_ASM_NOKPROBE(__rmemcpy_fast) diff --git a/arch/loongarch/lib/memset.S b/arch/loongarch/lib/memset.S index e7cb4ea3747d..a9eb732ab2ad 100644 --- a/arch/loongarch/lib/memset.S +++ b/arch/loongarch/lib/memset.S @@ -23,6 +23,7 @@ SYM_FUNC_START(memset) ALTERNATIVE "b __memset_generic", \ "b __memset_fast", CPU_FEATURE_UAL SYM_FUNC_END(memset) +_ASM_NOKPROBE(memset) EXPORT_SYMBOL(memset) @@ -45,6 +46,7 @@ SYM_FUNC_START(__memset_generic) 2: move a0, a3 jr ra SYM_FUNC_END(__memset_generic) +_ASM_NOKPROBE(__memset_generic) /* * void *__memset_fast(void *s, int c, size_t n) @@ -89,3 +91,4 @@ SYM_FUNC_START(__memset_fast) 3: move a0, a3 jr ra SYM_FUNC_END(__memset_fast) +_ASM_NOKPROBE(__memset_fast) diff --git a/arch/loongarch/mm/fault.c b/arch/loongarch/mm/fault.c index 1ccd53655cab..449087bd589d 100644 --- a/arch/loongarch/mm/fault.c +++ b/arch/loongarch/mm/fault.c @@ -135,6 +135,9 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, struct vm_area_struct *vma = NULL; vm_fault_t fault; + if (kprobe_page_fault(regs, current->thread.trap_nr)) + return; + /* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. diff --git a/arch/loongarch/mm/tlbex.S b/arch/loongarch/mm/tlbex.S index 58781c6e4191..244e2f5aeee5 100644 --- a/arch/loongarch/mm/tlbex.S +++ b/arch/loongarch/mm/tlbex.S @@ -24,8 +24,7 @@ move a0, sp REG_S a2, sp, PT_BVADDR li.w a1, \write - la.abs t0, do_page_fault - jirl ra, t0, 0 + bl do_page_fault RESTORE_ALL_AND_RET SYM_FUNC_END(tlb_do_page_fault_\write) .endm @@ -40,7 +39,7 @@ SYM_FUNC_START(handle_tlb_protect) move a1, zero csrrd a2, LOONGARCH_CSR_BADV REG_S a2, sp, PT_BVADDR - la.abs t0, do_page_fault + la_abs t0, do_page_fault jirl ra, t0, 0 RESTORE_ALL_AND_RET SYM_FUNC_END(handle_tlb_protect) @@ -116,7 +115,7 @@ smp_pgtable_change_load: #ifdef CONFIG_64BIT vmalloc_load: - la.abs t1, swapper_pg_dir + la_abs t1, swapper_pg_dir b vmalloc_done_load #endif @@ -187,7 +186,7 @@ tlb_huge_update_load: nopage_tlb_load: dbar 0 csrrd ra, EXCEPTION_KS2 - la.abs t0, tlb_do_page_fault_0 + la_abs t0, tlb_do_page_fault_0 jr t0 SYM_FUNC_END(handle_tlb_load) @@ -263,7 +262,7 @@ smp_pgtable_change_store: #ifdef CONFIG_64BIT vmalloc_store: - la.abs t1, swapper_pg_dir + la_abs t1, swapper_pg_dir b vmalloc_done_store #endif @@ -336,7 +335,7 @@ tlb_huge_update_store: nopage_tlb_store: dbar 0 csrrd ra, EXCEPTION_KS2 - la.abs t0, tlb_do_page_fault_1 + la_abs t0, tlb_do_page_fault_1 jr t0 SYM_FUNC_END(handle_tlb_store) @@ -411,7 +410,7 @@ smp_pgtable_change_modify: #ifdef CONFIG_64BIT vmalloc_modify: - la.abs t1, swapper_pg_dir + la_abs t1, swapper_pg_dir b vmalloc_done_modify #endif @@ -483,7 +482,7 @@ tlb_huge_update_modify: nopage_tlb_modify: dbar 0 csrrd ra, EXCEPTION_KS2 - la.abs t0, tlb_do_page_fault_1 + la_abs t0, tlb_do_page_fault_1 jr t0 SYM_FUNC_END(handle_tlb_modify) diff --git a/arch/loongarch/power/suspend_asm.S b/arch/loongarch/power/suspend_asm.S index eb2675642f9f..90da899c06a1 100644 --- a/arch/loongarch/power/suspend_asm.S +++ b/arch/loongarch/power/suspend_asm.S @@ -78,9 +78,8 @@ SYM_INNER_LABEL(loongarch_wakeup_start, SYM_L_GLOBAL) li.d t0, CSR_DMW1_INIT # CA, PLV0 csrwr t0, LOONGARCH_CSR_DMWIN1 - la.abs t0, 0f - jr t0 -0: + JUMP_VIRT_ADDR t0, t1 + la.pcrel t0, acpi_saved_sp ld.d sp, t0, 0 SETUP_WAKEUP diff --git a/arch/m68k/68000/dragen2.c b/arch/m68k/68000/dragen2.c index 1a57eff28cfe..7f1804e31a06 100644 --- a/arch/m68k/68000/dragen2.c +++ b/arch/m68k/68000/dragen2.c @@ -15,7 +15,7 @@ #include "screen.h" /***************************************************************************/ -/* Init Drangon Engine hardware */ +/* Init Dragon Engine II hardware */ /***************************************************************************/ static void dragen2_reset(void) diff --git a/arch/m68k/Kconfig.machine b/arch/m68k/Kconfig.machine index 53c45ccda564..e2f961208f18 100644 --- a/arch/m68k/Kconfig.machine +++ b/arch/m68k/Kconfig.machine @@ -192,18 +192,18 @@ config UCSIMM Support for the Arcturus Networks uCsimm module. config UCDIMM - bool "uDsimm module support" + bool "uCdimm module support" depends on !MMU select M68VZ328 help - Support for the Arcturus Networks uDsimm module. + Support for the Arcturus Networks uCdimm module. config DRAGEN2 - bool "DragenEngine II board support" + bool "DragonEngine II board support" depends on !MMU select M68VZ328 help - Support for the DragenEngine II board. + Support for the DragonEngine II board. config DIRECT_IO_ACCESS bool "Allow user to access IO directly" diff --git a/arch/mips/Kbuild b/arch/mips/Kbuild index 9e8071f0e58f..af2967bffb73 100644 --- a/arch/mips/Kbuild +++ b/arch/mips/Kbuild @@ -7,7 +7,7 @@ subdir-ccflags-y := -Werror endif # platform specific definitions -include arch/mips/Kbuild.platforms +include $(srctree)/arch/mips/Kbuild.platforms obj-y := $(platform-y) # make clean traverses $(obj-) without having included .config, so diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 15cb692b0a09..37072e15b263 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -445,6 +445,7 @@ config LANTIQ select IRQ_MIPS_CPU select CEVT_R4K select CSRC_R4K + select NO_EXCEPT_FILL select SYS_HAS_CPU_MIPS32_R1 select SYS_HAS_CPU_MIPS32_R2 select SYS_SUPPORTS_BIG_ENDIAN diff --git a/arch/mips/Makefile b/arch/mips/Makefile index 490dea07d4e0..04e46ec24319 100644 --- a/arch/mips/Makefile +++ b/arch/mips/Makefile @@ -95,7 +95,7 @@ all-$(CONFIG_SYS_SUPPORTS_ZBOOT)+= vmlinuz # crossformat linking we rely on the elf2ecoff tool for format conversion. # cflags-y += -G 0 -mno-abicalls -fno-pic -pipe -cflags-y += -msoft-float +cflags-y += -msoft-float -Wa,-msoft-float LDFLAGS_vmlinux += -G 0 -static -n -nostdlib KBUILD_AFLAGS_MODULE += -mlong-calls KBUILD_CFLAGS_MODULE += -mlong-calls @@ -104,15 +104,6 @@ ifeq ($(CONFIG_RELOCATABLE),y) LDFLAGS_vmlinux += --emit-relocs endif -# -# pass -msoft-float to GAS if it supports it. However on newer binutils -# (specifically newer than 2.24.51.20140728) we then also need to explicitly -# set ".set hardfloat" in all files which manipulate floating point registers. -# -ifneq ($(call as-option,-Wa$(comma)-msoft-float,),) - cflags-y += -DGAS_HAS_SET_HARDFLOAT -Wa,-msoft-float -endif - cflags-y += -ffreestanding cflags-$(CONFIG_CPU_BIG_ENDIAN) += -EB @@ -152,7 +143,7 @@ cflags-y += -fno-stack-check # # Avoid this by explicitly disabling that assembler behaviour. # -cflags-y += $(call as-option,-Wa$(comma)-mno-fix-loongson3-llsc,) +cflags-y += $(call cc-option,-Wa$(comma)-mno-fix-loongson3-llsc,) # # CPU-dependent compiler/assembler options for optimization. diff --git a/arch/mips/Makefile.postlink b/arch/mips/Makefile.postlink index 4b1d3ba3a8a2..34e3bd71f3b0 100644 --- a/arch/mips/Makefile.postlink +++ b/arch/mips/Makefile.postlink @@ -10,7 +10,7 @@ PHONY := __archpost __archpost: -include include/config/auto.conf -include scripts/Kbuild.include +include $(srctree)/scripts/Kbuild.include CMD_LS3_LLSC = arch/mips/tools/loongson3-llsc-check quiet_cmd_ls3_llsc = LLSCCHK $@ diff --git a/arch/mips/bcm47xx/board.c b/arch/mips/bcm47xx/board.c index 8ef002471b9c..90fb48b046c0 100644 --- a/arch/mips/bcm47xx/board.c +++ b/arch/mips/bcm47xx/board.c @@ -130,6 +130,7 @@ struct bcm47xx_board_type_list2 bcm47xx_board_list_boot_hw[] __initconst = { {{BCM47XX_BOARD_LINKSYS_E1000V21, "Linksys E1000 V2.1"}, "E1000", "2.1"}, {{BCM47XX_BOARD_LINKSYS_E1200V2, "Linksys E1200 V2"}, "E1200", "2.0"}, {{BCM47XX_BOARD_LINKSYS_E2000V1, "Linksys E2000 V1"}, "Linksys E2000", "1.0"}, + {{BCM47XX_BOARD_LINKSYS_E2500V3, "Linksys E2500 V3"}, "E2500", "1.0"}, /* like WRT610N v2.0 */ {{BCM47XX_BOARD_LINKSYS_E3000V1, "Linksys E3000 V1"}, "E300", "1.0"}, {{BCM47XX_BOARD_LINKSYS_E3200V1, "Linksys E3200 V1"}, "E3200", "1.0"}, diff --git a/arch/mips/bcm47xx/buttons.c b/arch/mips/bcm47xx/buttons.c index 38e4a9cbcf4e..437a737c01dd 100644 --- a/arch/mips/bcm47xx/buttons.c +++ b/arch/mips/bcm47xx/buttons.c @@ -223,6 +223,12 @@ bcm47xx_buttons_linksys_e2000v1[] __initconst = { }; static const struct gpio_keys_button +bcm47xx_buttons_linksys_e2500v3[] __initconst = { + BCM47XX_GPIO_KEY(9, KEY_WPS_BUTTON), + BCM47XX_GPIO_KEY(10, KEY_RESTART), +}; + +static const struct gpio_keys_button bcm47xx_buttons_linksys_e3000v1[] __initconst = { BCM47XX_GPIO_KEY(4, KEY_WPS_BUTTON), BCM47XX_GPIO_KEY(6, KEY_RESTART), @@ -617,6 +623,9 @@ int __init bcm47xx_buttons_register(void) case BCM47XX_BOARD_LINKSYS_E2000V1: err = bcm47xx_copy_bdata(bcm47xx_buttons_linksys_e2000v1); break; + case BCM47XX_BOARD_LINKSYS_E2500V3: + err = bcm47xx_copy_bdata(bcm47xx_buttons_linksys_e2500v3); + break; case BCM47XX_BOARD_LINKSYS_E3000V1: err = bcm47xx_copy_bdata(bcm47xx_buttons_linksys_e3000v1); break; diff --git a/arch/mips/boot/dts/cavium-octeon/dlink_dsr-1000n.dts b/arch/mips/boot/dts/cavium-octeon/dlink_dsr-1000n.dts index 2fdb4baad19c..cb460eaf8835 100644 --- a/arch/mips/boot/dts/cavium-octeon/dlink_dsr-1000n.dts +++ b/arch/mips/boot/dts/cavium-octeon/dlink_dsr-1000n.dts @@ -20,27 +20,27 @@ leds { compatible = "gpio-leds"; - usb1 { + led-usb1 { label = "usb1"; gpios = <&gpio 9 GPIO_ACTIVE_LOW>; }; - usb2 { + led-usb2 { label = "usb2"; gpios = <&gpio 10 GPIO_ACTIVE_LOW>; }; - wps { + led-wps { label = "wps"; gpios = <&gpio 11 GPIO_ACTIVE_LOW>; }; - wireless1 { + led-wireless1 { label = "5g"; gpios = <&gpio 17 GPIO_ACTIVE_LOW>; }; - wireless2 { + led-wireless2 { label = "2.4g"; gpios = <&gpio 18 GPIO_ACTIVE_LOW>; }; diff --git a/arch/mips/boot/dts/cavium-octeon/dlink_dsr-500n.dts b/arch/mips/boot/dts/cavium-octeon/dlink_dsr-500n.dts index e04237281b41..c55845fd84ca 100644 --- a/arch/mips/boot/dts/cavium-octeon/dlink_dsr-500n.dts +++ b/arch/mips/boot/dts/cavium-octeon/dlink_dsr-500n.dts @@ -21,15 +21,15 @@ leds { compatible = "gpio-leds"; - usb { + led-usb { gpios = <&gpio 9 GPIO_ACTIVE_LOW>; }; - wps { + led-wps { gpios = <&gpio 11 GPIO_ACTIVE_LOW>; }; - wireless { + led-wireless { label = "2.4g"; gpios = <&gpio 18 GPIO_ACTIVE_LOW>; }; diff --git a/arch/mips/boot/dts/img/boston.dts b/arch/mips/boot/dts/img/boston.dts index 84328afa3a55..72f7605d2e31 100644 --- a/arch/mips/boot/dts/img/boston.dts +++ b/arch/mips/boot/dts/img/boston.dts @@ -125,7 +125,7 @@ #interrupt-cells = <1>; }; - pci2_root@0,0,0 { + pci2_root@0,0 { compatible = "pci10ee,7021"; reg = <0x00000000 0 0 0 0>; diff --git a/arch/mips/boot/dts/ingenic/ci20.dts b/arch/mips/boot/dts/ingenic/ci20.dts index f38c39572a9e..239c4537484d 100644 --- a/arch/mips/boot/dts/ingenic/ci20.dts +++ b/arch/mips/boot/dts/ingenic/ci20.dts @@ -42,25 +42,25 @@ leds { compatible = "gpio-leds"; - led0 { + led-0 { label = "ci20:red:led0"; gpios = <&gpc 3 GPIO_ACTIVE_HIGH>; linux,default-trigger = "none"; }; - led1 { + led-1 { label = "ci20:red:led1"; gpios = <&gpc 2 GPIO_ACTIVE_HIGH>; linux,default-trigger = "nand-disk"; }; - led2 { + led-2 { label = "ci20:red:led2"; gpios = <&gpc 1 GPIO_ACTIVE_HIGH>; linux,default-trigger = "cpu1"; }; - led3 { + led-3 { label = "ci20:red:led3"; gpios = <&gpc 0 GPIO_ACTIVE_HIGH>; linux,default-trigger = "cpu0"; @@ -113,7 +113,7 @@ regulator-min-microvolt = <5000000>; regulator-max-microvolt = <5000000>; - gpio = <&gpf 14 GPIO_ACTIVE_LOW>; + gpio = <&gpf 15 GPIO_ACTIVE_LOW>; enable-active-high; }; }; diff --git a/arch/mips/boot/dts/ingenic/jz4780.dtsi b/arch/mips/boot/dts/ingenic/jz4780.dtsi index c182a656d63b..18affff85ce3 100644 --- a/arch/mips/boot/dts/ingenic/jz4780.dtsi +++ b/arch/mips/boot/dts/ingenic/jz4780.dtsi @@ -155,6 +155,8 @@ clocks = <&cgu JZ4780_CLK_RTCLK>; clock-names = "rtc"; + + #clock-cells = <0>; }; pinctrl: pin-controller@10010000 { diff --git a/arch/mips/boot/dts/lantiq/danube.dtsi b/arch/mips/boot/dts/lantiq/danube.dtsi index 510be63c8bdf..7a7ba66aa534 100644 --- a/arch/mips/boot/dts/lantiq/danube.dtsi +++ b/arch/mips/boot/dts/lantiq/danube.dtsi @@ -40,7 +40,6 @@ eiu0: eiu@101000 { #interrupt-cells = <1>; interrupt-controller; - interrupt-parent; compatible = "lantiq,eiu-xway"; reg = <0x101000 0x1000>; }; diff --git a/arch/mips/boot/dts/pic32/pic32mzda_sk.dts b/arch/mips/boot/dts/pic32/pic32mzda_sk.dts index ab70637bbec5..b1c5ffdb33fc 100644 --- a/arch/mips/boot/dts/pic32/pic32mzda_sk.dts +++ b/arch/mips/boot/dts/pic32/pic32mzda_sk.dts @@ -28,19 +28,19 @@ pinctrl-names = "default"; pinctrl-0 = <&user_leds_s0>; - led@1 { + led-1 { label = "pic32mzda_sk:red:led1"; gpios = <&gpio7 0 GPIO_ACTIVE_HIGH>; linux,default-trigger = "heartbeat"; }; - led@2 { + led-2 { label = "pic32mzda_sk:yellow:led2"; gpios = <&gpio7 1 GPIO_ACTIVE_HIGH>; linux,default-trigger = "mmc0"; }; - led@3 { + led-3 { label = "pic32mzda_sk:green:led3"; gpios = <&gpio7 2 GPIO_ACTIVE_HIGH>; default-state = "on"; diff --git a/arch/mips/boot/dts/qca/ar9132_tl_wr1043nd_v1.dts b/arch/mips/boot/dts/qca/ar9132_tl_wr1043nd_v1.dts index f3dff4009ab5..f894fe17816b 100644 --- a/arch/mips/boot/dts/qca/ar9132_tl_wr1043nd_v1.dts +++ b/arch/mips/boot/dts/qca/ar9132_tl_wr1043nd_v1.dts @@ -41,23 +41,23 @@ leds { compatible = "gpio-leds"; - led@0 { + led-0 { label = "tp-link:green:usb"; gpios = <&gpio 1 GPIO_ACTIVE_LOW>; }; - led@1 { + led-1 { label = "tp-link:green:system"; gpios = <&gpio 2 GPIO_ACTIVE_LOW>; linux,default-trigger = "heartbeat"; }; - led@2 { + led-2 { label = "tp-link:green:qss"; gpios = <&gpio 5 GPIO_ACTIVE_HIGH>; }; - led@3 { + led-3 { label = "tp-link:green:wlan"; gpios = <&gpio 9 GPIO_ACTIVE_LOW>; }; diff --git a/arch/mips/boot/dts/qca/ar9331_dragino_ms14.dts b/arch/mips/boot/dts/qca/ar9331_dragino_ms14.dts index 40e4c5da0e65..7affa58d4fa6 100644 --- a/arch/mips/boot/dts/qca/ar9331_dragino_ms14.dts +++ b/arch/mips/boot/dts/qca/ar9331_dragino_ms14.dts @@ -22,25 +22,25 @@ leds { compatible = "gpio-leds"; - wlan { + led-wlan { label = "dragino2:red:wlan"; gpios = <&gpio 0 GPIO_ACTIVE_HIGH>; default-state = "off"; }; - lan { + led-lan { label = "dragino2:red:lan"; gpios = <&gpio 13 GPIO_ACTIVE_LOW>; default-state = "off"; }; - wan { + led-wan { label = "dragino2:red:wan"; gpios = <&gpio 17 GPIO_ACTIVE_LOW>; default-state = "off"; }; - system { + led-system { label = "dragino2:red:system"; gpios = <&gpio 28 GPIO_ACTIVE_HIGH>; default-state = "off"; diff --git a/arch/mips/boot/dts/qca/ar9331_omega.dts b/arch/mips/boot/dts/qca/ar9331_omega.dts index ed184d861d5f..8904aa917a6e 100644 --- a/arch/mips/boot/dts/qca/ar9331_omega.dts +++ b/arch/mips/boot/dts/qca/ar9331_omega.dts @@ -22,7 +22,7 @@ leds { compatible = "gpio-leds"; - system { + led-system { label = "onion:amber:system"; gpios = <&gpio 27 GPIO_ACTIVE_LOW>; default-state = "off"; diff --git a/arch/mips/boot/dts/qca/ar9331_tl_mr3020.dts b/arch/mips/boot/dts/qca/ar9331_tl_mr3020.dts index 5f424c2cd781..10b9759228b7 100644 --- a/arch/mips/boot/dts/qca/ar9331_tl_mr3020.dts +++ b/arch/mips/boot/dts/qca/ar9331_tl_mr3020.dts @@ -22,25 +22,25 @@ leds { compatible = "gpio-leds"; - wlan { + led-wlan { label = "tp-link:green:wlan"; gpios = <&gpio 0 GPIO_ACTIVE_HIGH>; default-state = "off"; }; - lan { + led-lan { label = "tp-link:green:lan"; gpios = <&gpio 17 GPIO_ACTIVE_LOW>; default-state = "off"; }; - wps { + led-wps { label = "tp-link:green:wps"; gpios = <&gpio 26 GPIO_ACTIVE_LOW>; default-state = "off"; }; - led3g { + led-led3g { label = "tp-link:green:3g"; gpios = <&gpio 27 GPIO_ACTIVE_LOW>; default-state = "off"; diff --git a/arch/mips/boot/dts/ralink/gardena_smart_gateway_mt7688.dts b/arch/mips/boot/dts/ralink/gardena_smart_gateway_mt7688.dts index 179558161f85..18107ca0a06b 100644 --- a/arch/mips/boot/dts/ralink/gardena_smart_gateway_mt7688.dts +++ b/arch/mips/boot/dts/ralink/gardena_smart_gateway_mt7688.dts @@ -47,67 +47,67 @@ * (see below). So we can't include it in this LED node. */ - power_blue { + led-power-blue { label = "smartgw:power:blue"; gpios = <&gpio 18 GPIO_ACTIVE_HIGH>; default-state = "off"; }; - power_green { + led-power-green { label = "smartgw:power:green"; gpios = <&gpio 19 GPIO_ACTIVE_HIGH>; default-state = "off"; }; - power_red { + led-power-red { label = "smartgw:power:red"; gpios = <&gpio 22 GPIO_ACTIVE_HIGH>; default-state = "off"; }; - radio_blue { + led-radio-blue { label = "smartgw:radio:blue"; gpios = <&gpio 23 GPIO_ACTIVE_HIGH>; default-state = "off"; }; - radio_green { + led-radio-green { label = "smartgw:radio:green"; gpios = <&gpio 24 GPIO_ACTIVE_HIGH>; default-state = "off"; }; - radio_red { + led-radio-red { label = "smartgw:radio:red"; gpios = <&gpio 25 GPIO_ACTIVE_HIGH>; default-state = "off"; }; - internet_blue { + led-internet-blue { label = "smartgw:internet:blue"; gpios = <&gpio 26 GPIO_ACTIVE_HIGH>; default-state = "off"; }; - internet_green { + led-internet-green { label = "smartgw:internet:green"; gpios = <&gpio 27 GPIO_ACTIVE_HIGH>; default-state = "off"; }; - internet_red { + led-internet-red { label = "smartgw:internet:red"; gpios = <&gpio 28 GPIO_ACTIVE_HIGH>; default-state = "off"; }; - ethernet_link { + led-ethernet-link { label = "smartgw:eth:link"; gpios = <&gpio 3 GPIO_ACTIVE_LOW>; linux,default-trigger = "netdev"; }; - ethernet_activity { + led-ethernet-activity { label = "smartgw:eth:act"; gpios = <&gpio 43 GPIO_ACTIVE_LOW>; linux,default-trigger = "netdev"; diff --git a/arch/mips/boot/dts/ralink/mt7621-gnubee-gb-pc1.dts b/arch/mips/boot/dts/ralink/mt7621-gnubee-gb-pc1.dts index 0128bd8fa7ed..129b6710b699 100644 --- a/arch/mips/boot/dts/ralink/mt7621-gnubee-gb-pc1.dts +++ b/arch/mips/boot/dts/ralink/mt7621-gnubee-gb-pc1.dts @@ -33,13 +33,13 @@ gpio-leds { compatible = "gpio-leds"; - power { + led-power { label = "green:power"; gpios = <&gpio 6 GPIO_ACTIVE_LOW>; linux,default-trigger = "default-on"; }; - system { + led-system { label = "green:system"; gpios = <&gpio 8 GPIO_ACTIVE_LOW>; linux,default-trigger = "disk-activity"; @@ -91,22 +91,16 @@ status = "okay"; }; -&gmac1 { - status = "okay"; - phy-handle = <ðphy4>; -}; - -&mdio { - ethphy4: ethernet-phy@4 { - reg = <4>; - }; -}; - &switch0 { ports { port@0 { status = "okay"; label = "ethblack"; }; + + port@4 { + status = "okay"; + label = "ethblue"; + }; }; }; diff --git a/arch/mips/boot/dts/ralink/mt7621-gnubee-gb-pc2.dts b/arch/mips/boot/dts/ralink/mt7621-gnubee-gb-pc2.dts index e31417569e09..f810cd10f4f4 100644 --- a/arch/mips/boot/dts/ralink/mt7621-gnubee-gb-pc2.dts +++ b/arch/mips/boot/dts/ralink/mt7621-gnubee-gb-pc2.dts @@ -33,33 +33,33 @@ gpio-leds { compatible = "gpio-leds"; - ethblack-green { + led-ethblack-green { label = "green:ethblack"; gpios = <&gpio 3 GPIO_ACTIVE_LOW>; }; - ethblue-green { + led-ethblue-green { label = "green:ethblue"; gpios = <&gpio 4 GPIO_ACTIVE_LOW>; }; - ethyellow-green { + led-ethyellow-green { label = "green:ethyellow"; gpios = <&gpio 15 GPIO_ACTIVE_LOW>; }; - ethyellow-orange { + led-ethyellow-orange { label = "orange:ethyellow"; gpios = <&gpio 13 GPIO_ACTIVE_LOW>; }; - power { + led-power { label = "green:power"; gpios = <&gpio 6 GPIO_ACTIVE_LOW>; linux,default-trigger = "default-on"; }; - system { + led-system { label = "green:system"; gpios = <&gpio 8 GPIO_ACTIVE_LOW>; linux,default-trigger = "disk-activity"; @@ -112,9 +112,12 @@ }; &gmac1 { - status = "okay"; phy-mode = "rgmii-rxid"; phy-handle = <ðphy5>; + + fixed-link { + status = "disabled"; + }; }; &mdio { @@ -134,5 +137,9 @@ status = "okay"; label = "ethblue"; }; + + port@5 { + status = "disabled"; + }; }; }; diff --git a/arch/mips/boot/dts/ralink/mt7621.dtsi b/arch/mips/boot/dts/ralink/mt7621.dtsi index aec85c779359..290d47fbcfbb 100644 --- a/arch/mips/boot/dts/ralink/mt7621.dtsi +++ b/arch/mips/boot/dts/ralink/mt7621.dtsi @@ -332,8 +332,13 @@ gmac1: mac@1 { compatible = "mediatek,eth-mac"; reg = <1>; - status = "disabled"; phy-mode = "rgmii"; + + fixed-link { + speed = <1000>; + full-duplex; + pause; + }; }; mdio: mdio-bus { @@ -384,6 +389,18 @@ label = "swp4"; }; + port@5 { + reg = <5>; + ethernet = <&gmac1>; + phy-mode = "rgmii"; + + fixed-link { + speed = <1000>; + full-duplex; + pause; + }; + }; + port@6 { reg = <6>; ethernet = <&gmac0>; diff --git a/arch/mips/cavium-octeon/octeon-usb.c b/arch/mips/cavium-octeon/octeon-usb.c index 5cffe1ed2447..28677c615175 100644 --- a/arch/mips/cavium-octeon/octeon-usb.c +++ b/arch/mips/cavium-octeon/octeon-usb.c @@ -245,7 +245,7 @@ static int dwc3_octeon_config_power(struct device *dev, u64 base) power_active_low = 0; gpio = gpio_pwr[1]; } else { - dev_err(dev, "dwc3 controller clock init failure.\n"); + dev_err(dev, "invalid power configuration\n"); return -EINVAL; } if ((OCTEON_IS_MODEL(OCTEON_CN73XX) || @@ -278,7 +278,7 @@ static int dwc3_octeon_config_power(struct device *dev, u64 base) uctl_host_cfg.s.ppc_en = 0; uctl_host_cfg.s.ppc_active_high_en = 0; cvmx_write_csr(base + UCTL_HOST_CFG, uctl_host_cfg.u64); - dev_warn(dev, "dwc3 controller clock init failure.\n"); + dev_info(dev, "power control disabled\n"); } return 0; } @@ -301,19 +301,19 @@ static int dwc3_octeon_clocks_start(struct device *dev, u64 base) i = of_property_read_u32(dev->of_node, "refclk-frequency", &clock_rate); if (i) { - pr_err("No UCTL \"refclk-frequency\"\n"); + dev_err(dev, "No UCTL \"refclk-frequency\"\n"); return -EINVAL; } i = of_property_read_string(dev->of_node, "refclk-type-ss", &ss_clock_type); if (i) { - pr_err("No UCTL \"refclk-type-ss\"\n"); + dev_err(dev, "No UCTL \"refclk-type-ss\"\n"); return -EINVAL; } i = of_property_read_string(dev->of_node, "refclk-type-hs", &hs_clock_type); if (i) { - pr_err("No UCTL \"refclk-type-hs\"\n"); + dev_err(dev, "No UCTL \"refclk-type-hs\"\n"); return -EINVAL; } if (strcmp("dlmc_ref_clk0", ss_clock_type) == 0) { @@ -322,29 +322,29 @@ static int dwc3_octeon_clocks_start(struct device *dev, u64 base) else if (strcmp(hs_clock_type, "pll_ref_clk") == 0) ref_clk_sel = 2; else - pr_err("Invalid HS clock type %s, using pll_ref_clk instead\n", - hs_clock_type); + dev_warn(dev, "Invalid HS clock type %s, using pll_ref_clk instead\n", + hs_clock_type); } else if (strcmp(ss_clock_type, "dlmc_ref_clk1") == 0) { if (strcmp(hs_clock_type, "dlmc_ref_clk1") == 0) ref_clk_sel = 1; else if (strcmp(hs_clock_type, "pll_ref_clk") == 0) ref_clk_sel = 3; else { - pr_err("Invalid HS clock type %s, using pll_ref_clk instead\n", - hs_clock_type); + dev_warn(dev, "Invalid HS clock type %s, using pll_ref_clk instead\n", + hs_clock_type); ref_clk_sel = 3; } } else - pr_err("Invalid SS clock type %s, using dlmc_ref_clk0 instead\n", - ss_clock_type); + dev_warn(dev, "Invalid SS clock type %s, using dlmc_ref_clk0 instead\n", + ss_clock_type); if ((ref_clk_sel == 0 || ref_clk_sel == 1) && - (clock_rate != 100000000)) - pr_err("Invalid UCTL clock rate of %u, using 100000000 instead\n", - clock_rate); + (clock_rate != 100000000)) + dev_warn(dev, "Invalid UCTL clock rate of %u, using 100000000 instead\n", + clock_rate); } else { - pr_err("No USB UCTL device node\n"); + dev_err(dev, "No USB UCTL device node\n"); return -EINVAL; } @@ -396,8 +396,8 @@ static int dwc3_octeon_clocks_start(struct device *dev, u64 base) uctl_ctl.s.ref_clk_div2 = 0; switch (clock_rate) { default: - dev_err(dev, "Invalid ref_clk %u, using 100000000 instead\n", - clock_rate); + dev_warn(dev, "Invalid ref_clk %u, using 100000000 instead\n", + clock_rate); fallthrough; case 100000000: mpll_mul = 0x19; @@ -438,10 +438,8 @@ static int dwc3_octeon_clocks_start(struct device *dev, u64 base) udelay(10); /* Steo 8c: Setup power-power control. */ - if (dwc3_octeon_config_power(dev, base)) { - dev_err(dev, "Error configuring power.\n"); + if (dwc3_octeon_config_power(dev, base)) return -EINVAL; - } /* Step 8d: Deassert UAHC reset signal. */ uctl_ctl.u64 = cvmx_read_csr(uctl_ctl_reg); @@ -529,10 +527,10 @@ static int __init dwc3_octeon_device_init(void) } mutex_lock(&dwc3_octeon_clocks_mutex); - dwc3_octeon_clocks_start(&pdev->dev, (u64)base); + if (dwc3_octeon_clocks_start(&pdev->dev, (u64)base) == 0) + dev_info(&pdev->dev, "clocks initialized.\n"); dwc3_octeon_set_endian_mode((u64)base); dwc3_octeon_phy_reset((u64)base); - dev_info(&pdev->dev, "clocks initialized.\n"); mutex_unlock(&dwc3_octeon_clocks_mutex); devm_iounmap(&pdev->dev, base); devm_release_mem_region(&pdev->dev, res->start, diff --git a/arch/mips/include/asm/asmmacro-32.h b/arch/mips/include/asm/asmmacro-32.h index 1c08c1f7903c..83a4940b7c89 100644 --- a/arch/mips/include/asm/asmmacro-32.h +++ b/arch/mips/include/asm/asmmacro-32.h @@ -15,7 +15,7 @@ .macro fpu_save_single thread tmp=t0 .set push - SET_HARDFLOAT + .set hardfloat cfc1 \tmp, fcr31 s.d $f0, THREAD_FPR0(\thread) s.d $f2, THREAD_FPR2(\thread) @@ -39,7 +39,7 @@ .macro fpu_restore_single thread tmp=t0 .set push - SET_HARDFLOAT + .set hardfloat lw \tmp, THREAD_FCR31(\thread) l.d $f0, THREAD_FPR0(\thread) l.d $f2, THREAD_FPR2(\thread) diff --git a/arch/mips/include/asm/asmmacro.h b/arch/mips/include/asm/asmmacro.h index ca83ada7015f..1c4438f3f2ab 100644 --- a/arch/mips/include/asm/asmmacro.h +++ b/arch/mips/include/asm/asmmacro.h @@ -83,7 +83,7 @@ .macro fpu_save_16even thread tmp=t0 .set push - SET_HARDFLOAT + .set hardfloat cfc1 \tmp, fcr31 sdc1 $f0, THREAD_FPR0(\thread) sdc1 $f2, THREAD_FPR2(\thread) @@ -109,7 +109,7 @@ .set push .set mips64r2 .set fp=64 - SET_HARDFLOAT + .set hardfloat sdc1 $f1, THREAD_FPR1(\thread) sdc1 $f3, THREAD_FPR3(\thread) sdc1 $f5, THREAD_FPR5(\thread) @@ -142,7 +142,7 @@ .macro fpu_restore_16even thread tmp=t0 .set push - SET_HARDFLOAT + .set hardfloat lw \tmp, THREAD_FCR31(\thread) ldc1 $f0, THREAD_FPR0(\thread) ldc1 $f2, THREAD_FPR2(\thread) @@ -168,7 +168,7 @@ .set push .set mips64r2 .set fp=64 - SET_HARDFLOAT + .set hardfloat ldc1 $f1, THREAD_FPR1(\thread) ldc1 $f3, THREAD_FPR3(\thread) ldc1 $f5, THREAD_FPR5(\thread) @@ -373,7 +373,7 @@ .macro _cfcmsa rd, cs .set push .set noat - SET_HARDFLOAT + .set hardfloat insn_if_mips 0x787e0059 | (\cs << 11) insn32_if_mm 0x587e0056 | (\cs << 11) move \rd, $1 @@ -383,7 +383,7 @@ .macro _ctcmsa cd, rs .set push .set noat - SET_HARDFLOAT + .set hardfloat move $1, \rs insn_if_mips 0x783e0819 | (\cd << 6) insn32_if_mm 0x583e0816 | (\cd << 6) @@ -393,7 +393,7 @@ .macro ld_b wd, off, base .set push .set noat - SET_HARDFLOAT + .set hardfloat PTR_ADDU $1, \base, \off insn_if_mips 0x78000820 | (\wd << 6) insn32_if_mm 0x58000807 | (\wd << 6) @@ -403,7 +403,7 @@ .macro ld_h wd, off, base .set push .set noat - SET_HARDFLOAT + .set hardfloat PTR_ADDU $1, \base, \off insn_if_mips 0x78000821 | (\wd << 6) insn32_if_mm 0x58000817 | (\wd << 6) @@ -413,7 +413,7 @@ .macro ld_w wd, off, base .set push .set noat - SET_HARDFLOAT + .set hardfloat PTR_ADDU $1, \base, \off insn_if_mips 0x78000822 | (\wd << 6) insn32_if_mm 0x58000827 | (\wd << 6) @@ -423,7 +423,7 @@ .macro ld_d wd, off, base .set push .set noat - SET_HARDFLOAT + .set hardfloat PTR_ADDU $1, \base, \off insn_if_mips 0x78000823 | (\wd << 6) insn32_if_mm 0x58000837 | (\wd << 6) @@ -433,7 +433,7 @@ .macro st_b wd, off, base .set push .set noat - SET_HARDFLOAT + .set hardfloat PTR_ADDU $1, \base, \off insn_if_mips 0x78000824 | (\wd << 6) insn32_if_mm 0x5800080f | (\wd << 6) @@ -443,7 +443,7 @@ .macro st_h wd, off, base .set push .set noat - SET_HARDFLOAT + .set hardfloat PTR_ADDU $1, \base, \off insn_if_mips 0x78000825 | (\wd << 6) insn32_if_mm 0x5800081f | (\wd << 6) @@ -453,7 +453,7 @@ .macro st_w wd, off, base .set push .set noat - SET_HARDFLOAT + .set hardfloat PTR_ADDU $1, \base, \off insn_if_mips 0x78000826 | (\wd << 6) insn32_if_mm 0x5800082f | (\wd << 6) @@ -463,7 +463,7 @@ .macro st_d wd, off, base .set push .set noat - SET_HARDFLOAT + .set hardfloat PTR_ADDU $1, \base, \off insn_if_mips 0x78000827 | (\wd << 6) insn32_if_mm 0x5800083f | (\wd << 6) @@ -473,7 +473,7 @@ .macro copy_s_w ws, n .set push .set noat - SET_HARDFLOAT + .set hardfloat insn_if_mips 0x78b00059 | (\n << 16) | (\ws << 11) insn32_if_mm 0x58b00056 | (\n << 16) | (\ws << 11) .set pop @@ -482,7 +482,7 @@ .macro copy_s_d ws, n .set push .set noat - SET_HARDFLOAT + .set hardfloat insn_if_mips 0x78b80059 | (\n << 16) | (\ws << 11) insn32_if_mm 0x58b80056 | (\n << 16) | (\ws << 11) .set pop @@ -491,7 +491,7 @@ .macro insert_w wd, n .set push .set noat - SET_HARDFLOAT + .set hardfloat insn_if_mips 0x79300819 | (\n << 16) | (\wd << 6) insn32_if_mm 0x59300816 | (\n << 16) | (\wd << 6) .set pop @@ -500,7 +500,7 @@ .macro insert_d wd, n .set push .set noat - SET_HARDFLOAT + .set hardfloat insn_if_mips 0x79380819 | (\n << 16) | (\wd << 6) insn32_if_mm 0x59380816 | (\n << 16) | (\wd << 6) .set pop @@ -553,7 +553,7 @@ st_d 29, THREAD_FPR29 - FPR_BASE_OFFS, FPR_BASE st_d 30, THREAD_FPR30 - FPR_BASE_OFFS, FPR_BASE st_d 31, THREAD_FPR31 - FPR_BASE_OFFS, FPR_BASE - SET_HARDFLOAT + .set hardfloat _cfcmsa $1, MSA_CSR sw $1, THREAD_MSA_CSR(\thread) .set pop @@ -562,7 +562,7 @@ .macro msa_restore_all thread .set push .set noat - SET_HARDFLOAT + .set hardfloat lw $1, THREAD_MSA_CSR(\thread) _ctcmsa MSA_CSR, $1 #ifdef TOOLCHAIN_SUPPORTS_MSA @@ -618,7 +618,7 @@ .macro msa_init_all_upper .set push .set noat - SET_HARDFLOAT + .set hardfloat not $1, zero msa_init_upper 0 msa_init_upper 1 diff --git a/arch/mips/include/asm/fpregdef.h b/arch/mips/include/asm/fpregdef.h index f184ba088532..429481f9028d 100644 --- a/arch/mips/include/asm/fpregdef.h +++ b/arch/mips/include/asm/fpregdef.h @@ -14,20 +14,6 @@ #include <asm/sgidefs.h> -/* - * starting with binutils 2.24.51.20140729, MIPS binutils warn about mixing - * hardfloat and softfloat object files. The kernel build uses soft-float by - * default, so we also need to pass -msoft-float along to GAS if it supports it. - * But this in turn causes assembler errors in files which access hardfloat - * registers. We detect if GAS supports "-msoft-float" in the Makefile and - * explicitly put ".set hardfloat" where floating point registers are touched. - */ -#ifdef GAS_HAS_SET_HARDFLOAT -#define SET_HARDFLOAT .set hardfloat -#else -#define SET_HARDFLOAT -#endif - #if _MIPS_SIM == _MIPS_SIM_ABI32 /* diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 5cedb28e8a40..2803c9c21ef9 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -758,7 +758,7 @@ struct kvm_mips_callbacks { void (*vcpu_reenter)(struct kvm_vcpu *vcpu); }; extern struct kvm_mips_callbacks *kvm_mips_callbacks; -int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks); +int kvm_mips_emulation_init(void); /* Debug: dump vcpu state */ int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu); @@ -888,7 +888,6 @@ extern unsigned long kvm_mips_get_ramsize(struct kvm *kvm); extern int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_mips_interrupt *irq); -static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) {} diff --git a/arch/mips/include/asm/mach-bcm47xx/bcm47xx_board.h b/arch/mips/include/asm/mach-bcm47xx/bcm47xx_board.h index 30f4114ab872..3c401f11655e 100644 --- a/arch/mips/include/asm/mach-bcm47xx/bcm47xx_board.h +++ b/arch/mips/include/asm/mach-bcm47xx/bcm47xx_board.h @@ -61,6 +61,7 @@ enum bcm47xx_board { BCM47XX_BOARD_LINKSYS_E1000V21, BCM47XX_BOARD_LINKSYS_E1200V2, BCM47XX_BOARD_LINKSYS_E2000V1, + BCM47XX_BOARD_LINKSYS_E2500V3, BCM47XX_BOARD_LINKSYS_E3000V1, BCM47XX_BOARD_LINKSYS_E3200V1, BCM47XX_BOARD_LINKSYS_E4200V1, diff --git a/arch/mips/include/asm/mach-rc32434/pci.h b/arch/mips/include/asm/mach-rc32434/pci.h index 9a6eefd12757..3eb767c8a4ee 100644 --- a/arch/mips/include/asm/mach-rc32434/pci.h +++ b/arch/mips/include/asm/mach-rc32434/pci.h @@ -374,7 +374,7 @@ struct pci_msu { PCI_CFG04_STAT_SSE | \ PCI_CFG04_STAT_PE) -#define KORINA_CNFG1 ((KORINA_STAT<<16)|KORINA_CMD) +#define KORINA_CNFG1 (KORINA_STAT | KORINA_CMD) #define KORINA_REVID 0 #define KORINA_CLASS_CODE 0 diff --git a/arch/mips/include/asm/mipsregs.h b/arch/mips/include/asm/mipsregs.h index 99eeafe6dcab..2d53704d9f24 100644 --- a/arch/mips/include/asm/mipsregs.h +++ b/arch/mips/include/asm/mipsregs.h @@ -2367,7 +2367,7 @@ do { \ /* * Macros to access the floating point coprocessor control registers */ -#define _read_32bit_cp1_register(source, gas_hardfloat) \ +#define read_32bit_cp1_register(source) \ ({ \ unsigned int __res; \ \ @@ -2377,36 +2377,24 @@ do { \ " # gas fails to assemble cfc1 for some archs, \n" \ " # like Octeon. \n" \ " .set mips1 \n" \ - " "STR(gas_hardfloat)" \n" \ + " .set hardfloat \n" \ " cfc1 %0,"STR(source)" \n" \ " .set pop \n" \ : "=r" (__res)); \ __res; \ }) -#define _write_32bit_cp1_register(dest, val, gas_hardfloat) \ +#define write_32bit_cp1_register(dest, val) \ do { \ __asm__ __volatile__( \ " .set push \n" \ " .set reorder \n" \ - " "STR(gas_hardfloat)" \n" \ + " .set hardfloat \n" \ " ctc1 %0,"STR(dest)" \n" \ " .set pop \n" \ : : "r" (val)); \ } while (0) -#ifdef GAS_HAS_SET_HARDFLOAT -#define read_32bit_cp1_register(source) \ - _read_32bit_cp1_register(source, .set hardfloat) -#define write_32bit_cp1_register(dest, val) \ - _write_32bit_cp1_register(dest, val, .set hardfloat) -#else -#define read_32bit_cp1_register(source) \ - _read_32bit_cp1_register(source, ) -#define write_32bit_cp1_register(dest, val) \ - _write_32bit_cp1_register(dest, val, ) -#endif - #ifdef TOOLCHAIN_SUPPORTS_DSP #define rddsp(mask) \ ({ \ diff --git a/arch/mips/include/asm/syscall.h b/arch/mips/include/asm/syscall.h index 25fa651c937d..ebdf4d910af2 100644 --- a/arch/mips/include/asm/syscall.h +++ b/arch/mips/include/asm/syscall.h @@ -38,7 +38,7 @@ static inline bool mips_syscall_is_indirect(struct task_struct *task, static inline long syscall_get_nr(struct task_struct *task, struct pt_regs *regs) { - return current_thread_info()->syscall; + return task_thread_info(task)->syscall; } static inline void mips_syscall_update_nr(struct task_struct *task, diff --git a/arch/mips/include/asm/vpe.h b/arch/mips/include/asm/vpe.h index baa949a744cb..ef7e07829607 100644 --- a/arch/mips/include/asm/vpe.h +++ b/arch/mips/include/asm/vpe.h @@ -102,7 +102,6 @@ struct vpe_control { struct list_head tc_list; /* Thread contexts */ }; -extern unsigned long physical_memsize; extern struct vpe_control vpecontrol; extern const struct file_operations vpe_fops; diff --git a/arch/mips/kernel/genex.S b/arch/mips/kernel/genex.S index 3425df6019c0..b6de8e88c1bd 100644 --- a/arch/mips/kernel/genex.S +++ b/arch/mips/kernel/genex.S @@ -480,7 +480,7 @@ NESTED(nmi_handler, PT_SIZE, sp) .set push /* gas fails to assemble cfc1 for some archs (octeon).*/ \ .set mips1 - SET_HARDFLOAT + .set hardfloat cfc1 a1, fcr31 .set pop .endm diff --git a/arch/mips/kernel/r2300_fpu.S b/arch/mips/kernel/r2300_fpu.S index 2748c55820c2..6c745aa9e825 100644 --- a/arch/mips/kernel/r2300_fpu.S +++ b/arch/mips/kernel/r2300_fpu.S @@ -64,7 +64,7 @@ LEAF(_restore_fp) */ LEAF(_save_fp_context) .set push - SET_HARDFLOAT + .set hardfloat li v0, 0 # assume success cfc1 t1, fcr31 EX2(s.d $f0, 0(a0)) @@ -98,7 +98,7 @@ LEAF(_save_fp_context) */ LEAF(_restore_fp_context) .set push - SET_HARDFLOAT + .set hardfloat li v0, 0 # assume success EX(lw t0, (a1)) EX2(l.d $f0, 0(a0)) diff --git a/arch/mips/kernel/r4k_fpu.S b/arch/mips/kernel/r4k_fpu.S index 2e687c60bc4f..4e8c98517d9d 100644 --- a/arch/mips/kernel/r4k_fpu.S +++ b/arch/mips/kernel/r4k_fpu.S @@ -26,7 +26,7 @@ .macro EX insn, reg, src .set push - SET_HARDFLOAT + .set hardfloat .set nomacro .ex\@: \insn \reg, \src .set pop @@ -98,14 +98,14 @@ LEAF(_init_msa_upper) */ LEAF(_save_fp_context) .set push - SET_HARDFLOAT + .set hardfloat cfc1 t1, fcr31 .set pop #if defined(CONFIG_64BIT) || defined(CONFIG_CPU_MIPSR2) || \ defined(CONFIG_CPU_MIPSR5) || defined(CONFIG_CPU_MIPSR6) .set push - SET_HARDFLOAT + .set hardfloat #if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR5) .set mips32r2 .set fp=64 @@ -135,7 +135,7 @@ LEAF(_save_fp_context) #endif .set push - SET_HARDFLOAT + .set hardfloat /* Store the 16 even double precision registers */ EX sdc1 $f0, 0(a0) EX sdc1 $f2, 16(a0) @@ -173,7 +173,7 @@ LEAF(_restore_fp_context) #if defined(CONFIG_64BIT) || defined(CONFIG_CPU_MIPSR2) || \ defined(CONFIG_CPU_MIPSR5) || defined(CONFIG_CPU_MIPSR6) .set push - SET_HARDFLOAT + .set hardfloat #if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR5) .set mips32r2 .set fp=64 @@ -201,7 +201,7 @@ LEAF(_restore_fp_context) 1: .set pop #endif .set push - SET_HARDFLOAT + .set hardfloat EX ldc1 $f0, 0(a0) EX ldc1 $f2, 16(a0) EX ldc1 $f4, 32(a0) diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c index bcd6a944b839..f2df0cae1b4d 100644 --- a/arch/mips/kernel/smp-cps.c +++ b/arch/mips/kernel/smp-cps.c @@ -424,9 +424,11 @@ static void cps_shutdown_this_cpu(enum cpu_death death) wmb(); } } else { - pr_debug("Gating power to core %d\n", core); - /* Power down the core */ - cps_pm_enter_state(CPS_PM_POWER_GATED); + if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) { + pr_debug("Gating power to core %d\n", core); + /* Power down the core */ + cps_pm_enter_state(CPS_PM_POWER_GATED); + } } } diff --git a/arch/mips/kernel/vpe-mt.c b/arch/mips/kernel/vpe-mt.c index 84a82b551ec3..223d6274f2e5 100644 --- a/arch/mips/kernel/vpe-mt.c +++ b/arch/mips/kernel/vpe-mt.c @@ -92,12 +92,11 @@ int vpe_run(struct vpe *v) write_tc_c0_tchalt(read_tc_c0_tchalt() & ~TCHALT_H); /* - * The sde-kit passes 'memsize' to __start in $a3, so set something - * here... Or set $a3 to zero and define DFLT_STACK_SIZE and - * DFLT_HEAP_SIZE when you compile your program + * We don't pass the memsize here, so VPE programs need to be + * compiled with DFLT_STACK_SIZE and DFLT_HEAP_SIZE defined. */ + mttgpr(7, 0); mttgpr(6, v->ntcs); - mttgpr(7, physical_memsize); /* set up VPE1 */ /* diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index 91d197bee9c0..29e51649203b 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -28,6 +28,7 @@ config KVM select MMU_NOTIFIER select SRCU select INTERVAL_TREE + select KVM_GENERIC_HARDWARE_ENABLING help Support for hosting Guest kernels. diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile index 21ff75bcdbc4..805aeea2166e 100644 --- a/arch/mips/kvm/Makefile +++ b/arch/mips/kvm/Makefile @@ -17,4 +17,4 @@ kvm-$(CONFIG_CPU_LOONGSON64) += loongson_ipi.o kvm-y += vz.o obj-$(CONFIG_KVM) += kvm.o -obj-y += callback.o tlb.o +obj-y += tlb.o diff --git a/arch/mips/kvm/callback.c b/arch/mips/kvm/callback.c deleted file mode 100644 index d88aa2173fb0..000000000000 --- a/arch/mips/kvm/callback.c +++ /dev/null @@ -1,14 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved. - * Authors: Yann Le Du <ledu@kymasys.com> - */ - -#include <linux/export.h> -#include <linux/kvm_host.h> - -struct kvm_mips_callbacks *kvm_mips_callbacks; -EXPORT_SYMBOL_GPL(kvm_mips_callbacks); diff --git a/arch/mips/kvm/fpu.S b/arch/mips/kvm/fpu.S index 16f17c6390dd..eb2e8cc3532f 100644 --- a/arch/mips/kvm/fpu.S +++ b/arch/mips/kvm/fpu.S @@ -22,7 +22,7 @@ LEAF(__kvm_save_fpu) .set push - SET_HARDFLOAT + .set hardfloat .set fp=64 mfc0 t0, CP0_STATUS sll t0, t0, 5 # is Status.FR set? @@ -66,7 +66,7 @@ LEAF(__kvm_save_fpu) LEAF(__kvm_restore_fpu) .set push - SET_HARDFLOAT + .set hardfloat .set fp=64 mfc0 t0, CP0_STATUS sll t0, t0, 5 # is Status.FR set? @@ -110,7 +110,7 @@ LEAF(__kvm_restore_fpu) LEAF(__kvm_restore_fcsr) .set push - SET_HARDFLOAT + .set hardfloat lw t0, VCPU_FCR31(a0) /* * The ctc1 must stay at this offset in __kvm_restore_fcsr. diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index a25e0b73ee70..36c8991b5d39 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -135,16 +135,6 @@ void kvm_arch_hardware_disable(void) kvm_mips_callbacks->hardware_disable(); } -int kvm_arch_hardware_setup(void *opaque) -{ - return 0; -} - -int kvm_arch_check_processor_compat(void *opaque) -{ - return 0; -} - extern void kvm_init_loongson_ipi(struct kvm *kvm); int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) @@ -1015,21 +1005,6 @@ long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) return r; } -int kvm_arch_init(void *opaque) -{ - if (kvm_mips_callbacks) { - kvm_err("kvm: module already exists\n"); - return -EEXIST; - } - - return kvm_mips_emulation_init(&kvm_mips_callbacks); -} - -void kvm_arch_exit(void) -{ - kvm_mips_callbacks = NULL; -} - int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { @@ -1646,16 +1621,21 @@ static int __init kvm_mips_init(void) if (ret) return ret; - ret = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); - + ret = kvm_mips_emulation_init(); if (ret) return ret; + if (boot_cpu_type() == CPU_LOONGSON64) kvm_priority_to_irq = kvm_loongson3_priority_to_irq; register_die_notifier(&kvm_mips_csr_die_notifier); + ret = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); + if (ret) { + unregister_die_notifier(&kvm_mips_csr_die_notifier); + return ret; + } return 0; } diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c index c706f5890a05..dafab003ea0d 100644 --- a/arch/mips/kvm/vz.c +++ b/arch/mips/kvm/vz.c @@ -3304,7 +3304,10 @@ static struct kvm_mips_callbacks kvm_vz_callbacks = { .vcpu_reenter = kvm_vz_vcpu_reenter, }; -int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks) +/* FIXME: Get rid of the callbacks now that trap-and-emulate is gone. */ +struct kvm_mips_callbacks *kvm_mips_callbacks = &kvm_vz_callbacks; + +int kvm_mips_emulation_init(void) { if (!cpu_has_vz) return -ENODEV; @@ -3318,7 +3321,5 @@ int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks) return -ENODEV; pr_info("Starting KVM with MIPS VZ extensions\n"); - - *install_callbacks = &kvm_vz_callbacks; return 0; } diff --git a/arch/mips/lantiq/prom.c b/arch/mips/lantiq/prom.c index be4829cc7a3a..a3cf29365858 100644 --- a/arch/mips/lantiq/prom.c +++ b/arch/mips/lantiq/prom.c @@ -23,12 +23,6 @@ DEFINE_SPINLOCK(ebu_lock); EXPORT_SYMBOL_GPL(ebu_lock); /* - * This is needed by the VPE loader code, just set it to 0 and assume - * that the firmware hardcodes this value to something useful. - */ -unsigned long physical_memsize = 0L; - -/* * this struct is filled by the soc specific detection code and holds * information about the specific soc type, revision and name */ diff --git a/arch/mips/lantiq/xway/dcdc.c b/arch/mips/lantiq/xway/dcdc.c index 4960bee0a99d..96199966a350 100644 --- a/arch/mips/lantiq/xway/dcdc.c +++ b/arch/mips/lantiq/xway/dcdc.c @@ -22,10 +22,7 @@ static void __iomem *dcdc_membase; static int dcdc_probe(struct platform_device *pdev) { - struct resource *res; - - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - dcdc_membase = devm_ioremap_resource(&pdev->dev, res); + dcdc_membase = devm_platform_get_and_ioremap_resource(pdev, 0, NULL); if (IS_ERR(dcdc_membase)) return PTR_ERR(dcdc_membase); diff --git a/arch/mips/lantiq/xway/dma.c b/arch/mips/lantiq/xway/dma.c index f8eedeb15f18..934ac72937e5 100644 --- a/arch/mips/lantiq/xway/dma.c +++ b/arch/mips/lantiq/xway/dma.c @@ -239,12 +239,10 @@ static int ltq_dma_init(struct platform_device *pdev) { struct clk *clk; - struct resource *res; unsigned int id, nchannels; int i; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - ltq_dma_membase = devm_ioremap_resource(&pdev->dev, res); + ltq_dma_membase = devm_platform_get_and_ioremap_resource(pdev, 0, NULL); if (IS_ERR(ltq_dma_membase)) panic("Failed to remap dma resource"); diff --git a/arch/mips/lantiq/xway/gptu.c b/arch/mips/lantiq/xway/gptu.c index 200fe9ff641d..a492b1eb1925 100644 --- a/arch/mips/lantiq/xway/gptu.c +++ b/arch/mips/lantiq/xway/gptu.c @@ -136,17 +136,14 @@ static inline void clkdev_add_gptu(struct device *dev, const char *con, static int gptu_probe(struct platform_device *pdev) { struct clk *clk; - struct resource *res; if (of_irq_to_resource_table(pdev->dev.of_node, irqres, 6) != 6) { dev_err(&pdev->dev, "Failed to get IRQ list\n"); return -EINVAL; } - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - /* remap gptu register range */ - gptu_membase = devm_ioremap_resource(&pdev->dev, res); + gptu_membase = devm_platform_get_and_ioremap_resource(pdev, 0, NULL); if (IS_ERR(gptu_membase)) return PTR_ERR(gptu_membase); diff --git a/arch/mips/loongson2ef/Platform b/arch/mips/loongson2ef/Platform index eebabf9df6ac..c6f7a4b95997 100644 --- a/arch/mips/loongson2ef/Platform +++ b/arch/mips/loongson2ef/Platform @@ -25,7 +25,7 @@ cflags-$(CONFIG_CPU_LOONGSON2F) += -march=loongson2f # binutils does not merge support for the flag then we can revisit & remove # this later - for now it ensures vendor toolchains don't cause problems. # -cflags-$(CONFIG_CPU_LOONGSON2EF) += $(call as-option,-Wa$(comma)-mno-fix-loongson3-llsc,) +cflags-$(CONFIG_CPU_LOONGSON2EF) += $(call cc-option,-Wa$(comma)-mno-fix-loongson3-llsc,) # Enable the workarounds for Loongson2f ifdef CONFIG_CPU_LOONGSON2F_WORKAROUNDS diff --git a/arch/mips/pci/pci-lantiq.c b/arch/mips/pci/pci-lantiq.c index 8d16cd021f60..d967e4c0cf24 100644 --- a/arch/mips/pci/pci-lantiq.c +++ b/arch/mips/pci/pci-lantiq.c @@ -204,17 +204,13 @@ static int ltq_pci_startup(struct platform_device *pdev) static int ltq_pci_probe(struct platform_device *pdev) { - struct resource *res_cfg, *res_bridge; - pci_clear_flags(PCI_PROBE_ONLY); - res_bridge = platform_get_resource(pdev, IORESOURCE_MEM, 1); - ltq_pci_membase = devm_ioremap_resource(&pdev->dev, res_bridge); + ltq_pci_membase = devm_platform_get_and_ioremap_resource(pdev, 1, NULL); if (IS_ERR(ltq_pci_membase)) return PTR_ERR(ltq_pci_membase); - res_cfg = platform_get_resource(pdev, IORESOURCE_MEM, 0); - ltq_pci_mapped_cfg = devm_ioremap_resource(&pdev->dev, res_cfg); + ltq_pci_mapped_cfg = devm_platform_get_and_ioremap_resource(pdev, 0, NULL); if (IS_ERR(ltq_pci_mapped_cfg)) return PTR_ERR(ltq_pci_mapped_cfg); diff --git a/arch/mips/pci/pci-mt7620.c b/arch/mips/pci/pci-mt7620.c index e032932348d6..2700d75d41c5 100644 --- a/arch/mips/pci/pci-mt7620.c +++ b/arch/mips/pci/pci-mt7620.c @@ -282,21 +282,17 @@ static int mt7628_pci_hw_init(struct platform_device *pdev) static int mt7620_pci_probe(struct platform_device *pdev) { - struct resource *bridge_res = platform_get_resource(pdev, - IORESOURCE_MEM, 0); - struct resource *pcie_res = platform_get_resource(pdev, - IORESOURCE_MEM, 1); u32 val = 0; rstpcie0 = devm_reset_control_get_exclusive(&pdev->dev, "pcie0"); if (IS_ERR(rstpcie0)) return PTR_ERR(rstpcie0); - bridge_base = devm_ioremap_resource(&pdev->dev, bridge_res); + bridge_base = devm_platform_get_and_ioremap_resource(pdev, 0, NULL); if (IS_ERR(bridge_base)) return PTR_ERR(bridge_base); - pcie_base = devm_ioremap_resource(&pdev->dev, pcie_res); + pcie_base = devm_platform_get_and_ioremap_resource(pdev, 1, NULL); if (IS_ERR(pcie_base)) return PTR_ERR(pcie_base); diff --git a/arch/mips/ralink/Kconfig b/arch/mips/ralink/Kconfig index f9fe15630abb..06031796c87b 100644 --- a/arch/mips/ralink/Kconfig +++ b/arch/mips/ralink/Kconfig @@ -54,10 +54,11 @@ choice select HAVE_PCI select PCI_DRIVERS_GENERIC select SOC_BUS + select PINCTRL_MT7621 help - The MT7621 system-on-a-chip includes an 880 MHz MIPS1004Kc dual-core CPU, - a 5-port 10/100/1000 switch/PHY and one RGMII. + The MT7621 system-on-a-chip includes an 880 MHz MIPS1004Kc + dual-core CPU, a 5-port 10/100/1000 switch/PHY and one RGMII. endchoice choice diff --git a/arch/mips/ralink/timer.c b/arch/mips/ralink/timer.c index 652424d8ed51..fc503679a93d 100644 --- a/arch/mips/ralink/timer.c +++ b/arch/mips/ralink/timer.c @@ -95,7 +95,6 @@ static int rt_timer_enable(struct rt_timer *rt) static int rt_timer_probe(struct platform_device *pdev) { - struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0); struct rt_timer *rt; struct clk *clk; @@ -109,7 +108,7 @@ static int rt_timer_probe(struct platform_device *pdev) if (rt->irq < 0) return rt->irq; - rt->membase = devm_ioremap_resource(&pdev->dev, res); + rt->membase = devm_platform_get_and_ioremap_resource(pdev, 0, NULL); if (IS_ERR(rt->membase)) return PTR_ERR(rt->membase); diff --git a/arch/mips/vdso/Kconfig b/arch/mips/vdso/Kconfig index a665f6108cb5..70140248da72 100644 --- a/arch/mips/vdso/Kconfig +++ b/arch/mips/vdso/Kconfig @@ -1,18 +1,6 @@ -# For the pre-R6 code in arch/mips/vdso/vdso.h for locating -# the base address of VDSO, the linker will emit a R_MIPS_PC32 -# relocation in binutils > 2.25 but it will fail with older versions -# because that relocation is not supported for that symbol. As a result -# of which we are forced to disable the VDSO symbols when building -# with < 2.25 binutils on pre-R6 kernels. For more references on why we -# can't use other methods to get the base address of VDSO please refer to -# the comments on that file. -# # GCC (at least up to version 9.2) appears to emit function calls that make use # of the GOT when targeting microMIPS, which we can't use in the VDSO due to # the lack of relocations. As such, we disable the VDSO for microMIPS builds. -config MIPS_LD_CAN_LINK_VDSO - def_bool LD_VERSION >= 22500 || LD_IS_LLD - config MIPS_DISABLE_VDSO - def_bool CPU_MICROMIPS || (!CPU_MIPSR6 && !MIPS_LD_CAN_LINK_VDSO) + def_bool CPU_MICROMIPS diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile index 1f7d5c6c10b0..18af9474ed0e 100644 --- a/arch/mips/vdso/Makefile +++ b/arch/mips/vdso/Makefile @@ -52,9 +52,6 @@ endif CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) ifdef CONFIG_MIPS_DISABLE_VDSO - ifndef CONFIG_MIPS_LD_CAN_LINK_VDSO - $(warning MIPS VDSO requires binutils >= 2.25) - endif obj-vdso-y := $(filter-out vgettimeofday.o, $(obj-vdso-y)) endif diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 7a5f8dbfbdd0..2c9cdf1d8761 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -197,6 +197,7 @@ config PPC select HAVE_ARCH_KASAN if PPC_RADIX_MMU select HAVE_ARCH_KASAN if PPC_BOOK3E_64 select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN + select HAVE_ARCH_KCSAN if PPC_BOOK3S_64 select HAVE_ARCH_KFENCE if ARCH_SUPPORTS_DEBUG_PAGEALLOC select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET select HAVE_ARCH_KGDB @@ -206,7 +207,7 @@ config PPC select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_ASM_MODVERSIONS - select HAVE_CONTEXT_TRACKING_USER if PPC64 + select HAVE_CONTEXT_TRACKING_USER select HAVE_C_RECORDMCOUNT select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_STACKOVERFLOW @@ -256,6 +257,7 @@ config PPC select HAVE_STATIC_CALL if PPC32 select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING + select HAVE_VIRT_CPU_ACCOUNTING_GEN select HUGETLB_PAGE_SIZE_VARIABLE if PPC_BOOK3S_64 && HUGETLB_PAGE select IOMMU_HELPER if PPC64 select IRQ_DOMAIN @@ -387,10 +389,22 @@ config PPC_DCR depends on PPC_DCR_NATIVE || PPC_DCR_MMIO default y +config PPC_PCI_OF_BUS_MAP + bool "Use pci_to_OF_bus_map (deprecated)" + depends on PPC32 + depends on PPC_PMAC || PPC_CHRP + help + This option uses pci_to_OF_bus_map to map OF nodes to PCI devices, which + restricts the system to only having 256 PCI buses. On CHRP it also causes + the "pci-OF-bus-map" property to be created in the device tree. + + If unsure, say "N". + config PPC_PCI_BUS_NUM_DOMAIN_DEPENDENT depends on PPC32 - depends on !PPC_PMAC && !PPC_CHRP + depends on !PPC_PCI_OF_BUS_MAP bool "Assign PCI bus numbers from zero individually for each PCI domain" + default y help By default on PPC32 were PCI bus numbers unique across all PCI domains. So system could have only 256 PCI buses independently of available @@ -1028,6 +1042,7 @@ config PPC_SECURE_BOOT depends on PPC_POWERNV || PPC_PSERIES depends on IMA_ARCH_POLICY imply IMA_SECURE_AND_OR_TRUSTED_BOOT + select PSERIES_PLPKS if PPC_PSERIES help Systems with firmware secure boot enabled need to define security policies to extend secure boot to the OS. This config allows a user diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 533457466ce2..e91d7e91347d 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -90,7 +90,7 @@ aflags-$(CONFIG_CPU_LITTLE_ENDIAN) += -mlittle-endian ifeq ($(HAS_BIARCH),y) KBUILD_CFLAGS += -m$(BITS) -KBUILD_AFLAGS += -m$(BITS) -Wl,-a$(BITS) +KBUILD_AFLAGS += -m$(BITS) KBUILD_LDFLAGS += -m elf$(BITS)$(LDEMULATION) endif @@ -146,19 +146,6 @@ CFLAGS-$(CONFIG_PPC32) += $(call cc-option, $(MULTIPLEWORD)) CFLAGS-$(CONFIG_PPC32) += $(call cc-option,-mno-readonly-in-sdata) -ifdef CONFIG_PPC_BOOK3S_64 -ifdef CONFIG_CPU_LITTLE_ENDIAN -CFLAGS-$(CONFIG_GENERIC_CPU) += -mcpu=power8 -else -CFLAGS-$(CONFIG_GENERIC_CPU) += -mcpu=power4 -endif -CFLAGS-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=power10, \ - $(call cc-option,-mtune=power9, \ - $(call cc-option,-mtune=power8))) -else ifdef CONFIG_PPC_BOOK3E_64 -CFLAGS-$(CONFIG_GENERIC_CPU) += -mcpu=powerpc64 -endif - ifdef CONFIG_FUNCTION_TRACER CC_FLAGS_FTRACE := -pg ifdef CONFIG_MPROFILE_KERNEL @@ -166,11 +153,12 @@ CC_FLAGS_FTRACE += -mprofile-kernel endif endif -CFLAGS-$(CONFIG_TARGET_CPU_BOOL) += $(call cc-option,-mcpu=$(CONFIG_TARGET_CPU)) -AFLAGS-$(CONFIG_TARGET_CPU_BOOL) += $(call cc-option,-mcpu=$(CONFIG_TARGET_CPU)) +CFLAGS-$(CONFIG_TARGET_CPU_BOOL) += -mcpu=$(CONFIG_TARGET_CPU) +AFLAGS-$(CONFIG_TARGET_CPU_BOOL) += -mcpu=$(CONFIG_TARGET_CPU) -CFLAGS-$(CONFIG_E5500_CPU) += $(call cc-option,-mcpu=e500mc64,-mcpu=powerpc64) -CFLAGS-$(CONFIG_E6500_CPU) += $(call cc-option,-mcpu=e6500,$(E5500_CPU)) +CFLAGS-$(CONFIG_POWERPC64_CPU) += $(call cc-option,-mtune=power10, \ + $(call cc-option,-mtune=power9, \ + $(call cc-option,-mtune=power8))) asinstr := $(call as-instr,lis 9$(comma)foo@high,-DHAVE_AS_ATHIGH=1) @@ -213,10 +201,7 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables # often slow when they are implemented at all KBUILD_CFLAGS += $(call cc-option,-mno-string) -cpu-as-$(CONFIG_40x) += -Wa,-m405 -cpu-as-$(CONFIG_44x) += -Wa,-m440 cpu-as-$(CONFIG_ALTIVEC) += $(call as-option,-Wa$(comma)-maltivec) -cpu-as-$(CONFIG_PPC_E500) += -Wa,-me500 # When using '-many -mpower4' gas will first try and find a matching power4 # mnemonic and failing that it will allow any valid mnemonic that GAS knows @@ -224,7 +209,6 @@ cpu-as-$(CONFIG_PPC_E500) += -Wa,-me500 # LLVM IAS doesn't understand either flag: https://github.com/ClangBuiltLinux/linux/issues/675 # but LLVM IAS only supports ISA >= 2.06 for Book3S 64 anyway... cpu-as-$(CONFIG_PPC_BOOK3S_64) += $(call as-option,-Wa$(comma)-mpower4) $(call as-option,-Wa$(comma)-many) -cpu-as-$(CONFIG_PPC_E500MC) += $(call as-option,-Wa$(comma)-me500mc) KBUILD_AFLAGS += $(cpu-as-y) KBUILD_CFLAGS += $(cpu-as-y) diff --git a/arch/powerpc/Makefile.postlink b/arch/powerpc/Makefile.postlink index a6c77f4d32b2..1f860b3c9bec 100644 --- a/arch/powerpc/Makefile.postlink +++ b/arch/powerpc/Makefile.postlink @@ -9,7 +9,7 @@ PHONY := __archpost __archpost: -include include/config/auto.conf -include scripts/Kbuild.include +include $(srctree)/scripts/Kbuild.include quiet_cmd_head_check = CHKHEAD $@ cmd_head_check = $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/head_check.sh "$(NM)" "$@" diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index d32d95aea5d6..295f76df13b5 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -39,13 +39,19 @@ BOOTCFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ $(LINUXINCLUDE) ifdef CONFIG_PPC64_BOOT_WRAPPER -ifdef CONFIG_CPU_LITTLE_ENDIAN -BOOTCFLAGS += -m64 -mcpu=powerpc64le +BOOTCFLAGS += -m64 else -BOOTCFLAGS += -m64 -mcpu=powerpc64 +BOOTCFLAGS += -m32 endif + +ifdef CONFIG_TARGET_CPU_BOOL +BOOTCFLAGS += -mcpu=$(CONFIG_TARGET_CPU) +else ifdef CONFIG_PPC64_BOOT_WRAPPER +ifdef CONFIG_CPU_LITTLE_ENDIAN +BOOTCFLAGS += -mcpu=powerpc64le else -BOOTCFLAGS += -m32 -mcpu=powerpc +BOOTCFLAGS += -mcpu=powerpc64 +endif endif BOOTCFLAGS += -isystem $(shell $(BOOTCC) -print-file-name=include) diff --git a/arch/powerpc/boot/dts/turris1x.dts b/arch/powerpc/boot/dts/turris1x.dts index e9cda34a140e..c9b619f6ed5c 100644 --- a/arch/powerpc/boot/dts/turris1x.dts +++ b/arch/powerpc/boot/dts/turris1x.dts @@ -367,11 +367,34 @@ }; reboot@d { + /* + * CPLD firmware which manages system reset and + * watchdog registers has bugs. It does not + * autoclear system reset register after change + * and watchdog ignores reset line on immediate + * succeeding reset cycle triggered by watchdog. + * These bugs have to be workarounded in U-Boot + * bootloader. So use system reset via syscon as + * a last resort because older U-Boot versions + * do not have workaround for watchdog. + * + * Reset method via rstcr's global-utilities + * (the preferred one) has priority level 128, + * watchdog has priority level 0 and default + * syscon-reboot priority level is 192. + * + * So define syscon-reboot with custom priority + * level 64 (between rstcr and watchdog) because + * rstcr should stay as default preferred reset + * method and reset via watchdog is more broken + * than system reset via syscon. + */ compatible = "syscon-reboot"; reg = <0x0d 0x01>; offset = <0x0d>; mask = <0x01>; value = <0x01>; + priority = <64>; }; led-controller@13 { diff --git a/arch/powerpc/configs/ps3_defconfig b/arch/powerpc/configs/ps3_defconfig index 0a1b42c4f26a..52a8c5450ecb 100644 --- a/arch/powerpc/configs/ps3_defconfig +++ b/arch/powerpc/configs/ps3_defconfig @@ -1,8 +1,3 @@ -CONFIG_PPC64=y -CONFIG_CELL_CPU=y -CONFIG_ALTIVEC=y -CONFIG_SMP=y -CONFIG_NR_CPUS=2 CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_HIGH_RES_TIMERS=y @@ -10,11 +5,12 @@ CONFIG_BLK_DEV_INITRD=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_EMBEDDED=y # CONFIG_PERF_EVENTS is not set -# CONFIG_COMPAT_BRK is not set -CONFIG_SLAB=y CONFIG_PROFILING=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y +CONFIG_PPC64=y +CONFIG_CELL_CPU=y +CONFIG_ALTIVEC=y +CONFIG_SMP=y +CONFIG_NR_CPUS=2 # CONFIG_PPC_POWERNV is not set # CONFIG_PPC_PSERIES is not set # CONFIG_PPC_PMAC is not set @@ -27,17 +23,20 @@ CONFIG_PS3_FLASH=y CONFIG_PS3_VRAM=m CONFIG_PS3_LPM=m # CONFIG_PPC_OF_BOOT_TRAMPOLINE is not set -# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set -CONFIG_BINFMT_MISC=y CONFIG_KEXEC=y CONFIG_PPC_4K_PAGES=y -# CONFIG_SPARSEMEM_VMEMMAP is not set -# CONFIG_COMPACTION is not set CONFIG_SCHED_SMT=y CONFIG_PM=y CONFIG_PM_DEBUG=y # CONFIG_SECCOMP is not set -# CONFIG_PCI is not set +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +CONFIG_BINFMT_MISC=y +CONFIG_SLAB=y +# CONFIG_COMPAT_BRK is not set +# CONFIG_SPARSEMEM_VMEMMAP is not set +# CONFIG_COMPACTION is not set CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -87,7 +86,6 @@ CONFIG_USB_USBNET=m # CONFIG_USB_NET_NET1080 is not set # CONFIG_USB_NET_CDC_SUBSET is not set # CONFIG_USB_NET_ZAURUS is not set -CONFIG_INPUT_FF_MEMLESS=m CONFIG_INPUT_JOYDEV=m CONFIG_INPUT_EVDEV=m # CONFIG_INPUT_KEYBOARD is not set @@ -110,13 +108,10 @@ CONFIG_SND=m # CONFIG_SND_DRIVERS is not set CONFIG_SND_USB_AUDIO=m CONFIG_HIDRAW=y -CONFIG_HID_APPLE=m CONFIG_HID_BELKIN=m CONFIG_HID_CHERRY=m CONFIG_HID_EZKEY=m CONFIG_HID_TWINHAN=m -CONFIG_HID_LOGITECH=m -CONFIG_HID_LOGITECH_DJ=m CONFIG_HID_MICROSOFT=m CONFIG_HID_SUNPLUS=m CONFIG_HID_SMARTJOYPLUS=m @@ -151,8 +146,12 @@ CONFIG_CIFS=m CONFIG_NLS=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y +CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_LZO=m CONFIG_CRC_CCITT=m CONFIG_CRC_T10DIF=y +CONFIG_PRINTK_TIME=y CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_MEMORY_INIT=y @@ -163,7 +162,3 @@ CONFIG_DEBUG_LOCKDEP=y CONFIG_DEBUG_LIST=y CONFIG_RCU_CPU_STALL_TIMEOUT=60 # CONFIG_FTRACE is not set -CONFIG_CRYPTO_PCBC=m -CONFIG_CRYPTO_MICHAEL_MIC=m -CONFIG_CRYPTO_LZO=m -CONFIG_PRINTK_TIME=y diff --git a/arch/powerpc/crypto/crc32-vpmsum_core.S b/arch/powerpc/crypto/crc32-vpmsum_core.S index a16a717c809c..b0f87f595b26 100644 --- a/arch/powerpc/crypto/crc32-vpmsum_core.S +++ b/arch/powerpc/crypto/crc32-vpmsum_core.S @@ -113,9 +113,7 @@ FUNC_START(CRC_FUNCTION_NAME) #endif #ifdef BYTESWAP_DATA - addis r3,r2,.byteswap_constant@toc@ha - addi r3,r3,.byteswap_constant@toc@l - + LOAD_REG_ADDR(r3, .byteswap_constant) lvx byteswap,0,r3 addi r3,r3,16 #endif @@ -150,8 +148,7 @@ FUNC_START(CRC_FUNCTION_NAME) addi r7,r7,-1 mtctr r7 - addis r3,r2,.constants@toc@ha - addi r3,r3,.constants@toc@l + LOAD_REG_ADDR(r3, .constants) /* Find the start of our constants */ add r3,r3,r8 @@ -506,8 +503,7 @@ FUNC_START(CRC_FUNCTION_NAME) .Lbarrett_reduction: /* Barrett constants */ - addis r3,r2,.barrett_constants@toc@ha - addi r3,r3,.barrett_constants@toc@l + LOAD_REG_ADDR(r3, .barrett_constants) lvx const1,0,r3 lvx const2,off16,r3 @@ -610,8 +606,7 @@ FUNC_START(CRC_FUNCTION_NAME) cmpdi r5,0 beq .Lzero - addis r3,r2,.short_constants@toc@ha - addi r3,r3,.short_constants@toc@l + LOAD_REG_ADDR(r3, .short_constants) /* Calculate where in the constant table we need to start */ subfic r6,r5,256 diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index e80b2c0e9315..b95b666f0374 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -35,9 +35,9 @@ * However, on CPUs that don't support lwsync, lwsync actually maps to a * heavy-weight sync, so smp_wmb() can be a lighter-weight eieio. */ -#define mb() __asm__ __volatile__ ("sync" : : : "memory") -#define rmb() __asm__ __volatile__ ("sync" : : : "memory") -#define wmb() __asm__ __volatile__ ("sync" : : : "memory") +#define __mb() __asm__ __volatile__ ("sync" : : : "memory") +#define __rmb() __asm__ __volatile__ ("sync" : : : "memory") +#define __wmb() __asm__ __volatile__ ("sync" : : : "memory") /* The sub-arch has lwsync */ #if defined(CONFIG_PPC64) || defined(CONFIG_PPC_E500MC) @@ -51,12 +51,12 @@ /* clang defines this macro for a builtin, which will not work with runtime patching */ #undef __lwsync #define __lwsync() __asm__ __volatile__ (stringify_in_c(LWSYNC) : : :"memory") -#define dma_rmb() __lwsync() -#define dma_wmb() __asm__ __volatile__ (stringify_in_c(SMPWMB) : : :"memory") +#define __dma_rmb() __lwsync() +#define __dma_wmb() __asm__ __volatile__ (stringify_in_c(SMPWMB) : : :"memory") #define __smp_lwsync() __lwsync() -#define __smp_mb() mb() +#define __smp_mb() __mb() #define __smp_rmb() __lwsync() #define __smp_wmb() __asm__ __volatile__ (stringify_in_c(SMPWMB) : : :"memory") diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 95fd7f9485d5..c099780385dd 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -335,6 +335,7 @@ #define H_RPT_INVALIDATE 0x448 #define H_SCM_FLUSH 0x44C #define H_GET_ENERGY_SCALE_INFO 0x450 +#define H_PKS_SIGNED_UPDATE 0x454 #define H_WATCHDOG 0x45C #define MAX_HCALL_OPCODE H_WATCHDOG diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index eb6d094083fd..317659fdeacf 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -36,15 +36,17 @@ #define PACA_IRQ_DEC 0x08 /* Or FIT */ #define PACA_IRQ_HMI 0x10 #define PACA_IRQ_PMI 0x20 +#define PACA_IRQ_REPLAYING 0x40 /* * Some soft-masked interrupts must be hard masked until they are replayed * (e.g., because the soft-masked handler does not clear the exception). + * Interrupt replay itself must remain hard masked too. */ #ifdef CONFIG_PPC_BOOK3S -#define PACA_IRQ_MUST_HARD_MASK (PACA_IRQ_EE|PACA_IRQ_PMI) +#define PACA_IRQ_MUST_HARD_MASK (PACA_IRQ_EE|PACA_IRQ_PMI|PACA_IRQ_REPLAYING) #else -#define PACA_IRQ_MUST_HARD_MASK (PACA_IRQ_EE) +#define PACA_IRQ_MUST_HARD_MASK (PACA_IRQ_EE|PACA_IRQ_REPLAYING) #endif #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index 6d8492b6e2b8..a4196ab1d016 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -74,17 +74,18 @@ #include <asm/kprobes.h> #include <asm/runlatch.h> -#ifdef CONFIG_PPC64 +#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG /* * WARN/BUG is handled with a program interrupt so minimise checks here to * avoid recursion and maximise the chance of getting the first oops handled. */ #define INT_SOFT_MASK_BUG_ON(regs, cond) \ do { \ - if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG) && \ - (user_mode(regs) || (TRAP(regs) != INTERRUPT_PROGRAM))) \ + if ((user_mode(regs) || (TRAP(regs) != INTERRUPT_PROGRAM))) \ BUG_ON(cond); \ } while (0) +#else +#define INT_SOFT_MASK_BUG_ON(regs, cond) #endif #ifdef CONFIG_PPC_BOOK3S_64 @@ -151,28 +152,8 @@ static inline void booke_restore_dbcr0(void) static inline void interrupt_enter_prepare(struct pt_regs *regs) { -#ifdef CONFIG_PPC32 - if (!arch_irq_disabled_regs(regs)) - trace_hardirqs_off(); - - if (user_mode(regs)) - kuap_lock(); - else - kuap_save_and_lock(regs); - - if (user_mode(regs)) - account_cpu_user_entry(); -#endif - #ifdef CONFIG_PPC64 - bool trace_enable = false; - - if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS)) { - if (irq_soft_mask_set_return(IRQS_ALL_DISABLED) == IRQS_ENABLED) - trace_enable = true; - } else { - irq_soft_mask_set(IRQS_ALL_DISABLED); - } + irq_soft_mask_set(IRQS_ALL_DISABLED); /* * If the interrupt was taken with HARD_DIS clear, then enable MSR[EE]. @@ -188,9 +169,10 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs) } else { __hard_RI_enable(); } + /* Enable MSR[RI] early, to support kernel SLB and hash faults */ +#endif - /* Do this when RI=1 because it can cause SLB faults */ - if (trace_enable) + if (!arch_irq_disabled_regs(regs)) trace_hardirqs_off(); if (user_mode(regs)) { @@ -215,7 +197,6 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs) } INT_SOFT_MASK_BUG_ON(regs, !arch_irq_disabled_regs(regs) && !(regs->msr & MSR_EE)); -#endif booke_restore_dbcr0(); } diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h index 5c1516a5ba8f..deadd2149426 100644 --- a/arch/powerpc/include/asm/irq.h +++ b/arch/powerpc/include/asm/irq.h @@ -16,9 +16,6 @@ extern atomic_t ppc_n_lost_interrupts; -/* This number is used when no interrupt has been assigned */ -#define NO_IRQ (0) - /* Total number of virq in the platform */ #define NR_IRQS CONFIG_NR_IRQS diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index caea15dcb91d..959f566a455c 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -876,13 +876,10 @@ struct kvm_vcpu_arch { #define __KVM_HAVE_ARCH_WQP #define __KVM_HAVE_CREATE_DEVICE -static inline void kvm_arch_hardware_disable(void) {} -static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} -static inline void kvm_arch_exit(void) {} static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index eae9619b6190..6bef23d6d0e3 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -118,7 +118,6 @@ extern int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, extern int kvmppc_core_vcpu_create(struct kvm_vcpu *vcpu); extern void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu); extern int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu); -extern int kvmppc_core_check_processor_compat(void); extern int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr); diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 378b8d5836a7..459736d5e511 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -3,6 +3,7 @@ #define _ASM_POWERPC_MACHDEP_H #ifdef __KERNEL__ +#include <linux/compiler.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/dma-mapping.h> @@ -220,11 +221,16 @@ extern struct machdep_calls *machine_id; EXPORT_SYMBOL(mach_##name); \ struct machdep_calls mach_##name __machine_desc = -#define machine_is(name) \ - ({ \ - extern struct machdep_calls mach_##name \ - __attribute__((weak)); \ - machine_id == &mach_##name; \ +static inline bool __machine_is(const struct machdep_calls *md) +{ + WARN_ON(!machine_id); // complain if used before probe_machine() + return machine_id == md; +} + +#define machine_is(name) \ + ({ \ + extern struct machdep_calls mach_##name __weak; \ + __machine_is(&mach_##name); \ }) static inline void log_error(char *buf, unsigned int err_type, int fatal) diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 09f1790d0ae1..0ab3511a47d7 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -295,7 +295,6 @@ extern void free_unused_pacas(void); #else /* CONFIG_PPC64 */ -static inline void allocate_paca_ptrs(void) { } static inline void allocate_paca(int cpu) { } static inline void free_unused_pacas(void) { } diff --git a/arch/powerpc/include/asm/papr-sysparm.h b/arch/powerpc/include/asm/papr-sysparm.h new file mode 100644 index 000000000000..f5fdbd8ae9db --- /dev/null +++ b/arch/powerpc/include/asm/papr-sysparm.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_POWERPC_PAPR_SYSPARM_H +#define _ASM_POWERPC_PAPR_SYSPARM_H + +typedef struct { + const u32 token; +} papr_sysparm_t; + +#define mk_papr_sysparm(x_) ((papr_sysparm_t){ .token = x_, }) + +/* + * Derived from the "Defined Parameters" table in PAPR 7.3.16 System + * Parameters Option. Where the spec says "characteristics", we use + * "attrs" in the symbolic names to keep them from getting too + * unwieldy. + */ +#define PAPR_SYSPARM_SHARED_PROC_LPAR_ATTRS mk_papr_sysparm(20) +#define PAPR_SYSPARM_PROC_MODULE_INFO mk_papr_sysparm(43) +#define PAPR_SYSPARM_COOP_MEM_OVERCOMMIT_ATTRS mk_papr_sysparm(44) +#define PAPR_SYSPARM_TLB_BLOCK_INVALIDATE_ATTRS mk_papr_sysparm(50) +#define PAPR_SYSPARM_LPAR_NAME mk_papr_sysparm(55) + +enum { + PAPR_SYSPARM_MAX_INPUT = 1024, + PAPR_SYSPARM_MAX_OUTPUT = 4000, +}; + +struct papr_sysparm_buf { + __be16 len; + char val[PAPR_SYSPARM_MAX_OUTPUT]; +}; + +struct papr_sysparm_buf *papr_sysparm_buf_alloc(void); +void papr_sysparm_buf_free(struct papr_sysparm_buf *buf); +int papr_sysparm_set(papr_sysparm_t param, const struct papr_sysparm_buf *buf); +int papr_sysparm_get(papr_sysparm_t param, struct papr_sysparm_buf *buf); + +#endif /* _ASM_POWERPC_PAPR_SYSPARM_H */ diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index e18c95f4e1d4..71c1d26f2400 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -176,8 +176,10 @@ extern int pci_device_from_OF_node(struct device_node *node, #endif #ifndef CONFIG_PPC64 -#ifdef CONFIG_PPC_CHRP +#ifdef CONFIG_PPC_PCI_OF_BUS_MAP extern void pci_create_OF_bus_map(void); +#else +static inline void pci_create_OF_bus_map(void) {} #endif #else /* CONFIG_PPC64 */ diff --git a/arch/powerpc/include/asm/plpks.h b/arch/powerpc/include/asm/plpks.h new file mode 100644 index 000000000000..23b77027c916 --- /dev/null +++ b/arch/powerpc/include/asm/plpks.h @@ -0,0 +1,195 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2022 IBM Corporation + * Author: Nayna Jain <nayna@linux.ibm.com> + * + * Platform keystore for pseries LPAR(PLPKS). + */ + +#ifndef _ASM_POWERPC_PLPKS_H +#define _ASM_POWERPC_PLPKS_H + +#ifdef CONFIG_PSERIES_PLPKS + +#include <linux/types.h> +#include <linux/list.h> + +// Object policy flags from supported_policies +#define PLPKS_OSSECBOOTAUDIT PPC_BIT32(1) // OS secure boot must be audit/enforce +#define PLPKS_OSSECBOOTENFORCE PPC_BIT32(2) // OS secure boot must be enforce +#define PLPKS_PWSET PPC_BIT32(3) // No access without password set +#define PLPKS_WORLDREADABLE PPC_BIT32(4) // Readable without authentication +#define PLPKS_IMMUTABLE PPC_BIT32(5) // Once written, object cannot be removed +#define PLPKS_TRANSIENT PPC_BIT32(6) // Object does not persist through reboot +#define PLPKS_SIGNEDUPDATE PPC_BIT32(7) // Object can only be modified by signed updates +#define PLPKS_HVPROVISIONED PPC_BIT32(28) // Hypervisor has provisioned this object + +// Signature algorithm flags from signed_update_algorithms +#define PLPKS_ALG_RSA2048 PPC_BIT(0) +#define PLPKS_ALG_RSA4096 PPC_BIT(1) + +// Object label OS metadata flags +#define PLPKS_VAR_LINUX 0x02 +#define PLPKS_VAR_COMMON 0x04 + +// Flags for which consumer owns an object is owned by +#define PLPKS_FW_OWNER 0x1 +#define PLPKS_BOOTLOADER_OWNER 0x2 +#define PLPKS_OS_OWNER 0x3 + +// Flags for label metadata fields +#define PLPKS_LABEL_VERSION 0 +#define PLPKS_MAX_LABEL_ATTR_SIZE 16 +#define PLPKS_MAX_NAME_SIZE 239 +#define PLPKS_MAX_DATA_SIZE 4000 + +// Timeouts for PLPKS operations +#define PLPKS_MAX_TIMEOUT 5000 // msec +#define PLPKS_FLUSH_SLEEP 10 // msec +#define PLPKS_FLUSH_SLEEP_RANGE 400 + +struct plpks_var { + char *component; + u8 *name; + u8 *data; + u32 policy; + u16 namelen; + u16 datalen; + u8 os; +}; + +struct plpks_var_name { + u8 *name; + u16 namelen; +}; + +struct plpks_var_name_list { + u32 varcount; + struct plpks_var_name varlist[]; +}; + +/** + * Updates the authenticated variable. It expects NULL as the component. + */ +int plpks_signed_update_var(struct plpks_var *var, u64 flags); + +/** + * Writes the specified var and its data to PKS. + * Any caller of PKS driver should present a valid component type for + * their variable. + */ +int plpks_write_var(struct plpks_var var); + +/** + * Removes the specified var and its data from PKS. + */ +int plpks_remove_var(char *component, u8 varos, + struct plpks_var_name vname); + +/** + * Returns the data for the specified os variable. + * + * Caller must allocate a buffer in var->data with length in var->datalen. + * If no buffer is provided, var->datalen will be populated with the object's + * size. + */ +int plpks_read_os_var(struct plpks_var *var); + +/** + * Returns the data for the specified firmware variable. + * + * Caller must allocate a buffer in var->data with length in var->datalen. + * If no buffer is provided, var->datalen will be populated with the object's + * size. + */ +int plpks_read_fw_var(struct plpks_var *var); + +/** + * Returns the data for the specified bootloader variable. + * + * Caller must allocate a buffer in var->data with length in var->datalen. + * If no buffer is provided, var->datalen will be populated with the object's + * size. + */ +int plpks_read_bootloader_var(struct plpks_var *var); + +/** + * Returns if PKS is available on this LPAR. + */ +bool plpks_is_available(void); + +/** + * Returns version of the Platform KeyStore. + */ +u8 plpks_get_version(void); + +/** + * Returns hypervisor storage overhead per object, not including the size of + * the object or label. Only valid for config version >= 2 + */ +u16 plpks_get_objoverhead(void); + +/** + * Returns maximum password size. Must be >= 32 bytes + */ +u16 plpks_get_maxpwsize(void); + +/** + * Returns maximum object size supported by Platform KeyStore. + */ +u16 plpks_get_maxobjectsize(void); + +/** + * Returns maximum object label size supported by Platform KeyStore. + */ +u16 plpks_get_maxobjectlabelsize(void); + +/** + * Returns total size of the configured Platform KeyStore. + */ +u32 plpks_get_totalsize(void); + +/** + * Returns used space from the total size of the Platform KeyStore. + */ +u32 plpks_get_usedspace(void); + +/** + * Returns bitmask of policies supported by the hypervisor. + */ +u32 plpks_get_supportedpolicies(void); + +/** + * Returns maximum byte size of a single object supported by the hypervisor. + * Only valid for config version >= 3 + */ +u32 plpks_get_maxlargeobjectsize(void); + +/** + * Returns bitmask of signature algorithms supported for signed updates. + * Only valid for config version >= 3 + */ +u64 plpks_get_signedupdatealgorithms(void); + +/** + * Returns the length of the PLPKS password in bytes. + */ +u16 plpks_get_passwordlen(void); + +/** + * Called in early init to retrieve and clear the PLPKS password from the DT. + */ +void plpks_early_init_devtree(void); + +/** + * Populates the FDT with the PLPKS password to prepare for kexec. + */ +int plpks_populate_fdt(void *fdt); +#else // CONFIG_PSERIES_PLPKS +static inline bool plpks_is_available(void) { return false; } +static inline u16 plpks_get_passwordlen(void) { BUILD_BUG(); } +static inline void plpks_early_init_devtree(void) { } +static inline int plpks_populate_fdt(void *fdt) { BUILD_BUG(); } +#endif // CONFIG_PSERIES_PLPKS + +#endif // _ASM_POWERPC_PLPKS_H diff --git a/arch/powerpc/include/asm/rtas-types.h b/arch/powerpc/include/asm/rtas-types.h index 8df6235d64d1..f2ad4a96cbc5 100644 --- a/arch/powerpc/include/asm/rtas-types.h +++ b/arch/powerpc/include/asm/rtas-types.h @@ -18,8 +18,6 @@ struct rtas_t { unsigned long entry; /* physical address pointer */ unsigned long base; /* physical address pointer */ unsigned long size; - arch_spinlock_t lock; - struct rtas_args args; struct device_node *dev; /* virtual address pointer */ }; diff --git a/arch/powerpc/include/asm/rtas-work-area.h b/arch/powerpc/include/asm/rtas-work-area.h new file mode 100644 index 000000000000..251a395dbd2e --- /dev/null +++ b/arch/powerpc/include/asm/rtas-work-area.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_POWERPC_RTAS_WORK_AREA_H +#define _ASM_POWERPC_RTAS_WORK_AREA_H + +#include <linux/build_bug.h> +#include <linux/sizes.h> +#include <linux/types.h> + +#include <asm/page.h> + +/** + * struct rtas_work_area - RTAS work area descriptor. + * + * Descriptor for a "work area" in PAPR terminology that satisfies + * RTAS addressing requirements. + */ +struct rtas_work_area { + /* private: Use the APIs provided below. */ + char *buf; + size_t size; +}; + +enum { + /* Maximum allocation size, enforced at build time. */ + RTAS_WORK_AREA_MAX_ALLOC_SZ = SZ_128K, +}; + +/** + * rtas_work_area_alloc() - Acquire a work area of the requested size. + * @size_: Allocation size. Must be compile-time constant and not more + * than %RTAS_WORK_AREA_MAX_ALLOC_SZ. + * + * Allocate a buffer suitable for passing to RTAS functions that have + * a memory address parameter, often (but not always) referred to as a + * "work area" in PAPR. Although callers are allowed to block while + * holding a work area, the amount of memory reserved for this purpose + * is limited, and allocations should be short-lived. A good guideline + * is to release any allocated work area before returning from a + * system call. + * + * This function does not fail. It blocks until the allocation + * succeeds. To prevent deadlocks, callers are discouraged from + * allocating more than one work area simultaneously in a single task + * context. + * + * Context: This function may sleep. + * Return: A &struct rtas_work_area descriptor for the allocated work area. + */ +#define rtas_work_area_alloc(size_) ({ \ + static_assert(__builtin_constant_p(size_)); \ + static_assert((size_) > 0); \ + static_assert((size_) <= RTAS_WORK_AREA_MAX_ALLOC_SZ); \ + __rtas_work_area_alloc(size_); \ +}) + +/* + * Do not call __rtas_work_area_alloc() directly. Use + * rtas_work_area_alloc(). + */ +struct rtas_work_area *__rtas_work_area_alloc(size_t size); + +/** + * rtas_work_area_free() - Release a work area. + * @area: Work area descriptor as returned from rtas_work_area_alloc(). + * + * Return a work area buffer to the pool. + */ +void rtas_work_area_free(struct rtas_work_area *area); + +static inline char *rtas_work_area_raw_buf(const struct rtas_work_area *area) +{ + return area->buf; +} + +static inline size_t rtas_work_area_size(const struct rtas_work_area *area) +{ + return area->size; +} + +static inline phys_addr_t rtas_work_area_phys(const struct rtas_work_area *area) +{ + return __pa(area->buf); +} + +/* + * Early setup for the work area allocator. Call from + * rtas_initialize() only. + */ + +#ifdef CONFIG_PPC_PSERIES +void rtas_work_area_reserve_arena(phys_addr_t limit); +#else /* CONFIG_PPC_PSERIES */ +static inline void rtas_work_area_reserve_arena(phys_addr_t limit) {} +#endif /* CONFIG_PPC_PSERIES */ + +#endif /* _ASM_POWERPC_RTAS_WORK_AREA_H */ diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 479a95cb2770..3abe15ac79db 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -16,6 +16,185 @@ * Copyright (C) 2001 PPC 64 Team, IBM Corp */ +enum rtas_function_index { + RTAS_FNIDX__CHECK_EXCEPTION, + RTAS_FNIDX__DISPLAY_CHARACTER, + RTAS_FNIDX__EVENT_SCAN, + RTAS_FNIDX__FREEZE_TIME_BASE, + RTAS_FNIDX__GET_POWER_LEVEL, + RTAS_FNIDX__GET_SENSOR_STATE, + RTAS_FNIDX__GET_TERM_CHAR, + RTAS_FNIDX__GET_TIME_OF_DAY, + RTAS_FNIDX__IBM_ACTIVATE_FIRMWARE, + RTAS_FNIDX__IBM_CBE_START_PTCAL, + RTAS_FNIDX__IBM_CBE_STOP_PTCAL, + RTAS_FNIDX__IBM_CHANGE_MSI, + RTAS_FNIDX__IBM_CLOSE_ERRINJCT, + RTAS_FNIDX__IBM_CONFIGURE_BRIDGE, + RTAS_FNIDX__IBM_CONFIGURE_CONNECTOR, + RTAS_FNIDX__IBM_CONFIGURE_KERNEL_DUMP, + RTAS_FNIDX__IBM_CONFIGURE_PE, + RTAS_FNIDX__IBM_CREATE_PE_DMA_WINDOW, + RTAS_FNIDX__IBM_DISPLAY_MESSAGE, + RTAS_FNIDX__IBM_ERRINJCT, + RTAS_FNIDX__IBM_EXTI2C, + RTAS_FNIDX__IBM_GET_CONFIG_ADDR_INFO, + RTAS_FNIDX__IBM_GET_CONFIG_ADDR_INFO2, + RTAS_FNIDX__IBM_GET_DYNAMIC_SENSOR_STATE, + RTAS_FNIDX__IBM_GET_INDICES, + RTAS_FNIDX__IBM_GET_RIO_TOPOLOGY, + RTAS_FNIDX__IBM_GET_SYSTEM_PARAMETER, + RTAS_FNIDX__IBM_GET_VPD, + RTAS_FNIDX__IBM_GET_XIVE, + RTAS_FNIDX__IBM_INT_OFF, + RTAS_FNIDX__IBM_INT_ON, + RTAS_FNIDX__IBM_IO_QUIESCE_ACK, + RTAS_FNIDX__IBM_LPAR_PERFTOOLS, + RTAS_FNIDX__IBM_MANAGE_FLASH_IMAGE, + RTAS_FNIDX__IBM_MANAGE_STORAGE_PRESERVATION, + RTAS_FNIDX__IBM_NMI_INTERLOCK, + RTAS_FNIDX__IBM_NMI_REGISTER, + RTAS_FNIDX__IBM_OPEN_ERRINJCT, + RTAS_FNIDX__IBM_OPEN_SRIOV_ALLOW_UNFREEZE, + RTAS_FNIDX__IBM_OPEN_SRIOV_MAP_PE_NUMBER, + RTAS_FNIDX__IBM_OS_TERM, + RTAS_FNIDX__IBM_PARTNER_CONTROL, + RTAS_FNIDX__IBM_PHYSICAL_ATTESTATION, + RTAS_FNIDX__IBM_PLATFORM_DUMP, + RTAS_FNIDX__IBM_POWER_OFF_UPS, + RTAS_FNIDX__IBM_QUERY_INTERRUPT_SOURCE_NUMBER, + RTAS_FNIDX__IBM_QUERY_PE_DMA_WINDOW, + RTAS_FNIDX__IBM_READ_PCI_CONFIG, + RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE, + RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE2, + RTAS_FNIDX__IBM_REMOVE_PE_DMA_WINDOW, + RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOWS, + RTAS_FNIDX__IBM_SCAN_LOG_DUMP, + RTAS_FNIDX__IBM_SET_DYNAMIC_INDICATOR, + RTAS_FNIDX__IBM_SET_EEH_OPTION, + RTAS_FNIDX__IBM_SET_SLOT_RESET, + RTAS_FNIDX__IBM_SET_SYSTEM_PARAMETER, + RTAS_FNIDX__IBM_SET_XIVE, + RTAS_FNIDX__IBM_SLOT_ERROR_DETAIL, + RTAS_FNIDX__IBM_SUSPEND_ME, + RTAS_FNIDX__IBM_TUNE_DMA_PARMS, + RTAS_FNIDX__IBM_UPDATE_FLASH_64_AND_REBOOT, + RTAS_FNIDX__IBM_UPDATE_NODES, + RTAS_FNIDX__IBM_UPDATE_PROPERTIES, + RTAS_FNIDX__IBM_VALIDATE_FLASH_IMAGE, + RTAS_FNIDX__IBM_WRITE_PCI_CONFIG, + RTAS_FNIDX__NVRAM_FETCH, + RTAS_FNIDX__NVRAM_STORE, + RTAS_FNIDX__POWER_OFF, + RTAS_FNIDX__PUT_TERM_CHAR, + RTAS_FNIDX__QUERY_CPU_STOPPED_STATE, + RTAS_FNIDX__READ_PCI_CONFIG, + RTAS_FNIDX__RTAS_LAST_ERROR, + RTAS_FNIDX__SET_INDICATOR, + RTAS_FNIDX__SET_POWER_LEVEL, + RTAS_FNIDX__SET_TIME_FOR_POWER_ON, + RTAS_FNIDX__SET_TIME_OF_DAY, + RTAS_FNIDX__START_CPU, + RTAS_FNIDX__STOP_SELF, + RTAS_FNIDX__SYSTEM_REBOOT, + RTAS_FNIDX__THAW_TIME_BASE, + RTAS_FNIDX__WRITE_PCI_CONFIG, +}; + +/* + * Opaque handle for client code to refer to RTAS functions. All valid + * function handles are build-time constants prefixed with RTAS_FN_. + */ +typedef struct { + const enum rtas_function_index index; +} rtas_fn_handle_t; + + +#define rtas_fn_handle(x_) ((const rtas_fn_handle_t) { .index = x_, }) + +#define RTAS_FN_CHECK_EXCEPTION rtas_fn_handle(RTAS_FNIDX__CHECK_EXCEPTION) +#define RTAS_FN_DISPLAY_CHARACTER rtas_fn_handle(RTAS_FNIDX__DISPLAY_CHARACTER) +#define RTAS_FN_EVENT_SCAN rtas_fn_handle(RTAS_FNIDX__EVENT_SCAN) +#define RTAS_FN_FREEZE_TIME_BASE rtas_fn_handle(RTAS_FNIDX__FREEZE_TIME_BASE) +#define RTAS_FN_GET_POWER_LEVEL rtas_fn_handle(RTAS_FNIDX__GET_POWER_LEVEL) +#define RTAS_FN_GET_SENSOR_STATE rtas_fn_handle(RTAS_FNIDX__GET_SENSOR_STATE) +#define RTAS_FN_GET_TERM_CHAR rtas_fn_handle(RTAS_FNIDX__GET_TERM_CHAR) +#define RTAS_FN_GET_TIME_OF_DAY rtas_fn_handle(RTAS_FNIDX__GET_TIME_OF_DAY) +#define RTAS_FN_IBM_ACTIVATE_FIRMWARE rtas_fn_handle(RTAS_FNIDX__IBM_ACTIVATE_FIRMWARE) +#define RTAS_FN_IBM_CBE_START_PTCAL rtas_fn_handle(RTAS_FNIDX__IBM_CBE_START_PTCAL) +#define RTAS_FN_IBM_CBE_STOP_PTCAL rtas_fn_handle(RTAS_FNIDX__IBM_CBE_STOP_PTCAL) +#define RTAS_FN_IBM_CHANGE_MSI rtas_fn_handle(RTAS_FNIDX__IBM_CHANGE_MSI) +#define RTAS_FN_IBM_CLOSE_ERRINJCT rtas_fn_handle(RTAS_FNIDX__IBM_CLOSE_ERRINJCT) +#define RTAS_FN_IBM_CONFIGURE_BRIDGE rtas_fn_handle(RTAS_FNIDX__IBM_CONFIGURE_BRIDGE) +#define RTAS_FN_IBM_CONFIGURE_CONNECTOR rtas_fn_handle(RTAS_FNIDX__IBM_CONFIGURE_CONNECTOR) +#define RTAS_FN_IBM_CONFIGURE_KERNEL_DUMP rtas_fn_handle(RTAS_FNIDX__IBM_CONFIGURE_KERNEL_DUMP) +#define RTAS_FN_IBM_CONFIGURE_PE rtas_fn_handle(RTAS_FNIDX__IBM_CONFIGURE_PE) +#define RTAS_FN_IBM_CREATE_PE_DMA_WINDOW rtas_fn_handle(RTAS_FNIDX__IBM_CREATE_PE_DMA_WINDOW) +#define RTAS_FN_IBM_DISPLAY_MESSAGE rtas_fn_handle(RTAS_FNIDX__IBM_DISPLAY_MESSAGE) +#define RTAS_FN_IBM_ERRINJCT rtas_fn_handle(RTAS_FNIDX__IBM_ERRINJCT) +#define RTAS_FN_IBM_EXTI2C rtas_fn_handle(RTAS_FNIDX__IBM_EXTI2C) +#define RTAS_FN_IBM_GET_CONFIG_ADDR_INFO rtas_fn_handle(RTAS_FNIDX__IBM_GET_CONFIG_ADDR_INFO) +#define RTAS_FN_IBM_GET_CONFIG_ADDR_INFO2 rtas_fn_handle(RTAS_FNIDX__IBM_GET_CONFIG_ADDR_INFO2) +#define RTAS_FN_IBM_GET_DYNAMIC_SENSOR_STATE rtas_fn_handle(RTAS_FNIDX__IBM_GET_DYNAMIC_SENSOR_STATE) +#define RTAS_FN_IBM_GET_INDICES rtas_fn_handle(RTAS_FNIDX__IBM_GET_INDICES) +#define RTAS_FN_IBM_GET_RIO_TOPOLOGY rtas_fn_handle(RTAS_FNIDX__IBM_GET_RIO_TOPOLOGY) +#define RTAS_FN_IBM_GET_SYSTEM_PARAMETER rtas_fn_handle(RTAS_FNIDX__IBM_GET_SYSTEM_PARAMETER) +#define RTAS_FN_IBM_GET_VPD rtas_fn_handle(RTAS_FNIDX__IBM_GET_VPD) +#define RTAS_FN_IBM_GET_XIVE rtas_fn_handle(RTAS_FNIDX__IBM_GET_XIVE) +#define RTAS_FN_IBM_INT_OFF rtas_fn_handle(RTAS_FNIDX__IBM_INT_OFF) +#define RTAS_FN_IBM_INT_ON rtas_fn_handle(RTAS_FNIDX__IBM_INT_ON) +#define RTAS_FN_IBM_IO_QUIESCE_ACK rtas_fn_handle(RTAS_FNIDX__IBM_IO_QUIESCE_ACK) +#define RTAS_FN_IBM_LPAR_PERFTOOLS rtas_fn_handle(RTAS_FNIDX__IBM_LPAR_PERFTOOLS) +#define RTAS_FN_IBM_MANAGE_FLASH_IMAGE rtas_fn_handle(RTAS_FNIDX__IBM_MANAGE_FLASH_IMAGE) +#define RTAS_FN_IBM_MANAGE_STORAGE_PRESERVATION rtas_fn_handle(RTAS_FNIDX__IBM_MANAGE_STORAGE_PRESERVATION) +#define RTAS_FN_IBM_NMI_INTERLOCK rtas_fn_handle(RTAS_FNIDX__IBM_NMI_INTERLOCK) +#define RTAS_FN_IBM_NMI_REGISTER rtas_fn_handle(RTAS_FNIDX__IBM_NMI_REGISTER) +#define RTAS_FN_IBM_OPEN_ERRINJCT rtas_fn_handle(RTAS_FNIDX__IBM_OPEN_ERRINJCT) +#define RTAS_FN_IBM_OPEN_SRIOV_ALLOW_UNFREEZE rtas_fn_handle(RTAS_FNIDX__IBM_OPEN_SRIOV_ALLOW_UNFREEZE) +#define RTAS_FN_IBM_OPEN_SRIOV_MAP_PE_NUMBER rtas_fn_handle(RTAS_FNIDX__IBM_OPEN_SRIOV_MAP_PE_NUMBER) +#define RTAS_FN_IBM_OS_TERM rtas_fn_handle(RTAS_FNIDX__IBM_OS_TERM) +#define RTAS_FN_IBM_PARTNER_CONTROL rtas_fn_handle(RTAS_FNIDX__IBM_PARTNER_CONTROL) +#define RTAS_FN_IBM_PHYSICAL_ATTESTATION rtas_fn_handle(RTAS_FNIDX__IBM_PHYSICAL_ATTESTATION) +#define RTAS_FN_IBM_PLATFORM_DUMP rtas_fn_handle(RTAS_FNIDX__IBM_PLATFORM_DUMP) +#define RTAS_FN_IBM_POWER_OFF_UPS rtas_fn_handle(RTAS_FNIDX__IBM_POWER_OFF_UPS) +#define RTAS_FN_IBM_QUERY_INTERRUPT_SOURCE_NUMBER rtas_fn_handle(RTAS_FNIDX__IBM_QUERY_INTERRUPT_SOURCE_NUMBER) +#define RTAS_FN_IBM_QUERY_PE_DMA_WINDOW rtas_fn_handle(RTAS_FNIDX__IBM_QUERY_PE_DMA_WINDOW) +#define RTAS_FN_IBM_READ_PCI_CONFIG rtas_fn_handle(RTAS_FNIDX__IBM_READ_PCI_CONFIG) +#define RTAS_FN_IBM_READ_SLOT_RESET_STATE rtas_fn_handle(RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE) +#define RTAS_FN_IBM_READ_SLOT_RESET_STATE2 rtas_fn_handle(RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE2) +#define RTAS_FN_IBM_REMOVE_PE_DMA_WINDOW rtas_fn_handle(RTAS_FNIDX__IBM_REMOVE_PE_DMA_WINDOW) +#define RTAS_FN_IBM_RESET_PE_DMA_WINDOWS rtas_fn_handle(RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOWS) +#define RTAS_FN_IBM_SCAN_LOG_DUMP rtas_fn_handle(RTAS_FNIDX__IBM_SCAN_LOG_DUMP) +#define RTAS_FN_IBM_SET_DYNAMIC_INDICATOR rtas_fn_handle(RTAS_FNIDX__IBM_SET_DYNAMIC_INDICATOR) +#define RTAS_FN_IBM_SET_EEH_OPTION rtas_fn_handle(RTAS_FNIDX__IBM_SET_EEH_OPTION) +#define RTAS_FN_IBM_SET_SLOT_RESET rtas_fn_handle(RTAS_FNIDX__IBM_SET_SLOT_RESET) +#define RTAS_FN_IBM_SET_SYSTEM_PARAMETER rtas_fn_handle(RTAS_FNIDX__IBM_SET_SYSTEM_PARAMETER) +#define RTAS_FN_IBM_SET_XIVE rtas_fn_handle(RTAS_FNIDX__IBM_SET_XIVE) +#define RTAS_FN_IBM_SLOT_ERROR_DETAIL rtas_fn_handle(RTAS_FNIDX__IBM_SLOT_ERROR_DETAIL) +#define RTAS_FN_IBM_SUSPEND_ME rtas_fn_handle(RTAS_FNIDX__IBM_SUSPEND_ME) +#define RTAS_FN_IBM_TUNE_DMA_PARMS rtas_fn_handle(RTAS_FNIDX__IBM_TUNE_DMA_PARMS) +#define RTAS_FN_IBM_UPDATE_FLASH_64_AND_REBOOT rtas_fn_handle(RTAS_FNIDX__IBM_UPDATE_FLASH_64_AND_REBOOT) +#define RTAS_FN_IBM_UPDATE_NODES rtas_fn_handle(RTAS_FNIDX__IBM_UPDATE_NODES) +#define RTAS_FN_IBM_UPDATE_PROPERTIES rtas_fn_handle(RTAS_FNIDX__IBM_UPDATE_PROPERTIES) +#define RTAS_FN_IBM_VALIDATE_FLASH_IMAGE rtas_fn_handle(RTAS_FNIDX__IBM_VALIDATE_FLASH_IMAGE) +#define RTAS_FN_IBM_WRITE_PCI_CONFIG rtas_fn_handle(RTAS_FNIDX__IBM_WRITE_PCI_CONFIG) +#define RTAS_FN_NVRAM_FETCH rtas_fn_handle(RTAS_FNIDX__NVRAM_FETCH) +#define RTAS_FN_NVRAM_STORE rtas_fn_handle(RTAS_FNIDX__NVRAM_STORE) +#define RTAS_FN_POWER_OFF rtas_fn_handle(RTAS_FNIDX__POWER_OFF) +#define RTAS_FN_PUT_TERM_CHAR rtas_fn_handle(RTAS_FNIDX__PUT_TERM_CHAR) +#define RTAS_FN_QUERY_CPU_STOPPED_STATE rtas_fn_handle(RTAS_FNIDX__QUERY_CPU_STOPPED_STATE) +#define RTAS_FN_READ_PCI_CONFIG rtas_fn_handle(RTAS_FNIDX__READ_PCI_CONFIG) +#define RTAS_FN_RTAS_LAST_ERROR rtas_fn_handle(RTAS_FNIDX__RTAS_LAST_ERROR) +#define RTAS_FN_SET_INDICATOR rtas_fn_handle(RTAS_FNIDX__SET_INDICATOR) +#define RTAS_FN_SET_POWER_LEVEL rtas_fn_handle(RTAS_FNIDX__SET_POWER_LEVEL) +#define RTAS_FN_SET_TIME_FOR_POWER_ON rtas_fn_handle(RTAS_FNIDX__SET_TIME_FOR_POWER_ON) +#define RTAS_FN_SET_TIME_OF_DAY rtas_fn_handle(RTAS_FNIDX__SET_TIME_OF_DAY) +#define RTAS_FN_START_CPU rtas_fn_handle(RTAS_FNIDX__START_CPU) +#define RTAS_FN_STOP_SELF rtas_fn_handle(RTAS_FNIDX__STOP_SELF) +#define RTAS_FN_SYSTEM_REBOOT rtas_fn_handle(RTAS_FNIDX__SYSTEM_REBOOT) +#define RTAS_FN_THAW_TIME_BASE rtas_fn_handle(RTAS_FNIDX__THAW_TIME_BASE) +#define RTAS_FN_WRITE_PCI_CONFIG rtas_fn_handle(RTAS_FNIDX__WRITE_PCI_CONFIG) + #define RTAS_UNKNOWN_SERVICE (-1) #define RTAS_INSTANTIATE_MAX (1ULL<<30) /* Don't instantiate rtas at/above this value */ @@ -222,6 +401,11 @@ extern void (*rtas_flash_term_hook)(int); extern struct rtas_t rtas; +s32 rtas_function_token(const rtas_fn_handle_t handle); +static inline bool rtas_function_implemented(const rtas_fn_handle_t handle) +{ + return rtas_function_token(handle) != RTAS_UNKNOWN_SERVICE; +} extern int rtas_token(const char *service); extern int rtas_service_present(const char *service); extern int rtas_call(int token, int, int, int *, ...); diff --git a/arch/powerpc/include/asm/secvar.h b/arch/powerpc/include/asm/secvar.h index 4cc35b58b986..4828e0ab7e3c 100644 --- a/arch/powerpc/include/asm/secvar.h +++ b/arch/powerpc/include/asm/secvar.h @@ -10,25 +10,30 @@ #include <linux/types.h> #include <linux/errno.h> +#include <linux/sysfs.h> extern const struct secvar_operations *secvar_ops; struct secvar_operations { - int (*get)(const char *key, uint64_t key_len, u8 *data, - uint64_t *data_size); - int (*get_next)(const char *key, uint64_t *key_len, - uint64_t keybufsize); - int (*set)(const char *key, uint64_t key_len, u8 *data, - uint64_t data_size); + int (*get)(const char *key, u64 key_len, u8 *data, u64 *data_size); + int (*get_next)(const char *key, u64 *key_len, u64 keybufsize); + int (*set)(const char *key, u64 key_len, u8 *data, u64 data_size); + ssize_t (*format)(char *buf, size_t bufsize); + int (*max_size)(u64 *max_size); + const struct attribute **config_attrs; + + // NULL-terminated array of fixed variable names + // Only used if get_next() isn't provided + const char * const *var_names; }; #ifdef CONFIG_PPC_SECURE_BOOT -extern void set_secvar_ops(const struct secvar_operations *ops); +int set_secvar_ops(const struct secvar_operations *ops); #else -static inline void set_secvar_ops(const struct secvar_operations *ops) { } +static inline int set_secvar_ops(const struct secvar_operations *ops) { return 0; } #endif diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index f63505d74932..6c6cb53d7045 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -26,6 +26,7 @@ #include <asm/percpu.h> extern int boot_cpuid; +extern int boot_cpu_hwid; /* PPC64 only */ extern int spinning_secondaries; extern u32 *cpu_to_phys_id; extern bool coregroup_enabled; diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h index 08cd60cd70b7..82cc2c6704e6 100644 --- a/arch/powerpc/include/asm/trace.h +++ b/arch/powerpc/include/asm/trace.h @@ -119,6 +119,109 @@ TRACE_EVENT_FN_COND(hcall_exit, ); #endif +#ifdef CONFIG_PPC_RTAS + +#include <asm/rtas-types.h> + +TRACE_EVENT(rtas_input, + + TP_PROTO(struct rtas_args *rtas_args, const char *name), + + TP_ARGS(rtas_args, name), + + TP_STRUCT__entry( + __field(__u32, nargs) + __string(name, name) + __dynamic_array(__u32, inputs, be32_to_cpu(rtas_args->nargs)) + ), + + TP_fast_assign( + __entry->nargs = be32_to_cpu(rtas_args->nargs); + __assign_str(name, name); + be32_to_cpu_array(__get_dynamic_array(inputs), rtas_args->args, __entry->nargs); + ), + + TP_printk("%s arguments: %s", __get_str(name), + __print_array(__get_dynamic_array(inputs), __entry->nargs, 4) + ) +); + +TRACE_EVENT(rtas_output, + + TP_PROTO(struct rtas_args *rtas_args, const char *name), + + TP_ARGS(rtas_args, name), + + TP_STRUCT__entry( + __field(__u32, nr_other) + __field(__s32, status) + __string(name, name) + __dynamic_array(__u32, other_outputs, be32_to_cpu(rtas_args->nret) - 1) + ), + + TP_fast_assign( + __entry->nr_other = be32_to_cpu(rtas_args->nret) - 1; + __entry->status = be32_to_cpu(rtas_args->rets[0]); + __assign_str(name, name); + be32_to_cpu_array(__get_dynamic_array(other_outputs), + &rtas_args->rets[1], __entry->nr_other); + ), + + TP_printk("%s status: %d, other outputs: %s", __get_str(name), __entry->status, + __print_array(__get_dynamic_array(other_outputs), + __entry->nr_other, 4) + ) +); + +DECLARE_EVENT_CLASS(rtas_parameter_block, + + TP_PROTO(struct rtas_args *rtas_args), + + TP_ARGS(rtas_args), + + TP_STRUCT__entry( + __field(u32, token) + __field(u32, nargs) + __field(u32, nret) + __array(__u32, params, 16) + ), + + TP_fast_assign( + __entry->token = be32_to_cpu(rtas_args->token); + __entry->nargs = be32_to_cpu(rtas_args->nargs); + __entry->nret = be32_to_cpu(rtas_args->nret); + be32_to_cpu_array(__entry->params, rtas_args->args, ARRAY_SIZE(rtas_args->args)); + ), + + TP_printk("token=%u nargs=%u nret=%u params:" + " [0]=0x%08x [1]=0x%08x [2]=0x%08x [3]=0x%08x" + " [4]=0x%08x [5]=0x%08x [6]=0x%08x [7]=0x%08x" + " [8]=0x%08x [9]=0x%08x [10]=0x%08x [11]=0x%08x" + " [12]=0x%08x [13]=0x%08x [14]=0x%08x [15]=0x%08x", + __entry->token, __entry->nargs, __entry->nret, + __entry->params[0], __entry->params[1], __entry->params[2], __entry->params[3], + __entry->params[4], __entry->params[5], __entry->params[6], __entry->params[7], + __entry->params[8], __entry->params[9], __entry->params[10], __entry->params[11], + __entry->params[12], __entry->params[13], __entry->params[14], __entry->params[15] + ) +); + +DEFINE_EVENT(rtas_parameter_block, rtas_ll_entry, + + TP_PROTO(struct rtas_args *rtas_args), + + TP_ARGS(rtas_args) +); + +DEFINE_EVENT(rtas_parameter_block, rtas_ll_exit, + + TP_PROTO(struct rtas_args *rtas_args), + + TP_ARGS(rtas_args) +); + +#endif /* CONFIG_PPC_RTAS */ + #ifdef CONFIG_PPC_POWERNV extern int opal_tracepoint_regfunc(void); extern void opal_tracepoint_unregfunc(void); diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 9b6146056e48..9bf2be123093 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -54,6 +54,13 @@ CFLAGS_cputable.o += -DDISABLE_BRANCH_PROFILING CFLAGS_btext.o += -DDISABLE_BRANCH_PROFILING endif +KCSAN_SANITIZE_early_32.o := n +KCSAN_SANITIZE_early_64.o := n +KCSAN_SANITIZE_cputable.o := n +KCSAN_SANITIZE_btext.o := n +KCSAN_SANITIZE_paca.o := n +KCSAN_SANITIZE_setup_64.o := n + #ifdef CONFIG_RANDOMIZE_KSTACK_OFFSET # Remove stack protector to avoid triggering unneeded stack canary # checks due to randomize_kstack_offset. @@ -177,12 +184,15 @@ obj-$(CONFIG_PPC_SECVAR_SYSFS) += secvar-sysfs.o # Disable GCOV, KCOV & sanitizers in odd or sensitive code GCOV_PROFILE_prom_init.o := n KCOV_INSTRUMENT_prom_init.o := n +KCSAN_SANITIZE_prom_init.o := n UBSAN_SANITIZE_prom_init.o := n GCOV_PROFILE_kprobes.o := n KCOV_INSTRUMENT_kprobes.o := n +KCSAN_SANITIZE_kprobes.o := n UBSAN_SANITIZE_kprobes.o := n GCOV_PROFILE_kprobes-ftrace.o := n KCOV_INSTRUMENT_kprobes-ftrace.o := n +KCSAN_SANITIZE_kprobes-ftrace.o := n UBSAN_SANITIZE_kprobes-ftrace.o := n GCOV_PROFILE_syscall_64.o := n KCOV_INSTRUMENT_syscall_64.o := n diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index f279295179bd..438568a472d0 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -1065,10 +1065,10 @@ recover_failed: eeh_slot_error_detail(pe, EEH_LOG_PERM); /* Notify all devices that they're about to go down. */ - eeh_set_channel_state(pe, pci_channel_io_perm_failure); eeh_set_irq_state(pe, false); eeh_pe_report("error_detected(permanent failure)", pe, eeh_report_failure, NULL); + eeh_set_channel_state(pe, pci_channel_io_perm_failure); /* Mark the PE to be removed permanently */ eeh_pe_state_mark(pe, EEH_PE_REMOVED); @@ -1185,10 +1185,10 @@ void eeh_handle_special_event(void) /* Notify all devices to be down */ eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); - eeh_set_channel_state(pe, pci_channel_io_perm_failure); eeh_pe_report( "error_detected(permanent failure)", pe, eeh_report_failure, NULL); + eeh_set_channel_state(pe, pci_channel_io_perm_failure); pci_lock_rescan_remove(); list_for_each_entry(hose, &hose_list, list_node) { diff --git a/arch/powerpc/kernel/epapr_hcalls.S b/arch/powerpc/kernel/epapr_hcalls.S index 69a912550577..033116e465d0 100644 --- a/arch/powerpc/kernel/epapr_hcalls.S +++ b/arch/powerpc/kernel/epapr_hcalls.S @@ -21,7 +21,13 @@ _GLOBAL(epapr_ev_idle) ori r4, r4,_TLF_NAPPING /* so when we take an exception */ PPC_STL r4, TI_LOCAL_FLAGS(r2) /* it will return to our caller */ +#ifdef CONFIG_BOOKE_OR_40x wrteei 1 +#else + mfmsr r4 + ori r4, r4, MSR_EE + mtmsr r4 +#endif idle_loop: LOAD_REG_IMMEDIATE(r11, EV_HCALL_TOKEN(EV_IDLE)) diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 7558ba4eb864..1febb56ebaeb 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -160,12 +160,8 @@ __secondary_hold: std r24,(ABS_ADDR(__secondary_hold_acknowledge, first_256B))(0) sync - li r26,0 -#ifdef CONFIG_PPC_BOOK3E_64 - tovirt(r26,r26) -#endif /* All secondary cpus wait here until told to start. */ -100: ld r12,(ABS_ADDR(__secondary_hold_spinloop, first_256B))(r26) +100: ld r12,(ABS_ADDR(__secondary_hold_spinloop, first_256B))(0) cmpdi 0,r12,0 beq 100b @@ -475,8 +471,31 @@ SYM_FUNC_START_LOCAL(__mmu_off) rfid b . /* prevent speculative execution */ SYM_FUNC_END(__mmu_off) -#endif +SYM_FUNC_START_LOCAL(start_initialization_book3s) + mflr r25 + + /* Setup some critical 970 SPRs before switching MMU off */ + mfspr r0,SPRN_PVR + srwi r0,r0,16 + cmpwi r0,0x39 /* 970 */ + beq 1f + cmpwi r0,0x3c /* 970FX */ + beq 1f + cmpwi r0,0x44 /* 970MP */ + beq 1f + cmpwi r0,0x45 /* 970GX */ + bne 2f +1: bl __cpu_preinit_ppc970 +2: + + /* Switch off MMU if not already off */ + bl __mmu_off + + mtlr r25 + blr +SYM_FUNC_END(start_initialization_book3s) +#endif /* * Here is our main kernel entry point. We support currently 2 kind of entries @@ -523,26 +542,10 @@ __start_initialization_multiplatform: #ifdef CONFIG_PPC_BOOK3E_64 bl start_initialization_book3e - b __after_prom_start #else - /* Setup some critical 970 SPRs before switching MMU off */ - mfspr r0,SPRN_PVR - srwi r0,r0,16 - cmpwi r0,0x39 /* 970 */ - beq 1f - cmpwi r0,0x3c /* 970FX */ - beq 1f - cmpwi r0,0x44 /* 970MP */ - beq 1f - cmpwi r0,0x45 /* 970GX */ - bne 2f -1: bl __cpu_preinit_ppc970 -2: - - /* Switch off MMU if not already off */ - bl __mmu_off - b __after_prom_start + bl start_initialization_book3s #endif /* CONFIG_PPC_BOOK3E_64 */ + b __after_prom_start __REF __boot_from_prom: diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index caebe1431596..ee95937bdaf1 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -67,11 +67,9 @@ static void iommu_debugfs_add(struct iommu_table *tbl) static void iommu_debugfs_del(struct iommu_table *tbl) { char name[10]; - struct dentry *liobn_entry; sprintf(name, "%08lx", tbl->it_index); - liobn_entry = debugfs_lookup(name, iommu_debugfs_dir); - debugfs_remove(liobn_entry); + debugfs_lookup_and_remove(name, iommu_debugfs_dir); } #else static void iommu_debugfs_add(struct iommu_table *tbl){} diff --git a/arch/powerpc/kernel/irq_64.c b/arch/powerpc/kernel/irq_64.c index eb2b380e52a0..c788c55512ed 100644 --- a/arch/powerpc/kernel/irq_64.c +++ b/arch/powerpc/kernel/irq_64.c @@ -70,22 +70,19 @@ int distribute_irqs = 1; static inline void next_interrupt(struct pt_regs *regs) { - /* - * Softirq processing can enable/disable irqs, which will leave - * MSR[EE] enabled and the soft mask set to IRQS_DISABLED. Fix - * this up. - */ - if (!(local_paca->irq_happened & PACA_IRQ_HARD_DIS)) - hard_irq_disable(); - else - irq_soft_mask_set(IRQS_ALL_DISABLED); + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) { + WARN_ON(!(local_paca->irq_happened & PACA_IRQ_HARD_DIS)); + WARN_ON(irq_soft_mask_return() != IRQS_ALL_DISABLED); + } /* * We are responding to the next interrupt, so interrupt-off * latencies should be reset here. */ + lockdep_hardirq_exit(); trace_hardirqs_on(); trace_hardirqs_off(); + lockdep_hardirq_enter(); } static inline bool irq_happened_test_and_clear(u8 irq) @@ -97,22 +94,11 @@ static inline bool irq_happened_test_and_clear(u8 irq) return false; } -void replay_soft_interrupts(void) +static __no_kcsan void __replay_soft_interrupts(void) { struct pt_regs regs; /* - * Be careful here, calling these interrupt handlers can cause - * softirqs to be raised, which they may run when calling irq_exit, - * which will cause local_irq_enable() to be run, which can then - * recurse into this function. Don't keep any state across - * interrupt handler calls which may change underneath us. - * - * Softirqs can not be disabled over replay to stop this recursion - * because interrupts taken in idle code may require RCU softirq - * to run in the irq RCU tracking context. This is a hard problem - * to fix without changes to the softirq or idle layer. - * * We use local_paca rather than get_paca() to avoid all the * debug_smp_processor_id() business in this low level function. */ @@ -120,13 +106,20 @@ void replay_soft_interrupts(void) if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) { WARN_ON_ONCE(mfmsr() & MSR_EE); WARN_ON(!(local_paca->irq_happened & PACA_IRQ_HARD_DIS)); + WARN_ON(local_paca->irq_happened & PACA_IRQ_REPLAYING); } + /* + * PACA_IRQ_REPLAYING prevents interrupt handlers from enabling + * MSR[EE] to get PMIs, which can result in more IRQs becoming + * pending. + */ + local_paca->irq_happened |= PACA_IRQ_REPLAYING; + ppc_save_regs(®s); regs.softe = IRQS_ENABLED; regs.msr |= MSR_EE; -again: /* * Force the delivery of pending soft-disabled interrupts on PS3. * Any HV call will have this side effect. @@ -175,17 +168,18 @@ again: next_interrupt(®s); } - /* - * Softirq processing can enable and disable interrupts, which can - * result in new irqs becoming pending. Must keep looping until we - * have cleared out all pending interrupts. - */ - if (local_paca->irq_happened & ~PACA_IRQ_HARD_DIS) - goto again; + local_paca->irq_happened &= ~PACA_IRQ_REPLAYING; +} + +__no_kcsan void replay_soft_interrupts(void) +{ + irq_enter(); /* See comment in arch_local_irq_restore */ + __replay_soft_interrupts(); + irq_exit(); } #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_KUAP) -static inline void replay_soft_interrupts_irqrestore(void) +static inline __no_kcsan void replay_soft_interrupts_irqrestore(void) { unsigned long kuap_state = get_kuap(); @@ -200,16 +194,16 @@ static inline void replay_soft_interrupts_irqrestore(void) if (kuap_state != AMR_KUAP_BLOCKED) set_kuap(AMR_KUAP_BLOCKED); - replay_soft_interrupts(); + __replay_soft_interrupts(); if (kuap_state != AMR_KUAP_BLOCKED) set_kuap(kuap_state); } #else -#define replay_soft_interrupts_irqrestore() replay_soft_interrupts() +#define replay_soft_interrupts_irqrestore() __replay_soft_interrupts() #endif -notrace void arch_local_irq_restore(unsigned long mask) +notrace __no_kcsan void arch_local_irq_restore(unsigned long mask) { unsigned char irq_happened; @@ -219,9 +213,13 @@ notrace void arch_local_irq_restore(unsigned long mask) return; } - if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) - WARN_ON_ONCE(in_nmi() || in_hardirq()); + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) { + WARN_ON_ONCE(in_nmi()); + WARN_ON_ONCE(in_hardirq()); + WARN_ON_ONCE(local_paca->irq_happened & PACA_IRQ_REPLAYING); + } +again: /* * After the stb, interrupts are unmasked and there are no interrupts * pending replay. The restart sequence makes this atomic with @@ -248,6 +246,12 @@ notrace void arch_local_irq_restore(unsigned long mask) if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) WARN_ON_ONCE(!(mfmsr() & MSR_EE)); + /* + * If we came here from the replay below, we might have a preempt + * pending (due to preempt_enable_no_resched()). Have to check now. + */ + preempt_check_resched(); + return; happened: @@ -261,6 +265,7 @@ happened: irq_soft_mask_set(IRQS_ENABLED); local_paca->irq_happened = 0; __hard_irq_enable(); + preempt_check_resched(); return; } @@ -296,12 +301,38 @@ happened: irq_soft_mask_set(IRQS_ALL_DISABLED); trace_hardirqs_off(); + /* + * Now enter interrupt context. The interrupt handlers themselves + * also call irq_enter/exit (which is okay, they can nest). But call + * it here now to hold off softirqs until the below irq_exit(). If + * we allowed replayed handlers to run softirqs, that enables irqs, + * which must replay interrupts, which recurses in here and makes + * things more complicated. The recursion is limited to 2, and it can + * be made to work, but it's complicated. + * + * local_bh_disable can not be used here because interrupts taken in + * idle are not in the right context (RCU, tick, etc) to run softirqs + * so irq_enter must be called. + */ + irq_enter(); + replay_soft_interrupts_irqrestore(); + irq_exit(); + + if (unlikely(local_paca->irq_happened != PACA_IRQ_HARD_DIS)) { + /* + * The softirq processing in irq_exit() may enable interrupts + * temporarily, which can result in MSR[EE] being enabled and + * more irqs becoming pending. Go around again if that happens. + */ + trace_hardirqs_on(); + preempt_enable_no_resched(); + goto again; + } + trace_hardirqs_on(); irq_soft_mask_set(IRQS_ENABLED); - if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) - WARN_ON(local_paca->irq_happened != PACA_IRQ_HARD_DIS); local_paca->irq_happened = 0; __hard_irq_enable(); preempt_enable(); diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 6c5d30fba766..219f28637a3e 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -131,6 +131,13 @@ void save_mce_event(struct pt_regs *regs, long handled, if (mce->error_type == MCE_ERROR_TYPE_UE) mce->u.ue_error.ignore_event = mce_err->ignore_event; + /* + * Raise irq work, So that we don't miss to log the error for + * unrecoverable errors. + */ + if (mce->disposition == MCE_DISPOSITION_NOT_RECOVERED) + mce_irq_work_queue(); + if (!addr) return; @@ -233,9 +240,6 @@ static void machine_check_ue_event(struct machine_check_event *evt) } memcpy(&local_paca->mce_info->mce_ue_event_queue[index], evt, sizeof(*evt)); - - /* Queue work to process this event later. */ - mce_irq_work_queue(); } /* diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index ff045644f13f..2ac78d207f77 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -502,9 +502,10 @@ static unsigned long stub_for_addr(const Elf64_Shdr *sechdrs, static int restore_r2(const char *name, u32 *instruction, struct module *me) { u32 *prev_insn = instruction - 1; + u32 insn_val = *instruction; if (is_mprofile_ftrace_call(name)) - return 1; + return 0; /* * Make sure the branch isn't a sibling call. Sibling calls aren't @@ -512,19 +513,25 @@ static int restore_r2(const char *name, u32 *instruction, struct module *me) * restore afterwards. */ if (!instr_is_relative_link_branch(ppc_inst(*prev_insn))) - return 1; + return 0; - if (*instruction != PPC_RAW_NOP()) { - pr_err("%s: Expected nop after call, got %08x at %pS\n", - me->name, *instruction, instruction); + /* + * For livepatch, the restore r2 instruction might have already been + * written previously, if the referenced symbol is in a previously + * unloaded module which is now being loaded again. In that case, skip + * the warning and the instruction write. + */ + if (insn_val == PPC_INST_LD_TOC) return 0; + + if (insn_val != PPC_RAW_NOP()) { + pr_err("%s: Expected nop after call, got %08x at %pS\n", + me->name, insn_val, instruction); + return -ENOEXEC; } /* ld r2,R2_STACK_OFFSET(r1) */ - if (patch_instruction(instruction, ppc_inst(PPC_INST_LD_TOC))) - return 0; - - return 1; + return patch_instruction(instruction, ppc_inst(PPC_INST_LD_TOC)); } int apply_relocate_add(Elf64_Shdr *sechdrs, @@ -648,8 +655,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, strtab + sym->st_name); if (!value) return -ENOENT; - if (!restore_r2(strtab + sym->st_name, - (u32 *)location + 1, me)) + if (restore_r2(strtab + sym->st_name, + (u32 *)location + 1, me)) return -ENOEXEC; } else value += local_entry_offset(sym); diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index 855b59892c5c..ce0c8623e563 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -62,7 +62,7 @@ fixup_cpc710_pci64(struct pci_dev* dev) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_CPC710_PCI64, fixup_cpc710_pci64); -#if defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_CHRP) +#ifdef CONFIG_PPC_PCI_OF_BUS_MAP static u8* pci_to_OF_bus_map; static int pci_bus_count; @@ -152,6 +152,7 @@ pcibios_make_OF_bus_map(void) } #endif } +#endif // CONFIG_PPC_PCI_OF_BUS_MAP #ifdef CONFIG_PPC_PMAC @@ -160,7 +161,9 @@ pcibios_make_OF_bus_map(void) */ int pci_device_from_OF_node(struct device_node *node, u8 *bus, u8 *devfn) { +#ifdef CONFIG_PPC_PCI_OF_BUS_MAP struct pci_dev *dev = NULL; +#endif const __be32 *reg; int size; @@ -175,6 +178,9 @@ int pci_device_from_OF_node(struct device_node *node, u8 *bus, u8 *devfn) *bus = (be32_to_cpup(®[0]) >> 16) & 0xff; *devfn = (be32_to_cpup(®[0]) >> 8) & 0xff; +#ifndef CONFIG_PPC_PCI_OF_BUS_MAP + return 0; +#else /* Ok, here we need some tweak. If we have already renumbered * all busses, we can't rely on the OF bus number any more. * the pci_to_OF_bus_map is not enough as several PCI busses @@ -192,11 +198,12 @@ int pci_device_from_OF_node(struct device_node *node, u8 *bus, u8 *devfn) } return -ENODEV; +#endif // CONFIG_PPC_PCI_OF_BUS_MAP } EXPORT_SYMBOL(pci_device_from_OF_node); #endif -#ifdef CONFIG_PPC_CHRP +#ifdef CONFIG_PPC_PCI_OF_BUS_MAP /* We create the "pci-OF-bus-map" property now so it appears in the * /proc device tree */ @@ -221,9 +228,7 @@ pci_create_OF_bus_map(void) of_node_put(dn); } } -#endif - -#endif /* defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_CHRP) */ +#endif // CONFIG_PPC_PCI_OF_BUS_MAP void pcibios_setup_phb_io_space(struct pci_controller *hose) { @@ -273,6 +278,7 @@ static int __init pcibios_init(void) } #if defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_CHRP) +#ifdef CONFIG_PPC_PCI_OF_BUS_MAP pci_bus_count = next_busno; /* OpenFirmware based machines need a map of OF bus @@ -282,6 +288,7 @@ static int __init pcibios_init(void) if (pci_assign_all_buses) pcibios_make_OF_bus_map(); #endif +#endif /* Call common code to handle resource allocation */ pcibios_resource_survey(); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index c22cc234672f..4b29ac5ddac6 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1405,8 +1405,7 @@ static void show_instructions(struct pt_regs *regs) for (i = 0; i < NR_INSN_TO_PRINT; i++) { int instr; - if (!__kernel_text_address(pc) || - get_kernel_nofault(instr, (const void *)pc)) { + if (get_kernel_nofault(instr, (const void *)pc)) { pr_cont("XXXXXXXX "); } else { if (nip == pc) @@ -2118,6 +2117,9 @@ static inline int valid_irq_stack(unsigned long sp, struct task_struct *p, unsigned long stack_page; unsigned long cpu = task_cpu(p); + if (!hardirq_ctx[cpu] || !softirq_ctx[cpu]) + return 0; + stack_page = (unsigned long)hardirq_ctx[cpu]; if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes) return 1; @@ -2139,6 +2141,14 @@ static inline int valid_emergency_stack(unsigned long sp, struct task_struct *p, if (!paca_ptrs) return 0; + if (!paca_ptrs[cpu]->emergency_sp) + return 0; + +# ifdef CONFIG_PPC_BOOK3S_64 + if (!paca_ptrs[cpu]->nmi_emergency_sp || !paca_ptrs[cpu]->mc_emergency_sp) + return 0; +#endif + stack_page = (unsigned long)paca_ptrs[cpu]->emergency_sp - THREAD_SIZE; if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes) return 1; diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 4f1c920aa13e..9d9ee4e9e1a1 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -56,6 +56,7 @@ #include <asm/drmem.h> #include <asm/ultravisor.h> #include <asm/prom.h> +#include <asm/plpks.h> #include <mm/mmu_decl.h> @@ -370,8 +371,8 @@ static int __init early_init_dt_scan_cpus(unsigned long node, be32_to_cpu(intserv[found_thread])); boot_cpuid = found; - // Pass the boot CPU's hard CPU id back to our caller - *((u32 *)data) = be32_to_cpu(intserv[found_thread]); + if (IS_ENABLED(CONFIG_PPC64)) + boot_cpu_hwid = be32_to_cpu(intserv[found_thread]); /* * PAPR defines "logical" PVR values for cpus that @@ -755,7 +756,6 @@ static inline void save_fscr_to_task(void) {} void __init early_init_devtree(void *params) { - u32 boot_cpu_hwid; phys_addr_t limit; DBG(" -> early_init_devtree(%px)\n", params); @@ -851,7 +851,7 @@ void __init early_init_devtree(void *params) /* Retrieve CPU related informations from the flat tree * (altivec support, boot CPU ID, ...) */ - of_scan_flat_dt(early_init_dt_scan_cpus, &boot_cpu_hwid); + of_scan_flat_dt(early_init_dt_scan_cpus, NULL); if (boot_cpuid < 0) { printk("Failed to identify boot CPU !\n"); BUG(); @@ -868,11 +868,6 @@ void __init early_init_devtree(void *params) mmu_early_init_devtree(); - // NB. paca is not installed until later in early_setup() - allocate_paca_ptrs(); - allocate_paca(boot_cpuid); - set_hard_smp_processor_id(boot_cpuid, boot_cpu_hwid); - #ifdef CONFIG_PPC_POWERNV /* Scan and build the list of machine check recoverable ranges */ of_scan_flat_dt(early_init_dt_scan_recoverable_ranges, NULL); @@ -893,6 +888,9 @@ void __init early_init_devtree(void *params) powerpc_firmware_features |= FW_FEATURE_PS3_POSSIBLE; #endif + /* If kexec left a PLPKS password in the DT, get it and clear it */ + plpks_early_init_devtree(); + tm_init(); DBG(" <- early_init_devtree()\n"); diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh index 311890d71c4c..5a319863f289 100644 --- a/arch/powerpc/kernel/prom_init_check.sh +++ b/arch/powerpc/kernel/prom_init_check.sh @@ -51,11 +51,10 @@ do # a leading . on the name, so strip it off here. UNDEF="${UNDEF#.}" - if [ $KBUILD_VERBOSE ]; then - if [ $KBUILD_VERBOSE -ne 0 ]; then - echo "Checking prom_init.o symbol '$UNDEF'" - fi - fi + case "$KBUILD_VERBOSE" in + *1*) + echo "Checking prom_init.o symbol '$UNDEF'" ;; + esac OK=0 for WHITE in $WHITELIST diff --git a/arch/powerpc/kernel/rtas-proc.c b/arch/powerpc/kernel/rtas-proc.c index 081b2b741a8c..9454b8395b6a 100644 --- a/arch/powerpc/kernel/rtas-proc.c +++ b/arch/powerpc/kernel/rtas-proc.c @@ -287,9 +287,9 @@ static ssize_t ppc_rtas_poweron_write(struct file *file, rtc_time64_to_tm(nowtime, &tm); - error = rtas_call(rtas_token("set-time-for-power-on"), 7, 1, NULL, - tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, - tm.tm_hour, tm.tm_min, tm.tm_sec, 0 /* nano */); + error = rtas_call(rtas_function_token(RTAS_FN_SET_TIME_FOR_POWER_ON), 7, 1, NULL, + tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec, 0 /* nano */); if (error) printk(KERN_WARNING "error: setting poweron time returned: %s\n", ppc_rtas_process_error(error)); @@ -350,9 +350,9 @@ static ssize_t ppc_rtas_clock_write(struct file *file, return error; rtc_time64_to_tm(nowtime, &tm); - error = rtas_call(rtas_token("set-time-of-day"), 7, 1, NULL, - tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, - tm.tm_hour, tm.tm_min, tm.tm_sec, 0); + error = rtas_call(rtas_function_token(RTAS_FN_SET_TIME_OF_DAY), 7, 1, NULL, + tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec, 0); if (error) printk(KERN_WARNING "error: setting the clock returned: %s\n", ppc_rtas_process_error(error)); @@ -362,7 +362,7 @@ static ssize_t ppc_rtas_clock_write(struct file *file, static int ppc_rtas_clock_show(struct seq_file *m, void *v) { int ret[8]; - int error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret); + int error = rtas_call(rtas_function_token(RTAS_FN_GET_TIME_OF_DAY), 0, 8, ret); if (error) { printk(KERN_WARNING "error: reading the clock returned: %s\n", @@ -385,7 +385,7 @@ static int ppc_rtas_sensors_show(struct seq_file *m, void *v) { int i,j; int state, error; - int get_sensor_state = rtas_token("get-sensor-state"); + int get_sensor_state = rtas_function_token(RTAS_FN_GET_SENSOR_STATE); seq_printf(m, "RTAS (RunTime Abstraction Services) Sensor Information\n"); seq_printf(m, "Sensor\t\tValue\t\tCondition\tLocation\n"); @@ -708,8 +708,8 @@ static ssize_t ppc_rtas_tone_freq_write(struct file *file, return error; rtas_tone_frequency = freq; /* save it for later */ - error = rtas_call(rtas_token("set-indicator"), 3, 1, NULL, - TONE_FREQUENCY, 0, freq); + error = rtas_call(rtas_function_token(RTAS_FN_SET_INDICATOR), 3, 1, NULL, + TONE_FREQUENCY, 0, freq); if (error) printk(KERN_WARNING "error: setting tone frequency returned: %s\n", ppc_rtas_process_error(error)); @@ -736,8 +736,8 @@ static ssize_t ppc_rtas_tone_volume_write(struct file *file, volume = 100; rtas_tone_volume = volume; /* save it for later */ - error = rtas_call(rtas_token("set-indicator"), 3, 1, NULL, - TONE_VOLUME, 0, volume); + error = rtas_call(rtas_function_token(RTAS_FN_SET_INDICATOR), 3, 1, NULL, + TONE_VOLUME, 0, volume); if (error) printk(KERN_WARNING "error: setting tone volume returned: %s\n", ppc_rtas_process_error(error)); diff --git a/arch/powerpc/kernel/rtas-rtc.c b/arch/powerpc/kernel/rtas-rtc.c index 5a31d1829bca..6996214532bd 100644 --- a/arch/powerpc/kernel/rtas-rtc.c +++ b/arch/powerpc/kernel/rtas-rtc.c @@ -21,7 +21,7 @@ time64_t __init rtas_get_boot_time(void) max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT; do { - error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret); + error = rtas_call(rtas_function_token(RTAS_FN_GET_TIME_OF_DAY), 0, 8, ret); wait_time = rtas_busy_delay_time(error); if (wait_time) { @@ -53,7 +53,7 @@ void rtas_get_rtc_time(struct rtc_time *rtc_tm) max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT; do { - error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret); + error = rtas_call(rtas_function_token(RTAS_FN_GET_TIME_OF_DAY), 0, 8, ret); wait_time = rtas_busy_delay_time(error); if (wait_time) { @@ -90,7 +90,7 @@ int rtas_set_rtc_time(struct rtc_time *tm) max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT; do { - error = rtas_call(rtas_token("set-time-of-day"), 7, 1, NULL, + error = rtas_call(rtas_function_token(RTAS_FN_SET_TIME_OF_DAY), 7, 1, NULL, tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, tm->tm_hour, tm->tm_min, tm->tm_sec, 0); diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index deded51a7978..31175b34856a 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -9,10 +9,12 @@ #define pr_fmt(fmt) "rtas: " fmt +#include <linux/bsearch.h> #include <linux/capability.h> #include <linux/delay.h> #include <linux/export.h> #include <linux/init.h> +#include <linux/kconfig.h> #include <linux/kernel.h> #include <linux/memblock.h> #include <linux/of.h> @@ -26,6 +28,7 @@ #include <linux/syscalls.h> #include <linux/types.h> #include <linux/uaccess.h> +#include <linux/xarray.h> #include <asm/delay.h> #include <asm/firmware.h> @@ -33,43 +36,604 @@ #include <asm/machdep.h> #include <asm/mmu.h> #include <asm/page.h> +#include <asm/rtas-work-area.h> #include <asm/rtas.h> #include <asm/time.h> +#include <asm/trace.h> #include <asm/udbg.h> +struct rtas_filter { + /* Indexes into the args buffer, -1 if not used */ + const int buf_idx1; + const int size_idx1; + const int buf_idx2; + const int size_idx2; + /* + * Assumed buffer size per the spec if the function does not + * have a size parameter, e.g. ibm,errinjct. 0 if unused. + */ + const int fixed_size; +}; + +/** + * struct rtas_function - Descriptor for RTAS functions. + * + * @token: Value of @name if it exists under the /rtas node. + * @name: Function name. + * @filter: If non-NULL, invoking this function via the rtas syscall is + * generally allowed, and @filter describes constraints on the + * arguments. See also @banned_for_syscall_on_le. + * @banned_for_syscall_on_le: Set when call via sys_rtas is generally allowed + * but specifically restricted on ppc64le. Such + * functions are believed to have no users on + * ppc64le, and we want to keep it that way. It does + * not make sense for this to be set when @filter + * is false. + */ +struct rtas_function { + s32 token; + const bool banned_for_syscall_on_le:1; + const char * const name; + const struct rtas_filter *filter; +}; + +static struct rtas_function rtas_function_table[] __ro_after_init = { + [RTAS_FNIDX__CHECK_EXCEPTION] = { + .name = "check-exception", + }, + [RTAS_FNIDX__DISPLAY_CHARACTER] = { + .name = "display-character", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__EVENT_SCAN] = { + .name = "event-scan", + }, + [RTAS_FNIDX__FREEZE_TIME_BASE] = { + .name = "freeze-time-base", + }, + [RTAS_FNIDX__GET_POWER_LEVEL] = { + .name = "get-power-level", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__GET_SENSOR_STATE] = { + .name = "get-sensor-state", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__GET_TERM_CHAR] = { + .name = "get-term-char", + }, + [RTAS_FNIDX__GET_TIME_OF_DAY] = { + .name = "get-time-of-day", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_ACTIVATE_FIRMWARE] = { + .name = "ibm,activate-firmware", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_CBE_START_PTCAL] = { + .name = "ibm,cbe-start-ptcal", + }, + [RTAS_FNIDX__IBM_CBE_STOP_PTCAL] = { + .name = "ibm,cbe-stop-ptcal", + }, + [RTAS_FNIDX__IBM_CHANGE_MSI] = { + .name = "ibm,change-msi", + }, + [RTAS_FNIDX__IBM_CLOSE_ERRINJCT] = { + .name = "ibm,close-errinjct", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_CONFIGURE_BRIDGE] = { + .name = "ibm,configure-bridge", + }, + [RTAS_FNIDX__IBM_CONFIGURE_CONNECTOR] = { + .name = "ibm,configure-connector", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 0, .size_idx1 = -1, + .buf_idx2 = 1, .size_idx2 = -1, + .fixed_size = 4096, + }, + }, + [RTAS_FNIDX__IBM_CONFIGURE_KERNEL_DUMP] = { + .name = "ibm,configure-kernel-dump", + }, + [RTAS_FNIDX__IBM_CONFIGURE_PE] = { + .name = "ibm,configure-pe", + }, + [RTAS_FNIDX__IBM_CREATE_PE_DMA_WINDOW] = { + .name = "ibm,create-pe-dma-window", + }, + [RTAS_FNIDX__IBM_DISPLAY_MESSAGE] = { + .name = "ibm,display-message", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 0, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_ERRINJCT] = { + .name = "ibm,errinjct", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 2, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + .fixed_size = 1024, + }, + }, + [RTAS_FNIDX__IBM_EXTI2C] = { + .name = "ibm,exti2c", + }, + [RTAS_FNIDX__IBM_GET_CONFIG_ADDR_INFO] = { + .name = "ibm,get-config-addr-info", + }, + [RTAS_FNIDX__IBM_GET_CONFIG_ADDR_INFO2] = { + .name = "ibm,get-config-addr-info2", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_GET_DYNAMIC_SENSOR_STATE] = { + .name = "ibm,get-dynamic-sensor-state", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_GET_INDICES] = { + .name = "ibm,get-indices", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 2, .size_idx1 = 3, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_GET_RIO_TOPOLOGY] = { + .name = "ibm,get-rio-topology", + }, + [RTAS_FNIDX__IBM_GET_SYSTEM_PARAMETER] = { + .name = "ibm,get-system-parameter", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 1, .size_idx1 = 2, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_GET_VPD] = { + .name = "ibm,get-vpd", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 0, .size_idx1 = -1, + .buf_idx2 = 1, .size_idx2 = 2, + }, + }, + [RTAS_FNIDX__IBM_GET_XIVE] = { + .name = "ibm,get-xive", + }, + [RTAS_FNIDX__IBM_INT_OFF] = { + .name = "ibm,int-off", + }, + [RTAS_FNIDX__IBM_INT_ON] = { + .name = "ibm,int-on", + }, + [RTAS_FNIDX__IBM_IO_QUIESCE_ACK] = { + .name = "ibm,io-quiesce-ack", + }, + [RTAS_FNIDX__IBM_LPAR_PERFTOOLS] = { + .name = "ibm,lpar-perftools", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 2, .size_idx1 = 3, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_MANAGE_FLASH_IMAGE] = { + .name = "ibm,manage-flash-image", + }, + [RTAS_FNIDX__IBM_MANAGE_STORAGE_PRESERVATION] = { + .name = "ibm,manage-storage-preservation", + }, + [RTAS_FNIDX__IBM_NMI_INTERLOCK] = { + .name = "ibm,nmi-interlock", + }, + [RTAS_FNIDX__IBM_NMI_REGISTER] = { + .name = "ibm,nmi-register", + }, + [RTAS_FNIDX__IBM_OPEN_ERRINJCT] = { + .name = "ibm,open-errinjct", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_OPEN_SRIOV_ALLOW_UNFREEZE] = { + .name = "ibm,open-sriov-allow-unfreeze", + }, + [RTAS_FNIDX__IBM_OPEN_SRIOV_MAP_PE_NUMBER] = { + .name = "ibm,open-sriov-map-pe-number", + }, + [RTAS_FNIDX__IBM_OS_TERM] = { + .name = "ibm,os-term", + }, + [RTAS_FNIDX__IBM_PARTNER_CONTROL] = { + .name = "ibm,partner-control", + }, + [RTAS_FNIDX__IBM_PHYSICAL_ATTESTATION] = { + .name = "ibm,physical-attestation", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 0, .size_idx1 = 1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_PLATFORM_DUMP] = { + .name = "ibm,platform-dump", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 4, .size_idx1 = 5, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_POWER_OFF_UPS] = { + .name = "ibm,power-off-ups", + }, + [RTAS_FNIDX__IBM_QUERY_INTERRUPT_SOURCE_NUMBER] = { + .name = "ibm,query-interrupt-source-number", + }, + [RTAS_FNIDX__IBM_QUERY_PE_DMA_WINDOW] = { + .name = "ibm,query-pe-dma-window", + }, + [RTAS_FNIDX__IBM_READ_PCI_CONFIG] = { + .name = "ibm,read-pci-config", + }, + [RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE] = { + .name = "ibm,read-slot-reset-state", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE2] = { + .name = "ibm,read-slot-reset-state2", + }, + [RTAS_FNIDX__IBM_REMOVE_PE_DMA_WINDOW] = { + .name = "ibm,remove-pe-dma-window", + }, + [RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOWS] = { + .name = "ibm,reset-pe-dma-windows", + }, + [RTAS_FNIDX__IBM_SCAN_LOG_DUMP] = { + .name = "ibm,scan-log-dump", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 0, .size_idx1 = 1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_SET_DYNAMIC_INDICATOR] = { + .name = "ibm,set-dynamic-indicator", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 2, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_SET_EEH_OPTION] = { + .name = "ibm,set-eeh-option", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_SET_SLOT_RESET] = { + .name = "ibm,set-slot-reset", + }, + [RTAS_FNIDX__IBM_SET_SYSTEM_PARAMETER] = { + .name = "ibm,set-system-parameter", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_SET_XIVE] = { + .name = "ibm,set-xive", + }, + [RTAS_FNIDX__IBM_SLOT_ERROR_DETAIL] = { + .name = "ibm,slot-error-detail", + }, + [RTAS_FNIDX__IBM_SUSPEND_ME] = { + .name = "ibm,suspend-me", + .banned_for_syscall_on_le = true, + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__IBM_TUNE_DMA_PARMS] = { + .name = "ibm,tune-dma-parms", + }, + [RTAS_FNIDX__IBM_UPDATE_FLASH_64_AND_REBOOT] = { + .name = "ibm,update-flash-64-and-reboot", + }, + [RTAS_FNIDX__IBM_UPDATE_NODES] = { + .name = "ibm,update-nodes", + .banned_for_syscall_on_le = true, + .filter = &(const struct rtas_filter) { + .buf_idx1 = 0, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + .fixed_size = 4096, + }, + }, + [RTAS_FNIDX__IBM_UPDATE_PROPERTIES] = { + .name = "ibm,update-properties", + .banned_for_syscall_on_le = true, + .filter = &(const struct rtas_filter) { + .buf_idx1 = 0, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + .fixed_size = 4096, + }, + }, + [RTAS_FNIDX__IBM_VALIDATE_FLASH_IMAGE] = { + .name = "ibm,validate-flash-image", + }, + [RTAS_FNIDX__IBM_WRITE_PCI_CONFIG] = { + .name = "ibm,write-pci-config", + }, + [RTAS_FNIDX__NVRAM_FETCH] = { + .name = "nvram-fetch", + }, + [RTAS_FNIDX__NVRAM_STORE] = { + .name = "nvram-store", + }, + [RTAS_FNIDX__POWER_OFF] = { + .name = "power-off", + }, + [RTAS_FNIDX__PUT_TERM_CHAR] = { + .name = "put-term-char", + }, + [RTAS_FNIDX__QUERY_CPU_STOPPED_STATE] = { + .name = "query-cpu-stopped-state", + }, + [RTAS_FNIDX__READ_PCI_CONFIG] = { + .name = "read-pci-config", + }, + [RTAS_FNIDX__RTAS_LAST_ERROR] = { + .name = "rtas-last-error", + }, + [RTAS_FNIDX__SET_INDICATOR] = { + .name = "set-indicator", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__SET_POWER_LEVEL] = { + .name = "set-power-level", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__SET_TIME_FOR_POWER_ON] = { + .name = "set-time-for-power-on", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__SET_TIME_OF_DAY] = { + .name = "set-time-of-day", + .filter = &(const struct rtas_filter) { + .buf_idx1 = -1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + }, + [RTAS_FNIDX__START_CPU] = { + .name = "start-cpu", + }, + [RTAS_FNIDX__STOP_SELF] = { + .name = "stop-self", + }, + [RTAS_FNIDX__SYSTEM_REBOOT] = { + .name = "system-reboot", + }, + [RTAS_FNIDX__THAW_TIME_BASE] = { + .name = "thaw-time-base", + }, + [RTAS_FNIDX__WRITE_PCI_CONFIG] = { + .name = "write-pci-config", + }, +}; + +/** + * rtas_function_token() - RTAS function token lookup. + * @handle: Function handle, e.g. RTAS_FN_EVENT_SCAN. + * + * Context: Any context. + * Return: the token value for the function if implemented by this platform, + * otherwise RTAS_UNKNOWN_SERVICE. + */ +s32 rtas_function_token(const rtas_fn_handle_t handle) +{ + const size_t index = handle.index; + const bool out_of_bounds = index >= ARRAY_SIZE(rtas_function_table); + + if (WARN_ONCE(out_of_bounds, "invalid function index %zu", index)) + return RTAS_UNKNOWN_SERVICE; + /* + * Various drivers attempt token lookups on non-RTAS + * platforms. + */ + if (!rtas.dev) + return RTAS_UNKNOWN_SERVICE; + + return rtas_function_table[index].token; +} +EXPORT_SYMBOL_GPL(rtas_function_token); + +static int rtas_function_cmp(const void *a, const void *b) +{ + const struct rtas_function *f1 = a; + const struct rtas_function *f2 = b; + + return strcmp(f1->name, f2->name); +} + +/* + * Boot-time initialization of the function table needs the lookup to + * return a non-const-qualified object. Use rtas_name_to_function() + * in all other contexts. + */ +static struct rtas_function *__rtas_name_to_function(const char *name) +{ + const struct rtas_function key = { + .name = name, + }; + struct rtas_function *found; + + found = bsearch(&key, rtas_function_table, ARRAY_SIZE(rtas_function_table), + sizeof(rtas_function_table[0]), rtas_function_cmp); + + return found; +} + +static const struct rtas_function *rtas_name_to_function(const char *name) +{ + return __rtas_name_to_function(name); +} + +static DEFINE_XARRAY(rtas_token_to_function_xarray); + +static int __init rtas_token_to_function_xarray_init(void) +{ + int err = 0; + + for (size_t i = 0; i < ARRAY_SIZE(rtas_function_table); ++i) { + const struct rtas_function *func = &rtas_function_table[i]; + const s32 token = func->token; + + if (token == RTAS_UNKNOWN_SERVICE) + continue; + + err = xa_err(xa_store(&rtas_token_to_function_xarray, + token, (void *)func, GFP_KERNEL)); + if (err) + break; + } + + return err; +} +arch_initcall(rtas_token_to_function_xarray_init); + +static const struct rtas_function *rtas_token_to_function(s32 token) +{ + const struct rtas_function *func; + + if (WARN_ONCE(token < 0, "invalid token %d", token)) + return NULL; + + func = xa_load(&rtas_token_to_function_xarray, token); + + if (WARN_ONCE(!func, "unexpected failed lookup for token %d", token)) + return NULL; + + return func; +} + /* This is here deliberately so it's only used in this file */ void enter_rtas(unsigned long); -static inline void do_enter_rtas(unsigned long args) +static void __do_enter_rtas(struct rtas_args *args) +{ + enter_rtas(__pa(args)); + srr_regs_clobbered(); /* rtas uses SRRs, invalidate */ +} + +static void __do_enter_rtas_trace(struct rtas_args *args) { - unsigned long msr; + const char *name = NULL; + /* + * If the tracepoints that consume the function name aren't + * active, avoid the lookup. + */ + if ((trace_rtas_input_enabled() || trace_rtas_output_enabled())) { + const s32 token = be32_to_cpu(args->token); + const struct rtas_function *func = rtas_token_to_function(token); + + name = func->name; + } + + trace_rtas_input(args, name); + trace_rtas_ll_entry(args); + + __do_enter_rtas(args); + + trace_rtas_ll_exit(args); + trace_rtas_output(args, name); +} +static void do_enter_rtas(struct rtas_args *args) +{ + const unsigned long msr = mfmsr(); + /* + * Situations where we want to skip any active tracepoints for + * safety reasons: + * + * 1. The last code executed on an offline CPU as it stops, + * i.e. we're about to call stop-self. The tracepoints' + * function name lookup uses xarray, which uses RCU, which + * isn't valid to call on an offline CPU. Any events + * emitted on an offline CPU will be discarded anyway. + * + * 2. In real mode, as when invoking ibm,nmi-interlock from + * the pseries MCE handler. We cannot count on trace + * buffers or the entries in rtas_token_to_function_xarray + * to be contained in the RMO. + */ + const unsigned long mask = MSR_IR | MSR_DR; + const bool can_trace = likely(cpu_online(raw_smp_processor_id()) && + (msr & mask) == mask); /* * Make sure MSR[RI] is currently enabled as it will be forced later * in enter_rtas. */ - msr = mfmsr(); BUG_ON(!(msr & MSR_RI)); BUG_ON(!irqs_disabled()); hard_irq_disable(); /* Ensure MSR[EE] is disabled on PPC64 */ - enter_rtas(args); - - srr_regs_clobbered(); /* rtas uses SRRs, invalidate */ + if (can_trace) + __do_enter_rtas_trace(args); + else + __do_enter_rtas(args); } -struct rtas_t rtas = { - .lock = __ARCH_SPIN_LOCK_UNLOCKED -}; -EXPORT_SYMBOL(rtas); +struct rtas_t rtas; + +/* + * Nearly all RTAS calls need to be serialized. All uses of the + * default rtas_args block must hold rtas_lock. + * + * Exceptions to the RTAS serialization requirement (e.g. stop-self) + * must use a separate rtas_args structure. + */ +static DEFINE_RAW_SPINLOCK(rtas_lock); +static struct rtas_args rtas_args; DEFINE_SPINLOCK(rtas_data_buf_lock); -EXPORT_SYMBOL(rtas_data_buf_lock); +EXPORT_SYMBOL_GPL(rtas_data_buf_lock); -char rtas_data_buf[RTAS_DATA_BUF_SIZE] __cacheline_aligned; -EXPORT_SYMBOL(rtas_data_buf); +char rtas_data_buf[RTAS_DATA_BUF_SIZE] __aligned(SZ_4K); +EXPORT_SYMBOL_GPL(rtas_data_buf); unsigned long rtas_rmo_buf; @@ -78,29 +642,7 @@ unsigned long rtas_rmo_buf; * This is done like this so rtas_flash can be a module. */ void (*rtas_flash_term_hook)(int); -EXPORT_SYMBOL(rtas_flash_term_hook); - -/* RTAS use home made raw locking instead of spin_lock_irqsave - * because those can be called from within really nasty contexts - * such as having the timebase stopped which would lockup with - * normal locks and spinlock debugging enabled - */ -static unsigned long lock_rtas(void) -{ - unsigned long flags; - - local_irq_save(flags); - preempt_disable(); - arch_spin_lock(&rtas.lock); - return flags; -} - -static void unlock_rtas(unsigned long flags) -{ - arch_spin_unlock(&rtas.lock); - local_irq_restore(flags); - preempt_enable(); -} +EXPORT_SYMBOL_GPL(rtas_flash_term_hook); /* * call_rtas_display_status and call_rtas_display_status_delay @@ -109,14 +651,14 @@ static void unlock_rtas(unsigned long flags) */ static void call_rtas_display_status(unsigned char c) { - unsigned long s; + unsigned long flags; if (!rtas.base) return; - s = lock_rtas(); - rtas_call_unlocked(&rtas.args, 10, 1, 1, NULL, c); - unlock_rtas(s); + raw_spin_lock_irqsave(&rtas_lock, flags); + rtas_call_unlocked(&rtas_args, 10, 1, 1, NULL, c); + raw_spin_unlock_irqrestore(&rtas_lock, flags); } static void call_rtas_display_status_delay(char c) @@ -240,8 +782,8 @@ void rtas_progress(char *s, unsigned short hex) "ibm,display-truncation-length", NULL); of_node_put(root); } - display_character = rtas_token("display-character"); - set_indicator = rtas_token("set-indicator"); + display_character = rtas_function_token(RTAS_FN_DISPLAY_CHARACTER); + set_indicator = rtas_function_token(RTAS_FN_SET_INDICATOR); } if (display_character == RTAS_UNKNOWN_SERVICE) { @@ -326,23 +868,38 @@ void rtas_progress(char *s, unsigned short hex) spin_unlock(&progress_lock); } -EXPORT_SYMBOL(rtas_progress); /* needed by rtas_flash module */ +EXPORT_SYMBOL_GPL(rtas_progress); /* needed by rtas_flash module */ int rtas_token(const char *service) { + const struct rtas_function *func; const __be32 *tokp; + if (rtas.dev == NULL) return RTAS_UNKNOWN_SERVICE; + + func = rtas_name_to_function(service); + if (func) + return func->token; + /* + * The caller is looking up a name that is not known to be an + * RTAS function. Either it's a function that needs to be + * added to the table, or they're misusing rtas_token() to + * access non-function properties of the /rtas node. Warn and + * fall back to the legacy behavior. + */ + WARN_ONCE(1, "unknown function `%s`, should it be added to rtas_function_table?\n", + service); + tokp = of_get_property(rtas.dev, service, NULL); return tokp ? be32_to_cpu(*tokp) : RTAS_UNKNOWN_SERVICE; } -EXPORT_SYMBOL(rtas_token); +EXPORT_SYMBOL_GPL(rtas_token); int rtas_service_present(const char *service) { return rtas_token(service) != RTAS_UNKNOWN_SERVICE; } -EXPORT_SYMBOL(rtas_service_present); #ifdef CONFIG_RTAS_ERROR_LOGGING @@ -357,7 +914,6 @@ int rtas_get_error_log_max(void) { return rtas_error_log_max; } -EXPORT_SYMBOL(rtas_get_error_log_max); static void __init init_error_log_max(void) { @@ -381,39 +937,39 @@ static void __init init_error_log_max(void) static char rtas_err_buf[RTAS_ERROR_LOG_MAX]; -static int rtas_last_error_token; /** Return a copy of the detailed error text associated with the * most recent failed call to rtas. Because the error text * might go stale if there are any other intervening rtas calls, * this routine must be called atomically with whatever produced - * the error (i.e. with rtas.lock still held from the previous call). + * the error (i.e. with rtas_lock still held from the previous call). */ static char *__fetch_rtas_last_error(char *altbuf) { + const s32 token = rtas_function_token(RTAS_FN_RTAS_LAST_ERROR); struct rtas_args err_args, save_args; u32 bufsz; char *buf = NULL; - if (rtas_last_error_token == -1) + if (token == -1) return NULL; bufsz = rtas_get_error_log_max(); - err_args.token = cpu_to_be32(rtas_last_error_token); + err_args.token = cpu_to_be32(token); err_args.nargs = cpu_to_be32(2); err_args.nret = cpu_to_be32(1); err_args.args[0] = cpu_to_be32(__pa(rtas_err_buf)); err_args.args[1] = cpu_to_be32(bufsz); err_args.args[2] = 0; - save_args = rtas.args; - rtas.args = err_args; + save_args = rtas_args; + rtas_args = err_args; - do_enter_rtas(__pa(&rtas.args)); + do_enter_rtas(&rtas_args); - err_args = rtas.args; - rtas.args = save_args; + err_args = rtas_args; + rtas_args = save_args; /* Log the error in the unlikely case that there was one. */ if (unlikely(err_args.args[2] == 0)) { @@ -457,7 +1013,7 @@ va_rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret, for (i = 0; i < nret; ++i) args->rets[i] = 0; - do_enter_rtas(__pa(args)); + do_enter_rtas(args); } void rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret, ...) @@ -469,8 +1025,11 @@ void rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret, va_end(list); } -static int ibm_open_errinjct_token; -static int ibm_errinjct_token; +static bool token_is_restricted_errinjct(s32 token) +{ + return token == rtas_function_token(RTAS_FN_IBM_OPEN_ERRINJCT) || + token == rtas_function_token(RTAS_FN_IBM_ERRINJCT); +} /** * rtas_call() - Invoke an RTAS firmware function. @@ -481,7 +1040,7 @@ static int ibm_errinjct_token; * @....: List of @nargs input parameters. * * Invokes the RTAS function indicated by @token, which the caller - * should obtain via rtas_token(). + * should obtain via rtas_function_token(). * * The @nargs and @nret arguments must match the number of input and * output parameters specified for the RTAS function. @@ -534,15 +1093,15 @@ int rtas_call(int token, int nargs, int nret, int *outputs, ...) { va_list list; int i; - unsigned long s; - struct rtas_args *rtas_args; + unsigned long flags; + struct rtas_args *args; char *buff_copy = NULL; int ret; if (!rtas.entry || token == RTAS_UNKNOWN_SERVICE) return -1; - if (token == ibm_open_errinjct_token || token == ibm_errinjct_token) { + if (token_is_restricted_errinjct(token)) { /* * It would be nicer to not discard the error value * from security_locked_down(), but callers expect an @@ -557,26 +1116,25 @@ int rtas_call(int token, int nargs, int nret, int *outputs, ...) return -1; } - s = lock_rtas(); - + raw_spin_lock_irqsave(&rtas_lock, flags); /* We use the global rtas args buffer */ - rtas_args = &rtas.args; + args = &rtas_args; va_start(list, outputs); - va_rtas_call_unlocked(rtas_args, token, nargs, nret, list); + va_rtas_call_unlocked(args, token, nargs, nret, list); va_end(list); /* A -1 return code indicates that the last command couldn't be completed due to a hardware error. */ - if (be32_to_cpu(rtas_args->rets[0]) == -1) + if (be32_to_cpu(args->rets[0]) == -1) buff_copy = __fetch_rtas_last_error(NULL); if (nret > 1 && outputs != NULL) for (i = 0; i < nret-1; ++i) - outputs[i] = be32_to_cpu(rtas_args->rets[i+1]); - ret = (nret > 0)? be32_to_cpu(rtas_args->rets[0]): 0; + outputs[i] = be32_to_cpu(args->rets[i + 1]); + ret = (nret > 0) ? be32_to_cpu(args->rets[0]) : 0; - unlock_rtas(s); + raw_spin_unlock_irqrestore(&rtas_lock, flags); if (buff_copy) { log_error(buff_copy, ERR_TYPE_RTAS_LOG, 0); @@ -585,7 +1143,7 @@ int rtas_call(int token, int nargs, int nret, int *outputs, ...) } return ret; } -EXPORT_SYMBOL(rtas_call); +EXPORT_SYMBOL_GPL(rtas_call); /** * rtas_busy_delay_time() - From an RTAS status value, calculate the @@ -623,7 +1181,47 @@ unsigned int rtas_busy_delay_time(int status) return ms; } -EXPORT_SYMBOL(rtas_busy_delay_time); + +/* + * Early boot fallback for rtas_busy_delay(). + */ +static bool __init rtas_busy_delay_early(int status) +{ + static size_t successive_ext_delays __initdata; + bool retry; + + switch (status) { + case RTAS_EXTENDED_DELAY_MIN...RTAS_EXTENDED_DELAY_MAX: + /* + * In the unlikely case that we receive an extended + * delay status in early boot, the OS is probably not + * the cause, and there's nothing we can do to clear + * the condition. Best we can do is delay for a bit + * and hope it's transient. Lie to the caller if it + * seems like we're stuck in a retry loop. + */ + mdelay(1); + retry = true; + successive_ext_delays += 1; + if (successive_ext_delays > 1000) { + pr_err("too many extended delays, giving up\n"); + dump_stack(); + retry = false; + successive_ext_delays = 0; + } + break; + case RTAS_BUSY: + retry = true; + successive_ext_delays = 0; + break; + default: + retry = false; + successive_ext_delays = 0; + break; + } + + return retry; +} /** * rtas_busy_delay() - helper for RTAS busy and extended delay statuses @@ -643,11 +1241,17 @@ EXPORT_SYMBOL(rtas_busy_delay_time); * * false - @status is not @RTAS_BUSY nor an extended delay hint. The * caller is responsible for handling @status. */ -bool rtas_busy_delay(int status) +bool __ref rtas_busy_delay(int status) { unsigned int ms; bool ret; + /* + * Can't do timed sleeps before timekeeping is up. + */ + if (system_state < SYSTEM_SCHEDULING) + return rtas_busy_delay_early(status); + switch (status) { case RTAS_EXTENDED_DELAY_MIN...RTAS_EXTENDED_DELAY_MAX: ret = true; @@ -697,7 +1301,7 @@ bool rtas_busy_delay(int status) return ret; } -EXPORT_SYMBOL(rtas_busy_delay); +EXPORT_SYMBOL_GPL(rtas_busy_delay); static int rtas_error_rc(int rtas_rc) { @@ -729,7 +1333,7 @@ static int rtas_error_rc(int rtas_rc) int rtas_get_power_level(int powerdomain, int *level) { - int token = rtas_token("get-power-level"); + int token = rtas_function_token(RTAS_FN_GET_POWER_LEVEL); int rc; if (token == RTAS_UNKNOWN_SERVICE) @@ -742,11 +1346,11 @@ int rtas_get_power_level(int powerdomain, int *level) return rtas_error_rc(rc); return rc; } -EXPORT_SYMBOL(rtas_get_power_level); +EXPORT_SYMBOL_GPL(rtas_get_power_level); int rtas_set_power_level(int powerdomain, int level, int *setlevel) { - int token = rtas_token("set-power-level"); + int token = rtas_function_token(RTAS_FN_SET_POWER_LEVEL); int rc; if (token == RTAS_UNKNOWN_SERVICE) @@ -760,11 +1364,11 @@ int rtas_set_power_level(int powerdomain, int level, int *setlevel) return rtas_error_rc(rc); return rc; } -EXPORT_SYMBOL(rtas_set_power_level); +EXPORT_SYMBOL_GPL(rtas_set_power_level); int rtas_get_sensor(int sensor, int index, int *state) { - int token = rtas_token("get-sensor-state"); + int token = rtas_function_token(RTAS_FN_GET_SENSOR_STATE); int rc; if (token == RTAS_UNKNOWN_SERVICE) @@ -778,11 +1382,11 @@ int rtas_get_sensor(int sensor, int index, int *state) return rtas_error_rc(rc); return rc; } -EXPORT_SYMBOL(rtas_get_sensor); +EXPORT_SYMBOL_GPL(rtas_get_sensor); int rtas_get_sensor_fast(int sensor, int index, int *state) { - int token = rtas_token("get-sensor-state"); + int token = rtas_function_token(RTAS_FN_GET_SENSOR_STATE); int rc; if (token == RTAS_UNKNOWN_SERVICE) @@ -821,11 +1425,10 @@ bool rtas_indicator_present(int token, int *maxindex) return false; } -EXPORT_SYMBOL(rtas_indicator_present); int rtas_set_indicator(int indicator, int index, int new_value) { - int token = rtas_token("set-indicator"); + int token = rtas_function_token(RTAS_FN_SET_INDICATOR); int rc; if (token == RTAS_UNKNOWN_SERVICE) @@ -839,15 +1442,15 @@ int rtas_set_indicator(int indicator, int index, int new_value) return rtas_error_rc(rc); return rc; } -EXPORT_SYMBOL(rtas_set_indicator); +EXPORT_SYMBOL_GPL(rtas_set_indicator); /* * Ignoring RTAS extended delay */ int rtas_set_indicator_fast(int indicator, int index, int new_value) { + int token = rtas_function_token(RTAS_FN_SET_INDICATOR); int rc; - int token = rtas_token("set-indicator"); if (token == RTAS_UNKNOWN_SERVICE) return -ENOENT; @@ -889,10 +1492,11 @@ int rtas_set_indicator_fast(int indicator, int index, int new_value) */ int rtas_ibm_suspend_me(int *fw_status) { + int token = rtas_function_token(RTAS_FN_IBM_SUSPEND_ME); int fwrc; int ret; - fwrc = rtas_call(rtas_token("ibm,suspend-me"), 0, 1, NULL); + fwrc = rtas_call(token, 0, 1, NULL); switch (fwrc) { case 0: @@ -925,7 +1529,7 @@ void __noreturn rtas_restart(char *cmd) if (rtas_flash_term_hook) rtas_flash_term_hook(SYS_RESTART); pr_emerg("system-reboot returned %d\n", - rtas_call(rtas_token("system-reboot"), 0, 1, NULL)); + rtas_call(rtas_function_token(RTAS_FN_SYSTEM_REBOOT), 0, 1, NULL)); for (;;); } @@ -935,7 +1539,7 @@ void rtas_power_off(void) rtas_flash_term_hook(SYS_POWER_OFF); /* allow power on only with power button press */ pr_emerg("power-off returned %d\n", - rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1)); + rtas_call(rtas_function_token(RTAS_FN_POWER_OFF), 2, 1, NULL, -1, -1)); for (;;); } @@ -945,16 +1549,17 @@ void __noreturn rtas_halt(void) rtas_flash_term_hook(SYS_HALT); /* allow power on only with power button press */ pr_emerg("power-off returned %d\n", - rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1)); + rtas_call(rtas_function_token(RTAS_FN_POWER_OFF), 2, 1, NULL, -1, -1)); for (;;); } /* Must be in the RMO region, so we place it here */ static char rtas_os_term_buf[2048]; -static s32 ibm_os_term_token = RTAS_UNKNOWN_SERVICE; +static bool ibm_extended_os_term; void rtas_os_term(char *str) { + s32 token = rtas_function_token(RTAS_FN_IBM_OS_TERM); int status; /* @@ -963,7 +1568,8 @@ void rtas_os_term(char *str) * this property may terminate the partition which we want to avoid * since it interferes with panic_timeout. */ - if (ibm_os_term_token == RTAS_UNKNOWN_SERVICE) + + if (token == RTAS_UNKNOWN_SERVICE || !ibm_extended_os_term) return; snprintf(rtas_os_term_buf, 2048, "OS panic: %s", str); @@ -974,8 +1580,7 @@ void rtas_os_term(char *str) * schedules. */ do { - status = rtas_call(ibm_os_term_token, 1, 1, NULL, - __pa(rtas_os_term_buf)); + status = rtas_call(token, 1, 1, NULL, __pa(rtas_os_term_buf)); } while (rtas_busy_delay_time(status)); if (status != 0) @@ -995,10 +1600,9 @@ void rtas_os_term(char *str) */ void rtas_activate_firmware(void) { - int token; + int token = rtas_function_token(RTAS_FN_IBM_ACTIVATE_FIRMWARE); int fwrc; - token = rtas_token("ibm,activate-firmware"); if (token == RTAS_UNKNOWN_SERVICE) { pr_notice("ibm,activate-firmware method unavailable\n"); return; @@ -1063,56 +1667,12 @@ noinstr struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log * * Accordingly, we filter RTAS requests to check that the call is * permitted, and that provided pointers fall within the RMO buffer. - * The rtas_filters list contains an entry for each permitted call, - * with the indexes of the parameters which are expected to contain - * addresses and sizes of buffers allocated inside the RMO buffer. + * If a function is allowed to be invoked via the syscall, then its + * entry in the rtas_functions table points to a rtas_filter that + * describes its constraints, with the indexes of the parameters which + * are expected to contain addresses and sizes of buffers allocated + * inside the RMO buffer. */ -struct rtas_filter { - const char *name; - int token; - /* Indexes into the args buffer, -1 if not used */ - int buf_idx1; - int size_idx1; - int buf_idx2; - int size_idx2; - - int fixed_size; -}; - -static struct rtas_filter rtas_filters[] __ro_after_init = { - { "ibm,activate-firmware", -1, -1, -1, -1, -1 }, - { "ibm,configure-connector", -1, 0, -1, 1, -1, 4096 }, /* Special cased */ - { "display-character", -1, -1, -1, -1, -1 }, - { "ibm,display-message", -1, 0, -1, -1, -1 }, - { "ibm,errinjct", -1, 2, -1, -1, -1, 1024 }, - { "ibm,close-errinjct", -1, -1, -1, -1, -1 }, - { "ibm,open-errinjct", -1, -1, -1, -1, -1 }, - { "ibm,get-config-addr-info2", -1, -1, -1, -1, -1 }, - { "ibm,get-dynamic-sensor-state", -1, 1, -1, -1, -1 }, - { "ibm,get-indices", -1, 2, 3, -1, -1 }, - { "get-power-level", -1, -1, -1, -1, -1 }, - { "get-sensor-state", -1, -1, -1, -1, -1 }, - { "ibm,get-system-parameter", -1, 1, 2, -1, -1 }, - { "get-time-of-day", -1, -1, -1, -1, -1 }, - { "ibm,get-vpd", -1, 0, -1, 1, 2 }, - { "ibm,lpar-perftools", -1, 2, 3, -1, -1 }, - { "ibm,platform-dump", -1, 4, 5, -1, -1 }, /* Special cased */ - { "ibm,read-slot-reset-state", -1, -1, -1, -1, -1 }, - { "ibm,scan-log-dump", -1, 0, 1, -1, -1 }, - { "ibm,set-dynamic-indicator", -1, 2, -1, -1, -1 }, - { "ibm,set-eeh-option", -1, -1, -1, -1, -1 }, - { "set-indicator", -1, -1, -1, -1, -1 }, - { "set-power-level", -1, -1, -1, -1, -1 }, - { "set-time-for-power-on", -1, -1, -1, -1, -1 }, - { "ibm,set-system-parameter", -1, 1, -1, -1, -1 }, - { "set-time-of-day", -1, -1, -1, -1, -1 }, -#ifdef CONFIG_CPU_BIG_ENDIAN - { "ibm,suspend-me", -1, -1, -1, -1, -1 }, - { "ibm,update-nodes", -1, 0, -1, -1, -1, 4096 }, - { "ibm,update-properties", -1, 0, -1, -1, -1, 4096 }, -#endif - { "ibm,physical-attestation", -1, 0, 1, -1, -1 }, -}; static bool in_rmo_buf(u32 base, u32 end) { @@ -1126,63 +1686,75 @@ static bool in_rmo_buf(u32 base, u32 end) static bool block_rtas_call(int token, int nargs, struct rtas_args *args) { - int i; - - for (i = 0; i < ARRAY_SIZE(rtas_filters); i++) { - struct rtas_filter *f = &rtas_filters[i]; - u32 base, size, end; + const struct rtas_function *func; + const struct rtas_filter *f; + const bool is_platform_dump = token == rtas_function_token(RTAS_FN_IBM_PLATFORM_DUMP); + const bool is_config_conn = token == rtas_function_token(RTAS_FN_IBM_CONFIGURE_CONNECTOR); + u32 base, size, end; - if (token != f->token) - continue; - - if (f->buf_idx1 != -1) { - base = be32_to_cpu(args->args[f->buf_idx1]); - if (f->size_idx1 != -1) - size = be32_to_cpu(args->args[f->size_idx1]); - else if (f->fixed_size) - size = f->fixed_size; - else - size = 1; - - end = base + size - 1; + /* + * If this token doesn't correspond to a function the kernel + * understands, you're not allowed to call it. + */ + func = rtas_token_to_function(token); + if (!func) + goto err; + /* + * And only functions with filters attached are allowed. + */ + f = func->filter; + if (!f) + goto err; + /* + * And some functions aren't allowed on LE. + */ + if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) && func->banned_for_syscall_on_le) + goto err; + + if (f->buf_idx1 != -1) { + base = be32_to_cpu(args->args[f->buf_idx1]); + if (f->size_idx1 != -1) + size = be32_to_cpu(args->args[f->size_idx1]); + else if (f->fixed_size) + size = f->fixed_size; + else + size = 1; - /* - * Special case for ibm,platform-dump - NULL buffer - * address is used to indicate end of dump processing - */ - if (!strcmp(f->name, "ibm,platform-dump") && - base == 0) - return false; + end = base + size - 1; - if (!in_rmo_buf(base, end)) - goto err; - } + /* + * Special case for ibm,platform-dump - NULL buffer + * address is used to indicate end of dump processing + */ + if (is_platform_dump && base == 0) + return false; - if (f->buf_idx2 != -1) { - base = be32_to_cpu(args->args[f->buf_idx2]); - if (f->size_idx2 != -1) - size = be32_to_cpu(args->args[f->size_idx2]); - else if (f->fixed_size) - size = f->fixed_size; - else - size = 1; - end = base + size - 1; + if (!in_rmo_buf(base, end)) + goto err; + } - /* - * Special case for ibm,configure-connector where the - * address can be 0 - */ - if (!strcmp(f->name, "ibm,configure-connector") && - base == 0) - return false; + if (f->buf_idx2 != -1) { + base = be32_to_cpu(args->args[f->buf_idx2]); + if (f->size_idx2 != -1) + size = be32_to_cpu(args->args[f->size_idx2]); + else if (f->fixed_size) + size = f->fixed_size; + else + size = 1; + end = base + size - 1; - if (!in_rmo_buf(base, end)) - goto err; - } + /* + * Special case for ibm,configure-connector where the + * address can be 0 + */ + if (is_config_conn && base == 0) + return false; - return false; + if (!in_rmo_buf(base, end)) + goto err; } + return false; err: pr_err_ratelimited("sys_rtas: RTAS call blocked - exploit attempt?\n"); pr_err_ratelimited("sys_rtas: token=0x%x, nargs=%d (called by %s)\n", @@ -1190,14 +1762,6 @@ err: return true; } -static void __init rtas_syscall_filter_init(void) -{ - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(rtas_filters); i++) - rtas_filters[i].token = rtas_token(rtas_filters[i].name); -} - /* We assume to be passed big endian arguments */ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs) { @@ -1238,7 +1802,7 @@ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs) if (block_rtas_call(token, nargs, &args)) return -EINVAL; - if (token == ibm_open_errinjct_token || token == ibm_errinjct_token) { + if (token_is_restricted_errinjct(token)) { int err; err = security_locked_down(LOCKDOWN_RTAS_ERROR_INJECTION); @@ -1247,7 +1811,7 @@ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs) } /* Need to handle ibm,suspend_me call specially */ - if (token == rtas_token("ibm,suspend-me")) { + if (token == rtas_function_token(RTAS_FN_IBM_SUSPEND_ME)) { /* * rtas_ibm_suspend_me assumes the streamid handle is in cpu @@ -1268,18 +1832,18 @@ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs) buff_copy = get_errorlog_buffer(); - flags = lock_rtas(); + raw_spin_lock_irqsave(&rtas_lock, flags); - rtas.args = args; - do_enter_rtas(__pa(&rtas.args)); - args = rtas.args; + rtas_args = args; + do_enter_rtas(&rtas_args); + args = rtas_args; /* A -1 return code indicates that the last command couldn't be completed due to a hardware error. */ if (be32_to_cpu(args.rets[0]) == -1) errbuf = __fetch_rtas_last_error(buff_copy); - unlock_rtas(flags); + raw_spin_unlock_irqrestore(&rtas_lock, flags); if (buff_copy) { if (errbuf) @@ -1297,6 +1861,54 @@ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs) return 0; } +static void __init rtas_function_table_init(void) +{ + struct property *prop; + + for (size_t i = 0; i < ARRAY_SIZE(rtas_function_table); ++i) { + struct rtas_function *curr = &rtas_function_table[i]; + struct rtas_function *prior; + int cmp; + + curr->token = RTAS_UNKNOWN_SERVICE; + + if (i == 0) + continue; + /* + * Ensure table is sorted correctly for binary search + * on function names. + */ + prior = &rtas_function_table[i - 1]; + + cmp = strcmp(prior->name, curr->name); + if (cmp < 0) + continue; + + if (cmp == 0) { + pr_err("'%s' has duplicate function table entries\n", + curr->name); + } else { + pr_err("function table unsorted: '%s' wrongly precedes '%s'\n", + prior->name, curr->name); + } + } + + for_each_property_of_node(rtas.dev, prop) { + struct rtas_function *func; + + if (prop->length != sizeof(u32)) + continue; + + func = __rtas_name_to_function(prop->name); + if (!func) + continue; + + func->token = be32_to_cpup((__be32 *)prop->value); + + pr_debug("function %s has token %u\n", func->name, func->token); + } +} + /* * Call early during boot, before mem init, to retrieve the RTAS * information from the device-tree and allocate the RMO buffer for userland @@ -1330,12 +1942,14 @@ void __init rtas_initialize(void) init_error_log_max(); + /* Must be called before any function token lookups */ + rtas_function_table_init(); + /* - * Discover these now to avoid device tree lookups in the + * Discover this now to avoid a device tree lookup in the * panic path. */ - if (of_property_read_bool(rtas.dev, "ibm,extended-os-term")) - ibm_os_term_token = rtas_token("ibm,os-term"); + ibm_extended_os_term = of_property_read_bool(rtas.dev, "ibm,extended-os-term"); /* If RTAS was found, allocate the RMO buffer for it and look for * the stop-self token if any @@ -1350,12 +1964,7 @@ void __init rtas_initialize(void) panic("ERROR: RTAS: Failed to allocate %lx bytes below %pa\n", PAGE_SIZE, &rtas_region); -#ifdef CONFIG_RTAS_ERROR_LOGGING - rtas_last_error_token = rtas_token("rtas-last-error"); -#endif - ibm_open_errinjct_token = rtas_token("ibm,open-errinjct"); - ibm_errinjct_token = rtas_token("ibm,errinjct"); - rtas_syscall_filter_init(); + rtas_work_area_reserve_arena(rtas_region); } int __init early_init_dt_scan_rtas(unsigned long node, @@ -1401,23 +2010,22 @@ int __init early_init_dt_scan_rtas(unsigned long node, return 1; } -static arch_spinlock_t timebase_lock; +static DEFINE_RAW_SPINLOCK(timebase_lock); static u64 timebase = 0; void rtas_give_timebase(void) { unsigned long flags; - local_irq_save(flags); + raw_spin_lock_irqsave(&timebase_lock, flags); hard_irq_disable(); - arch_spin_lock(&timebase_lock); - rtas_call(rtas_token("freeze-time-base"), 0, 1, NULL); + rtas_call(rtas_function_token(RTAS_FN_FREEZE_TIME_BASE), 0, 1, NULL); timebase = get_tb(); - arch_spin_unlock(&timebase_lock); + raw_spin_unlock(&timebase_lock); while (timebase) barrier(); - rtas_call(rtas_token("thaw-time-base"), 0, 1, NULL); + rtas_call(rtas_function_token(RTAS_FN_THAW_TIME_BASE), 0, 1, NULL); local_irq_restore(flags); } @@ -1425,8 +2033,8 @@ void rtas_take_timebase(void) { while (!timebase) barrier(); - arch_spin_lock(&timebase_lock); + raw_spin_lock(&timebase_lock); set_tb(timebase >> 32, timebase & 0xffffffff); timebase = 0; - arch_spin_unlock(&timebase_lock); + raw_spin_unlock(&timebase_lock); } diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c index bc817a5619d6..4caf5e3079eb 100644 --- a/arch/powerpc/kernel/rtas_flash.c +++ b/arch/powerpc/kernel/rtas_flash.c @@ -376,7 +376,7 @@ static void manage_flash(struct rtas_manage_flash_t *args_buf, unsigned int op) s32 rc; do { - rc = rtas_call(rtas_token("ibm,manage-flash-image"), 1, 1, + rc = rtas_call(rtas_function_token(RTAS_FN_IBM_MANAGE_FLASH_IMAGE), 1, 1, NULL, op); } while (rtas_busy_delay(rc)); @@ -444,7 +444,7 @@ error: */ static void validate_flash(struct rtas_validate_flash_t *args_buf) { - int token = rtas_token("ibm,validate-flash-image"); + int token = rtas_function_token(RTAS_FN_IBM_VALIDATE_FLASH_IMAGE); int update_results; s32 rc; @@ -570,7 +570,7 @@ static void rtas_flash_firmware(int reboot_type) return; } - update_token = rtas_token("ibm,update-flash-64-and-reboot"); + update_token = rtas_function_token(RTAS_FN_IBM_UPDATE_FLASH_64_AND_REBOOT); if (update_token == RTAS_UNKNOWN_SERVICE) { printk(KERN_ALERT "FLASH: ibm,update-flash-64-and-reboot " "is not available -- not a service partition?\n"); @@ -653,7 +653,7 @@ static void rtas_flash_firmware(int reboot_type) */ struct rtas_flash_file { const char *filename; - const char *rtas_call_name; + const rtas_fn_handle_t handle; int *status; const struct proc_ops ops; }; @@ -661,7 +661,7 @@ struct rtas_flash_file { static const struct rtas_flash_file rtas_flash_files[] = { { .filename = "powerpc/rtas/" FIRMWARE_FLASH_NAME, - .rtas_call_name = "ibm,update-flash-64-and-reboot", + .handle = RTAS_FN_IBM_UPDATE_FLASH_64_AND_REBOOT, .status = &rtas_update_flash_data.status, .ops.proc_read = rtas_flash_read_msg, .ops.proc_write = rtas_flash_write, @@ -670,7 +670,7 @@ static const struct rtas_flash_file rtas_flash_files[] = { }, { .filename = "powerpc/rtas/" FIRMWARE_UPDATE_NAME, - .rtas_call_name = "ibm,update-flash-64-and-reboot", + .handle = RTAS_FN_IBM_UPDATE_FLASH_64_AND_REBOOT, .status = &rtas_update_flash_data.status, .ops.proc_read = rtas_flash_read_num, .ops.proc_write = rtas_flash_write, @@ -679,7 +679,7 @@ static const struct rtas_flash_file rtas_flash_files[] = { }, { .filename = "powerpc/rtas/" VALIDATE_FLASH_NAME, - .rtas_call_name = "ibm,validate-flash-image", + .handle = RTAS_FN_IBM_VALIDATE_FLASH_IMAGE, .status = &rtas_validate_flash_data.status, .ops.proc_read = validate_flash_read, .ops.proc_write = validate_flash_write, @@ -688,7 +688,7 @@ static const struct rtas_flash_file rtas_flash_files[] = { }, { .filename = "powerpc/rtas/" MANAGE_FLASH_NAME, - .rtas_call_name = "ibm,manage-flash-image", + .handle = RTAS_FN_IBM_MANAGE_FLASH_IMAGE, .status = &rtas_manage_flash_data.status, .ops.proc_read = manage_flash_read, .ops.proc_write = manage_flash_write, @@ -700,8 +700,7 @@ static int __init rtas_flash_init(void) { int i; - if (rtas_token("ibm,update-flash-64-and-reboot") == - RTAS_UNKNOWN_SERVICE) { + if (rtas_function_token(RTAS_FN_IBM_UPDATE_FLASH_64_AND_REBOOT) == RTAS_UNKNOWN_SERVICE) { pr_info("rtas_flash: no firmware flash support\n"); return -EINVAL; } @@ -730,7 +729,7 @@ static int __init rtas_flash_init(void) * This code assumes that the status int is the first member of the * struct */ - token = rtas_token(f->rtas_call_name); + token = rtas_function_token(f->handle); if (token == RTAS_UNKNOWN_SERVICE) *f->status = FLASH_AUTH; else diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c index 5a2f5ea3b054..e1fdc7473b72 100644 --- a/arch/powerpc/kernel/rtas_pci.c +++ b/arch/powerpc/kernel/rtas_pci.c @@ -191,10 +191,10 @@ static void python_countermeasures(struct device_node *dev) void __init init_pci_config_tokens(void) { - read_pci_config = rtas_token("read-pci-config"); - write_pci_config = rtas_token("write-pci-config"); - ibm_read_pci_config = rtas_token("ibm,read-pci-config"); - ibm_write_pci_config = rtas_token("ibm,write-pci-config"); + read_pci_config = rtas_function_token(RTAS_FN_READ_PCI_CONFIG); + write_pci_config = rtas_function_token(RTAS_FN_WRITE_PCI_CONFIG); + ibm_read_pci_config = rtas_function_token(RTAS_FN_IBM_READ_PCI_CONFIG); + ibm_write_pci_config = rtas_function_token(RTAS_FN_IBM_WRITE_PCI_CONFIG); } unsigned long get_phb_buid(struct device_node *phb) diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index cc56ac6ba4b0..9bba469239fc 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -506,7 +506,7 @@ static int __init rtas_event_scan_init(void) return 0; /* No RTAS */ - event_scan = rtas_token("event-scan"); + event_scan = rtas_function_token(RTAS_FN_EVENT_SCAN); if (event_scan == RTAS_UNKNOWN_SERVICE) { printk(KERN_INFO "rtasd: No event-scan on system\n"); return -ENODEV; diff --git a/arch/powerpc/kernel/secvar-ops.c b/arch/powerpc/kernel/secvar-ops.c index 6a29777d6a2d..19172a2804f0 100644 --- a/arch/powerpc/kernel/secvar-ops.c +++ b/arch/powerpc/kernel/secvar-ops.c @@ -8,10 +8,16 @@ #include <linux/cache.h> #include <asm/secvar.h> +#include <asm/bug.h> -const struct secvar_operations *secvar_ops __ro_after_init; +const struct secvar_operations *secvar_ops __ro_after_init = NULL; -void set_secvar_ops(const struct secvar_operations *ops) +int set_secvar_ops(const struct secvar_operations *ops) { + if (WARN_ON_ONCE(secvar_ops)) + return -EBUSY; + secvar_ops = ops; + + return 0; } diff --git a/arch/powerpc/kernel/secvar-sysfs.c b/arch/powerpc/kernel/secvar-sysfs.c index 1ee4640a2641..eb3c053f323f 100644 --- a/arch/powerpc/kernel/secvar-sysfs.c +++ b/arch/powerpc/kernel/secvar-sysfs.c @@ -21,56 +21,48 @@ static struct kset *secvar_kset; static ssize_t format_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - ssize_t rc = 0; - struct device_node *node; - const char *format; - - node = of_find_compatible_node(NULL, NULL, "ibm,secvar-backend"); - if (!of_device_is_available(node)) { - rc = -ENODEV; - goto out; - } - - rc = of_property_read_string(node, "format", &format); - if (rc) - goto out; - - rc = sprintf(buf, "%s\n", format); + char tmp[32]; + ssize_t len = secvar_ops->format(tmp, sizeof(tmp)); -out: - of_node_put(node); + if (len > 0) + return sysfs_emit(buf, "%s\n", tmp); + else if (len < 0) + pr_err("Error %zd reading format string\n", len); + else + pr_err("Got empty format string from backend\n"); - return rc; + return -EIO; } static ssize_t size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - uint64_t dsize; + u64 dsize; int rc; rc = secvar_ops->get(kobj->name, strlen(kobj->name) + 1, NULL, &dsize); if (rc) { - pr_err("Error retrieving %s variable size %d\n", kobj->name, - rc); + if (rc != -ENOENT) + pr_err("Error retrieving %s variable size %d\n", kobj->name, rc); return rc; } - return sprintf(buf, "%llu\n", dsize); + return sysfs_emit(buf, "%llu\n", dsize); } static ssize_t data_read(struct file *filep, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { - uint64_t dsize; char *data; + u64 dsize; int rc; rc = secvar_ops->get(kobj->name, strlen(kobj->name) + 1, NULL, &dsize); if (rc) { - pr_err("Error getting %s variable size %d\n", kobj->name, rc); + if (rc != -ENOENT) + pr_err("Error getting %s variable size %d\n", kobj->name, rc); return rc; } pr_debug("dsize is %llu\n", dsize); @@ -141,34 +133,58 @@ static struct kobj_type secvar_ktype = { static int update_kobj_size(void) { - struct device_node *node; u64 varsize; - int rc = 0; - - node = of_find_compatible_node(NULL, NULL, "ibm,secvar-backend"); - if (!of_device_is_available(node)) { - rc = -ENODEV; - goto out; - } + int rc = secvar_ops->max_size(&varsize); - rc = of_property_read_u64(node, "max-var-size", &varsize); if (rc) - goto out; + return rc; data_attr.size = varsize; update_attr.size = varsize; -out: - of_node_put(node); + return 0; +} - return rc; +static int secvar_sysfs_config(struct kobject *kobj) +{ + struct attribute_group config_group = { + .name = "config", + .attrs = (struct attribute **)secvar_ops->config_attrs, + }; + + if (secvar_ops->config_attrs) + return sysfs_create_group(kobj, &config_group); + + return 0; +} + +static int add_var(const char *name) +{ + struct kobject *kobj; + int rc; + + kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); + if (!kobj) + return -ENOMEM; + + kobject_init(kobj, &secvar_ktype); + + rc = kobject_add(kobj, &secvar_kset->kobj, "%s", name); + if (rc) { + pr_warn("kobject_add error %d for attribute: %s\n", rc, + name); + kobject_put(kobj); + return rc; + } + + kobject_uevent(kobj, KOBJ_ADD); + return 0; } static int secvar_sysfs_load(void) { + u64 namesize = 0; char *name; - uint64_t namesize = 0; - struct kobject *kobj; int rc; name = kzalloc(NAME_MAX_SIZE, GFP_KERNEL); @@ -179,73 +195,99 @@ static int secvar_sysfs_load(void) rc = secvar_ops->get_next(name, &namesize, NAME_MAX_SIZE); if (rc) { if (rc != -ENOENT) - pr_err("error getting secvar from firmware %d\n", - rc); - break; - } + pr_err("error getting secvar from firmware %d\n", rc); + else + rc = 0; - kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); - if (!kobj) { - rc = -ENOMEM; break; } - kobject_init(kobj, &secvar_ktype); - - rc = kobject_add(kobj, &secvar_kset->kobj, "%s", name); - if (rc) { - pr_warn("kobject_add error %d for attribute: %s\n", rc, - name); - kobject_put(kobj); - kobj = NULL; - } - - if (kobj) - kobject_uevent(kobj, KOBJ_ADD); - + rc = add_var(name); } while (!rc); kfree(name); return rc; } +static int secvar_sysfs_load_static(void) +{ + const char * const *name_ptr = secvar_ops->var_names; + int rc; + + while (*name_ptr) { + rc = add_var(*name_ptr); + if (rc) + return rc; + name_ptr++; + } + + return 0; +} + static int secvar_sysfs_init(void) { + u64 max_size; int rc; if (!secvar_ops) { - pr_warn("secvar: failed to retrieve secvar operations.\n"); + pr_warn("Failed to retrieve secvar operations\n"); return -ENODEV; } secvar_kobj = kobject_create_and_add("secvar", firmware_kobj); if (!secvar_kobj) { - pr_err("secvar: Failed to create firmware kobj\n"); + pr_err("Failed to create firmware kobj\n"); return -ENOMEM; } rc = sysfs_create_file(secvar_kobj, &format_attr.attr); if (rc) { - kobject_put(secvar_kobj); - return -ENOMEM; + pr_err("Failed to create format object\n"); + rc = -ENOMEM; + goto err; } secvar_kset = kset_create_and_add("vars", NULL, secvar_kobj); if (!secvar_kset) { - pr_err("secvar: sysfs kobject registration failed.\n"); - kobject_put(secvar_kobj); - return -ENOMEM; + pr_err("sysfs kobject registration failed\n"); + rc = -ENOMEM; + goto err; } rc = update_kobj_size(); if (rc) { pr_err("Cannot read the size of the attribute\n"); - return rc; + goto err; + } + + rc = secvar_sysfs_config(secvar_kobj); + if (rc) { + pr_err("Failed to create config directory\n"); + goto err; } - secvar_sysfs_load(); + if (secvar_ops->get_next) + rc = secvar_sysfs_load(); + else + rc = secvar_sysfs_load_static(); + + if (rc) { + pr_err("Failed to create variable attributes\n"); + goto err; + } + + // Due to sysfs limitations, we will only ever get a write buffer of + // up to 1 page in size. Print a warning if this is potentially going + // to cause problems, so that the user is aware. + secvar_ops->max_size(&max_size); + if (max_size > PAGE_SIZE) + pr_warn_ratelimited("PAGE_SIZE (%lu) is smaller than maximum object size (%llu), writes are limited to PAGE_SIZE\n", + PAGE_SIZE, max_size); return 0; +err: + kobject_put(secvar_kobj); + return rc; } late_initcall(secvar_sysfs_init); diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 9b10e57040c6..e77734e5a127 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -87,6 +87,10 @@ EXPORT_SYMBOL(machine_id); int boot_cpuid = -1; EXPORT_SYMBOL_GPL(boot_cpuid); +#ifdef CONFIG_PPC64 +int boot_cpu_hwid = -1; +#endif + /* * These are used in binfmt_elf.c to put aux entries on the stack * for each elf executable being started. diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index a0dee7354fe6..b2e0d3ce4261 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -385,17 +385,21 @@ void __init early_setup(unsigned long dt_ptr) /* * Do early initialization using the flattened device * tree, such as retrieving the physical memory map or - * calculating/retrieving the hash table size. + * calculating/retrieving the hash table size, discover + * boot_cpuid and boot_cpu_hwid. */ early_init_devtree(__va(dt_ptr)); - /* Now we know the logical id of our boot cpu, setup the paca. */ - if (boot_cpuid != 0) { - /* Poison paca_ptrs[0] again if it's not the boot cpu */ - memset(&paca_ptrs[0], 0x88, sizeof(paca_ptrs[0])); - } + allocate_paca_ptrs(); + allocate_paca(boot_cpuid); + set_hard_smp_processor_id(boot_cpuid, boot_cpu_hwid); fixup_boot_paca(paca_ptrs[boot_cpuid]); setup_paca(paca_ptrs[boot_cpuid]); /* install the paca into registers */ + // smp_processor_id() now reports boot_cpuid + +#ifdef CONFIG_SMP + task_thread_info(current)->cpu = boot_cpuid; // fix task_cpu(current) +#endif /* * Configure exception handlers. This include setting up trampolines diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index e26eb6618ae5..9d8665910350 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -356,7 +356,7 @@ void vtime_flush(struct task_struct *tsk) } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ -void __delay(unsigned long loops) +void __no_kcsan __delay(unsigned long loops) { unsigned long start; @@ -377,7 +377,7 @@ void __delay(unsigned long loops) } EXPORT_SYMBOL(__delay); -void udelay(unsigned long usecs) +void __no_kcsan udelay(unsigned long usecs) { __delay(tb_ticks_per_usec * usecs); } diff --git a/arch/powerpc/kernel/trace/Makefile b/arch/powerpc/kernel/trace/Makefile index af8527538fe4..b16a9f9c0b35 100644 --- a/arch/powerpc/kernel/trace/Makefile +++ b/arch/powerpc/kernel/trace/Makefile @@ -23,4 +23,5 @@ obj-$(CONFIG_PPC32) += $(obj32-y) # Disable GCOV, KCOV & sanitizers in odd or sensitive code GCOV_PROFILE_ftrace.o := n KCOV_INSTRUMENT_ftrace.o := n +KCSAN_SANITIZE_ftrace.o := n UBSAN_SANITIZE_ftrace.o := n diff --git a/arch/powerpc/kernel/vdso/Makefile b/arch/powerpc/kernel/vdso/Makefile index 6a977b0d8ffc..66f723f53be2 100644 --- a/arch/powerpc/kernel/vdso/Makefile +++ b/arch/powerpc/kernel/vdso/Makefile @@ -16,6 +16,11 @@ ifneq ($(c-gettimeofday-y),) CFLAGS_vgettimeofday-32.o += -ffreestanding -fasynchronous-unwind-tables CFLAGS_REMOVE_vgettimeofday-32.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_vgettimeofday-32.o += -mcmodel=medium -mabi=elfv1 -mabi=elfv2 -mcall-aixdesc + # This flag is supported by clang for 64-bit but not 32-bit so it will cause + # an unused command line flag warning for this file. + ifdef CONFIG_CC_IS_CLANG + CFLAGS_REMOVE_vgettimeofday-32.o += -fno-stack-clash-protection + endif CFLAGS_vgettimeofday-64.o += -include $(c-gettimeofday-y) CFLAGS_vgettimeofday-64.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) CFLAGS_vgettimeofday-64.o += $(call cc-option, -fno-stack-protector) @@ -46,15 +51,20 @@ GCOV_PROFILE := n KCOV_INSTRUMENT := n UBSAN_SANITIZE := n KASAN_SANITIZE := n +KCSAN_SANITIZE := n -ccflags-y := -shared -fno-common -fno-builtin -nostdlib -Wl,--hash-style=both -ccflags-$(CONFIG_LD_IS_LLD) += $(call cc-option,--ld-path=$(LD),-fuse-ld=lld) +ccflags-y := -fno-common -fno-builtin +ldflags-y := -Wl,--hash-style=both -nostdlib -shared -z noexecstack +ldflags-$(CONFIG_LD_IS_LLD) += $(call cc-option,--ld-path=$(LD),-fuse-ld=lld) +# Filter flags that clang will warn are unused for linking +ldflags-y += $(filter-out $(CC_AUTO_VAR_INIT_ZERO_ENABLER) $(CC_FLAGS_FTRACE) -Wa$(comma)%, $(KBUILD_CFLAGS)) -CC32FLAGS := -Wl,-soname=linux-vdso32.so.1 -m32 -AS32FLAGS := -D__VDSO32__ -s +CC32FLAGS := -m32 +LD32FLAGS := -Wl,-soname=linux-vdso32.so.1 +AS32FLAGS := -D__VDSO32__ -CC64FLAGS := -Wl,-soname=linux-vdso64.so.1 -AS64FLAGS := -D__VDSO64__ -s +LD64FLAGS := -Wl,-soname=linux-vdso64.so.1 +AS64FLAGS := -D__VDSO64__ targets += vdso32.lds CPPFLAGS_vdso32.lds += -P -C -Upowerpc @@ -92,15 +102,15 @@ include/generated/vdso64-offsets.h: $(obj)/vdso64.so.dbg FORCE # actual build commands quiet_cmd_vdso32ld_and_check = VDSO32L $@ - cmd_vdso32ld_and_check = $(VDSOCC) $(c_flags) $(CC32FLAGS) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^) -z noexecstack ; $(cmd_vdso_check) + cmd_vdso32ld_and_check = $(VDSOCC) $(ldflags-y) $(CC32FLAGS) $(LD32FLAGS) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^); $(cmd_vdso_check) quiet_cmd_vdso32as = VDSO32A $@ cmd_vdso32as = $(VDSOCC) $(a_flags) $(CC32FLAGS) $(AS32FLAGS) -c -o $@ $< quiet_cmd_vdso32cc = VDSO32C $@ cmd_vdso32cc = $(VDSOCC) $(c_flags) $(CC32FLAGS) -c -o $@ $< quiet_cmd_vdso64ld_and_check = VDSO64L $@ - cmd_vdso64ld_and_check = $(VDSOCC) $(c_flags) $(CC64FLAGS) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^) -z noexecstack ; $(cmd_vdso_check) + cmd_vdso64ld_and_check = $(VDSOCC) $(ldflags-y) $(LD64FLAGS) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^); $(cmd_vdso_check) quiet_cmd_vdso64as = VDSO64A $@ - cmd_vdso64as = $(VDSOCC) $(a_flags) $(CC64FLAGS) $(AS64FLAGS) -c -o $@ $< + cmd_vdso64as = $(VDSOCC) $(a_flags) $(AS64FLAGS) -c -o $@ $< OBJECT_FILES_NON_STANDARD := y diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c index 9be3e818a240..110d28bede2a 100644 --- a/arch/powerpc/kexec/file_load_64.c +++ b/arch/powerpc/kexec/file_load_64.c @@ -28,6 +28,7 @@ #include <asm/crashdump-ppc64.h> #include <asm/mmzone.h> #include <asm/prom.h> +#include <asm/plpks.h> struct umem_info { u64 *buf; /* data buffer for usable-memory property */ @@ -689,7 +690,8 @@ static int update_usable_mem_fdt(void *fdt, struct crash_mem *usable_mem) ret = fdt_setprop(fdt, node, "linux,drconf-usable-memory", um_info.buf, (um_info.idx * sizeof(u64))); if (ret) { - pr_err("Failed to update fdt with linux,drconf-usable-memory property"); + pr_err("Failed to update fdt with linux,drconf-usable-memory property: %s", + fdt_strerror(ret)); goto out; } } @@ -978,12 +980,17 @@ static unsigned int cpu_node_size(void) */ unsigned int kexec_extra_fdt_size_ppc64(struct kimage *image) { - unsigned int cpu_nodes, extra_size; + unsigned int cpu_nodes, extra_size = 0; struct device_node *dn; u64 usm_entries; + // Budget some space for the password blob. There's already extra space + // for the key name + if (plpks_is_available()) + extra_size += (unsigned int)plpks_get_passwordlen(); + if (image->type != KEXEC_TYPE_CRASH) - return 0; + return extra_size; /* * For kdump kernel, account for linux,usable-memory and @@ -993,9 +1000,7 @@ unsigned int kexec_extra_fdt_size_ppc64(struct kimage *image) if (drmem_lmb_size()) { usm_entries = ((memory_hotplug_max() / drmem_lmb_size()) + (2 * (resource_size(&crashk_res) / drmem_lmb_size()))); - extra_size = (unsigned int)(usm_entries * sizeof(u64)); - } else { - extra_size = 0; + extra_size += (unsigned int)(usm_entries * sizeof(u64)); } /* @@ -1234,6 +1239,10 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt, } } + // If we have PLPKS active, we need to provide the password to the new kernel + if (plpks_is_available()) + ret = plpks_populate_fdt(fdt); + out: kfree(rmem); kfree(umem); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 6d525285dbe8..57f4e7896d67 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -999,16 +999,6 @@ int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_store); -int kvmppc_core_check_processor_compat(void) -{ - /* - * We always return 0 for book3s. We check - * for compatibility while loading the HV - * or PR module - */ - return 0; -} - int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hcall) { return kvm->arch.kvm_ops->hcall_implemented(hcall); @@ -1062,7 +1052,7 @@ static int kvmppc_book3s_init(void) { int r; - r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + r = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); if (r) return r; #ifdef CONFIG_KVM_BOOK3S_32_HANDLER diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index e89281d3ba28..01adffb24667 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1210,7 +1210,7 @@ int kvmppc_handle_exit(struct kvm_vcpu *vcpu, unsigned int exit_nr) /* * On cores with Vector category, KVM is loaded only if CONFIG_ALTIVEC, - * see kvmppc_core_check_processor_compat(). + * see kvmppc_e500mc_check_processor_compat(). */ #ifdef CONFIG_ALTIVEC case BOOKE_INTERRUPT_ALTIVEC_UNAVAIL: diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index c8b2b4478545..b0f695428733 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -314,7 +314,7 @@ static void kvmppc_core_vcpu_put_e500(struct kvm_vcpu *vcpu) kvmppc_booke_vcpu_put(vcpu); } -int kvmppc_core_check_processor_compat(void) +static int kvmppc_e500_check_processor_compat(void) { int r; @@ -507,7 +507,7 @@ static int __init kvmppc_e500_init(void) unsigned long handler_len; unsigned long max_ivor = 0; - r = kvmppc_core_check_processor_compat(); + r = kvmppc_e500_check_processor_compat(); if (r) goto err_out; @@ -531,7 +531,7 @@ static int __init kvmppc_e500_init(void) flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers + ivor[max_ivor] + handler_len); - r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + r = kvm_init(sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); if (r) goto err_out; kvm_ops_e500.owner = THIS_MODULE; diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index 57e0ad6a2ca3..a309138927ff 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -168,7 +168,7 @@ static void kvmppc_core_vcpu_put_e500mc(struct kvm_vcpu *vcpu) kvmppc_booke_vcpu_put(vcpu); } -int kvmppc_core_check_processor_compat(void) +int kvmppc_e500mc_check_processor_compat(void) { int r; @@ -388,6 +388,10 @@ static int __init kvmppc_e500mc_init(void) { int r; + r = kvmppc_e500mc_check_processor_compat(); + if (r) + goto err_out; + r = kvmppc_booke_init(); if (r) goto err_out; @@ -400,7 +404,7 @@ static int __init kvmppc_e500mc_init(void) */ kvmppc_init_lpid(KVMPPC_NR_LPIDS/threads_per_core); - r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + r = kvm_init(sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); if (r) goto err_out; kvm_ops_e500mc.owner = THIS_MODULE; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 04494a4fb37a..4c5405fc5538 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -435,21 +435,6 @@ int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, } EXPORT_SYMBOL_GPL(kvmppc_ld); -int kvm_arch_hardware_enable(void) -{ - return 0; -} - -int kvm_arch_hardware_setup(void *opaque) -{ - return 0; -} - -int kvm_arch_check_processor_compat(void *opaque) -{ - return kvmppc_core_check_processor_compat(); -} - int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { struct kvmppc_ops *kvm_ops = NULL; @@ -2544,11 +2529,6 @@ void kvmppc_init_lpid(unsigned long nr_lpids_param) } EXPORT_SYMBOL_GPL(kvmppc_init_lpid); -int kvm_arch_init(void *opaque) -{ - return 0; -} - EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ppc_instr); void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry) diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 4de71cbf6e8e..c4db459d304a 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -16,6 +16,8 @@ KASAN_SANITIZE_feature-fixups.o := n # restart_table.o contains functions called in the NMI interrupt path # which can be in real mode. Disable KASAN. KASAN_SANITIZE_restart_table.o := n +KCSAN_SANITIZE_code-patching.o := n +KCSAN_SANITIZE_feature-fixups.o := n ifdef CONFIG_KASAN CFLAGS_code-patching.o += -DDISABLE_BRANCH_PROFILING diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 44a35ed4f686..fedffe3ae136 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1051,7 +1051,8 @@ static void __init htab_initialize(void) static_branch_enable(&stress_hpt_key); // Too early to use nr_cpu_ids, so use NR_CPUS tmp = memblock_phys_alloc_range(sizeof(struct stress_hpt_struct) * NR_CPUS, - 0, 0, MEMBLOCK_ALLOC_ANYWHERE); + __alignof__(struct stress_hpt_struct), + 0, MEMBLOCK_ALLOC_ANYWHERE); memset((void *)tmp, 0xff, sizeof(struct stress_hpt_struct) * NR_CPUS); stress_hpt_struct = __va(tmp); diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 4e29b619578c..e50bc5fc7ddf 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -700,12 +700,13 @@ static inline void _tlbiel_va_range_multicast(struct mm_struct *mm, */ void radix__local_flush_tlb_mm(struct mm_struct *mm) { - unsigned long pid; + unsigned long pid = mm->context.id; + + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; preempt_disable(); - pid = mm->context.id; - if (pid != MMU_NO_CONTEXT) - _tlbiel_pid(pid, RIC_FLUSH_TLB); + _tlbiel_pid(pid, RIC_FLUSH_TLB); preempt_enable(); } EXPORT_SYMBOL(radix__local_flush_tlb_mm); @@ -713,12 +714,13 @@ EXPORT_SYMBOL(radix__local_flush_tlb_mm); #ifndef CONFIG_SMP void radix__local_flush_all_mm(struct mm_struct *mm) { - unsigned long pid; + unsigned long pid = mm->context.id; + + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; preempt_disable(); - pid = mm->context.id; - if (pid != MMU_NO_CONTEXT) - _tlbiel_pid(pid, RIC_FLUSH_ALL); + _tlbiel_pid(pid, RIC_FLUSH_ALL); preempt_enable(); } EXPORT_SYMBOL(radix__local_flush_all_mm); @@ -732,12 +734,13 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm) void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize) { - unsigned long pid; + unsigned long pid = mm->context.id; + + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; preempt_disable(); - pid = mm->context.id; - if (pid != MMU_NO_CONTEXT) - _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); preempt_enable(); } @@ -945,7 +948,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm) enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; preempt_disable(); @@ -985,7 +988,7 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm) enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; preempt_disable(); @@ -1024,7 +1027,7 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; preempt_disable(); @@ -1104,6 +1107,9 @@ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) } EXPORT_SYMBOL(radix__flush_tlb_kernel_range); +/* + * Doesn't appear to be used anywhere. Remove. + */ #define TLB_FLUSH_ALL -1UL /* @@ -1125,23 +1131,22 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm, unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift; unsigned long page_size = 1UL << page_shift; unsigned long nr_pages = (end - start) >> page_shift; - bool fullmm = (end == TLB_FLUSH_ALL); bool flush_pid, flush_pwc = false; enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; + WARN_ON_ONCE(end == TLB_FLUSH_ALL); + preempt_disable(); smp_mb(); /* see radix__flush_tlb_mm */ - type = flush_type_needed(mm, fullmm); + type = flush_type_needed(mm, false); if (type == FLUSH_TYPE_NONE) goto out; - if (fullmm) - flush_pid = true; - else if (type == FLUSH_TYPE_GLOBAL) + if (type == FLUSH_TYPE_GLOBAL) flush_pid = nr_pages > tlb_single_page_flush_ceiling; else flush_pid = nr_pages > tlb_local_single_page_flush_ceiling; @@ -1179,15 +1184,12 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm, } } } else { - bool hflush = false; + bool hflush; unsigned long hstart, hend; - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - hstart = (start + PMD_SIZE - 1) & PMD_MASK; - hend = end & PMD_MASK; - if (hstart < hend) - hflush = true; - } + hstart = (start + PMD_SIZE - 1) & PMD_MASK; + hend = end & PMD_MASK; + hflush = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hstart < hend; if (type == FLUSH_TYPE_LOCAL) { asm volatile("ptesync": : :"memory"); @@ -1302,7 +1304,7 @@ void radix__tlb_flush(struct mmu_gather *tlb) * that flushes the process table entry cache upon process teardown. * See the comment for radix in arch_exit_mmap(). */ - if (tlb->fullmm || tlb->need_flush_all) { + if (tlb->fullmm) { __flush_all_mm(mm, true); } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) { if (!tlb->freed_tables) @@ -1325,25 +1327,22 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned int page_shift = mmu_psize_defs[psize].shift; unsigned long page_size = 1UL << page_shift; unsigned long nr_pages = (end - start) >> page_shift; - bool fullmm = (end == TLB_FLUSH_ALL); bool flush_pid; enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; - fullmm = (end == TLB_FLUSH_ALL); + WARN_ON_ONCE(end == TLB_FLUSH_ALL); preempt_disable(); smp_mb(); /* see radix__flush_tlb_mm */ - type = flush_type_needed(mm, fullmm); + type = flush_type_needed(mm, false); if (type == FLUSH_TYPE_NONE) goto out; - if (fullmm) - flush_pid = true; - else if (type == FLUSH_TYPE_GLOBAL) + if (type == FLUSH_TYPE_GLOBAL) flush_pid = nr_pages > tlb_single_page_flush_ceiling; else flush_pid = nr_pages > tlb_local_single_page_flush_ceiling; @@ -1406,7 +1405,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; /* 4k page size, just blow the world */ diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index bd9784f77f2e..c6dccb4f06dc 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -120,6 +120,7 @@ extern int switch_to_as1(void); extern void restore_to_as0(int esel, int offset, void *dt_ptr, int bootcpu); void create_kaslr_tlb_entry(int entry, unsigned long virt, phys_addr_t phys); void reloc_kernel_entry(void *fdt, int addr); +void relocate_init(u64 dt_ptr, phys_addr_t start); extern int is_second_reloc; #endif extern void loadcam_entry(unsigned int index); diff --git a/arch/powerpc/mm/nohash/e500_hugetlbpage.c b/arch/powerpc/mm/nohash/e500_hugetlbpage.c index c7d4b317a823..58c8d9849cb1 100644 --- a/arch/powerpc/mm/nohash/e500_hugetlbpage.c +++ b/arch/powerpc/mm/nohash/e500_hugetlbpage.c @@ -45,7 +45,9 @@ static inline void book3e_tlb_lock(void) if (!cpu_has_feature(CPU_FTR_SMT)) return; - asm volatile("1: lbarx %0, 0, %1;" + asm volatile(".machine push;" + ".machine e6500;" + "1: lbarx %0, 0, %1;" "cmpwi %0, 0;" "bne 2f;" "stbcx. %2, 0, %1;" @@ -56,6 +58,7 @@ static inline void book3e_tlb_lock(void) "bne 2b;" "b 1b;" "3:" + ".machine pop;" : "=&r" (tmp) : "r" (&paca->tcd_ptr->lock), "r" (token) : "memory"); diff --git a/arch/powerpc/mm/nohash/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S index 76cf456d7976..7e0b8fe1c279 100644 --- a/arch/powerpc/mm/nohash/tlb_low_64e.S +++ b/arch/powerpc/mm/nohash/tlb_low_64e.S @@ -351,7 +351,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_EMB_HV,CPU_FTR_EMB_HV,532) mfspr r15,SPRN_MAS2 isync - tlbilxva 0,r15 + PPC_TLBILX_VA(0,R15) isync mtspr SPRN_MAS6,r10 diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index a4f7880f959d..d767e39d5645 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -169,7 +169,7 @@ static inline void bpf_clear_seen_register(struct codegen_context *ctx, int i) void bpf_jit_init_reg_mapping(struct codegen_context *ctx); int bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 func); int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx, - u32 *addrs, int pass); + u32 *addrs, int pass, bool extra_pass); void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx); void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx); void bpf_jit_realloc_regs(struct codegen_context *ctx); diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 43e634126514..e93aefcfb83f 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -23,74 +23,6 @@ static void bpf_jit_fill_ill_insns(void *area, unsigned int size) memset32(area, BREAKPOINT_INSTRUCTION, size / 4); } -/* Fix updated addresses (for subprog calls, ldimm64, et al) during extra pass */ -static int bpf_jit_fixup_addresses(struct bpf_prog *fp, u32 *image, - struct codegen_context *ctx, u32 *addrs) -{ - const struct bpf_insn *insn = fp->insnsi; - bool func_addr_fixed; - u64 func_addr; - u32 tmp_idx; - int i, j, ret; - - for (i = 0; i < fp->len; i++) { - /* - * During the extra pass, only the branch target addresses for - * the subprog calls need to be fixed. All other instructions - * can left untouched. - * - * The JITed image length does not change because we already - * ensure that the JITed instruction sequence for these calls - * are of fixed length by padding them with NOPs. - */ - if (insn[i].code == (BPF_JMP | BPF_CALL) && - insn[i].src_reg == BPF_PSEUDO_CALL) { - ret = bpf_jit_get_func_addr(fp, &insn[i], true, - &func_addr, - &func_addr_fixed); - if (ret < 0) - return ret; - - /* - * Save ctx->idx as this would currently point to the - * end of the JITed image and set it to the offset of - * the instruction sequence corresponding to the - * subprog call temporarily. - */ - tmp_idx = ctx->idx; - ctx->idx = addrs[i] / 4; - ret = bpf_jit_emit_func_call_rel(image, ctx, func_addr); - if (ret) - return ret; - - /* - * Restore ctx->idx here. This is safe as the length - * of the JITed sequence remains unchanged. - */ - ctx->idx = tmp_idx; - } else if (insn[i].code == (BPF_LD | BPF_IMM | BPF_DW)) { - tmp_idx = ctx->idx; - ctx->idx = addrs[i] / 4; -#ifdef CONFIG_PPC32 - PPC_LI32(bpf_to_ppc(insn[i].dst_reg) - 1, (u32)insn[i + 1].imm); - PPC_LI32(bpf_to_ppc(insn[i].dst_reg), (u32)insn[i].imm); - for (j = ctx->idx - addrs[i] / 4; j < 4; j++) - EMIT(PPC_RAW_NOP()); -#else - func_addr = ((u64)(u32)insn[i].imm) | (((u64)(u32)insn[i + 1].imm) << 32); - PPC_LI64(bpf_to_ppc(insn[i].dst_reg), func_addr); - /* overwrite rest with nops */ - for (j = ctx->idx - addrs[i] / 4; j < 5; j++) - EMIT(PPC_RAW_NOP()); -#endif - ctx->idx = tmp_idx; - i++; - } - } - - return 0; -} - int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx, int tmp_reg, long exit_addr) { if (!exit_addr || is_offset_in_branch_range(exit_addr - (ctx->idx * 4))) { @@ -185,7 +117,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) cgctx.stack_size = round_up(fp->aux->stack_depth, 16); /* Scouting faux-generate pass 0 */ - if (bpf_jit_build_body(fp, 0, &cgctx, addrs, 0)) { + if (bpf_jit_build_body(fp, 0, &cgctx, addrs, 0, false)) { /* We hit something illegal or unsupported. */ fp = org_fp; goto out_addrs; @@ -200,7 +132,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) */ if (cgctx.seen & SEEN_TAILCALL || !is_offset_in_branch_range((long)cgctx.idx * 4)) { cgctx.idx = 0; - if (bpf_jit_build_body(fp, 0, &cgctx, addrs, 0)) { + if (bpf_jit_build_body(fp, 0, &cgctx, addrs, 0, false)) { fp = org_fp; goto out_addrs; } @@ -234,29 +166,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) skip_init_ctx: code_base = (u32 *)(image + FUNCTION_DESCR_SIZE); - if (extra_pass) { - /* - * Do not touch the prologue and epilogue as they will remain - * unchanged. Only fix the branch target address for subprog - * calls in the body, and ldimm64 instructions. - * - * This does not change the offsets and lengths of the subprog - * call instruction sequences and hence, the size of the JITed - * image as well. - */ - bpf_jit_fixup_addresses(fp, code_base, &cgctx, addrs); - - /* There is no need to perform the usual passes. */ - goto skip_codegen_passes; - } - /* Code generation passes 1-2 */ for (pass = 1; pass < 3; pass++) { /* Now build the prologue, body code & epilogue for real. */ cgctx.idx = 0; cgctx.alt_exit_addr = 0; bpf_jit_build_prologue(code_base, &cgctx); - if (bpf_jit_build_body(fp, code_base, &cgctx, addrs, pass)) { + if (bpf_jit_build_body(fp, code_base, &cgctx, addrs, pass, extra_pass)) { bpf_jit_binary_free(bpf_hdr); fp = org_fp; goto out_addrs; @@ -268,7 +184,6 @@ skip_init_ctx: proglen - (cgctx.idx * 4), cgctx.seen); } -skip_codegen_passes: if (bpf_jit_enable > 1) /* * Note that we output the base address of the code_base diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index a379b0ce19ff..7f91ea064c08 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -79,6 +79,20 @@ static int bpf_jit_stack_offsetof(struct codegen_context *ctx, int reg) #define SEEN_NVREG_FULL_MASK 0x0003ffff /* Non volatile registers r14-r31 */ #define SEEN_NVREG_TEMP_MASK 0x00001e01 /* BPF_REG_5, BPF_REG_AX, TMP_REG */ +static inline bool bpf_has_stack_frame(struct codegen_context *ctx) +{ + /* + * We only need a stack frame if: + * - we call other functions (kernel helpers), or + * - we use non volatile registers, or + * - we use tail call counter + * - the bpf program uses its stack area + * The latter condition is deduced from the usage of BPF_REG_FP + */ + return ctx->seen & (SEEN_FUNC | SEEN_TAILCALL | SEEN_NVREG_FULL_MASK) || + bpf_is_seen_register(ctx, bpf_to_ppc(BPF_REG_FP)); +} + void bpf_jit_realloc_regs(struct codegen_context *ctx) { unsigned int nvreg_mask; @@ -114,11 +128,15 @@ void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) int i; /* Initialize tail_call_cnt, to be skipped if we do tail calls. */ - EMIT(PPC_RAW_LI(_R4, 0)); + if (ctx->seen & SEEN_TAILCALL) + EMIT(PPC_RAW_LI(_R4, 0)); + else + EMIT(PPC_RAW_NOP()); #define BPF_TAILCALL_PROLOGUE_SIZE 4 - EMIT(PPC_RAW_STWU(_R1, _R1, -BPF_PPC_STACKFRAME(ctx))); + if (bpf_has_stack_frame(ctx)) + EMIT(PPC_RAW_STWU(_R1, _R1, -BPF_PPC_STACKFRAME(ctx))); if (ctx->seen & SEEN_TAILCALL) EMIT(PPC_RAW_STW(_R4, _R1, bpf_jit_stack_offsetof(ctx, BPF_PPC_TC))); @@ -141,12 +159,6 @@ void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) if (bpf_is_seen_register(ctx, i)) EMIT(PPC_RAW_STW(i, _R1, bpf_jit_stack_offsetof(ctx, i))); - /* If needed retrieve arguments 9 and 10, ie 5th 64 bits arg.*/ - if (bpf_is_seen_register(ctx, bpf_to_ppc(BPF_REG_5))) { - EMIT(PPC_RAW_LWZ(bpf_to_ppc(BPF_REG_5) - 1, _R1, BPF_PPC_STACKFRAME(ctx)) + 8); - EMIT(PPC_RAW_LWZ(bpf_to_ppc(BPF_REG_5), _R1, BPF_PPC_STACKFRAME(ctx)) + 12); - } - /* Setup frame pointer to point to the bpf stack area */ if (bpf_is_seen_register(ctx, bpf_to_ppc(BPF_REG_FP))) { EMIT(PPC_RAW_LI(bpf_to_ppc(BPF_REG_FP) - 1, 0)); @@ -171,7 +183,8 @@ static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx EMIT(PPC_RAW_LWZ(_R0, _R1, BPF_PPC_STACKFRAME(ctx) + PPC_LR_STKOFF)); /* Tear down our stack frame */ - EMIT(PPC_RAW_ADDI(_R1, _R1, BPF_PPC_STACKFRAME(ctx))); + if (bpf_has_stack_frame(ctx)) + EMIT(PPC_RAW_ADDI(_R1, _R1, BPF_PPC_STACKFRAME(ctx))); if (ctx->seen & SEEN_FUNC) EMIT(PPC_RAW_MTLR(_R0)); @@ -193,9 +206,6 @@ int bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 func if (image && rel < 0x2000000 && rel >= -0x2000000) { PPC_BL(func); - EMIT(PPC_RAW_NOP()); - EMIT(PPC_RAW_NOP()); - EMIT(PPC_RAW_NOP()); } else { /* Load function address into r0 */ EMIT(PPC_RAW_LIS(_R0, IMM_H(func))); @@ -269,7 +279,7 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o /* Assemble the body code between the prologue & epilogue */ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx, - u32 *addrs, int pass) + u32 *addrs, int pass, bool extra_pass) { const struct bpf_insn *insn = fp->insnsi; int flen = fp->len; @@ -280,10 +290,13 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * for (i = 0; i < flen; i++) { u32 code = insn[i].code; + u32 prevcode = i ? insn[i - 1].code : 0; u32 dst_reg = bpf_to_ppc(insn[i].dst_reg); u32 dst_reg_h = dst_reg - 1; u32 src_reg = bpf_to_ppc(insn[i].src_reg); u32 src_reg_h = src_reg - 1; + u32 src2_reg = dst_reg; + u32 src2_reg_h = dst_reg_h; u32 ax_reg = bpf_to_ppc(BPF_REG_AX); u32 tmp_reg = bpf_to_ppc(TMP_REG); u32 size = BPF_SIZE(code); @@ -296,6 +309,15 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * u32 tmp_idx; int j; + if (i && (BPF_CLASS(code) == BPF_ALU64 || BPF_CLASS(code) == BPF_ALU) && + (BPF_CLASS(prevcode) == BPF_ALU64 || BPF_CLASS(prevcode) == BPF_ALU) && + BPF_OP(prevcode) == BPF_MOV && BPF_SRC(prevcode) == BPF_X && + insn[i - 1].dst_reg == insn[i].dst_reg && insn[i - 1].imm != 1) { + src2_reg = bpf_to_ppc(insn[i - 1].src_reg); + src2_reg_h = src2_reg - 1; + ctx->idx = addrs[i - 1] / 4; + } + /* * addrs[] maps a BPF bytecode address into a real offset from * the start of the body code. @@ -328,95 +350,111 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * * Arithmetic operations: ADD/SUB/MUL/DIV/MOD/NEG */ case BPF_ALU | BPF_ADD | BPF_X: /* (u32) dst += (u32) src */ - EMIT(PPC_RAW_ADD(dst_reg, dst_reg, src_reg)); + EMIT(PPC_RAW_ADD(dst_reg, src2_reg, src_reg)); break; case BPF_ALU64 | BPF_ADD | BPF_X: /* dst += src */ - EMIT(PPC_RAW_ADDC(dst_reg, dst_reg, src_reg)); - EMIT(PPC_RAW_ADDE(dst_reg_h, dst_reg_h, src_reg_h)); + EMIT(PPC_RAW_ADDC(dst_reg, src2_reg, src_reg)); + EMIT(PPC_RAW_ADDE(dst_reg_h, src2_reg_h, src_reg_h)); break; case BPF_ALU | BPF_SUB | BPF_X: /* (u32) dst -= (u32) src */ - EMIT(PPC_RAW_SUB(dst_reg, dst_reg, src_reg)); + EMIT(PPC_RAW_SUB(dst_reg, src2_reg, src_reg)); break; case BPF_ALU64 | BPF_SUB | BPF_X: /* dst -= src */ - EMIT(PPC_RAW_SUBFC(dst_reg, src_reg, dst_reg)); - EMIT(PPC_RAW_SUBFE(dst_reg_h, src_reg_h, dst_reg_h)); + EMIT(PPC_RAW_SUBFC(dst_reg, src_reg, src2_reg)); + EMIT(PPC_RAW_SUBFE(dst_reg_h, src_reg_h, src2_reg_h)); break; case BPF_ALU | BPF_SUB | BPF_K: /* (u32) dst -= (u32) imm */ imm = -imm; fallthrough; case BPF_ALU | BPF_ADD | BPF_K: /* (u32) dst += (u32) imm */ - if (IMM_HA(imm) & 0xffff) - EMIT(PPC_RAW_ADDIS(dst_reg, dst_reg, IMM_HA(imm))); + if (!imm) { + EMIT(PPC_RAW_MR(dst_reg, src2_reg)); + } else if (IMM_HA(imm) & 0xffff) { + EMIT(PPC_RAW_ADDIS(dst_reg, src2_reg, IMM_HA(imm))); + src2_reg = dst_reg; + } if (IMM_L(imm)) - EMIT(PPC_RAW_ADDI(dst_reg, dst_reg, IMM_L(imm))); + EMIT(PPC_RAW_ADDI(dst_reg, src2_reg, IMM_L(imm))); break; case BPF_ALU64 | BPF_SUB | BPF_K: /* dst -= imm */ imm = -imm; fallthrough; case BPF_ALU64 | BPF_ADD | BPF_K: /* dst += imm */ - if (!imm) + if (!imm) { + EMIT(PPC_RAW_MR(dst_reg, src2_reg)); + EMIT(PPC_RAW_MR(dst_reg_h, src2_reg_h)); break; - + } if (imm >= -32768 && imm < 32768) { - EMIT(PPC_RAW_ADDIC(dst_reg, dst_reg, imm)); + EMIT(PPC_RAW_ADDIC(dst_reg, src2_reg, imm)); } else { PPC_LI32(_R0, imm); - EMIT(PPC_RAW_ADDC(dst_reg, dst_reg, _R0)); + EMIT(PPC_RAW_ADDC(dst_reg, src2_reg, _R0)); } if (imm >= 0 || (BPF_OP(code) == BPF_SUB && imm == 0x80000000)) - EMIT(PPC_RAW_ADDZE(dst_reg_h, dst_reg_h)); + EMIT(PPC_RAW_ADDZE(dst_reg_h, src2_reg_h)); else - EMIT(PPC_RAW_ADDME(dst_reg_h, dst_reg_h)); + EMIT(PPC_RAW_ADDME(dst_reg_h, src2_reg_h)); break; case BPF_ALU64 | BPF_MUL | BPF_X: /* dst *= src */ bpf_set_seen_register(ctx, tmp_reg); - EMIT(PPC_RAW_MULW(_R0, dst_reg, src_reg_h)); - EMIT(PPC_RAW_MULW(dst_reg_h, dst_reg_h, src_reg)); - EMIT(PPC_RAW_MULHWU(tmp_reg, dst_reg, src_reg)); - EMIT(PPC_RAW_MULW(dst_reg, dst_reg, src_reg)); + EMIT(PPC_RAW_MULW(_R0, src2_reg, src_reg_h)); + EMIT(PPC_RAW_MULW(dst_reg_h, src2_reg_h, src_reg)); + EMIT(PPC_RAW_MULHWU(tmp_reg, src2_reg, src_reg)); + EMIT(PPC_RAW_MULW(dst_reg, src2_reg, src_reg)); EMIT(PPC_RAW_ADD(dst_reg_h, dst_reg_h, _R0)); EMIT(PPC_RAW_ADD(dst_reg_h, dst_reg_h, tmp_reg)); break; case BPF_ALU | BPF_MUL | BPF_X: /* (u32) dst *= (u32) src */ - EMIT(PPC_RAW_MULW(dst_reg, dst_reg, src_reg)); + EMIT(PPC_RAW_MULW(dst_reg, src2_reg, src_reg)); break; case BPF_ALU | BPF_MUL | BPF_K: /* (u32) dst *= (u32) imm */ - if (imm >= -32768 && imm < 32768) { - EMIT(PPC_RAW_MULI(dst_reg, dst_reg, imm)); + if (imm == 1) { + EMIT(PPC_RAW_MR(dst_reg, src2_reg)); + } else if (imm == -1) { + EMIT(PPC_RAW_SUBFIC(dst_reg, src2_reg, 0)); + } else if (is_power_of_2((u32)imm)) { + EMIT(PPC_RAW_SLWI(dst_reg, src2_reg, ilog2(imm))); + } else if (imm >= -32768 && imm < 32768) { + EMIT(PPC_RAW_MULI(dst_reg, src2_reg, imm)); } else { PPC_LI32(_R0, imm); - EMIT(PPC_RAW_MULW(dst_reg, dst_reg, _R0)); + EMIT(PPC_RAW_MULW(dst_reg, src2_reg, _R0)); } break; case BPF_ALU64 | BPF_MUL | BPF_K: /* dst *= imm */ if (!imm) { PPC_LI32(dst_reg, 0); PPC_LI32(dst_reg_h, 0); - break; - } - if (imm == 1) - break; - if (imm == -1) { - EMIT(PPC_RAW_SUBFIC(dst_reg, dst_reg, 0)); - EMIT(PPC_RAW_SUBFZE(dst_reg_h, dst_reg_h)); - break; + } else if (imm == 1) { + EMIT(PPC_RAW_MR(dst_reg, src2_reg)); + EMIT(PPC_RAW_MR(dst_reg_h, src2_reg_h)); + } else if (imm == -1) { + EMIT(PPC_RAW_SUBFIC(dst_reg, src2_reg, 0)); + EMIT(PPC_RAW_SUBFZE(dst_reg_h, src2_reg_h)); + } else if (imm > 0 && is_power_of_2(imm)) { + imm = ilog2(imm); + EMIT(PPC_RAW_RLWINM(dst_reg_h, src2_reg_h, imm, 0, 31 - imm)); + EMIT(PPC_RAW_RLWIMI(dst_reg_h, dst_reg, imm, 32 - imm, 31)); + EMIT(PPC_RAW_SLWI(dst_reg, src2_reg, imm)); + } else { + bpf_set_seen_register(ctx, tmp_reg); + PPC_LI32(tmp_reg, imm); + EMIT(PPC_RAW_MULW(dst_reg_h, src2_reg_h, tmp_reg)); + if (imm < 0) + EMIT(PPC_RAW_SUB(dst_reg_h, dst_reg_h, src2_reg)); + EMIT(PPC_RAW_MULHWU(_R0, src2_reg, tmp_reg)); + EMIT(PPC_RAW_MULW(dst_reg, src2_reg, tmp_reg)); + EMIT(PPC_RAW_ADD(dst_reg_h, dst_reg_h, _R0)); } - bpf_set_seen_register(ctx, tmp_reg); - PPC_LI32(tmp_reg, imm); - EMIT(PPC_RAW_MULW(dst_reg_h, dst_reg_h, tmp_reg)); - if (imm < 0) - EMIT(PPC_RAW_SUB(dst_reg_h, dst_reg_h, dst_reg)); - EMIT(PPC_RAW_MULHWU(_R0, dst_reg, tmp_reg)); - EMIT(PPC_RAW_MULW(dst_reg, dst_reg, tmp_reg)); - EMIT(PPC_RAW_ADD(dst_reg_h, dst_reg_h, _R0)); break; case BPF_ALU | BPF_DIV | BPF_X: /* (u32) dst /= (u32) src */ - EMIT(PPC_RAW_DIVWU(dst_reg, dst_reg, src_reg)); + EMIT(PPC_RAW_DIVWU(dst_reg, src2_reg, src_reg)); break; case BPF_ALU | BPF_MOD | BPF_X: /* (u32) dst %= (u32) src */ - EMIT(PPC_RAW_DIVWU(_R0, dst_reg, src_reg)); + EMIT(PPC_RAW_DIVWU(_R0, src2_reg, src_reg)); EMIT(PPC_RAW_MULW(_R0, src_reg, _R0)); - EMIT(PPC_RAW_SUB(dst_reg, dst_reg, _R0)); + EMIT(PPC_RAW_SUB(dst_reg, src2_reg, _R0)); break; case BPF_ALU64 | BPF_DIV | BPF_X: /* dst /= src */ return -EOPNOTSUPP; @@ -425,11 +463,14 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * case BPF_ALU | BPF_DIV | BPF_K: /* (u32) dst /= (u32) imm */ if (!imm) return -EINVAL; - if (imm == 1) - break; - - PPC_LI32(_R0, imm); - EMIT(PPC_RAW_DIVWU(dst_reg, dst_reg, _R0)); + if (imm == 1) { + EMIT(PPC_RAW_MR(dst_reg, src2_reg)); + } else if (is_power_of_2((u32)imm)) { + EMIT(PPC_RAW_SRWI(dst_reg, src2_reg, ilog2(imm))); + } else { + PPC_LI32(_R0, imm); + EMIT(PPC_RAW_DIVWU(dst_reg, src2_reg, _R0)); + } break; case BPF_ALU | BPF_MOD | BPF_K: /* (u32) dst %= (u32) imm */ if (!imm) @@ -438,16 +479,15 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * if (!is_power_of_2((u32)imm)) { bpf_set_seen_register(ctx, tmp_reg); PPC_LI32(tmp_reg, imm); - EMIT(PPC_RAW_DIVWU(_R0, dst_reg, tmp_reg)); + EMIT(PPC_RAW_DIVWU(_R0, src2_reg, tmp_reg)); EMIT(PPC_RAW_MULW(_R0, tmp_reg, _R0)); - EMIT(PPC_RAW_SUB(dst_reg, dst_reg, _R0)); - break; - } - if (imm == 1) + EMIT(PPC_RAW_SUB(dst_reg, src2_reg, _R0)); + } else if (imm == 1) { EMIT(PPC_RAW_LI(dst_reg, 0)); - else - EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 0, 32 - ilog2((u32)imm), 31)); - + } else { + imm = ilog2((u32)imm); + EMIT(PPC_RAW_RLWINM(dst_reg, src2_reg, 0, 32 - imm, 31)); + } break; case BPF_ALU64 | BPF_MOD | BPF_K: /* dst %= imm */ if (!imm) @@ -459,7 +499,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * if (imm == 1) EMIT(PPC_RAW_LI(dst_reg, 0)); else - EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 0, 32 - ilog2(imm), 31)); + EMIT(PPC_RAW_RLWINM(dst_reg, src2_reg, 0, 32 - ilog2(imm), 31)); EMIT(PPC_RAW_LI(dst_reg_h, 0)); break; case BPF_ALU64 | BPF_DIV | BPF_K: /* dst /= imm */ @@ -469,34 +509,38 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * return -EOPNOTSUPP; if (imm < 0) { - EMIT(PPC_RAW_SUBFIC(dst_reg, dst_reg, 0)); - EMIT(PPC_RAW_SUBFZE(dst_reg_h, dst_reg_h)); + EMIT(PPC_RAW_SUBFIC(dst_reg, src2_reg, 0)); + EMIT(PPC_RAW_SUBFZE(dst_reg_h, src2_reg_h)); imm = -imm; + src2_reg = dst_reg; + } + if (imm == 1) { + EMIT(PPC_RAW_MR(dst_reg, src2_reg)); + EMIT(PPC_RAW_MR(dst_reg_h, src2_reg_h)); + } else { + imm = ilog2(imm); + EMIT(PPC_RAW_RLWINM(dst_reg, src2_reg, 32 - imm, imm, 31)); + EMIT(PPC_RAW_RLWIMI(dst_reg, src2_reg_h, 32 - imm, 0, imm - 1)); + EMIT(PPC_RAW_SRAWI(dst_reg_h, src2_reg_h, imm)); } - if (imm == 1) - break; - imm = ilog2(imm); - EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 32 - imm, imm, 31)); - EMIT(PPC_RAW_RLWIMI(dst_reg, dst_reg_h, 32 - imm, 0, imm - 1)); - EMIT(PPC_RAW_SRAWI(dst_reg_h, dst_reg_h, imm)); break; case BPF_ALU | BPF_NEG: /* (u32) dst = -dst */ - EMIT(PPC_RAW_NEG(dst_reg, dst_reg)); + EMIT(PPC_RAW_NEG(dst_reg, src2_reg)); break; case BPF_ALU64 | BPF_NEG: /* dst = -dst */ - EMIT(PPC_RAW_SUBFIC(dst_reg, dst_reg, 0)); - EMIT(PPC_RAW_SUBFZE(dst_reg_h, dst_reg_h)); + EMIT(PPC_RAW_SUBFIC(dst_reg, src2_reg, 0)); + EMIT(PPC_RAW_SUBFZE(dst_reg_h, src2_reg_h)); break; /* * Logical operations: AND/OR/XOR/[A]LSH/[A]RSH */ case BPF_ALU64 | BPF_AND | BPF_X: /* dst = dst & src */ - EMIT(PPC_RAW_AND(dst_reg, dst_reg, src_reg)); - EMIT(PPC_RAW_AND(dst_reg_h, dst_reg_h, src_reg_h)); + EMIT(PPC_RAW_AND(dst_reg, src2_reg, src_reg)); + EMIT(PPC_RAW_AND(dst_reg_h, src2_reg_h, src_reg_h)); break; case BPF_ALU | BPF_AND | BPF_X: /* (u32) dst = dst & src */ - EMIT(PPC_RAW_AND(dst_reg, dst_reg, src_reg)); + EMIT(PPC_RAW_AND(dst_reg, src2_reg, src_reg)); break; case BPF_ALU64 | BPF_AND | BPF_K: /* dst = dst & imm */ if (imm >= 0) @@ -504,23 +548,23 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * fallthrough; case BPF_ALU | BPF_AND | BPF_K: /* (u32) dst = dst & imm */ if (!IMM_H(imm)) { - EMIT(PPC_RAW_ANDI(dst_reg, dst_reg, IMM_L(imm))); + EMIT(PPC_RAW_ANDI(dst_reg, src2_reg, IMM_L(imm))); } else if (!IMM_L(imm)) { - EMIT(PPC_RAW_ANDIS(dst_reg, dst_reg, IMM_H(imm))); + EMIT(PPC_RAW_ANDIS(dst_reg, src2_reg, IMM_H(imm))); } else if (imm == (((1 << fls(imm)) - 1) ^ ((1 << (ffs(i) - 1)) - 1))) { - EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 0, + EMIT(PPC_RAW_RLWINM(dst_reg, src2_reg, 0, 32 - fls(imm), 32 - ffs(imm))); } else { PPC_LI32(_R0, imm); - EMIT(PPC_RAW_AND(dst_reg, dst_reg, _R0)); + EMIT(PPC_RAW_AND(dst_reg, src2_reg, _R0)); } break; case BPF_ALU64 | BPF_OR | BPF_X: /* dst = dst | src */ - EMIT(PPC_RAW_OR(dst_reg, dst_reg, src_reg)); - EMIT(PPC_RAW_OR(dst_reg_h, dst_reg_h, src_reg_h)); + EMIT(PPC_RAW_OR(dst_reg, src2_reg, src_reg)); + EMIT(PPC_RAW_OR(dst_reg_h, src2_reg_h, src_reg_h)); break; case BPF_ALU | BPF_OR | BPF_X: /* dst = (u32) dst | (u32) src */ - EMIT(PPC_RAW_OR(dst_reg, dst_reg, src_reg)); + EMIT(PPC_RAW_OR(dst_reg, src2_reg, src_reg)); break; case BPF_ALU64 | BPF_OR | BPF_K:/* dst = dst | imm */ /* Sign-extended */ @@ -528,145 +572,154 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * EMIT(PPC_RAW_LI(dst_reg_h, -1)); fallthrough; case BPF_ALU | BPF_OR | BPF_K:/* dst = (u32) dst | (u32) imm */ - if (IMM_L(imm)) - EMIT(PPC_RAW_ORI(dst_reg, dst_reg, IMM_L(imm))); + if (IMM_L(imm)) { + EMIT(PPC_RAW_ORI(dst_reg, src2_reg, IMM_L(imm))); + src2_reg = dst_reg; + } if (IMM_H(imm)) - EMIT(PPC_RAW_ORIS(dst_reg, dst_reg, IMM_H(imm))); + EMIT(PPC_RAW_ORIS(dst_reg, src2_reg, IMM_H(imm))); break; case BPF_ALU64 | BPF_XOR | BPF_X: /* dst ^= src */ if (dst_reg == src_reg) { EMIT(PPC_RAW_LI(dst_reg, 0)); EMIT(PPC_RAW_LI(dst_reg_h, 0)); } else { - EMIT(PPC_RAW_XOR(dst_reg, dst_reg, src_reg)); - EMIT(PPC_RAW_XOR(dst_reg_h, dst_reg_h, src_reg_h)); + EMIT(PPC_RAW_XOR(dst_reg, src2_reg, src_reg)); + EMIT(PPC_RAW_XOR(dst_reg_h, src2_reg_h, src_reg_h)); } break; case BPF_ALU | BPF_XOR | BPF_X: /* (u32) dst ^= src */ if (dst_reg == src_reg) EMIT(PPC_RAW_LI(dst_reg, 0)); else - EMIT(PPC_RAW_XOR(dst_reg, dst_reg, src_reg)); + EMIT(PPC_RAW_XOR(dst_reg, src2_reg, src_reg)); break; case BPF_ALU64 | BPF_XOR | BPF_K: /* dst ^= imm */ if (imm < 0) - EMIT(PPC_RAW_NOR(dst_reg_h, dst_reg_h, dst_reg_h)); + EMIT(PPC_RAW_NOR(dst_reg_h, src2_reg_h, src2_reg_h)); fallthrough; case BPF_ALU | BPF_XOR | BPF_K: /* (u32) dst ^= (u32) imm */ - if (IMM_L(imm)) - EMIT(PPC_RAW_XORI(dst_reg, dst_reg, IMM_L(imm))); + if (IMM_L(imm)) { + EMIT(PPC_RAW_XORI(dst_reg, src2_reg, IMM_L(imm))); + src2_reg = dst_reg; + } if (IMM_H(imm)) - EMIT(PPC_RAW_XORIS(dst_reg, dst_reg, IMM_H(imm))); + EMIT(PPC_RAW_XORIS(dst_reg, src2_reg, IMM_H(imm))); break; case BPF_ALU | BPF_LSH | BPF_X: /* (u32) dst <<= (u32) src */ - EMIT(PPC_RAW_SLW(dst_reg, dst_reg, src_reg)); + EMIT(PPC_RAW_SLW(dst_reg, src2_reg, src_reg)); break; case BPF_ALU64 | BPF_LSH | BPF_X: /* dst <<= src; */ bpf_set_seen_register(ctx, tmp_reg); EMIT(PPC_RAW_SUBFIC(_R0, src_reg, 32)); - EMIT(PPC_RAW_SLW(dst_reg_h, dst_reg_h, src_reg)); + EMIT(PPC_RAW_SLW(dst_reg_h, src2_reg_h, src_reg)); EMIT(PPC_RAW_ADDI(tmp_reg, src_reg, 32)); - EMIT(PPC_RAW_SRW(_R0, dst_reg, _R0)); - EMIT(PPC_RAW_SLW(tmp_reg, dst_reg, tmp_reg)); + EMIT(PPC_RAW_SRW(_R0, src2_reg, _R0)); + EMIT(PPC_RAW_SLW(tmp_reg, src2_reg, tmp_reg)); EMIT(PPC_RAW_OR(dst_reg_h, dst_reg_h, _R0)); - EMIT(PPC_RAW_SLW(dst_reg, dst_reg, src_reg)); + EMIT(PPC_RAW_SLW(dst_reg, src2_reg, src_reg)); EMIT(PPC_RAW_OR(dst_reg_h, dst_reg_h, tmp_reg)); break; case BPF_ALU | BPF_LSH | BPF_K: /* (u32) dst <<= (u32) imm */ - if (!imm) - break; - EMIT(PPC_RAW_SLWI(dst_reg, dst_reg, imm)); + if (imm) + EMIT(PPC_RAW_SLWI(dst_reg, src2_reg, imm)); + else + EMIT(PPC_RAW_MR(dst_reg, src2_reg)); break; case BPF_ALU64 | BPF_LSH | BPF_K: /* dst <<= imm */ if (imm < 0) return -EINVAL; - if (!imm) - break; - if (imm < 32) { - EMIT(PPC_RAW_RLWINM(dst_reg_h, dst_reg_h, imm, 0, 31 - imm)); - EMIT(PPC_RAW_RLWIMI(dst_reg_h, dst_reg, imm, 32 - imm, 31)); - EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, imm, 0, 31 - imm)); - break; - } - if (imm < 64) - EMIT(PPC_RAW_RLWINM(dst_reg_h, dst_reg, imm, 0, 31 - imm)); - else + if (!imm) { + EMIT(PPC_RAW_MR(dst_reg, src2_reg)); + } else if (imm < 32) { + EMIT(PPC_RAW_RLWINM(dst_reg_h, src2_reg_h, imm, 0, 31 - imm)); + EMIT(PPC_RAW_RLWIMI(dst_reg_h, src2_reg, imm, 32 - imm, 31)); + EMIT(PPC_RAW_RLWINM(dst_reg, src2_reg, imm, 0, 31 - imm)); + } else if (imm < 64) { + EMIT(PPC_RAW_RLWINM(dst_reg_h, src2_reg, imm, 0, 31 - imm)); + EMIT(PPC_RAW_LI(dst_reg, 0)); + } else { EMIT(PPC_RAW_LI(dst_reg_h, 0)); - EMIT(PPC_RAW_LI(dst_reg, 0)); + EMIT(PPC_RAW_LI(dst_reg, 0)); + } break; case BPF_ALU | BPF_RSH | BPF_X: /* (u32) dst >>= (u32) src */ - EMIT(PPC_RAW_SRW(dst_reg, dst_reg, src_reg)); + EMIT(PPC_RAW_SRW(dst_reg, src2_reg, src_reg)); break; case BPF_ALU64 | BPF_RSH | BPF_X: /* dst >>= src */ bpf_set_seen_register(ctx, tmp_reg); EMIT(PPC_RAW_SUBFIC(_R0, src_reg, 32)); - EMIT(PPC_RAW_SRW(dst_reg, dst_reg, src_reg)); + EMIT(PPC_RAW_SRW(dst_reg, src2_reg, src_reg)); EMIT(PPC_RAW_ADDI(tmp_reg, src_reg, 32)); - EMIT(PPC_RAW_SLW(_R0, dst_reg_h, _R0)); + EMIT(PPC_RAW_SLW(_R0, src2_reg_h, _R0)); EMIT(PPC_RAW_SRW(tmp_reg, dst_reg_h, tmp_reg)); EMIT(PPC_RAW_OR(dst_reg, dst_reg, _R0)); - EMIT(PPC_RAW_SRW(dst_reg_h, dst_reg_h, src_reg)); + EMIT(PPC_RAW_SRW(dst_reg_h, src2_reg_h, src_reg)); EMIT(PPC_RAW_OR(dst_reg, dst_reg, tmp_reg)); break; case BPF_ALU | BPF_RSH | BPF_K: /* (u32) dst >>= (u32) imm */ - if (!imm) - break; - EMIT(PPC_RAW_SRWI(dst_reg, dst_reg, imm)); + if (imm) + EMIT(PPC_RAW_SRWI(dst_reg, src2_reg, imm)); + else + EMIT(PPC_RAW_MR(dst_reg, src2_reg)); break; case BPF_ALU64 | BPF_RSH | BPF_K: /* dst >>= imm */ if (imm < 0) return -EINVAL; - if (!imm) - break; - if (imm < 32) { - EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 32 - imm, imm, 31)); - EMIT(PPC_RAW_RLWIMI(dst_reg, dst_reg_h, 32 - imm, 0, imm - 1)); - EMIT(PPC_RAW_RLWINM(dst_reg_h, dst_reg_h, 32 - imm, imm, 31)); - break; - } - if (imm < 64) - EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg_h, 64 - imm, imm - 32, 31)); - else + if (!imm) { + EMIT(PPC_RAW_MR(dst_reg, src2_reg)); + EMIT(PPC_RAW_MR(dst_reg_h, src2_reg_h)); + } else if (imm < 32) { + EMIT(PPC_RAW_RLWINM(dst_reg, src2_reg, 32 - imm, imm, 31)); + EMIT(PPC_RAW_RLWIMI(dst_reg, src2_reg_h, 32 - imm, 0, imm - 1)); + EMIT(PPC_RAW_RLWINM(dst_reg_h, src2_reg_h, 32 - imm, imm, 31)); + } else if (imm < 64) { + EMIT(PPC_RAW_RLWINM(dst_reg, src2_reg_h, 64 - imm, imm - 32, 31)); + EMIT(PPC_RAW_LI(dst_reg_h, 0)); + } else { EMIT(PPC_RAW_LI(dst_reg, 0)); - EMIT(PPC_RAW_LI(dst_reg_h, 0)); + EMIT(PPC_RAW_LI(dst_reg_h, 0)); + } break; case BPF_ALU | BPF_ARSH | BPF_X: /* (s32) dst >>= src */ - EMIT(PPC_RAW_SRAW(dst_reg, dst_reg, src_reg)); + EMIT(PPC_RAW_SRAW(dst_reg, src2_reg, src_reg)); break; case BPF_ALU64 | BPF_ARSH | BPF_X: /* (s64) dst >>= src */ bpf_set_seen_register(ctx, tmp_reg); EMIT(PPC_RAW_SUBFIC(_R0, src_reg, 32)); - EMIT(PPC_RAW_SRW(dst_reg, dst_reg, src_reg)); - EMIT(PPC_RAW_SLW(_R0, dst_reg_h, _R0)); + EMIT(PPC_RAW_SRW(dst_reg, src2_reg, src_reg)); + EMIT(PPC_RAW_SLW(_R0, src2_reg_h, _R0)); EMIT(PPC_RAW_ADDI(tmp_reg, src_reg, 32)); EMIT(PPC_RAW_OR(dst_reg, dst_reg, _R0)); EMIT(PPC_RAW_RLWINM(_R0, tmp_reg, 0, 26, 26)); - EMIT(PPC_RAW_SRAW(tmp_reg, dst_reg_h, tmp_reg)); - EMIT(PPC_RAW_SRAW(dst_reg_h, dst_reg_h, src_reg)); + EMIT(PPC_RAW_SRAW(tmp_reg, src2_reg_h, tmp_reg)); + EMIT(PPC_RAW_SRAW(dst_reg_h, src2_reg_h, src_reg)); EMIT(PPC_RAW_SLW(tmp_reg, tmp_reg, _R0)); EMIT(PPC_RAW_OR(dst_reg, dst_reg, tmp_reg)); break; case BPF_ALU | BPF_ARSH | BPF_K: /* (s32) dst >>= imm */ - if (!imm) - break; - EMIT(PPC_RAW_SRAWI(dst_reg, dst_reg, imm)); + if (imm) + EMIT(PPC_RAW_SRAWI(dst_reg, src2_reg, imm)); + else + EMIT(PPC_RAW_MR(dst_reg, src2_reg)); break; case BPF_ALU64 | BPF_ARSH | BPF_K: /* (s64) dst >>= imm */ if (imm < 0) return -EINVAL; - if (!imm) - break; - if (imm < 32) { - EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 32 - imm, imm, 31)); - EMIT(PPC_RAW_RLWIMI(dst_reg, dst_reg_h, 32 - imm, 0, imm - 1)); - EMIT(PPC_RAW_SRAWI(dst_reg_h, dst_reg_h, imm)); - break; + if (!imm) { + EMIT(PPC_RAW_MR(dst_reg, src2_reg)); + EMIT(PPC_RAW_MR(dst_reg_h, src2_reg_h)); + } else if (imm < 32) { + EMIT(PPC_RAW_RLWINM(dst_reg, src2_reg, 32 - imm, imm, 31)); + EMIT(PPC_RAW_RLWIMI(dst_reg, src2_reg_h, 32 - imm, 0, imm - 1)); + EMIT(PPC_RAW_SRAWI(dst_reg_h, src2_reg_h, imm)); + } else if (imm < 64) { + EMIT(PPC_RAW_SRAWI(dst_reg, src2_reg_h, imm - 32)); + EMIT(PPC_RAW_SRAWI(dst_reg_h, src2_reg_h, 31)); + } else { + EMIT(PPC_RAW_SRAWI(dst_reg, src2_reg_h, 31)); + EMIT(PPC_RAW_SRAWI(dst_reg_h, src2_reg_h, 31)); } - if (imm < 64) - EMIT(PPC_RAW_SRAWI(dst_reg, dst_reg_h, imm - 32)); - else - EMIT(PPC_RAW_SRAWI(dst_reg, dst_reg_h, 31)); - EMIT(PPC_RAW_SRAWI(dst_reg_h, dst_reg_h, 31)); break; /* @@ -700,7 +753,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * switch (imm) { case 16: /* Copy 16 bits to upper part */ - EMIT(PPC_RAW_RLWIMI(dst_reg, dst_reg, 16, 0, 15)); + EMIT(PPC_RAW_RLWIMI(dst_reg, src2_reg, 16, 0, 15)); /* Rotate 8 bits right & mask */ EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 24, 16, 31)); break; @@ -710,23 +763,23 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * * 2 bytes are already in their final position * -- byte 2 and 4 (of bytes 1, 2, 3 and 4) */ - EMIT(PPC_RAW_RLWINM(_R0, dst_reg, 8, 0, 31)); + EMIT(PPC_RAW_RLWINM(_R0, src2_reg, 8, 0, 31)); /* Rotate 24 bits and insert byte 1 */ - EMIT(PPC_RAW_RLWIMI(_R0, dst_reg, 24, 0, 7)); + EMIT(PPC_RAW_RLWIMI(_R0, src2_reg, 24, 0, 7)); /* Rotate 24 bits and insert byte 3 */ - EMIT(PPC_RAW_RLWIMI(_R0, dst_reg, 24, 16, 23)); + EMIT(PPC_RAW_RLWIMI(_R0, src2_reg, 24, 16, 23)); EMIT(PPC_RAW_MR(dst_reg, _R0)); break; case 64: bpf_set_seen_register(ctx, tmp_reg); - EMIT(PPC_RAW_RLWINM(tmp_reg, dst_reg, 8, 0, 31)); - EMIT(PPC_RAW_RLWINM(_R0, dst_reg_h, 8, 0, 31)); + EMIT(PPC_RAW_RLWINM(tmp_reg, src2_reg, 8, 0, 31)); + EMIT(PPC_RAW_RLWINM(_R0, src2_reg_h, 8, 0, 31)); /* Rotate 24 bits and insert byte 1 */ - EMIT(PPC_RAW_RLWIMI(tmp_reg, dst_reg, 24, 0, 7)); - EMIT(PPC_RAW_RLWIMI(_R0, dst_reg_h, 24, 0, 7)); + EMIT(PPC_RAW_RLWIMI(tmp_reg, src2_reg, 24, 0, 7)); + EMIT(PPC_RAW_RLWIMI(_R0, src2_reg_h, 24, 0, 7)); /* Rotate 24 bits and insert byte 3 */ - EMIT(PPC_RAW_RLWIMI(tmp_reg, dst_reg, 24, 16, 23)); - EMIT(PPC_RAW_RLWIMI(_R0, dst_reg_h, 24, 16, 23)); + EMIT(PPC_RAW_RLWIMI(tmp_reg, src2_reg, 24, 16, 23)); + EMIT(PPC_RAW_RLWIMI(_R0, src2_reg_h, 24, 16, 23)); EMIT(PPC_RAW_MR(dst_reg, _R0)); EMIT(PPC_RAW_MR(dst_reg_h, tmp_reg)); break; @@ -736,7 +789,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * switch (imm) { case 16: /* zero-extend 16 bits into 32 bits */ - EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 0, 16, 31)); + EMIT(PPC_RAW_RLWINM(dst_reg, src2_reg, 0, 16, 31)); break; case 32: case 64: @@ -960,8 +1013,9 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * PPC_LI32(dst_reg_h, (u32)insn[i + 1].imm); PPC_LI32(dst_reg, (u32)insn[i].imm); /* padding to allow full 4 instructions for later patching */ - for (j = ctx->idx - tmp_idx; j < 4; j++) - EMIT(PPC_RAW_NOP()); + if (!image) + for (j = ctx->idx - tmp_idx; j < 4; j++) + EMIT(PPC_RAW_NOP()); /* Adjust for two bpf instructions */ addrs[++i] = ctx->idx * 4; break; @@ -989,7 +1043,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * case BPF_JMP | BPF_CALL: ctx->seen |= SEEN_FUNC; - ret = bpf_jit_get_func_addr(fp, &insn[i], false, + ret = bpf_jit_get_func_addr(fp, &insn[i], extra_pass, &func_addr, &func_addr_fixed); if (ret < 0) return ret; diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 29ee306d6302..8dd3cabaa83a 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -240,13 +240,14 @@ int bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 func * load the callee's address, but this may optimize the number of * instructions required based on the nature of the address. * - * Since we don't want the number of instructions emitted to change, + * Since we don't want the number of instructions emitted to increase, * we pad the optimized PPC_LI64() call with NOPs to guarantee that * we always have a five-instruction sequence, which is the maximum * that PPC_LI64() can emit. */ - for (i = ctx->idx - ctx_idx; i < 5; i++) - EMIT(PPC_RAW_NOP()); + if (!image) + for (i = ctx->idx - ctx_idx; i < 5; i++) + EMIT(PPC_RAW_NOP()); EMIT(PPC_RAW_MTCTR(_R12)); EMIT(PPC_RAW_BCTRL()); @@ -343,7 +344,7 @@ asm ( /* Assemble the body code between the prologue & epilogue */ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx, - u32 *addrs, int pass) + u32 *addrs, int pass, bool extra_pass) { enum stf_barrier_type stf_barrier = stf_barrier_type_get(); const struct bpf_insn *insn = fp->insnsi; @@ -938,8 +939,9 @@ emit_clear: tmp_idx = ctx->idx; PPC_LI64(dst_reg, imm64); /* padding to allow full 5 instructions for later patching */ - for (j = ctx->idx - tmp_idx; j < 5; j++) - EMIT(PPC_RAW_NOP()); + if (!image) + for (j = ctx->idx - tmp_idx; j < 5; j++) + EMIT(PPC_RAW_NOP()); /* Adjust for two bpf instructions */ addrs[++i] = ctx->idx * 4; break; @@ -967,7 +969,7 @@ emit_clear: case BPF_JMP | BPF_CALL: ctx->seen |= SEEN_FUNC; - ret = bpf_jit_get_func_addr(fp, &insn[i], false, + ret = bpf_jit_get_func_addr(fp, &insn[i], extra_pass, &func_addr, &func_addr_fixed); if (ret < 0) return ret; diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 33c23225fd54..317175791d23 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -18,6 +18,7 @@ #include <asm/firmware.h> #include <asm/hvcall.h> #include <asm/io.h> +#include <asm/papr-sysparm.h> #include <linux/byteorder/generic.h> #include <asm/rtas.h> @@ -66,8 +67,6 @@ static bool is_physical_domain(unsigned int domain) * Refer PAPR+ document to get parameter token value as '43'. */ -#define PROCESSOR_MODULE_INFO 43 - static u32 phys_sockets; /* Physical sockets */ static u32 phys_chipspersocket; /* Physical chips per socket*/ static u32 phys_coresperchip; /* Physical cores per chip */ @@ -79,9 +78,7 @@ static u32 phys_coresperchip; /* Physical cores per chip */ */ void read_24x7_sys_info(void) { - int call_status, len, ntypes; - - spin_lock(&rtas_data_buf_lock); + struct papr_sysparm_buf *buf; /* * Making system parameter: chips and sockets and cores per chip @@ -91,32 +88,22 @@ void read_24x7_sys_info(void) phys_chipspersocket = 1; phys_coresperchip = 1; - call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, - NULL, - PROCESSOR_MODULE_INFO, - __pa(rtas_data_buf), - RTAS_DATA_BUF_SIZE); - - if (call_status != 0) { - pr_err("Error calling get-system-parameter %d\n", - call_status); - } else { - len = be16_to_cpup((__be16 *)&rtas_data_buf[0]); - if (len < 8) - goto out; - - ntypes = be16_to_cpup((__be16 *)&rtas_data_buf[2]); + buf = papr_sysparm_buf_alloc(); + if (!buf) + return; - if (!ntypes) - goto out; + if (!papr_sysparm_get(PAPR_SYSPARM_PROC_MODULE_INFO, buf)) { + int ntypes = be16_to_cpup((__be16 *)&buf->val[0]); + int len = be16_to_cpu(buf->len); - phys_sockets = be16_to_cpup((__be16 *)&rtas_data_buf[4]); - phys_chipspersocket = be16_to_cpup((__be16 *)&rtas_data_buf[6]); - phys_coresperchip = be16_to_cpup((__be16 *)&rtas_data_buf[8]); + if (len >= 8 && ntypes != 0) { + phys_sockets = be16_to_cpup((__be16 *)&buf->val[2]); + phys_chipspersocket = be16_to_cpup((__be16 *)&buf->val[4]); + phys_coresperchip = be16_to_cpup((__be16 *)&buf->val[6]); + } } -out: - spin_unlock(&rtas_data_buf_lock); + papr_sysparm_buf_free(buf); } /* Domains for which more than one result element are returned for each event. */ @@ -1727,7 +1714,8 @@ static int hv_24x7_init(void) } /* POWER8 only supports v1, while POWER9 only supports v2. */ - if (PVR_VER(pvr) == PVR_POWER8) + if (PVR_VER(pvr) == PVR_POWER8 || PVR_VER(pvr) == PVR_POWER8E || + PVR_VER(pvr) == PVR_POWER8NVL) interface_version = 1; else { interface_version = 2; diff --git a/arch/powerpc/platforms/44x/fsp2.c b/arch/powerpc/platforms/44x/fsp2.c index e2e4f6d8150d..56d91dbef577 100644 --- a/arch/powerpc/platforms/44x/fsp2.c +++ b/arch/powerpc/platforms/44x/fsp2.c @@ -205,7 +205,7 @@ static void __init node_irq_request(const char *compat, irq_handler_t errirq_han for_each_compatible_node(np, NULL, compat) { irq = irq_of_parse_and_map(np, 0); - if (irq == NO_IRQ) { + if (!irq) { pr_err("device tree node %pOFn is missing a interrupt", np); of_node_put(np); diff --git a/arch/powerpc/platforms/52xx/efika.c b/arch/powerpc/platforms/52xx/efika.c index e0647720ed5e..61dfec74ff85 100644 --- a/arch/powerpc/platforms/52xx/efika.c +++ b/arch/powerpc/platforms/52xx/efika.c @@ -41,7 +41,7 @@ static int rtas_read_config(struct pci_bus *bus, unsigned int devfn, int offset, int ret = -1; int rval; - rval = rtas_call(rtas_token("read-pci-config"), 2, 2, &ret, addr, len); + rval = rtas_call(rtas_function_token(RTAS_FN_READ_PCI_CONFIG), 2, 2, &ret, addr, len); *val = ret; return rval ? PCIBIOS_DEVICE_NOT_FOUND : PCIBIOS_SUCCESSFUL; } @@ -55,7 +55,7 @@ static int rtas_write_config(struct pci_bus *bus, unsigned int devfn, | (hose->global_number << 24); int rval; - rval = rtas_call(rtas_token("write-pci-config"), 3, 1, NULL, + rval = rtas_call(rtas_function_token(RTAS_FN_WRITE_PCI_CONFIG), 3, 1, NULL, addr, len, val); return rval ? PCIBIOS_DEVICE_NOT_FOUND : PCIBIOS_SUCCESSFUL; } diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 9563336e3348..046b571496b1 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -118,19 +118,18 @@ endchoice choice prompt "CPU selection" - default GENERIC_CPU help This will create a kernel which is optimised for a particular CPU. The resulting kernel may not run on other CPUs, so use this with care. If unsure, select Generic. -config GENERIC_CPU +config POWERPC64_CPU bool "Generic (POWER5 and PowerPC 970 and above)" depends on PPC_BOOK3S_64 && !CPU_LITTLE_ENDIAN select PPC_64S_HASH_MMU -config GENERIC_CPU +config POWERPC64_CPU bool "Generic (POWER8 and above)" depends on PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN select ARCH_HAS_FAST_MULTIPLIER @@ -144,6 +143,7 @@ config POWERPC_CPU config CELL_CPU bool "Cell Broadband Engine" depends on PPC_BOOK3S_64 && !CPU_LITTLE_ENDIAN + depends on !CC_IS_CLANG select PPC_64S_HASH_MMU config PPC_970_CPU @@ -188,11 +188,13 @@ config E5500_CPU config E6500_CPU bool "Freescale e6500" depends on PPC64 && PPC_E500 + depends on !CC_IS_CLANG select PPC_HAS_LBARX_LHARX config 405_CPU bool "40x family" depends on 40x + depends on !CC_IS_CLANG config 440_CPU bool "440 (44x family)" @@ -201,22 +203,27 @@ config 440_CPU config 464_CPU bool "464 (44x family)" depends on 44x + depends on !CC_IS_CLANG config 476_CPU bool "476 (47x family)" depends on PPC_47x + depends on !CC_IS_CLANG config 860_CPU bool "8xx family" depends on PPC_8xx + depends on !CC_IS_CLANG config E300C2_CPU bool "e300c2 (832x)" depends on PPC_BOOK3S_32 + depends on !CC_IS_CLANG config E300C3_CPU bool "e300c3 (831x)" depends on PPC_BOOK3S_32 + depends on !CC_IS_CLANG config G4_CPU bool "G4 (74xx)" @@ -233,13 +240,12 @@ config E500MC_CPU config TOOLCHAIN_DEFAULT_CPU bool "Rely on the toolchain's implicit default CPU" - depends on PPC32 endchoice config TARGET_CPU_BOOL bool - default !GENERIC_CPU && !TOOLCHAIN_DEFAULT_CPU + default !TOOLCHAIN_DEFAULT_CPU config TARGET_CPU string @@ -251,6 +257,10 @@ config TARGET_CPU default "power8" if POWER8_CPU default "power9" if POWER9_CPU default "power10" if POWER10_CPU + default "e5500" if E5500_CPU + default "e6500" if E6500_CPU + default "power4" if POWERPC64_CPU && !CPU_LITTLE_ENDIAN + default "power8" if POWERPC64_CPU && CPU_LITTLE_ENDIAN default "405" if 405_CPU default "440" if 440_CPU default "464" if 464_CPU diff --git a/arch/powerpc/platforms/cell/ras.c b/arch/powerpc/platforms/cell/ras.c index 8d934ea6270c..98db63b72d56 100644 --- a/arch/powerpc/platforms/cell/ras.c +++ b/arch/powerpc/platforms/cell/ras.c @@ -297,8 +297,8 @@ int cbe_sysreset_hack(void) static int __init cbe_ptcal_init(void) { int ret; - ptcal_start_tok = rtas_token("ibm,cbe-start-ptcal"); - ptcal_stop_tok = rtas_token("ibm,cbe-stop-ptcal"); + ptcal_start_tok = rtas_function_token(RTAS_FN_IBM_CBE_START_PTCAL); + ptcal_stop_tok = rtas_function_token(RTAS_FN_IBM_CBE_STOP_PTCAL); if (ptcal_start_tok == RTAS_UNKNOWN_SERVICE || ptcal_stop_tok == RTAS_UNKNOWN_SERVICE) diff --git a/arch/powerpc/platforms/cell/smp.c b/arch/powerpc/platforms/cell/smp.c index 31ce00b52a32..30394c6f8894 100644 --- a/arch/powerpc/platforms/cell/smp.c +++ b/arch/powerpc/platforms/cell/smp.c @@ -81,7 +81,7 @@ static inline int smp_startup_cpu(unsigned int lcpu) * If the RTAS start-cpu token does not exist then presume the * cpu is already spinning. */ - start_cpu = rtas_token("start-cpu"); + start_cpu = rtas_function_token(RTAS_FN_START_CPU); if (start_cpu == RTAS_UNKNOWN_SERVICE) return 1; @@ -152,7 +152,7 @@ void __init smp_init_cell(void) cpumask_clear_cpu(boot_cpuid, &of_spin_map); /* Non-lpar has additional take/give timebase */ - if (rtas_token("freeze-time-base") != RTAS_UNKNOWN_SERVICE) { + if (rtas_function_token(RTAS_FN_FREEZE_TIME_BASE) != RTAS_UNKNOWN_SERVICE) { smp_ops->give_timebase = rtas_give_timebase; smp_ops->take_timebase = rtas_take_timebase; } diff --git a/arch/powerpc/platforms/chrp/nvram.c b/arch/powerpc/platforms/chrp/nvram.c index dab78076fedb..0eedae96498c 100644 --- a/arch/powerpc/platforms/chrp/nvram.c +++ b/arch/powerpc/platforms/chrp/nvram.c @@ -31,7 +31,7 @@ static unsigned char chrp_nvram_read_val(int addr) return 0xff; } spin_lock_irqsave(&nvram_lock, flags); - if ((rtas_call(rtas_token("nvram-fetch"), 3, 2, &done, addr, + if ((rtas_call(rtas_function_token(RTAS_FN_NVRAM_FETCH), 3, 2, &done, addr, __pa(nvram_buf), 1) != 0) || 1 != done) ret = 0xff; else @@ -53,7 +53,7 @@ static void chrp_nvram_write_val(int addr, unsigned char val) } spin_lock_irqsave(&nvram_lock, flags); nvram_buf[0] = val; - if ((rtas_call(rtas_token("nvram-store"), 3, 2, &done, addr, + if ((rtas_call(rtas_function_token(RTAS_FN_NVRAM_STORE), 3, 2, &done, addr, __pa(nvram_buf), 1) != 0) || 1 != done) printk(KERN_DEBUG "rtas IO error storing 0x%02x at %d", val, addr); spin_unlock_irqrestore(&nvram_lock, flags); diff --git a/arch/powerpc/platforms/chrp/pci.c b/arch/powerpc/platforms/chrp/pci.c index 6f6598e771ff..428fd2a7b3ee 100644 --- a/arch/powerpc/platforms/chrp/pci.c +++ b/arch/powerpc/platforms/chrp/pci.c @@ -104,7 +104,7 @@ static int rtas_read_config(struct pci_bus *bus, unsigned int devfn, int offset, int ret = -1; int rval; - rval = rtas_call(rtas_token("read-pci-config"), 2, 2, &ret, addr, len); + rval = rtas_call(rtas_function_token(RTAS_FN_READ_PCI_CONFIG), 2, 2, &ret, addr, len); *val = ret; return rval? PCIBIOS_DEVICE_NOT_FOUND: PCIBIOS_SUCCESSFUL; } @@ -118,7 +118,7 @@ static int rtas_write_config(struct pci_bus *bus, unsigned int devfn, int offset | (hose->global_number << 24); int rval; - rval = rtas_call(rtas_token("write-pci-config"), 3, 1, NULL, + rval = rtas_call(rtas_function_token(RTAS_FN_WRITE_PCI_CONFIG), 3, 1, NULL, addr, len, val); return rval? PCIBIOS_DEVICE_NOT_FOUND: PCIBIOS_SUCCESSFUL; } diff --git a/arch/powerpc/platforms/chrp/setup.c b/arch/powerpc/platforms/chrp/setup.c index ec63c0558db6..d9049ceb1046 100644 --- a/arch/powerpc/platforms/chrp/setup.c +++ b/arch/powerpc/platforms/chrp/setup.c @@ -323,11 +323,11 @@ static void __init chrp_setup_arch(void) printk("chrp type = %x [%s]\n", _chrp_type, chrp_names[_chrp_type]); rtas_initialize(); - if (rtas_token("display-character") >= 0) + if (rtas_function_token(RTAS_FN_DISPLAY_CHARACTER) >= 0) ppc_md.progress = rtas_progress; /* use RTAS time-of-day routines if available */ - if (rtas_token("get-time-of-day") != RTAS_UNKNOWN_SERVICE) { + if (rtas_function_token(RTAS_FN_GET_TIME_OF_DAY) != RTAS_UNKNOWN_SERVICE) { ppc_md.get_boot_time = rtas_get_boot_time; ppc_md.get_rtc_time = rtas_get_rtc_time; ppc_md.set_rtc_time = rtas_set_rtc_time; diff --git a/arch/powerpc/platforms/maple/setup.c b/arch/powerpc/platforms/maple/setup.c index c26c379e1cc8..98c8e3603064 100644 --- a/arch/powerpc/platforms/maple/setup.c +++ b/arch/powerpc/platforms/maple/setup.c @@ -162,8 +162,8 @@ static struct smp_ops_t maple_smp_ops = { static void __init maple_use_rtas_reboot_and_halt_if_present(void) { - if (rtas_service_present("system-reboot") && - rtas_service_present("power-off")) { + if (rtas_function_implemented(RTAS_FN_SYSTEM_REBOOT) && + rtas_function_implemented(RTAS_FN_POWER_OFF)) { ppc_md.restart = rtas_restart; pm_power_off = rtas_power_off; ppc_md.halt = rtas_halt; diff --git a/arch/powerpc/platforms/powernv/opal-secvar.c b/arch/powerpc/platforms/powernv/opal-secvar.c index 14133e120bdd..a8436bf35e2f 100644 --- a/arch/powerpc/platforms/powernv/opal-secvar.c +++ b/arch/powerpc/platforms/powernv/opal-secvar.c @@ -54,8 +54,7 @@ static int opal_status_to_err(int rc) return err; } -static int opal_get_variable(const char *key, uint64_t ksize, - u8 *data, uint64_t *dsize) +static int opal_get_variable(const char *key, u64 ksize, u8 *data, u64 *dsize) { int rc; @@ -71,8 +70,7 @@ static int opal_get_variable(const char *key, uint64_t ksize, return opal_status_to_err(rc); } -static int opal_get_next_variable(const char *key, uint64_t *keylen, - uint64_t keybufsize) +static int opal_get_next_variable(const char *key, u64 *keylen, u64 keybufsize) { int rc; @@ -88,8 +86,7 @@ static int opal_get_next_variable(const char *key, uint64_t *keylen, return opal_status_to_err(rc); } -static int opal_set_variable(const char *key, uint64_t ksize, u8 *data, - uint64_t dsize) +static int opal_set_variable(const char *key, u64 ksize, u8 *data, u64 dsize) { int rc; @@ -101,10 +98,57 @@ static int opal_set_variable(const char *key, uint64_t ksize, u8 *data, return opal_status_to_err(rc); } +static ssize_t opal_secvar_format(char *buf, size_t bufsize) +{ + ssize_t rc = 0; + struct device_node *node; + const char *format; + + node = of_find_compatible_node(NULL, NULL, "ibm,secvar-backend"); + if (!of_device_is_available(node)) { + rc = -ENODEV; + goto out; + } + + rc = of_property_read_string(node, "format", &format); + if (rc) + goto out; + + rc = snprintf(buf, bufsize, "%s", format); + +out: + of_node_put(node); + + return rc; +} + +static int opal_secvar_max_size(u64 *max_size) +{ + int rc; + struct device_node *node; + + node = of_find_compatible_node(NULL, NULL, "ibm,secvar-backend"); + if (!node) + return -ENODEV; + + if (!of_device_is_available(node)) { + rc = -ENODEV; + goto out; + } + + rc = of_property_read_u64(node, "max-var-size", max_size); + +out: + of_node_put(node); + return rc; +} + static const struct secvar_operations opal_secvar_ops = { .get = opal_get_variable, .get_next = opal_get_next_variable, .set = opal_set_variable, + .format = opal_secvar_format, + .max_size = opal_secvar_max_size, }; static int opal_secvar_probe(struct platform_device *pdev) @@ -116,9 +160,7 @@ static int opal_secvar_probe(struct platform_device *pdev) return -ENODEV; } - set_secvar_ops(&opal_secvar_ops); - - return 0; + return set_secvar_ops(&opal_secvar_ops); } static const struct of_device_id opal_secvar_match[] = { diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 5c144c05cbfd..4f6e20a35aa1 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2325,7 +2325,8 @@ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, int index; int64_t rc; - if (!res || !res->flags || res->start > res->end) + if (!res || !res->flags || res->start > res->end || + res->flags & IORESOURCE_UNSET) return; if (res->flags & IORESOURCE_IO) { diff --git a/arch/powerpc/platforms/ps3/htab.c b/arch/powerpc/platforms/ps3/htab.c index c27e6cf85272..9de62bd52650 100644 --- a/arch/powerpc/platforms/ps3/htab.c +++ b/arch/powerpc/platforms/ps3/htab.c @@ -146,7 +146,7 @@ static long ps3_hpte_updatepp(unsigned long slot, unsigned long newpp, static void ps3_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, int psize, int ssize) { - panic("ps3_hpte_updateboltedpp() not implemented"); + pr_info("ps3_hpte_updateboltedpp() not implemented"); } static void ps3_hpte_invalidate(unsigned long slot, unsigned long vpn, diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index a3b4d99567cb..b481c5c8bae1 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -151,16 +151,16 @@ config IBMEBUS config PSERIES_PLPKS depends on PPC_PSERIES - bool "Support for the Platform Key Storage" - help - PowerVM provides an isolated Platform Keystore(PKS) storage - allocation for each LPAR with individually managed access - controls to store sensitive information securely. It can be - used to store asymmetric public keys or secrets as required - by different usecases. Select this config to enable - operating system interface to hypervisor to access this space. - - If unsure, select N. + select NLS + bool + # PowerVM provides an isolated Platform Keystore (PKS) storage + # allocation for each LPAR with individually managed access + # controls to store sensitive information securely. It can be + # used to store asymmetric public keys or secrets as required + # by different usecases. + # + # This option is selected by in-kernel consumers that require + # access to the PKS. config PAPR_SCM depends on PPC_PSERIES && MEMORY_HOTPLUG && LIBNVDIMM diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index 92310202bdd7..53c3b91af2f7 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -3,7 +3,7 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) ccflags-$(CONFIG_PPC_PSERIES_DEBUG) += -DDEBUG obj-y := lpar.o hvCall.o nvram.o reconfig.o \ - of_helpers.o \ + of_helpers.o rtas-work-area.o papr-sysparm.o \ setup.o iommu.o event_sources.o ras.o \ firmware.o power.o dlpar.o mobility.o rng.o \ pci.o pci_dlpar.o eeh_pseries.o msi.o \ @@ -27,8 +27,8 @@ obj-$(CONFIG_PAPR_SCM) += papr_scm.o obj-$(CONFIG_PPC_SPLPAR) += vphn.o obj-$(CONFIG_PPC_SVM) += svm.o obj-$(CONFIG_FA_DUMP) += rtas-fadump.o -obj-$(CONFIG_PSERIES_PLPKS) += plpks.o - +obj-$(CONFIG_PSERIES_PLPKS) += plpks.o +obj-$(CONFIG_PPC_SECURE_BOOT) += plpks-secvar.o obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PPC_VAS) += vas.o vas-sysfs.o diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index 498d6efcb5ae..75ffdbcd2865 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -22,6 +22,7 @@ #include <asm/machdep.h> #include <linux/uaccess.h> #include <asm/rtas.h> +#include <asm/rtas-work-area.h> static struct workqueue_struct *pseries_hp_wq; @@ -137,37 +138,27 @@ struct device_node *dlpar_configure_connector(__be32 drc_index, struct property *property; struct property *last_property = NULL; struct cc_workarea *ccwa; + struct rtas_work_area *work_area; char *data_buf; int cc_token; int rc = -1; - cc_token = rtas_token("ibm,configure-connector"); + cc_token = rtas_function_token(RTAS_FN_IBM_CONFIGURE_CONNECTOR); if (cc_token == RTAS_UNKNOWN_SERVICE) return NULL; - data_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL); - if (!data_buf) - return NULL; + work_area = rtas_work_area_alloc(SZ_4K); + data_buf = rtas_work_area_raw_buf(work_area); ccwa = (struct cc_workarea *)&data_buf[0]; ccwa->drc_index = drc_index; ccwa->zero = 0; do { - /* Since we release the rtas_data_buf lock between configure - * connector calls we want to re-populate the rtas_data_buffer - * with the contents of the previous call. - */ - spin_lock(&rtas_data_buf_lock); - - memcpy(rtas_data_buf, data_buf, RTAS_DATA_BUF_SIZE); - rc = rtas_call(cc_token, 2, 1, NULL, rtas_data_buf, NULL); - memcpy(data_buf, rtas_data_buf, RTAS_DATA_BUF_SIZE); - - spin_unlock(&rtas_data_buf_lock); - - if (rtas_busy_delay(rc)) - continue; + do { + rc = rtas_call(cc_token, 2, 1, NULL, + rtas_work_area_phys(work_area), NULL); + } while (rtas_busy_delay(rc)); switch (rc) { case COMPLETE: @@ -227,7 +218,7 @@ struct device_node *dlpar_configure_connector(__be32 drc_index, } while (rc); cc_error: - kfree(data_buf); + rtas_work_area_free(work_area); if (rc) { if (first_dn) diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index 6b507b62ce8f..def184da51cf 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -699,7 +699,7 @@ static int pseries_eeh_write_config(struct eeh_dev *edev, int where, int size, u static int pseries_send_allow_unfreeze(struct pci_dn *pdn, u16 *vf_pe_array, int cur_vfs) { int rc; - int ibm_allow_unfreeze = rtas_token("ibm,open-sriov-allow-unfreeze"); + int ibm_allow_unfreeze = rtas_function_token(RTAS_FN_IBM_OPEN_SRIOV_ALLOW_UNFREEZE); unsigned long buid, addr; addr = rtas_config_addr(pdn->busno, pdn->devfn, 0); @@ -774,7 +774,7 @@ static int pseries_notify_resume(struct eeh_dev *edev) if (!edev) return -EEXIST; - if (rtas_token("ibm,open-sriov-allow-unfreeze") == RTAS_UNKNOWN_SERVICE) + if (rtas_function_token(RTAS_FN_IBM_OPEN_SRIOV_ALLOW_UNFREEZE) == RTAS_UNKNOWN_SERVICE) return -EINVAL; if (edev->pdev->is_physfn || edev->pdev->is_virtfn) @@ -815,14 +815,14 @@ static int __init eeh_pseries_init(void) int ret, config_addr; /* figure out EEH RTAS function call tokens */ - ibm_set_eeh_option = rtas_token("ibm,set-eeh-option"); - ibm_set_slot_reset = rtas_token("ibm,set-slot-reset"); - ibm_read_slot_reset_state2 = rtas_token("ibm,read-slot-reset-state2"); - ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state"); - ibm_slot_error_detail = rtas_token("ibm,slot-error-detail"); - ibm_get_config_addr_info2 = rtas_token("ibm,get-config-addr-info2"); - ibm_get_config_addr_info = rtas_token("ibm,get-config-addr-info"); - ibm_configure_pe = rtas_token("ibm,configure-pe"); + ibm_set_eeh_option = rtas_function_token(RTAS_FN_IBM_SET_EEH_OPTION); + ibm_set_slot_reset = rtas_function_token(RTAS_FN_IBM_SET_SLOT_RESET); + ibm_read_slot_reset_state2 = rtas_function_token(RTAS_FN_IBM_READ_SLOT_RESET_STATE2); + ibm_read_slot_reset_state = rtas_function_token(RTAS_FN_IBM_READ_SLOT_RESET_STATE); + ibm_slot_error_detail = rtas_function_token(RTAS_FN_IBM_SLOT_ERROR_DETAIL); + ibm_get_config_addr_info2 = rtas_function_token(RTAS_FN_IBM_GET_CONFIG_ADDR_INFO2); + ibm_get_config_addr_info = rtas_function_token(RTAS_FN_IBM_GET_CONFIG_ADDR_INFO); + ibm_configure_pe = rtas_function_token(RTAS_FN_IBM_CONFIGURE_PE); /* * ibm,configure-pe and ibm,configure-bridge have the same semantics, @@ -830,7 +830,7 @@ static int __init eeh_pseries_init(void) * ibm,configure-pe then fall back to using ibm,configure-bridge. */ if (ibm_configure_pe == RTAS_UNKNOWN_SERVICE) - ibm_configure_pe = rtas_token("ibm,configure-bridge"); + ibm_configure_pe = rtas_function_token(RTAS_FN_IBM_CONFIGURE_BRIDGE); /* * Necessary sanity check. We needn't check "get-config-addr-info" diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 090ae5a1e0f5..982e5e4b5e06 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -855,8 +855,8 @@ static int __init pseries_cpu_hotplug_init(void) ppc_md.cpu_release = dlpar_cpu_release; #endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */ - rtas_stop_self_token = rtas_token("stop-self"); - qcss_tok = rtas_token("query-cpu-stopped-state"); + rtas_stop_self_token = rtas_function_token(RTAS_FN_STOP_SELF); + qcss_tok = rtas_function_token(RTAS_FN_QUERY_CPU_STOPPED_STATE); if (rtas_stop_self_token == RTAS_UNKNOWN_SERVICE || qcss_tok == RTAS_UNKNOWN_SERVICE) { diff --git a/arch/powerpc/platforms/pseries/io_event_irq.c b/arch/powerpc/platforms/pseries/io_event_irq.c index 7b74d4d34e9a..f411d4fe7b24 100644 --- a/arch/powerpc/platforms/pseries/io_event_irq.c +++ b/arch/powerpc/platforms/pseries/io_event_irq.c @@ -143,7 +143,7 @@ static int __init ioei_init(void) { struct device_node *np; - ioei_check_exception_token = rtas_token("check-exception"); + ioei_check_exception_token = rtas_function_token(RTAS_FN_CHECK_EXCEPTION); if (ioei_check_exception_token == RTAS_UNKNOWN_SERVICE) return -ENODEV; diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 97ef6499e501..2eab323f6970 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -32,6 +32,7 @@ #include <asm/iommu.h> #include <asm/tlb.h> #include <asm/cputable.h> +#include <asm/papr-sysparm.h> #include <asm/udbg.h> #include <asm/smp.h> #include <asm/trace.h> @@ -1469,8 +1470,6 @@ static inline void __init check_lp_set_hblkrm(unsigned int lp, } } -#define SPLPAR_TLB_BIC_TOKEN 50 - /* * The size of the TLB Block Invalidate Characteristics is variable. But at the * maximum it will be the number of possible page sizes *2 + 10 bytes. @@ -1481,42 +1480,24 @@ static inline void __init check_lp_set_hblkrm(unsigned int lp, void __init pseries_lpar_read_hblkrm_characteristics(void) { - unsigned char local_buffer[SPLPAR_TLB_BIC_MAXLENGTH]; - int call_status, len, idx, bpsize; + static struct papr_sysparm_buf buf __initdata; + int len, idx, bpsize; if (!firmware_has_feature(FW_FEATURE_BLOCK_REMOVE)) return; - spin_lock(&rtas_data_buf_lock); - memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE); - call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, - NULL, - SPLPAR_TLB_BIC_TOKEN, - __pa(rtas_data_buf), - RTAS_DATA_BUF_SIZE); - memcpy(local_buffer, rtas_data_buf, SPLPAR_TLB_BIC_MAXLENGTH); - local_buffer[SPLPAR_TLB_BIC_MAXLENGTH - 1] = '\0'; - spin_unlock(&rtas_data_buf_lock); - - if (call_status != 0) { - pr_warn("%s %s Error calling get-system-parameter (0x%x)\n", - __FILE__, __func__, call_status); + if (papr_sysparm_get(PAPR_SYSPARM_TLB_BLOCK_INVALIDATE_ATTRS, &buf)) return; - } - /* - * The first two (2) bytes of the data in the buffer are the length of - * the returned data, not counting these first two (2) bytes. - */ - len = be16_to_cpu(*((u16 *)local_buffer)) + 2; + len = be16_to_cpu(buf.len); if (len > SPLPAR_TLB_BIC_MAXLENGTH) { pr_warn("%s too large returned buffer %d", __func__, len); return; } - idx = 2; + idx = 0; while (idx < len) { - u8 block_shift = local_buffer[idx++]; + u8 block_shift = buf.val[idx++]; u32 block_size; unsigned int npsize; @@ -1525,9 +1506,9 @@ void __init pseries_lpar_read_hblkrm_characteristics(void) block_size = 1 << block_shift; - for (npsize = local_buffer[idx++]; + for (npsize = buf.val[idx++]; npsize > 0 && idx < len; npsize--) - check_lp_set_hblkrm((unsigned int) local_buffer[idx++], + check_lp_set_hblkrm((unsigned int)buf.val[idx++], block_size); } diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index 63fd925ccbb8..8acc70509520 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -19,6 +19,7 @@ #include <linux/errno.h> #include <linux/proc_fs.h> #include <linux/init.h> +#include <asm/papr-sysparm.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/uaccess.h> @@ -312,16 +313,6 @@ static void parse_mpp_x_data(struct seq_file *m) } /* - * PAPR defines, in section "7.3.16 System Parameters Option", the token 55 to - * read the LPAR name, and the largest output data to 4000 + 2 bytes length. - */ -#define SPLPAR_LPAR_NAME_TOKEN 55 -#define GET_SYS_PARM_BUF_SIZE 4002 -#if GET_SYS_PARM_BUF_SIZE > RTAS_DATA_BUF_SIZE -#error "GET_SYS_PARM_BUF_SIZE is larger than RTAS_DATA_BUF_SIZE" -#endif - -/* * Read the lpar name using the RTAS ibm,get-system-parameter call. * * The name read through this call is updated if changes are made by the end @@ -332,46 +323,19 @@ static void parse_mpp_x_data(struct seq_file *m) */ static int read_rtas_lpar_name(struct seq_file *m) { - int rc, len, token; - union { - char raw_buffer[GET_SYS_PARM_BUF_SIZE]; - struct { - __be16 len; - char name[GET_SYS_PARM_BUF_SIZE-2]; - }; - } *local_buffer; - - token = rtas_token("ibm,get-system-parameter"); - if (token == RTAS_UNKNOWN_SERVICE) - return -EINVAL; + struct papr_sysparm_buf *buf; + int err; - local_buffer = kmalloc(sizeof(*local_buffer), GFP_KERNEL); - if (!local_buffer) + buf = papr_sysparm_buf_alloc(); + if (!buf) return -ENOMEM; - do { - spin_lock(&rtas_data_buf_lock); - memset(rtas_data_buf, 0, sizeof(*local_buffer)); - rc = rtas_call(token, 3, 1, NULL, SPLPAR_LPAR_NAME_TOKEN, - __pa(rtas_data_buf), sizeof(*local_buffer)); - if (!rc) - memcpy(local_buffer->raw_buffer, rtas_data_buf, - sizeof(local_buffer->raw_buffer)); - spin_unlock(&rtas_data_buf_lock); - } while (rtas_busy_delay(rc)); - - if (!rc) { - /* Force end of string */ - len = min((int) be16_to_cpu(local_buffer->len), - (int) sizeof(local_buffer->name)-1); - local_buffer->name[len] = '\0'; - - seq_printf(m, "partition_name=%s\n", local_buffer->name); - } else - rc = -ENODATA; + err = papr_sysparm_get(PAPR_SYSPARM_LPAR_NAME, buf); + if (!err) + seq_printf(m, "partition_name=%s\n", buf->val); - kfree(local_buffer); - return rc; + papr_sysparm_buf_free(buf); + return err; } /* @@ -397,7 +361,6 @@ static void read_lpar_name(struct seq_file *m) pr_err_once("Error can't get the LPAR name"); } -#define SPLPAR_CHARACTERISTICS_TOKEN 20 #define SPLPAR_MAXLENGTH 1026*(sizeof(char)) /* @@ -408,45 +371,25 @@ static void read_lpar_name(struct seq_file *m) */ static void parse_system_parameter_string(struct seq_file *m) { - int call_status; + struct papr_sysparm_buf *buf; - unsigned char *local_buffer = kmalloc(SPLPAR_MAXLENGTH, GFP_KERNEL); - if (!local_buffer) { - printk(KERN_ERR "%s %s kmalloc failure at line %d\n", - __FILE__, __func__, __LINE__); + buf = papr_sysparm_buf_alloc(); + if (!buf) return; - } - spin_lock(&rtas_data_buf_lock); - memset(rtas_data_buf, 0, SPLPAR_MAXLENGTH); - call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, - NULL, - SPLPAR_CHARACTERISTICS_TOKEN, - __pa(rtas_data_buf), - RTAS_DATA_BUF_SIZE); - memcpy(local_buffer, rtas_data_buf, SPLPAR_MAXLENGTH); - local_buffer[SPLPAR_MAXLENGTH - 1] = '\0'; - spin_unlock(&rtas_data_buf_lock); - - if (call_status != 0) { - printk(KERN_INFO - "%s %s Error calling get-system-parameter (0x%x)\n", - __FILE__, __func__, call_status); + if (papr_sysparm_get(PAPR_SYSPARM_SHARED_PROC_LPAR_ATTRS, buf)) { + goto out_free; } else { + const char *local_buffer; int splpar_strlen; int idx, w_idx; char *workbuffer = kzalloc(SPLPAR_MAXLENGTH, GFP_KERNEL); - if (!workbuffer) { - printk(KERN_ERR "%s %s kmalloc failure at line %d\n", - __FILE__, __func__, __LINE__); - kfree(local_buffer); - return; - } -#ifdef LPARCFG_DEBUG - printk(KERN_INFO "success calling get-system-parameter\n"); -#endif - splpar_strlen = local_buffer[0] * 256 + local_buffer[1]; - local_buffer += 2; /* step over strlen value */ + + if (!workbuffer) + goto out_free; + + splpar_strlen = be16_to_cpu(buf->len); + local_buffer = buf->val; w_idx = 0; idx = 0; @@ -480,7 +423,8 @@ static void parse_system_parameter_string(struct seq_file *m) kfree(workbuffer); local_buffer -= 2; /* back up over strlen value */ } - kfree(local_buffer); +out_free: + papr_sysparm_buf_free(buf); } /* Return the number of processors in the system. diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index 4cea71aa0f41..643d309d1bd0 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -195,7 +195,7 @@ static int update_dt_node(struct device_node *dn, s32 scope) u32 nprops; u32 vd; - update_properties_token = rtas_token("ibm,update-properties"); + update_properties_token = rtas_function_token(RTAS_FN_IBM_UPDATE_PROPERTIES); if (update_properties_token == RTAS_UNKNOWN_SERVICE) return -EINVAL; @@ -306,7 +306,7 @@ static int pseries_devicetree_update(s32 scope) int update_nodes_token; int rc; - update_nodes_token = rtas_token("ibm,update-nodes"); + update_nodes_token = rtas_function_token(RTAS_FN_IBM_UPDATE_NODES); if (update_nodes_token == RTAS_UNKNOWN_SERVICE) return 0; diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 3f05507e444d..423ee1d5bd94 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -679,8 +679,8 @@ static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev) static int rtas_msi_init(void) { - query_token = rtas_token("ibm,query-interrupt-source-number"); - change_token = rtas_token("ibm,change-msi"); + query_token = rtas_function_token(RTAS_FN_IBM_QUERY_INTERRUPT_SOURCE_NUMBER); + change_token = rtas_function_token(RTAS_FN_IBM_CHANGE_MSI); if ((query_token == RTAS_UNKNOWN_SERVICE) || (change_token == RTAS_UNKNOWN_SERVICE)) { diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index cbf1720eb4aa..8130c37962c0 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -227,8 +227,8 @@ int __init pSeries_nvram_init(void) nvram_size = be32_to_cpup(nbytes_p); - nvram_fetch = rtas_token("nvram-fetch"); - nvram_store = rtas_token("nvram-store"); + nvram_fetch = rtas_function_token(RTAS_FN_NVRAM_FETCH); + nvram_store = rtas_function_token(RTAS_FN_NVRAM_STORE); printk(KERN_INFO "PPC64 nvram contains %d bytes\n", nvram_size); of_node_put(nvram); diff --git a/arch/powerpc/platforms/pseries/papr-sysparm.c b/arch/powerpc/platforms/pseries/papr-sysparm.c new file mode 100644 index 000000000000..fedc61599e6c --- /dev/null +++ b/arch/powerpc/platforms/pseries/papr-sysparm.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "papr-sysparm: " fmt + +#include <linux/bug.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/printk.h> +#include <linux/slab.h> +#include <asm/rtas.h> +#include <asm/papr-sysparm.h> +#include <asm/rtas-work-area.h> + +struct papr_sysparm_buf *papr_sysparm_buf_alloc(void) +{ + struct papr_sysparm_buf *buf = kzalloc(sizeof(*buf), GFP_KERNEL); + + return buf; +} + +void papr_sysparm_buf_free(struct papr_sysparm_buf *buf) +{ + kfree(buf); +} + +/** + * papr_sysparm_get() - Retrieve the value of a PAPR system parameter. + * @param: PAPR system parameter token as described in + * 7.3.16 "System Parameters Option". + * @buf: A &struct papr_sysparm_buf as returned from papr_sysparm_buf_alloc(). + * + * Place the result of querying the specified parameter, if available, + * in @buf. The result includes a be16 length header followed by the + * value, which may be a string or binary data. See &struct papr_sysparm_buf. + * + * Since there is at least one parameter (60, OS Service Entitlement + * Status) where the results depend on the incoming contents of the + * work area, the caller-supplied buffer is copied unmodified into the + * work area before calling ibm,get-system-parameter. + * + * A defined parameter may not be implemented on a given system, and + * some implemented parameters may not be available to all partitions + * on a system. A parameter's disposition may change at any time due + * to system configuration changes or partition migration. + * + * Context: This function may sleep. + * + * Return: 0 on success, -errno otherwise. @buf is unmodified on error. + */ + +int papr_sysparm_get(papr_sysparm_t param, struct papr_sysparm_buf *buf) +{ + const s32 token = rtas_function_token(RTAS_FN_IBM_GET_SYSTEM_PARAMETER); + struct rtas_work_area *work_area; + s32 fwrc; + int ret; + + might_sleep(); + + if (WARN_ON(!buf)) + return -EFAULT; + + if (token == RTAS_UNKNOWN_SERVICE) + return -ENOENT; + + work_area = rtas_work_area_alloc(sizeof(*buf)); + + memcpy(rtas_work_area_raw_buf(work_area), buf, sizeof(*buf)); + + do { + fwrc = rtas_call(token, 3, 1, NULL, param.token, + rtas_work_area_phys(work_area), + rtas_work_area_size(work_area)); + } while (rtas_busy_delay(fwrc)); + + switch (fwrc) { + case 0: + ret = 0; + memcpy(buf, rtas_work_area_raw_buf(work_area), sizeof(*buf)); + break; + case -3: /* parameter not implemented */ + ret = -EOPNOTSUPP; + break; + case -9002: /* this partition not authorized to retrieve this parameter */ + ret = -EPERM; + break; + case -9999: /* "parameter error" e.g. the buffer is too small */ + ret = -EINVAL; + break; + default: + pr_err("unexpected ibm,get-system-parameter result %d\n", fwrc); + fallthrough; + case -1: /* Hardware/platform error */ + ret = -EIO; + break; + } + + rtas_work_area_free(work_area); + + return ret; +} + +int papr_sysparm_set(papr_sysparm_t param, const struct papr_sysparm_buf *buf) +{ + const s32 token = rtas_function_token(RTAS_FN_IBM_SET_SYSTEM_PARAMETER); + struct rtas_work_area *work_area; + s32 fwrc; + int ret; + + might_sleep(); + + if (WARN_ON(!buf)) + return -EFAULT; + + if (token == RTAS_UNKNOWN_SERVICE) + return -ENOENT; + + work_area = rtas_work_area_alloc(sizeof(*buf)); + + memcpy(rtas_work_area_raw_buf(work_area), buf, sizeof(*buf)); + + do { + fwrc = rtas_call(token, 2, 1, NULL, param.token, + rtas_work_area_phys(work_area)); + } while (rtas_busy_delay(fwrc)); + + switch (fwrc) { + case 0: + ret = 0; + break; + case -3: /* parameter not supported */ + ret = -EOPNOTSUPP; + break; + case -9002: /* this partition not authorized to modify this parameter */ + ret = -EPERM; + break; + case -9999: /* "parameter error" e.g. invalid input data */ + ret = -EINVAL; + break; + default: + pr_err("unexpected ibm,set-system-parameter result %d\n", fwrc); + fallthrough; + case -1: /* Hardware/platform error */ + ret = -EIO; + break; + } + + rtas_work_area_free(work_area); + + return ret; +} diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c index 6e671c3809ec..60e0a58928ef 100644 --- a/arch/powerpc/platforms/pseries/pci.c +++ b/arch/powerpc/platforms/pseries/pci.c @@ -60,7 +60,7 @@ static int pseries_send_map_pe(struct pci_dev *pdev, u16 num_vfs, struct pci_dn *pdn; int rc; unsigned long buid, addr; - int ibm_map_pes = rtas_token("ibm,open-sriov-map-pe-number"); + int ibm_map_pes = rtas_function_token(RTAS_FN_IBM_OPEN_SRIOV_MAP_PE_NUMBER); if (ibm_map_pes == RTAS_UNKNOWN_SERVICE) return -EINVAL; diff --git a/arch/powerpc/platforms/pseries/plpks-secvar.c b/arch/powerpc/platforms/pseries/plpks-secvar.c new file mode 100644 index 000000000000..257fd1f8bc19 --- /dev/null +++ b/arch/powerpc/platforms/pseries/plpks-secvar.c @@ -0,0 +1,217 @@ +// SPDX-License-Identifier: GPL-2.0-only + +// Secure variable implementation using the PowerVM LPAR Platform KeyStore (PLPKS) +// +// Copyright 2022, 2023 IBM Corporation +// Authors: Russell Currey +// Andrew Donnellan +// Nayna Jain + +#define pr_fmt(fmt) "secvar: "fmt + +#include <linux/printk.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/kobject.h> +#include <linux/nls.h> +#include <asm/machdep.h> +#include <asm/secvar.h> +#include <asm/plpks.h> + +// Config attributes for sysfs +#define PLPKS_CONFIG_ATTR(name, fmt, func) \ + static ssize_t name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *buf) \ + { \ + return sysfs_emit(buf, fmt, func()); \ + } \ + static struct kobj_attribute attr_##name = __ATTR_RO(name) + +PLPKS_CONFIG_ATTR(version, "%u\n", plpks_get_version); +PLPKS_CONFIG_ATTR(max_object_size, "%u\n", plpks_get_maxobjectsize); +PLPKS_CONFIG_ATTR(total_size, "%u\n", plpks_get_totalsize); +PLPKS_CONFIG_ATTR(used_space, "%u\n", plpks_get_usedspace); +PLPKS_CONFIG_ATTR(supported_policies, "%08x\n", plpks_get_supportedpolicies); +PLPKS_CONFIG_ATTR(signed_update_algorithms, "%016llx\n", plpks_get_signedupdatealgorithms); + +static const struct attribute *config_attrs[] = { + &attr_version.attr, + &attr_max_object_size.attr, + &attr_total_size.attr, + &attr_used_space.attr, + &attr_supported_policies.attr, + &attr_signed_update_algorithms.attr, + NULL, +}; + +static u32 get_policy(const char *name) +{ + if ((strcmp(name, "db") == 0) || + (strcmp(name, "dbx") == 0) || + (strcmp(name, "grubdb") == 0) || + (strcmp(name, "grubdbx") == 0) || + (strcmp(name, "sbat") == 0)) + return (PLPKS_WORLDREADABLE | PLPKS_SIGNEDUPDATE); + else + return PLPKS_SIGNEDUPDATE; +} + +static const char * const plpks_var_names[] = { + "PK", + "KEK", + "db", + "dbx", + "grubdb", + "grubdbx", + "sbat", + "moduledb", + "trustedcadb", + NULL, +}; + +static int plpks_get_variable(const char *key, u64 key_len, u8 *data, + u64 *data_size) +{ + struct plpks_var var = {0}; + int rc = 0; + + // We subtract 1 from key_len because we don't need to include the + // null terminator at the end of the string + var.name = kcalloc(key_len - 1, sizeof(wchar_t), GFP_KERNEL); + if (!var.name) + return -ENOMEM; + rc = utf8s_to_utf16s(key, key_len - 1, UTF16_LITTLE_ENDIAN, (wchar_t *)var.name, + key_len - 1); + if (rc < 0) + goto err; + var.namelen = rc * 2; + + var.os = PLPKS_VAR_LINUX; + if (data) { + var.data = data; + var.datalen = *data_size; + } + rc = plpks_read_os_var(&var); + + if (rc) + goto err; + + *data_size = var.datalen; + +err: + kfree(var.name); + if (rc && rc != -ENOENT) { + pr_err("Failed to read variable '%s': %d\n", key, rc); + // Return -EIO since userspace probably doesn't care about the + // specific error + rc = -EIO; + } + return rc; +} + +static int plpks_set_variable(const char *key, u64 key_len, u8 *data, + u64 data_size) +{ + struct plpks_var var = {0}; + int rc = 0; + u64 flags; + + // Secure variables need to be prefixed with 8 bytes of flags. + // We only want to perform the write if we have at least one byte of data. + if (data_size <= sizeof(flags)) + return -EINVAL; + + // We subtract 1 from key_len because we don't need to include the + // null terminator at the end of the string + var.name = kcalloc(key_len - 1, sizeof(wchar_t), GFP_KERNEL); + if (!var.name) + return -ENOMEM; + rc = utf8s_to_utf16s(key, key_len - 1, UTF16_LITTLE_ENDIAN, (wchar_t *)var.name, + key_len - 1); + if (rc < 0) + goto err; + var.namelen = rc * 2; + + // Flags are contained in the first 8 bytes of the buffer, and are always big-endian + flags = be64_to_cpup((__be64 *)data); + + var.datalen = data_size - sizeof(flags); + var.data = data + sizeof(flags); + var.os = PLPKS_VAR_LINUX; + var.policy = get_policy(key); + + // Unlike in the read case, the plpks error code can be useful to + // userspace on write, so we return it rather than just -EIO + rc = plpks_signed_update_var(&var, flags); + +err: + kfree(var.name); + return rc; +} + +// PLPKS dynamic secure boot doesn't give us a format string in the same way OPAL does. +// Instead, report the format using the SB_VERSION variable in the keystore. +// The string is made up by us, and takes the form "ibm,plpks-sb-v<n>" (or "ibm,plpks-sb-unknown" +// if the SB_VERSION variable doesn't exist). Hypervisor defines the SB_VERSION variable as a +// "1 byte unsigned integer value". +static ssize_t plpks_secvar_format(char *buf, size_t bufsize) +{ + struct plpks_var var = {0}; + ssize_t ret; + u8 version; + + var.component = NULL; + // Only the signed variables have null bytes in their names, this one doesn't + var.name = "SB_VERSION"; + var.namelen = strlen(var.name); + var.datalen = 1; + var.data = &version; + + // Unlike the other vars, SB_VERSION is owned by firmware instead of the OS + ret = plpks_read_fw_var(&var); + if (ret) { + if (ret == -ENOENT) { + ret = snprintf(buf, bufsize, "ibm,plpks-sb-unknown"); + } else { + pr_err("Error %ld reading SB_VERSION from firmware\n", ret); + ret = -EIO; + } + goto err; + } + + ret = snprintf(buf, bufsize, "ibm,plpks-sb-v%hhu", version); +err: + return ret; +} + +static int plpks_max_size(u64 *max_size) +{ + // The max object size reported by the hypervisor is accurate for the + // object itself, but we use the first 8 bytes of data on write as the + // signed update flags, so the max size a user can write is larger. + *max_size = (u64)plpks_get_maxobjectsize() + sizeof(u64); + + return 0; +} + + +static const struct secvar_operations plpks_secvar_ops = { + .get = plpks_get_variable, + .set = plpks_set_variable, + .format = plpks_secvar_format, + .max_size = plpks_max_size, + .config_attrs = config_attrs, + .var_names = plpks_var_names, +}; + +static int plpks_secvar_init(void) +{ + if (!plpks_is_available()) + return -ENODEV; + + return set_secvar_ops(&plpks_secvar_ops); +} +machine_device_initcall(pseries, plpks_secvar_init); diff --git a/arch/powerpc/platforms/pseries/plpks.c b/arch/powerpc/platforms/pseries/plpks.c index 4edd1585e245..6f7bf3fc3aea 100644 --- a/arch/powerpc/platforms/pseries/plpks.c +++ b/arch/powerpc/platforms/pseries/plpks.c @@ -16,30 +16,28 @@ #include <linux/slab.h> #include <linux/string.h> #include <linux/types.h> +#include <linux/of_fdt.h> +#include <linux/libfdt.h> +#include <linux/memblock.h> #include <asm/hvcall.h> #include <asm/machdep.h> - -#include "plpks.h" - -#define PKS_FW_OWNER 0x1 -#define PKS_BOOTLOADER_OWNER 0x2 -#define PKS_OS_OWNER 0x3 - -#define LABEL_VERSION 0 -#define MAX_LABEL_ATTR_SIZE 16 -#define MAX_NAME_SIZE 239 -#define MAX_DATA_SIZE 4000 - -#define PKS_FLUSH_MAX_TIMEOUT 5000 //msec -#define PKS_FLUSH_SLEEP 10 //msec -#define PKS_FLUSH_SLEEP_RANGE 400 +#include <asm/plpks.h> +#include <asm/firmware.h> static u8 *ospassword; static u16 ospasswordlength; // Retrieved with H_PKS_GET_CONFIG +static u8 version; +static u16 objoverhead; static u16 maxpwsize; static u16 maxobjsize; +static s16 maxobjlabelsize; +static u32 totalsize; +static u32 usedspace; +static u32 supportedpolicies; +static u32 maxlargeobjectsize; +static u64 signedupdatealgorithms; struct plpks_auth { u8 version; @@ -60,7 +58,7 @@ struct label_attr { struct label { struct label_attr attr; - u8 name[MAX_NAME_SIZE]; + u8 name[PLPKS_MAX_NAME_SIZE]; size_t size; }; @@ -87,6 +85,12 @@ static int pseries_status_to_err(int rc) err = -ENOENT; break; case H_BUSY: + case H_LONG_BUSY_ORDER_1_MSEC: + case H_LONG_BUSY_ORDER_10_MSEC: + case H_LONG_BUSY_ORDER_100_MSEC: + case H_LONG_BUSY_ORDER_1_SEC: + case H_LONG_BUSY_ORDER_10_SEC: + case H_LONG_BUSY_ORDER_100_SEC: err = -EBUSY; break; case H_AUTHORITY: @@ -117,16 +121,25 @@ static int pseries_status_to_err(int rc) err = -EINVAL; } + pr_debug("Converted hypervisor code %d to Linux %d\n", rc, err); + return err; } static int plpks_gen_password(void) { unsigned long retbuf[PLPAR_HCALL_BUFSIZE] = { 0 }; - u8 *password, consumer = PKS_OS_OWNER; + u8 *password, consumer = PLPKS_OS_OWNER; int rc; - password = kzalloc(maxpwsize, GFP_KERNEL); + // If we booted from kexec, we could be reusing an existing password already + if (ospassword) { + pr_debug("Password of length %u already in use\n", ospasswordlength); + return 0; + } + + // The password must not cross a page boundary, so we align to the next power of 2 + password = kzalloc(roundup_pow_of_two(maxpwsize), GFP_KERNEL); if (!password) return -ENOMEM; @@ -143,7 +156,7 @@ static int plpks_gen_password(void) memcpy(ospassword, password, ospasswordlength); } else { if (rc == H_IN_USE) { - pr_warn("Password is already set for POWER LPAR Platform KeyStore\n"); + pr_warn("Password already set - authenticated operations will fail\n"); rc = 0; } else { goto out; @@ -159,17 +172,19 @@ static struct plpks_auth *construct_auth(u8 consumer) { struct plpks_auth *auth; - if (consumer > PKS_OS_OWNER) + if (consumer > PLPKS_OS_OWNER) return ERR_PTR(-EINVAL); - auth = kzalloc(struct_size(auth, password, maxpwsize), GFP_KERNEL); + // The auth structure must not cross a page boundary and must be + // 16 byte aligned. We align to the next largest power of 2 + auth = kzalloc(roundup_pow_of_two(struct_size(auth, password, maxpwsize)), GFP_KERNEL); if (!auth) return ERR_PTR(-ENOMEM); auth->version = 1; auth->consumer = consumer; - if (consumer == PKS_FW_OWNER || consumer == PKS_BOOTLOADER_OWNER) + if (consumer == PLPKS_FW_OWNER || consumer == PLPKS_BOOTLOADER_OWNER) return auth; memcpy(auth->password, ospassword, ospasswordlength); @@ -187,25 +202,29 @@ static struct label *construct_label(char *component, u8 varos, u8 *name, u16 namelen) { struct label *label; - size_t slen; + size_t slen = 0; - if (!name || namelen > MAX_NAME_SIZE) + if (!name || namelen > PLPKS_MAX_NAME_SIZE) return ERR_PTR(-EINVAL); - slen = strlen(component); - if (component && slen > sizeof(label->attr.prefix)) - return ERR_PTR(-EINVAL); + // Support NULL component for signed updates + if (component) { + slen = strlen(component); + if (slen > sizeof(label->attr.prefix)) + return ERR_PTR(-EINVAL); + } - label = kzalloc(sizeof(*label), GFP_KERNEL); + // The label structure must not cross a page boundary, so we align to the next power of 2 + label = kzalloc(roundup_pow_of_two(sizeof(*label)), GFP_KERNEL); if (!label) return ERR_PTR(-ENOMEM); if (component) memcpy(&label->attr.prefix, component, slen); - label->attr.version = LABEL_VERSION; + label->attr.version = PLPKS_LABEL_VERSION; label->attr.os = varos; - label->attr.length = MAX_LABEL_ATTR_SIZE; + label->attr.length = PLPKS_MAX_LABEL_ATTR_SIZE; memcpy(&label->name, name, namelen); label->size = sizeof(struct label_attr) + namelen; @@ -216,38 +235,164 @@ static struct label *construct_label(char *component, u8 varos, u8 *name, static int _plpks_get_config(void) { unsigned long retbuf[PLPAR_HCALL_BUFSIZE] = { 0 }; - struct { + struct config { u8 version; u8 flags; - __be32 rsvd0; + __be16 rsvd0; + __be16 objoverhead; __be16 maxpwsize; __be16 maxobjlabelsize; __be16 maxobjsize; __be32 totalsize; __be32 usedspace; __be32 supportedpolicies; - __be64 rsvd1; - } __packed config; + __be32 maxlargeobjectsize; + __be64 signedupdatealgorithms; + u8 rsvd1[476]; + } __packed * config; size_t size; - int rc; + int rc = 0; + + size = sizeof(*config); + + // Config struct must not cross a page boundary. So long as the struct + // size is a power of 2, this should be fine as alignment is guaranteed + config = kzalloc(size, GFP_KERNEL); + if (!config) { + rc = -ENOMEM; + goto err; + } + + rc = plpar_hcall(H_PKS_GET_CONFIG, retbuf, virt_to_phys(config), size); + + if (rc != H_SUCCESS) { + rc = pseries_status_to_err(rc); + goto err; + } + + version = config->version; + objoverhead = be16_to_cpu(config->objoverhead); + maxpwsize = be16_to_cpu(config->maxpwsize); + maxobjsize = be16_to_cpu(config->maxobjsize); + maxobjlabelsize = be16_to_cpu(config->maxobjlabelsize); + totalsize = be32_to_cpu(config->totalsize); + usedspace = be32_to_cpu(config->usedspace); + supportedpolicies = be32_to_cpu(config->supportedpolicies); + maxlargeobjectsize = be32_to_cpu(config->maxlargeobjectsize); + signedupdatealgorithms = be64_to_cpu(config->signedupdatealgorithms); + + // Validate that the numbers we get back match the requirements of the spec + if (maxpwsize < 32) { + pr_err("Invalid Max Password Size received from hypervisor (%d < 32)\n", maxpwsize); + rc = -EIO; + goto err; + } + + if (maxobjlabelsize < 255) { + pr_err("Invalid Max Object Label Size received from hypervisor (%d < 255)\n", + maxobjlabelsize); + rc = -EIO; + goto err; + } + + if (totalsize < 4096) { + pr_err("Invalid Total Size received from hypervisor (%d < 4096)\n", totalsize); + rc = -EIO; + goto err; + } + + if (version >= 3 && maxlargeobjectsize >= 65536 && maxobjsize != 0xFFFF) { + pr_err("Invalid Max Object Size (0x%x != 0xFFFF)\n", maxobjsize); + rc = -EIO; + goto err; + } + +err: + kfree(config); + return rc; +} + +u8 plpks_get_version(void) +{ + return version; +} + +u16 plpks_get_objoverhead(void) +{ + return objoverhead; +} + +u16 plpks_get_maxpwsize(void) +{ + return maxpwsize; +} + +u16 plpks_get_maxobjectsize(void) +{ + return maxobjsize; +} + +u16 plpks_get_maxobjectlabelsize(void) +{ + return maxobjlabelsize; +} + +u32 plpks_get_totalsize(void) +{ + return totalsize; +} + +u32 plpks_get_usedspace(void) +{ + // Unlike other config values, usedspace regularly changes as objects + // are updated, so we need to refresh. + int rc = _plpks_get_config(); + if (rc) { + pr_err("Couldn't get config, rc: %d\n", rc); + return 0; + } + return usedspace; +} + +u32 plpks_get_supportedpolicies(void) +{ + return supportedpolicies; +} + +u32 plpks_get_maxlargeobjectsize(void) +{ + return maxlargeobjectsize; +} + +u64 plpks_get_signedupdatealgorithms(void) +{ + return signedupdatealgorithms; +} - size = sizeof(config); +u16 plpks_get_passwordlen(void) +{ + return ospasswordlength; +} - rc = plpar_hcall(H_PKS_GET_CONFIG, retbuf, virt_to_phys(&config), size); +bool plpks_is_available(void) +{ + int rc; - if (rc != H_SUCCESS) - return pseries_status_to_err(rc); + if (!firmware_has_feature(FW_FEATURE_LPAR)) + return false; - maxpwsize = be16_to_cpu(config.maxpwsize); - maxobjsize = be16_to_cpu(config.maxobjsize); + rc = _plpks_get_config(); + if (rc) + return false; - return 0; + return true; } static int plpks_confirm_object_flushed(struct label *label, struct plpks_auth *auth) { unsigned long retbuf[PLPAR_HCALL_BUFSIZE] = { 0 }; + bool timed_out = true; u64 timeout = 0; u8 status; int rc; @@ -259,20 +404,79 @@ static int plpks_confirm_object_flushed(struct label *label, status = retbuf[0]; if (rc) { + timed_out = false; if (rc == H_NOT_FOUND && status == 1) rc = 0; break; } - if (!rc && status == 1) + if (!rc && status == 1) { + timed_out = false; break; + } - usleep_range(PKS_FLUSH_SLEEP, - PKS_FLUSH_SLEEP + PKS_FLUSH_SLEEP_RANGE); - timeout = timeout + PKS_FLUSH_SLEEP; - } while (timeout < PKS_FLUSH_MAX_TIMEOUT); + usleep_range(PLPKS_FLUSH_SLEEP, + PLPKS_FLUSH_SLEEP + PLPKS_FLUSH_SLEEP_RANGE); + timeout = timeout + PLPKS_FLUSH_SLEEP; + } while (timeout < PLPKS_MAX_TIMEOUT); - rc = pseries_status_to_err(rc); + if (timed_out) + return -ETIMEDOUT; + + return pseries_status_to_err(rc); +} + +int plpks_signed_update_var(struct plpks_var *var, u64 flags) +{ + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; + int rc; + struct label *label; + struct plpks_auth *auth; + u64 continuetoken = 0; + u64 timeout = 0; + + if (!var->data || var->datalen <= 0 || var->namelen > PLPKS_MAX_NAME_SIZE) + return -EINVAL; + + if (!(var->policy & PLPKS_SIGNEDUPDATE)) + return -EINVAL; + + // Signed updates need the component to be NULL. + if (var->component) + return -EINVAL; + + auth = construct_auth(PLPKS_OS_OWNER); + if (IS_ERR(auth)) + return PTR_ERR(auth); + + label = construct_label(var->component, var->os, var->name, var->namelen); + if (IS_ERR(label)) { + rc = PTR_ERR(label); + goto out; + } + + do { + rc = plpar_hcall9(H_PKS_SIGNED_UPDATE, retbuf, + virt_to_phys(auth), virt_to_phys(label), + label->size, var->policy, flags, + virt_to_phys(var->data), var->datalen, + continuetoken); + + continuetoken = retbuf[0]; + if (pseries_status_to_err(rc) == -EBUSY) { + int delay_ms = get_longbusy_msecs(rc); + mdelay(delay_ms); + timeout += delay_ms; + } + rc = pseries_status_to_err(rc); + } while (rc == -EBUSY && timeout < PLPKS_MAX_TIMEOUT); + + if (!rc) + rc = plpks_confirm_object_flushed(label, auth); + + kfree(label); +out: + kfree(auth); return rc; } @@ -285,13 +489,13 @@ int plpks_write_var(struct plpks_var var) int rc; if (!var.component || !var.data || var.datalen <= 0 || - var.namelen > MAX_NAME_SIZE || var.datalen > MAX_DATA_SIZE) + var.namelen > PLPKS_MAX_NAME_SIZE || var.datalen > PLPKS_MAX_DATA_SIZE) return -EINVAL; - if (var.policy & SIGNEDUPDATE) + if (var.policy & PLPKS_SIGNEDUPDATE) return -EINVAL; - auth = construct_auth(PKS_OS_OWNER); + auth = construct_auth(PLPKS_OS_OWNER); if (IS_ERR(auth)) return PTR_ERR(auth); @@ -323,10 +527,10 @@ int plpks_remove_var(char *component, u8 varos, struct plpks_var_name vname) struct label *label; int rc; - if (!component || vname.namelen > MAX_NAME_SIZE) + if (vname.namelen > PLPKS_MAX_NAME_SIZE) return -EINVAL; - auth = construct_auth(PKS_OS_OWNER); + auth = construct_auth(PLPKS_OS_OWNER); if (IS_ERR(auth)) return PTR_ERR(auth); @@ -358,14 +562,14 @@ static int plpks_read_var(u8 consumer, struct plpks_var *var) u8 *output; int rc; - if (var->namelen > MAX_NAME_SIZE) + if (var->namelen > PLPKS_MAX_NAME_SIZE) return -EINVAL; auth = construct_auth(consumer); if (IS_ERR(auth)) return PTR_ERR(auth); - if (consumer == PKS_OS_OWNER) { + if (consumer == PLPKS_OS_OWNER) { label = construct_label(var->component, var->os, var->name, var->namelen); if (IS_ERR(label)) { @@ -380,7 +584,7 @@ static int plpks_read_var(u8 consumer, struct plpks_var *var) goto out_free_label; } - if (consumer == PKS_OS_OWNER) + if (consumer == PLPKS_OS_OWNER) rc = plpar_hcall(H_PKS_READ_OBJECT, retbuf, virt_to_phys(auth), virt_to_phys(label), label->size, virt_to_phys(output), maxobjsize); @@ -395,17 +599,14 @@ static int plpks_read_var(u8 consumer, struct plpks_var *var) goto out_free_output; } - if (var->datalen == 0 || var->datalen > retbuf[0]) + if (!var->data || var->datalen > retbuf[0]) var->datalen = retbuf[0]; - var->data = kzalloc(var->datalen, GFP_KERNEL); - if (!var->data) { - rc = -ENOMEM; - goto out_free_output; - } var->policy = retbuf[1]; - memcpy(var->data, output, var->datalen); + if (var->data) + memcpy(var->data, output, var->datalen); + rc = 0; out_free_output: @@ -420,17 +621,69 @@ out_free_auth: int plpks_read_os_var(struct plpks_var *var) { - return plpks_read_var(PKS_OS_OWNER, var); + return plpks_read_var(PLPKS_OS_OWNER, var); } int plpks_read_fw_var(struct plpks_var *var) { - return plpks_read_var(PKS_FW_OWNER, var); + return plpks_read_var(PLPKS_FW_OWNER, var); } int plpks_read_bootloader_var(struct plpks_var *var) { - return plpks_read_var(PKS_BOOTLOADER_OWNER, var); + return plpks_read_var(PLPKS_BOOTLOADER_OWNER, var); +} + +int plpks_populate_fdt(void *fdt) +{ + int chosen_offset = fdt_path_offset(fdt, "/chosen"); + + if (chosen_offset < 0) { + pr_err("Can't find chosen node: %s\n", + fdt_strerror(chosen_offset)); + return chosen_offset; + } + + return fdt_setprop(fdt, chosen_offset, "ibm,plpks-pw", ospassword, ospasswordlength); +} + +// Once a password is registered with the hypervisor it cannot be cleared without +// rebooting the LPAR, so to keep using the PLPKS across kexec boots we need to +// recover the previous password from the FDT. +// +// There are a few challenges here. We don't want the password to be visible to +// users, so we need to clear it from the FDT. This has to be done in early boot. +// Clearing it from the FDT would make the FDT's checksum invalid, so we have to +// manually cause the checksum to be recalculated. +void __init plpks_early_init_devtree(void) +{ + void *fdt = initial_boot_params; + int chosen_node = fdt_path_offset(fdt, "/chosen"); + const u8 *password; + int len; + + if (chosen_node < 0) + return; + + password = fdt_getprop(fdt, chosen_node, "ibm,plpks-pw", &len); + if (len <= 0) { + pr_debug("Couldn't find ibm,plpks-pw node.\n"); + return; + } + + ospassword = memblock_alloc_raw(len, SMP_CACHE_BYTES); + if (!ospassword) { + pr_err("Error allocating memory for password.\n"); + goto out; + } + + memcpy(ospassword, password, len); + ospasswordlength = (u16)len; + +out: + fdt_nop_property(fdt, chosen_node, "ibm,plpks-pw"); + // Since we've cleared the password, we must update the FDT checksum + early_init_dt_verify(fdt); } static __init int pseries_plpks_init(void) diff --git a/arch/powerpc/platforms/pseries/plpks.h b/arch/powerpc/platforms/pseries/plpks.h deleted file mode 100644 index 275ccd86bfb5..000000000000 --- a/arch/powerpc/platforms/pseries/plpks.h +++ /dev/null @@ -1,71 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2022 IBM Corporation - * Author: Nayna Jain <nayna@linux.ibm.com> - * - * Platform keystore for pseries LPAR(PLPKS). - */ - -#ifndef _PSERIES_PLPKS_H -#define _PSERIES_PLPKS_H - -#include <linux/types.h> -#include <linux/list.h> - -#define OSSECBOOTAUDIT 0x40000000 -#define OSSECBOOTENFORCE 0x20000000 -#define WORLDREADABLE 0x08000000 -#define SIGNEDUPDATE 0x01000000 - -#define PLPKS_VAR_LINUX 0x02 -#define PLPKS_VAR_COMMON 0x04 - -struct plpks_var { - char *component; - u8 *name; - u8 *data; - u32 policy; - u16 namelen; - u16 datalen; - u8 os; -}; - -struct plpks_var_name { - u8 *name; - u16 namelen; -}; - -struct plpks_var_name_list { - u32 varcount; - struct plpks_var_name varlist[]; -}; - -/** - * Writes the specified var and its data to PKS. - * Any caller of PKS driver should present a valid component type for - * their variable. - */ -int plpks_write_var(struct plpks_var var); - -/** - * Removes the specified var and its data from PKS. - */ -int plpks_remove_var(char *component, u8 varos, - struct plpks_var_name vname); - -/** - * Returns the data for the specified os variable. - */ -int plpks_read_os_var(struct plpks_var *var); - -/** - * Returns the data for the specified firmware variable. - */ -int plpks_read_fw_var(struct plpks_var *var); - -/** - * Returns the data for the specified bootloader variable. - */ -int plpks_read_bootloader_var(struct plpks_var *var); - -#endif diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index f12516c3998c..adafd593d9d3 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -155,7 +155,7 @@ static int __init init_ras_IRQ(void) { struct device_node *np; - ras_check_exception_token = rtas_token("check-exception"); + ras_check_exception_token = rtas_function_token(RTAS_FN_CHECK_EXCEPTION); /* Internal Errors */ np = of_find_node_by_path("/event-sources/internal-errors"); diff --git a/arch/powerpc/platforms/pseries/rtas-work-area.c b/arch/powerpc/platforms/pseries/rtas-work-area.c new file mode 100644 index 000000000000..b37d52f40360 --- /dev/null +++ b/arch/powerpc/platforms/pseries/rtas-work-area.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "rtas-work-area: " fmt + +#include <linux/genalloc.h> +#include <linux/log2.h> +#include <linux/kernel.h> +#include <linux/memblock.h> +#include <linux/mempool.h> +#include <linux/minmax.h> +#include <linux/mutex.h> +#include <linux/numa.h> +#include <linux/sizes.h> +#include <linux/wait.h> + +#include <asm/machdep.h> +#include <asm/rtas-work-area.h> +#include <asm/rtas.h> + +enum { + /* + * Ensure the pool is page-aligned. + */ + RTAS_WORK_AREA_ARENA_ALIGN = PAGE_SIZE, + /* + * Don't let a single allocation claim the whole arena. + */ + RTAS_WORK_AREA_ARENA_SZ = RTAS_WORK_AREA_MAX_ALLOC_SZ * 2, + /* + * The smallest known work area size is for ibm,get-vpd's + * location code argument, which is limited to 79 characters + * plus 1 nul terminator. + * + * PAPR+ 7.3.20 ibm,get-vpd RTAS Call + * PAPR+ 12.3.2.4 Converged Location Code Rules - Length Restrictions + */ + RTAS_WORK_AREA_MIN_ALLOC_SZ = roundup_pow_of_two(80), +}; + +static struct { + struct gen_pool *gen_pool; + char *arena; + struct mutex mutex; /* serializes allocations */ + struct wait_queue_head wqh; + mempool_t descriptor_pool; + bool available; +} rwa_state = { + .mutex = __MUTEX_INITIALIZER(rwa_state.mutex), + .wqh = __WAIT_QUEUE_HEAD_INITIALIZER(rwa_state.wqh), +}; + +/* + * A single work area buffer and descriptor to serve requests early in + * boot before the allocator is fully initialized. We know 4KB is the + * most any boot time user needs (they all call ibm,get-system-parameter). + */ +static bool early_work_area_in_use __initdata; +static char early_work_area_buf[SZ_4K] __initdata __aligned(SZ_4K); +static struct rtas_work_area early_work_area __initdata = { + .buf = early_work_area_buf, + .size = sizeof(early_work_area_buf), +}; + + +static struct rtas_work_area * __init rtas_work_area_alloc_early(size_t size) +{ + WARN_ON(size > early_work_area.size); + WARN_ON(early_work_area_in_use); + early_work_area_in_use = true; + memset(early_work_area.buf, 0, early_work_area.size); + return &early_work_area; +} + +static void __init rtas_work_area_free_early(struct rtas_work_area *work_area) +{ + WARN_ON(work_area != &early_work_area); + WARN_ON(!early_work_area_in_use); + early_work_area_in_use = false; +} + +struct rtas_work_area * __ref __rtas_work_area_alloc(size_t size) +{ + struct rtas_work_area *area; + unsigned long addr; + + might_sleep(); + + /* + * The rtas_work_area_alloc() wrapper enforces this at build + * time. Requests that exceed the arena size will block + * indefinitely. + */ + WARN_ON(size > RTAS_WORK_AREA_MAX_ALLOC_SZ); + + if (!rwa_state.available) + return rtas_work_area_alloc_early(size); + /* + * To ensure FCFS behavior and prevent a high rate of smaller + * requests from starving larger ones, use the mutex to queue + * allocations. + */ + mutex_lock(&rwa_state.mutex); + wait_event(rwa_state.wqh, + (addr = gen_pool_alloc(rwa_state.gen_pool, size)) != 0); + mutex_unlock(&rwa_state.mutex); + + area = mempool_alloc(&rwa_state.descriptor_pool, GFP_KERNEL); + area->buf = (char *)addr; + area->size = size; + + return area; +} + +void __ref rtas_work_area_free(struct rtas_work_area *area) +{ + if (!rwa_state.available) { + rtas_work_area_free_early(area); + return; + } + + gen_pool_free(rwa_state.gen_pool, (unsigned long)area->buf, area->size); + mempool_free(area, &rwa_state.descriptor_pool); + wake_up(&rwa_state.wqh); +} + +/* + * Initialization of the work area allocator happens in two parts. To + * reliably reserve an arena that satisfies RTAS addressing + * requirements, we must perform a memblock allocation early, + * immmediately after RTAS instantiation. Then we have to wait until + * the slab allocator is up before setting up the descriptor mempool + * and adding the arena to a gen_pool. + */ +static __init int rtas_work_area_allocator_init(void) +{ + const unsigned int order = ilog2(RTAS_WORK_AREA_MIN_ALLOC_SZ); + const phys_addr_t pa_start = __pa(rwa_state.arena); + const phys_addr_t pa_end = pa_start + RTAS_WORK_AREA_ARENA_SZ - 1; + struct gen_pool *pool; + const int nid = NUMA_NO_NODE; + int err; + + err = -ENOMEM; + if (!rwa_state.arena) + goto err_out; + + pool = gen_pool_create(order, nid); + if (!pool) + goto err_out; + /* + * All RTAS functions that consume work areas are OK with + * natural alignment, when they have alignment requirements at + * all. + */ + gen_pool_set_algo(pool, gen_pool_first_fit_order_align, NULL); + + err = gen_pool_add(pool, (unsigned long)rwa_state.arena, + RTAS_WORK_AREA_ARENA_SZ, nid); + if (err) + goto err_destroy; + + err = mempool_init_kmalloc_pool(&rwa_state.descriptor_pool, 1, + sizeof(struct rtas_work_area)); + if (err) + goto err_destroy; + + rwa_state.gen_pool = pool; + rwa_state.available = true; + + pr_debug("arena [%pa-%pa] (%uK), min/max alloc sizes %u/%u\n", + &pa_start, &pa_end, + RTAS_WORK_AREA_ARENA_SZ / SZ_1K, + RTAS_WORK_AREA_MIN_ALLOC_SZ, + RTAS_WORK_AREA_MAX_ALLOC_SZ); + + return 0; + +err_destroy: + gen_pool_destroy(pool); +err_out: + return err; +} +machine_arch_initcall(pseries, rtas_work_area_allocator_init); + +/** + * rtas_work_area_reserve_arena() - Reserve memory suitable for RTAS work areas. + */ +void __init rtas_work_area_reserve_arena(const phys_addr_t limit) +{ + const phys_addr_t align = RTAS_WORK_AREA_ARENA_ALIGN; + const phys_addr_t size = RTAS_WORK_AREA_ARENA_SZ; + const phys_addr_t min = MEMBLOCK_LOW_LIMIT; + const int nid = NUMA_NO_NODE; + + /* + * Too early for a machine_is(pseries) check. But PAPR + * effectively mandates that ibm,get-system-parameter is + * present: + * + * R1–7.3.16–1. All platforms must support the System + * Parameters option. + * + * So set up the arena if we find that, with a fallback to + * ibm,configure-connector, just in case. + */ + if (rtas_function_implemented(RTAS_FN_IBM_GET_SYSTEM_PARAMETER) || + rtas_function_implemented(RTAS_FN_IBM_CONFIGURE_CONNECTOR)) + rwa_state.arena = memblock_alloc_try_nid(size, align, min, limit, nid); +} diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 8ef3270515a9..4a0cec8cf623 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -57,6 +57,7 @@ #include <asm/pmc.h> #include <asm/xics.h> #include <asm/xive.h> +#include <asm/papr-sysparm.h> #include <asm/ppc-pci.h> #include <asm/i8259.h> #include <asm/udbg.h> @@ -135,11 +136,11 @@ static void __init fwnmi_init(void) #endif int ibm_nmi_register_token; - ibm_nmi_register_token = rtas_token("ibm,nmi-register"); + ibm_nmi_register_token = rtas_function_token(RTAS_FN_IBM_NMI_REGISTER); if (ibm_nmi_register_token == RTAS_UNKNOWN_SERVICE) return; - ibm_nmi_interlock_token = rtas_token("ibm,nmi-interlock"); + ibm_nmi_interlock_token = rtas_function_token(RTAS_FN_IBM_NMI_INTERLOCK); if (WARN_ON(ibm_nmi_interlock_token == RTAS_UNKNOWN_SERVICE)) return; @@ -941,28 +942,21 @@ void pSeries_coalesce_init(void) */ static void __init pSeries_cmo_feature_init(void) { + static struct papr_sysparm_buf buf __initdata; + static_assert(sizeof(buf.val) >= CMO_MAXLENGTH); char *ptr, *key, *value, *end; - int call_status; int page_order = IOMMU_PAGE_SHIFT_4K; pr_debug(" -> fw_cmo_feature_init()\n"); - spin_lock(&rtas_data_buf_lock); - memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE); - call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, - NULL, - CMO_CHARACTERISTICS_TOKEN, - __pa(rtas_data_buf), - RTAS_DATA_BUF_SIZE); - - if (call_status != 0) { - spin_unlock(&rtas_data_buf_lock); + + if (papr_sysparm_get(PAPR_SYSPARM_COOP_MEM_OVERCOMMIT_ATTRS, &buf)) { pr_debug("CMO not available\n"); pr_debug(" <- fw_cmo_feature_init()\n"); return; } - end = rtas_data_buf + CMO_MAXLENGTH - 2; - ptr = rtas_data_buf + 2; /* step over strlen value */ + end = &buf.val[CMO_MAXLENGTH]; + ptr = &buf.val[0]; key = value = ptr; while (*ptr && (ptr <= end)) { @@ -1008,7 +1002,6 @@ static void __init pSeries_cmo_feature_init(void) } else pr_debug("CMO not enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP, CMO_SecPSP); - spin_unlock(&rtas_data_buf_lock); pr_debug(" <- fw_cmo_feature_init()\n"); } @@ -1078,14 +1071,14 @@ static void __init pseries_init(void) static void pseries_power_off(void) { int rc; - int rtas_poweroff_ups_token = rtas_token("ibm,power-off-ups"); + int rtas_poweroff_ups_token = rtas_function_token(RTAS_FN_IBM_POWER_OFF_UPS); if (rtas_flash_term_hook) rtas_flash_term_hook(SYS_POWER_OFF); if (rtas_poweron_auto == 0 || rtas_poweroff_ups_token == RTAS_UNKNOWN_SERVICE) { - rc = rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1); + rc = rtas_call(rtas_function_token(RTAS_FN_POWER_OFF), 2, 1, NULL, -1, -1); printk(KERN_INFO "RTAS power-off returned %d\n", rc); } else { rc = rtas_call(rtas_poweroff_ups_token, 0, 1, NULL); diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index fd2174edfa1d..c597711ef20a 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -55,7 +55,7 @@ static cpumask_var_t of_spin_mask; int smp_query_cpu_stopped(unsigned int pcpu) { int cpu_status, status; - int qcss_tok = rtas_token("query-cpu-stopped-state"); + int qcss_tok = rtas_function_token(RTAS_FN_QUERY_CPU_STOPPED_STATE); if (qcss_tok == RTAS_UNKNOWN_SERVICE) { printk_once(KERN_INFO @@ -108,7 +108,7 @@ static inline int smp_startup_cpu(unsigned int lcpu) * If the RTAS start-cpu token does not exist then presume the * cpu is already spinning. */ - start_cpu = rtas_token("start-cpu"); + start_cpu = rtas_function_token(RTAS_FN_START_CPU); if (start_cpu == RTAS_UNKNOWN_SERVICE) return 1; @@ -266,7 +266,7 @@ void __init smp_init_pseries(void) * We know prom_init will not have started them if RTAS supports * query-cpu-stopped-state. */ - if (rtas_token("query-cpu-stopped-state") == RTAS_UNKNOWN_SERVICE) { + if (rtas_function_token(RTAS_FN_QUERY_CPU_STOPPED_STATE) == RTAS_UNKNOWN_SERVICE) { if (cpu_has_feature(CPU_FTR_SMT)) { for_each_present_cpu(i) { if (cpu_thread_in_core(i) == 0) @@ -278,11 +278,5 @@ void __init smp_init_pseries(void) cpumask_clear_cpu(boot_cpuid, of_spin_mask); } - /* Non-lpar has additional take/give timebase */ - if (rtas_token("freeze-time-base") != RTAS_UNKNOWN_SERVICE) { - smp_ops->give_timebase = rtas_give_timebase; - smp_ops->take_timebase = rtas_take_timebase; - } - pr_debug(" <- smp_init_pSeries()\n"); } diff --git a/arch/powerpc/purgatory/Makefile b/arch/powerpc/purgatory/Makefile index a81d155b89ae..6f5e2727963c 100644 --- a/arch/powerpc/purgatory/Makefile +++ b/arch/powerpc/purgatory/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 KASAN_SANITIZE := n +KCSAN_SANITIZE := n targets += trampoline_$(BITS).o purgatory.ro diff --git a/arch/powerpc/sysdev/xics/ics-rtas.c b/arch/powerpc/sysdev/xics/ics-rtas.c index f8320f8e5bc7..b772a833d9b7 100644 --- a/arch/powerpc/sysdev/xics/ics-rtas.c +++ b/arch/powerpc/sysdev/xics/ics-rtas.c @@ -200,10 +200,10 @@ static struct ics ics_rtas = { __init int ics_rtas_init(void) { - ibm_get_xive = rtas_token("ibm,get-xive"); - ibm_set_xive = rtas_token("ibm,set-xive"); - ibm_int_on = rtas_token("ibm,int-on"); - ibm_int_off = rtas_token("ibm,int-off"); + ibm_get_xive = rtas_function_token(RTAS_FN_IBM_GET_XIVE); + ibm_set_xive = rtas_function_token(RTAS_FN_IBM_SET_XIVE); + ibm_int_on = rtas_function_token(RTAS_FN_IBM_INT_ON); + ibm_int_off = rtas_function_token(RTAS_FN_IBM_INT_OFF); /* We enable the RTAS "ICS" if RTAS is present with the * appropriate tokens diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile index eb25d7554ffd..d334de392e6c 100644 --- a/arch/powerpc/xmon/Makefile +++ b/arch/powerpc/xmon/Makefile @@ -5,6 +5,7 @@ GCOV_PROFILE := n KCOV_INSTRUMENT := n UBSAN_SANITIZE := n KASAN_SANITIZE := n +KCSAN_SANITIZE := n # Disable ftrace for the entire directory ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 0da66bc4823d..73c620c2a3a1 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -76,9 +76,6 @@ static cpumask_t xmon_batch_cpus = CPU_MASK_NONE; #define xmon_owner 0 #endif /* CONFIG_SMP */ -#ifdef CONFIG_PPC_PSERIES -static int set_indicator_token = RTAS_UNKNOWN_SERVICE; -#endif static unsigned long in_xmon __read_mostly = 0; static int xmon_on = IS_ENABLED(CONFIG_XMON_DEFAULT); static bool xmon_is_ro = IS_ENABLED(CONFIG_XMON_DEFAULT_RO_MODE); @@ -398,6 +395,7 @@ static inline void disable_surveillance(void) #ifdef CONFIG_PPC_PSERIES /* Since this can't be a module, args should end up below 4GB. */ static struct rtas_args args; + const s32 token = rtas_function_token(RTAS_FN_SET_INDICATOR); /* * At this point we have got all the cpus we can into @@ -406,10 +404,10 @@ static inline void disable_surveillance(void) * If we did try to take rtas.lock there would be a * real possibility of deadlock. */ - if (set_indicator_token == RTAS_UNKNOWN_SERVICE) + if (token == RTAS_UNKNOWN_SERVICE) return; - rtas_call_unlocked(&args, set_indicator_token, 3, 1, NULL, + rtas_call_unlocked(&args, token, 3, 1, NULL, SURVEILLANCE_TOKEN, 0, 0); #endif /* CONFIG_PPC_PSERIES */ @@ -3976,14 +3974,6 @@ static void xmon_init(int enable) __debugger_iabr_match = xmon_iabr_match; __debugger_break_match = xmon_break_match; __debugger_fault_handler = xmon_fault_handler; - -#ifdef CONFIG_PPC_PSERIES - /* - * Get the token here to avoid trying to get a lock - * during the crash, causing a deadlock. - */ - set_indicator_token = rtas_token("set-indicator"); -#endif } else { __debugger = NULL; __debugger_ipi = NULL; diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 9c687da7756d..c5e42cc37604 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -14,10 +14,11 @@ config RISCV def_bool y select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 + select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_HAS_BINFMT_FLAT select ARCH_HAS_CURRENT_STACK_POINTER - select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEBUG_VIRTUAL if MMU + select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEBUG_WX select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL @@ -44,12 +45,14 @@ config RISCV select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_GENERAL_HUGETLB + select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP select ARCH_WANT_HUGE_PMD_SHARE if 64BIT + select ARCH_WANT_LD_ORPHAN_WARN if !XIP_KERNEL select ARCH_WANTS_THP_SWAP if HAVE_ARCH_TRANSPARENT_HUGEPAGE select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU select BUILDTIME_TABLE_SORT if MMU - select CLONE_BACKWARDS select CLINT_TIMER if !MMU + select CLONE_BACKWARDS select COMMON_CLK select CPU_PM if CPU_IDLE select EDAC_SUPPORT @@ -84,16 +87,16 @@ config RISCV select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT select HAVE_ARCH_SECCOMP_FILTER + select HAVE_ARCH_THREAD_STRUCT_WHITELIST select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 64BIT && MMU - select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE - select HAVE_ARCH_THREAD_STRUCT_WHITELIST select HAVE_ARCH_VMAP_STACK if MMU && 64BIT select HAVE_ASM_MODVERSIONS select HAVE_CONTEXT_TRACKING_USER select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS if MMU select HAVE_EBPF_JIT if MMU + select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUNCTION_ERROR_INJECTION select HAVE_GCC_PLUGINS select HAVE_GENERIC_VDSO if MMU && 64BIT @@ -110,10 +113,9 @@ config RISCV select HAVE_PERF_USER_STACK_DUMP select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_FUNCTION_ARG_ACCESS_API + select HAVE_RSEQ select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS - select HAVE_RSEQ select IRQ_DOMAIN select IRQ_FORCED_THREADING select MODULES_USE_ELF_RELA if MODULES @@ -137,7 +139,7 @@ config RISCV select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL select HAVE_FUNCTION_GRAPH_TRACER - select HAVE_FUNCTION_TRACER if !XIP_KERNEL + select HAVE_FUNCTION_TRACER if !XIP_KERNEL && !PREEMPTION config ARCH_MMAP_RND_BITS_MIN default 18 if 64BIT @@ -234,9 +236,9 @@ config LOCKDEP_SUPPORT config RISCV_DMA_NONCOHERENT bool select ARCH_HAS_DMA_PREP_COHERENT - select ARCH_HAS_SYNC_DMA_FOR_DEVICE - select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SETUP_DMA_OPS + select ARCH_HAS_SYNC_DMA_FOR_CPU + select ARCH_HAS_SYNC_DMA_FOR_DEVICE select DMA_DIRECT_REMAP config AS_HAS_INSN @@ -351,11 +353,11 @@ endchoice config NUMA bool "NUMA Memory Allocation and Scheduler Support" depends on SMP && MMU + select ARCH_SUPPORTS_NUMA_BALANCING select GENERIC_ARCH_NUMA + select NEED_PER_CPU_EMBED_FIRST_CHUNK select OF_NUMA - select ARCH_SUPPORTS_NUMA_BALANCING select USE_PERCPU_NUMA_NODE_ID - select NEED_PER_CPU_EMBED_FIRST_CHUNK help Enable NUMA (Non-Uniform Memory Access) support. @@ -400,8 +402,8 @@ config RISCV_ISA_SVPBMT bool "SVPBMT extension support" depends on 64BIT && MMU depends on !XIP_KERNEL - select RISCV_ALTERNATIVE default y + select RISCV_ALTERNATIVE help Adds support to dynamically detect the presence of the SVPBMT ISA-extension (Supervisor-mode: page-based memory types) and @@ -415,20 +417,36 @@ config RISCV_ISA_SVPBMT If you don't know what to do here, say Y. -config TOOLCHAIN_HAS_ZICBOM +config TOOLCHAIN_HAS_ZBB bool default y - depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64ima_zicbom) - depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32ima_zicbom) - depends on LLD_VERSION >= 150000 || LD_VERSION >= 23800 + depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64ima_zbb) + depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32ima_zbb) + depends on LLD_VERSION >= 150000 || LD_VERSION >= 23900 + depends on AS_IS_GNU + +config RISCV_ISA_ZBB + bool "Zbb extension support for bit manipulation instructions" + depends on TOOLCHAIN_HAS_ZBB + depends on !XIP_KERNEL && MMU + select RISCV_ALTERNATIVE + default y + help + Adds support to dynamically detect the presence of the ZBB + extension (basic bit manipulation) and enable its usage. + + The Zbb extension provides instructions to accelerate a number + of bit-specific operations (count bit population, sign extending, + bitrotation, etc). + + If you don't know what to do here, say Y. config RISCV_ISA_ZICBOM bool "Zicbom extension support for non-coherent DMA operation" - depends on TOOLCHAIN_HAS_ZICBOM depends on !XIP_KERNEL && MMU - select RISCV_DMA_NONCOHERENT - select RISCV_ALTERNATIVE default y + select RISCV_ALTERNATIVE + select RISCV_DMA_NONCOHERENT help Adds support to dynamically detect the presence of the ZICBOM extension (Cache Block Management Operations) and enable its @@ -490,9 +508,9 @@ config RISCV_BOOT_SPINWAIT config KEXEC bool "Kexec system call" - select KEXEC_CORE - select HOTPLUG_CPU if SMP depends on MMU + select HOTPLUG_CPU if SMP + select KEXEC_CORE help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot @@ -503,10 +521,10 @@ config KEXEC config KEXEC_FILE bool "kexec file based systmem call" + depends on 64BIT && MMU + select HAVE_IMA_KEXEC if IMA select KEXEC_CORE select KEXEC_ELF - select HAVE_IMA_KEXEC if IMA - depends on 64BIT && MMU help This is new version of kexec system call. This system call is file based and takes file descriptors as system call argument @@ -595,15 +613,15 @@ config EFI_STUB config EFI bool "UEFI runtime support" depends on OF && !XIP_KERNEL - select LIBFDT - select UCS2_STRING - select EFI_PARAMS_FROM_FDT - select EFI_STUB + depends on MMU + default y select EFI_GENERIC_STUB + select EFI_PARAMS_FROM_FDT select EFI_RUNTIME_WRAPPERS + select EFI_STUB + select LIBFDT select RISCV_ISA_C - depends on MMU - default y + select UCS2_STRING help This option provides support for runtime services provided by UEFI firmware (such as non-volatile variables, realtime @@ -682,8 +700,8 @@ config PORTABLE bool default !NONPORTABLE select EFI - select OF select MMU + select OF menu "Power management options" diff --git a/arch/riscv/Kconfig.socs b/arch/riscv/Kconfig.socs index 4b91367604ea..1cf69f958f10 100644 --- a/arch/riscv/Kconfig.socs +++ b/arch/riscv/Kconfig.socs @@ -43,7 +43,7 @@ config ARCH_SUNXI config ARCH_VIRT def_bool SOC_VIRT - + config SOC_VIRT bool "QEMU Virt Machine" select CLINT_TIMER if RISCV_M_MODE @@ -88,7 +88,8 @@ config SOC_CANAAN_K210_DTB_BUILTIN If unsure, say Y. config ARCH_CANAAN_K210_DTB_SOURCE - def_bool SOC_CANAAN_K210_DTB_SOURCE + string + default SOC_CANAAN_K210_DTB_SOURCE config SOC_CANAAN_K210_DTB_SOURCE string "Source file for the Canaan Kendryte K210 builtin DTB" diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 7123511d977c..6203c3378922 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -11,7 +11,11 @@ LDFLAGS_vmlinux := ifeq ($(CONFIG_DYNAMIC_FTRACE),y) LDFLAGS_vmlinux := --no-relax KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY - CC_FLAGS_FTRACE := -fpatchable-function-entry=8 +ifeq ($(CONFIG_RISCV_ISA_C),y) + CC_FLAGS_FTRACE := -fpatchable-function-entry=4 +else + CC_FLAGS_FTRACE := -fpatchable-function-entry=2 +endif endif ifeq ($(CONFIG_CMODEL_MEDLOW),y) @@ -58,9 +62,6 @@ riscv-march-$(CONFIG_RISCV_ISA_C) := $(riscv-march-y)c toolchain-need-zicsr-zifencei := $(call cc-option-yn, -march=$(riscv-march-y)_zicsr_zifencei) riscv-march-$(toolchain-need-zicsr-zifencei) := $(riscv-march-y)_zicsr_zifencei -# Check if the toolchain supports Zicbom extension -riscv-march-$(CONFIG_TOOLCHAIN_HAS_ZICBOM) := $(riscv-march-y)_zicbom - # Check if the toolchain supports Zihintpause extension riscv-march-$(CONFIG_TOOLCHAIN_HAS_ZIHINTPAUSE) := $(riscv-march-y)_zihintpause diff --git a/arch/riscv/errata/sifive/errata.c b/arch/riscv/errata/sifive/errata.c index 1031038423e7..da55cb247e89 100644 --- a/arch/riscv/errata/sifive/errata.c +++ b/arch/riscv/errata/sifive/errata.c @@ -4,6 +4,7 @@ */ #include <linux/kernel.h> +#include <linux/memory.h> #include <linux/module.h> #include <linux/string.h> #include <linux/bug.h> @@ -107,7 +108,10 @@ void __init_or_module sifive_errata_patch_func(struct alt_entry *begin, tmp = (1U << alt->errata_id); if (cpu_req_errata & tmp) { - patch_text_nosync(alt->old_ptr, alt->alt_ptr, alt->alt_len); + mutex_lock(&text_mutex); + patch_text_nosync(ALT_OLD_PTR(alt), ALT_ALT_PTR(alt), + alt->alt_len); + mutex_lock(&text_mutex); cpu_apply_errata |= tmp; } } diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c index fac5742d1c1e..3b96a06d3c54 100644 --- a/arch/riscv/errata/thead/errata.c +++ b/arch/riscv/errata/thead/errata.c @@ -5,6 +5,7 @@ #include <linux/bug.h> #include <linux/kernel.h> +#include <linux/memory.h> #include <linux/module.h> #include <linux/string.h> #include <linux/uaccess.h> @@ -87,6 +88,7 @@ void __init_or_module thead_errata_patch_func(struct alt_entry *begin, struct al struct alt_entry *alt; u32 cpu_req_errata = thead_errata_probe(stage, archid, impid); u32 tmp; + void *oldptr, *altptr; for (alt = begin; alt < end; alt++) { if (alt->vendor_id != THEAD_VENDOR_ID) @@ -96,12 +98,17 @@ void __init_or_module thead_errata_patch_func(struct alt_entry *begin, struct al tmp = (1U << alt->errata_id); if (cpu_req_errata & tmp) { + oldptr = ALT_OLD_PTR(alt); + altptr = ALT_ALT_PTR(alt); + /* On vm-alternatives, the mmu isn't running yet */ - if (stage == RISCV_ALTERNATIVES_EARLY_BOOT) - memcpy((void *)__pa_symbol(alt->old_ptr), - (void *)__pa_symbol(alt->alt_ptr), alt->alt_len); - else - patch_text_nosync(alt->old_ptr, alt->alt_ptr, alt->alt_len); + if (stage == RISCV_ALTERNATIVES_EARLY_BOOT) { + memcpy(oldptr, altptr, alt->alt_len); + } else { + mutex_lock(&text_mutex); + patch_text_nosync(oldptr, altptr, alt->alt_len); + mutex_unlock(&text_mutex); + } } } diff --git a/arch/riscv/include/asm/alternative-macros.h b/arch/riscv/include/asm/alternative-macros.h index 2c0f4c887289..51c6867e02f3 100644 --- a/arch/riscv/include/asm/alternative-macros.h +++ b/arch/riscv/include/asm/alternative-macros.h @@ -7,11 +7,11 @@ #ifdef __ASSEMBLY__ .macro ALT_ENTRY oldptr newptr vendor_id errata_id new_len - RISCV_PTR \oldptr - RISCV_PTR \newptr - REG_ASM \vendor_id - REG_ASM \new_len - .word \errata_id + .4byte \oldptr - . + .4byte \newptr - . + .2byte \vendor_id + .2byte \new_len + .4byte \errata_id .endm .macro ALT_NEW_CONTENT vendor_id, errata_id, enable = 1, new_c : vararg @@ -59,11 +59,11 @@ #include <linux/stringify.h> #define ALT_ENTRY(oldptr, newptr, vendor_id, errata_id, newlen) \ - RISCV_PTR " " oldptr "\n" \ - RISCV_PTR " " newptr "\n" \ - REG_ASM " " vendor_id "\n" \ - REG_ASM " " newlen "\n" \ - ".word " errata_id "\n" + ".4byte ((" oldptr ") - .) \n" \ + ".4byte ((" newptr ") - .) \n" \ + ".2byte " vendor_id "\n" \ + ".2byte " newlen "\n" \ + ".4byte " errata_id "\n" #define ALT_NEW_CONTENT(vendor_id, errata_id, enable, new_c) \ ".if " __stringify(enable) " == 1\n" \ diff --git a/arch/riscv/include/asm/alternative.h b/arch/riscv/include/asm/alternative.h index 6511dd73e812..b8648d4f2ac1 100644 --- a/arch/riscv/include/asm/alternative.h +++ b/arch/riscv/include/asm/alternative.h @@ -23,17 +23,25 @@ #define RISCV_ALTERNATIVES_MODULE 1 /* alternatives applied during module-init */ #define RISCV_ALTERNATIVES_EARLY_BOOT 2 /* alternatives applied before mmu start */ +/* add the relative offset to the address of the offset to get the absolute address */ +#define __ALT_PTR(a, f) ((void *)&(a)->f + (a)->f) +#define ALT_OLD_PTR(a) __ALT_PTR(a, old_offset) +#define ALT_ALT_PTR(a) __ALT_PTR(a, alt_offset) + void __init apply_boot_alternatives(void); void __init apply_early_boot_alternatives(void); void apply_module_alternatives(void *start, size_t length); +void riscv_alternative_fix_offsets(void *alt_ptr, unsigned int len, + int patch_offset); + struct alt_entry { - void *old_ptr; /* address of original instruciton or data */ - void *alt_ptr; /* address of replacement instruction or data */ - unsigned long vendor_id; /* cpu vendor id */ - unsigned long alt_len; /* The replacement size */ - unsigned int errata_id; /* The errata id */ -} __packed; + s32 old_offset; /* offset relative to original instruction or data */ + s32 alt_offset; /* offset relative to replacement instruction or data */ + u16 vendor_id; /* cpu vendor id */ + u16 alt_len; /* The replacement size */ + u32 errata_id; /* The errata id */ +}; struct errata_checkfunc_id { unsigned long vendor_id; diff --git a/arch/riscv/include/asm/elf.h b/arch/riscv/include/asm/elf.h index e7acffdf21d2..30e7d2455960 100644 --- a/arch/riscv/include/asm/elf.h +++ b/arch/riscv/include/asm/elf.h @@ -14,6 +14,7 @@ #include <asm/auxvec.h> #include <asm/byteorder.h> #include <asm/cacheinfo.h> +#include <asm/hwcap.h> /* * These are used to set parameters in the core dumps. @@ -59,12 +60,13 @@ extern bool compat_elf_check_arch(Elf32_Ehdr *hdr); #define STACK_RND_MASK (0x3ffff >> (PAGE_SHIFT - 12)) #endif #endif + /* - * This yields a mask that user programs can use to figure out what - * instruction set this CPU supports. This could be done in user space, - * but it's not easy, and we've already done it here. + * Provides information on the availiable set of ISA extensions to userspace, + * via a bitmap that coorespends to each single-letter ISA extension. This is + * essentially defunct, but will remain for compatibility with userspace. */ -#define ELF_HWCAP (elf_hwcap) +#define ELF_HWCAP (elf_hwcap & ((1UL << RISCV_ISA_EXT_BASE) - 1)) extern unsigned long elf_hwcap; /* diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h index 4180312d2a70..fb1a810f3d8c 100644 --- a/arch/riscv/include/asm/errata_list.h +++ b/arch/riscv/include/asm/errata_list.h @@ -7,6 +7,8 @@ #include <asm/alternative.h> #include <asm/csr.h> +#include <asm/insn-def.h> +#include <asm/hwcap.h> #include <asm/vendorid_list.h> #ifdef CONFIG_ERRATA_SIFIVE @@ -22,10 +24,6 @@ #define ERRATA_THEAD_NUMBER 3 #endif -#define CPUFEATURE_SVPBMT 0 -#define CPUFEATURE_ZICBOM 1 -#define CPUFEATURE_NUMBER 2 - #ifdef __ASSEMBLY__ #define ALT_INSN_FAULT(x) \ @@ -55,7 +53,7 @@ asm(ALTERNATIVE("sfence.vma %0", "sfence.vma", SIFIVE_VENDOR_ID, \ #define ALT_SVPBMT(_val, prot) \ asm(ALTERNATIVE_2("li %0, 0\t\nnop", \ "li %0, %1\t\nslli %0,%0,%3", 0, \ - CPUFEATURE_SVPBMT, CONFIG_RISCV_ISA_SVPBMT, \ + RISCV_ISA_EXT_SVPBMT, CONFIG_RISCV_ISA_SVPBMT, \ "li %0, %2\t\nslli %0,%0,%4", THEAD_VENDOR_ID, \ ERRATA_THEAD_PBMT, CONFIG_ERRATA_THEAD_PBMT) \ : "=r"(_val) \ @@ -125,11 +123,11 @@ asm volatile(ALTERNATIVE_2( \ "mv a0, %1\n\t" \ "j 2f\n\t" \ "3:\n\t" \ - "cbo." __stringify(_op) " (a0)\n\t" \ + CBO_##_op(a0) \ "add a0, a0, %0\n\t" \ "2:\n\t" \ "bltu a0, %2, 3b\n\t" \ - "nop", 0, CPUFEATURE_ZICBOM, CONFIG_RISCV_ISA_ZICBOM, \ + "nop", 0, RISCV_ISA_EXT_ZICBOM, CONFIG_RISCV_ISA_ZICBOM, \ "mv a0, %1\n\t" \ "j 2f\n\t" \ "3:\n\t" \ diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h index 04dad3380041..9e73922e1e2e 100644 --- a/arch/riscv/include/asm/ftrace.h +++ b/arch/riscv/include/asm/ftrace.h @@ -42,6 +42,14 @@ struct dyn_arch_ftrace { * 2) jalr: setting low-12 offset to ra, jump to ra, and set ra to * return address (original pc + 4) * + *<ftrace enable>: + * 0: auipc t0/ra, 0x? + * 4: jalr t0/ra, ?(t0/ra) + * + *<ftrace disable>: + * 0: nop + * 4: nop + * * Dynamic ftrace generates probes to call sites, so we must deal with * both auipc and jalr at the same time. */ @@ -52,25 +60,43 @@ struct dyn_arch_ftrace { #define AUIPC_OFFSET_MASK (0xfffff000) #define AUIPC_PAD (0x00001000) #define JALR_SHIFT 20 -#define JALR_BASIC (0x000080e7) -#define AUIPC_BASIC (0x00000097) +#define JALR_RA (0x000080e7) +#define AUIPC_RA (0x00000097) +#define JALR_T0 (0x000282e7) +#define AUIPC_T0 (0x00000297) #define NOP4 (0x00000013) -#define make_call(caller, callee, call) \ +#define to_jalr_t0(offset) \ + (((offset & JALR_OFFSET_MASK) << JALR_SHIFT) | JALR_T0) + +#define to_auipc_t0(offset) \ + ((offset & JALR_SIGN_MASK) ? \ + (((offset & AUIPC_OFFSET_MASK) + AUIPC_PAD) | AUIPC_T0) : \ + ((offset & AUIPC_OFFSET_MASK) | AUIPC_T0)) + +#define make_call_t0(caller, callee, call) \ do { \ - call[0] = to_auipc_insn((unsigned int)((unsigned long)callee - \ - (unsigned long)caller)); \ - call[1] = to_jalr_insn((unsigned int)((unsigned long)callee - \ - (unsigned long)caller)); \ + unsigned int offset = \ + (unsigned long) callee - (unsigned long) caller; \ + call[0] = to_auipc_t0(offset); \ + call[1] = to_jalr_t0(offset); \ } while (0) -#define to_jalr_insn(offset) \ - (((offset & JALR_OFFSET_MASK) << JALR_SHIFT) | JALR_BASIC) +#define to_jalr_ra(offset) \ + (((offset & JALR_OFFSET_MASK) << JALR_SHIFT) | JALR_RA) -#define to_auipc_insn(offset) \ +#define to_auipc_ra(offset) \ ((offset & JALR_SIGN_MASK) ? \ - (((offset & AUIPC_OFFSET_MASK) + AUIPC_PAD) | AUIPC_BASIC) : \ - ((offset & AUIPC_OFFSET_MASK) | AUIPC_BASIC)) + (((offset & AUIPC_OFFSET_MASK) + AUIPC_PAD) | AUIPC_RA) : \ + ((offset & AUIPC_OFFSET_MASK) | AUIPC_RA)) + +#define make_call_ra(caller, callee, call) \ +do { \ + unsigned int offset = \ + (unsigned long) callee - (unsigned long) caller; \ + call[0] = to_auipc_ra(offset); \ + call[1] = to_jalr_ra(offset); \ +} while (0) /* * Let auipc+jalr be the basic *mcount unit*, so we make it 8 bytes here. diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index 64ad1937e714..e3021b2590de 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -8,24 +8,11 @@ #ifndef _ASM_RISCV_HWCAP_H #define _ASM_RISCV_HWCAP_H +#include <asm/alternative-macros.h> #include <asm/errno.h> #include <linux/bits.h> #include <uapi/asm/hwcap.h> -#ifndef __ASSEMBLY__ -#include <linux/jump_label.h> -/* - * This yields a mask that user programs can use to figure out what - * instruction set this cpu supports. - */ -#define ELF_HWCAP (elf_hwcap) - -enum { - CAP_HWCAP = 1, -}; - -extern unsigned long elf_hwcap; - #define RISCV_ISA_EXT_a ('a' - 'a') #define RISCV_ISA_EXT_c ('c' - 'a') #define RISCV_ISA_EXT_d ('d' - 'a') @@ -37,42 +24,31 @@ extern unsigned long elf_hwcap; #define RISCV_ISA_EXT_u ('u' - 'a') /* - * Increse this to higher value as kernel support more ISA extensions. + * These macros represent the logical IDs of each multi-letter RISC-V ISA + * extension and are used in the ISA bitmap. The logical IDs start from + * RISCV_ISA_EXT_BASE, which allows the 0-25 range to be reserved for single + * letter extensions. The maximum, RISCV_ISA_EXT_MAX, is defined in order + * to allocate the bitmap and may be increased when necessary. + * + * New extensions should just be added to the bottom, rather than added + * alphabetically, in order to avoid unnecessary shuffling. */ -#define RISCV_ISA_EXT_MAX 64 -#define RISCV_ISA_EXT_NAME_LEN_MAX 32 +#define RISCV_ISA_EXT_BASE 26 -/* The base ID for multi-letter ISA extensions */ -#define RISCV_ISA_EXT_BASE 26 +#define RISCV_ISA_EXT_SSCOFPMF 26 +#define RISCV_ISA_EXT_SSTC 27 +#define RISCV_ISA_EXT_SVINVAL 28 +#define RISCV_ISA_EXT_SVPBMT 29 +#define RISCV_ISA_EXT_ZBB 30 +#define RISCV_ISA_EXT_ZICBOM 31 +#define RISCV_ISA_EXT_ZIHINTPAUSE 32 -/* - * This enum represent the logical ID for each multi-letter RISC-V ISA extension. - * The logical ID should start from RISCV_ISA_EXT_BASE and must not exceed - * RISCV_ISA_EXT_MAX. 0-25 range is reserved for single letter - * extensions while all the multi-letter extensions should define the next - * available logical extension id. - */ -enum riscv_isa_ext_id { - RISCV_ISA_EXT_SSCOFPMF = RISCV_ISA_EXT_BASE, - RISCV_ISA_EXT_SVPBMT, - RISCV_ISA_EXT_ZICBOM, - RISCV_ISA_EXT_ZIHINTPAUSE, - RISCV_ISA_EXT_SSTC, - RISCV_ISA_EXT_SVINVAL, - RISCV_ISA_EXT_ID_MAX -}; -static_assert(RISCV_ISA_EXT_ID_MAX <= RISCV_ISA_EXT_MAX); +#define RISCV_ISA_EXT_MAX 64 +#define RISCV_ISA_EXT_NAME_LEN_MAX 32 -/* - * This enum represents the logical ID for each RISC-V ISA extension static - * keys. We can use static key to optimize code path if some ISA extensions - * are available. - */ -enum riscv_isa_ext_key { - RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */ - RISCV_ISA_EXT_KEY_SVINVAL, - RISCV_ISA_EXT_KEY_MAX, -}; +#ifndef __ASSEMBLY__ + +#include <linux/jump_label.h> struct riscv_isa_ext_data { /* Name of the extension displayed to userspace via /proc/cpuinfo */ @@ -81,20 +57,40 @@ struct riscv_isa_ext_data { unsigned int isa_ext_id; }; -extern struct static_key_false riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_MAX]; +static __always_inline bool +riscv_has_extension_likely(const unsigned long ext) +{ + compiletime_assert(ext < RISCV_ISA_EXT_MAX, + "ext must be < RISCV_ISA_EXT_MAX"); + + asm_volatile_goto( + ALTERNATIVE("j %l[l_no]", "nop", 0, %[ext], 1) + : + : [ext] "i" (ext) + : + : l_no); + + return true; +l_no: + return false; +} -static __always_inline int riscv_isa_ext2key(int num) +static __always_inline bool +riscv_has_extension_unlikely(const unsigned long ext) { - switch (num) { - case RISCV_ISA_EXT_f: - return RISCV_ISA_EXT_KEY_FPU; - case RISCV_ISA_EXT_d: - return RISCV_ISA_EXT_KEY_FPU; - case RISCV_ISA_EXT_SVINVAL: - return RISCV_ISA_EXT_KEY_SVINVAL; - default: - return -EINVAL; - } + compiletime_assert(ext < RISCV_ISA_EXT_MAX, + "ext must be < RISCV_ISA_EXT_MAX"); + + asm_volatile_goto( + ALTERNATIVE("nop", "j %l[l_yes]", 0, %[ext], 1) + : + : [ext] "i" (ext) + : + : l_yes); + + return false; +l_yes: + return true; } unsigned long riscv_isa_extension_base(const unsigned long *isa_bitmap); diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h index 16044affa57c..e01ab51f50d2 100644 --- a/arch/riscv/include/asm/insn-def.h +++ b/arch/riscv/include/asm/insn-def.h @@ -12,6 +12,12 @@ #define INSN_R_RD_SHIFT 7 #define INSN_R_OPCODE_SHIFT 0 +#define INSN_I_SIMM12_SHIFT 20 +#define INSN_I_RS1_SHIFT 15 +#define INSN_I_FUNC3_SHIFT 12 +#define INSN_I_RD_SHIFT 7 +#define INSN_I_OPCODE_SHIFT 0 + #ifdef __ASSEMBLY__ #ifdef CONFIG_AS_HAS_INSN @@ -20,6 +26,10 @@ .insn r \opcode, \func3, \func7, \rd, \rs1, \rs2 .endm + .macro insn_i, opcode, func3, rd, rs1, simm12 + .insn i \opcode, \func3, \rd, \rs1, \simm12 + .endm + #else #include <asm/gpr-num.h> @@ -33,9 +43,18 @@ (.L__gpr_num_\rs2 << INSN_R_RS2_SHIFT)) .endm + .macro insn_i, opcode, func3, rd, rs1, simm12 + .4byte ((\opcode << INSN_I_OPCODE_SHIFT) | \ + (\func3 << INSN_I_FUNC3_SHIFT) | \ + (.L__gpr_num_\rd << INSN_I_RD_SHIFT) | \ + (.L__gpr_num_\rs1 << INSN_I_RS1_SHIFT) | \ + (\simm12 << INSN_I_SIMM12_SHIFT)) + .endm + #endif #define __INSN_R(...) insn_r __VA_ARGS__ +#define __INSN_I(...) insn_i __VA_ARGS__ #else /* ! __ASSEMBLY__ */ @@ -44,6 +63,9 @@ #define __INSN_R(opcode, func3, func7, rd, rs1, rs2) \ ".insn r " opcode ", " func3 ", " func7 ", " rd ", " rs1 ", " rs2 "\n" +#define __INSN_I(opcode, func3, rd, rs1, simm12) \ + ".insn i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n" + #else #include <linux/stringify.h> @@ -60,14 +82,32 @@ " (.L__gpr_num_\\rs2 << " __stringify(INSN_R_RS2_SHIFT) "))\n" \ " .endm\n" +#define DEFINE_INSN_I \ + __DEFINE_ASM_GPR_NUMS \ +" .macro insn_i, opcode, func3, rd, rs1, simm12\n" \ +" .4byte ((\\opcode << " __stringify(INSN_I_OPCODE_SHIFT) ") |" \ +" (\\func3 << " __stringify(INSN_I_FUNC3_SHIFT) ") |" \ +" (.L__gpr_num_\\rd << " __stringify(INSN_I_RD_SHIFT) ") |" \ +" (.L__gpr_num_\\rs1 << " __stringify(INSN_I_RS1_SHIFT) ") |" \ +" (\\simm12 << " __stringify(INSN_I_SIMM12_SHIFT) "))\n" \ +" .endm\n" + #define UNDEFINE_INSN_R \ " .purgem insn_r\n" +#define UNDEFINE_INSN_I \ +" .purgem insn_i\n" + #define __INSN_R(opcode, func3, func7, rd, rs1, rs2) \ DEFINE_INSN_R \ "insn_r " opcode ", " func3 ", " func7 ", " rd ", " rs1 ", " rs2 "\n" \ UNDEFINE_INSN_R +#define __INSN_I(opcode, func3, rd, rs1, simm12) \ + DEFINE_INSN_I \ + "insn_i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n" \ + UNDEFINE_INSN_I + #endif #endif /* ! __ASSEMBLY__ */ @@ -76,9 +116,14 @@ __INSN_R(RV_##opcode, RV_##func3, RV_##func7, \ RV_##rd, RV_##rs1, RV_##rs2) +#define INSN_I(opcode, func3, rd, rs1, simm12) \ + __INSN_I(RV_##opcode, RV_##func3, RV_##rd, \ + RV_##rs1, RV_##simm12) + #define RV_OPCODE(v) __ASM_STR(v) #define RV_FUNC3(v) __ASM_STR(v) #define RV_FUNC7(v) __ASM_STR(v) +#define RV_SIMM12(v) __ASM_STR(v) #define RV_RD(v) __ASM_STR(v) #define RV_RS1(v) __ASM_STR(v) #define RV_RS2(v) __ASM_STR(v) @@ -87,6 +132,7 @@ #define RV___RS1(v) __RV_REG(v) #define RV___RS2(v) __RV_REG(v) +#define RV_OPCODE_MISC_MEM RV_OPCODE(15) #define RV_OPCODE_SYSTEM RV_OPCODE(115) #define HFENCE_VVMA(vaddr, asid) \ @@ -134,4 +180,16 @@ INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(51), \ __RD(0), RS1(gaddr), RS2(vmid)) +#define CBO_inval(base) \ + INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \ + RS1(base), SIMM12(0)) + +#define CBO_clean(base) \ + INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \ + RS1(base), SIMM12(1)) + +#define CBO_flush(base) \ + INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \ + RS1(base), SIMM12(2)) + #endif /* __ASM_INSN_DEF_H */ diff --git a/arch/riscv/include/asm/insn.h b/arch/riscv/include/asm/insn.h new file mode 100644 index 000000000000..8d5c84f2d5ef --- /dev/null +++ b/arch/riscv/include/asm/insn.h @@ -0,0 +1,381 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2020 SiFive + */ + +#ifndef _ASM_RISCV_INSN_H +#define _ASM_RISCV_INSN_H + +#include <linux/bits.h> + +#define RV_INSN_FUNCT3_MASK GENMASK(14, 12) +#define RV_INSN_FUNCT3_OPOFF 12 +#define RV_INSN_OPCODE_MASK GENMASK(6, 0) +#define RV_INSN_OPCODE_OPOFF 0 +#define RV_INSN_FUNCT12_OPOFF 20 + +#define RV_ENCODE_FUNCT3(f_) (RVG_FUNCT3_##f_ << RV_INSN_FUNCT3_OPOFF) +#define RV_ENCODE_FUNCT12(f_) (RVG_FUNCT12_##f_ << RV_INSN_FUNCT12_OPOFF) + +/* The bit field of immediate value in I-type instruction */ +#define RV_I_IMM_SIGN_OPOFF 31 +#define RV_I_IMM_11_0_OPOFF 20 +#define RV_I_IMM_SIGN_OFF 12 +#define RV_I_IMM_11_0_OFF 0 +#define RV_I_IMM_11_0_MASK GENMASK(11, 0) + +/* The bit field of immediate value in J-type instruction */ +#define RV_J_IMM_SIGN_OPOFF 31 +#define RV_J_IMM_10_1_OPOFF 21 +#define RV_J_IMM_11_OPOFF 20 +#define RV_J_IMM_19_12_OPOFF 12 +#define RV_J_IMM_SIGN_OFF 20 +#define RV_J_IMM_10_1_OFF 1 +#define RV_J_IMM_11_OFF 11 +#define RV_J_IMM_19_12_OFF 12 +#define RV_J_IMM_10_1_MASK GENMASK(9, 0) +#define RV_J_IMM_11_MASK GENMASK(0, 0) +#define RV_J_IMM_19_12_MASK GENMASK(7, 0) + +/* + * U-type IMMs contain the upper 20bits [31:20] of an immediate with + * the rest filled in by zeros, so no shifting required. Similarly, + * bit31 contains the signed state, so no sign extension necessary. + */ +#define RV_U_IMM_SIGN_OPOFF 31 +#define RV_U_IMM_31_12_OPOFF 0 +#define RV_U_IMM_31_12_MASK GENMASK(31, 12) + +/* The bit field of immediate value in B-type instruction */ +#define RV_B_IMM_SIGN_OPOFF 31 +#define RV_B_IMM_10_5_OPOFF 25 +#define RV_B_IMM_4_1_OPOFF 8 +#define RV_B_IMM_11_OPOFF 7 +#define RV_B_IMM_SIGN_OFF 12 +#define RV_B_IMM_10_5_OFF 5 +#define RV_B_IMM_4_1_OFF 1 +#define RV_B_IMM_11_OFF 11 +#define RV_B_IMM_10_5_MASK GENMASK(5, 0) +#define RV_B_IMM_4_1_MASK GENMASK(3, 0) +#define RV_B_IMM_11_MASK GENMASK(0, 0) + +/* The register offset in RVG instruction */ +#define RVG_RS1_OPOFF 15 +#define RVG_RS2_OPOFF 20 +#define RVG_RD_OPOFF 7 +#define RVG_RD_MASK GENMASK(4, 0) + +/* The bit field of immediate value in RVC J instruction */ +#define RVC_J_IMM_SIGN_OPOFF 12 +#define RVC_J_IMM_4_OPOFF 11 +#define RVC_J_IMM_9_8_OPOFF 9 +#define RVC_J_IMM_10_OPOFF 8 +#define RVC_J_IMM_6_OPOFF 7 +#define RVC_J_IMM_7_OPOFF 6 +#define RVC_J_IMM_3_1_OPOFF 3 +#define RVC_J_IMM_5_OPOFF 2 +#define RVC_J_IMM_SIGN_OFF 11 +#define RVC_J_IMM_4_OFF 4 +#define RVC_J_IMM_9_8_OFF 8 +#define RVC_J_IMM_10_OFF 10 +#define RVC_J_IMM_6_OFF 6 +#define RVC_J_IMM_7_OFF 7 +#define RVC_J_IMM_3_1_OFF 1 +#define RVC_J_IMM_5_OFF 5 +#define RVC_J_IMM_4_MASK GENMASK(0, 0) +#define RVC_J_IMM_9_8_MASK GENMASK(1, 0) +#define RVC_J_IMM_10_MASK GENMASK(0, 0) +#define RVC_J_IMM_6_MASK GENMASK(0, 0) +#define RVC_J_IMM_7_MASK GENMASK(0, 0) +#define RVC_J_IMM_3_1_MASK GENMASK(2, 0) +#define RVC_J_IMM_5_MASK GENMASK(0, 0) + +/* The bit field of immediate value in RVC B instruction */ +#define RVC_B_IMM_SIGN_OPOFF 12 +#define RVC_B_IMM_4_3_OPOFF 10 +#define RVC_B_IMM_7_6_OPOFF 5 +#define RVC_B_IMM_2_1_OPOFF 3 +#define RVC_B_IMM_5_OPOFF 2 +#define RVC_B_IMM_SIGN_OFF 8 +#define RVC_B_IMM_4_3_OFF 3 +#define RVC_B_IMM_7_6_OFF 6 +#define RVC_B_IMM_2_1_OFF 1 +#define RVC_B_IMM_5_OFF 5 +#define RVC_B_IMM_4_3_MASK GENMASK(1, 0) +#define RVC_B_IMM_7_6_MASK GENMASK(1, 0) +#define RVC_B_IMM_2_1_MASK GENMASK(1, 0) +#define RVC_B_IMM_5_MASK GENMASK(0, 0) + +#define RVC_INSN_FUNCT4_MASK GENMASK(15, 12) +#define RVC_INSN_FUNCT4_OPOFF 12 +#define RVC_INSN_FUNCT3_MASK GENMASK(15, 13) +#define RVC_INSN_FUNCT3_OPOFF 13 +#define RVC_INSN_J_RS2_MASK GENMASK(6, 2) +#define RVC_INSN_OPCODE_MASK GENMASK(1, 0) +#define RVC_ENCODE_FUNCT3(f_) (RVC_FUNCT3_##f_ << RVC_INSN_FUNCT3_OPOFF) +#define RVC_ENCODE_FUNCT4(f_) (RVC_FUNCT4_##f_ << RVC_INSN_FUNCT4_OPOFF) + +/* The register offset in RVC op=C0 instruction */ +#define RVC_C0_RS1_OPOFF 7 +#define RVC_C0_RS2_OPOFF 2 +#define RVC_C0_RD_OPOFF 2 + +/* The register offset in RVC op=C1 instruction */ +#define RVC_C1_RS1_OPOFF 7 +#define RVC_C1_RS2_OPOFF 2 +#define RVC_C1_RD_OPOFF 7 + +/* The register offset in RVC op=C2 instruction */ +#define RVC_C2_RS1_OPOFF 7 +#define RVC_C2_RS2_OPOFF 2 +#define RVC_C2_RD_OPOFF 7 + +/* parts of opcode for RVG*/ +#define RVG_OPCODE_FENCE 0x0f +#define RVG_OPCODE_AUIPC 0x17 +#define RVG_OPCODE_BRANCH 0x63 +#define RVG_OPCODE_JALR 0x67 +#define RVG_OPCODE_JAL 0x6f +#define RVG_OPCODE_SYSTEM 0x73 + +/* parts of opcode for RVC*/ +#define RVC_OPCODE_C0 0x0 +#define RVC_OPCODE_C1 0x1 +#define RVC_OPCODE_C2 0x2 + +/* parts of funct3 code for I, M, A extension*/ +#define RVG_FUNCT3_JALR 0x0 +#define RVG_FUNCT3_BEQ 0x0 +#define RVG_FUNCT3_BNE 0x1 +#define RVG_FUNCT3_BLT 0x4 +#define RVG_FUNCT3_BGE 0x5 +#define RVG_FUNCT3_BLTU 0x6 +#define RVG_FUNCT3_BGEU 0x7 + +/* parts of funct3 code for C extension*/ +#define RVC_FUNCT3_C_BEQZ 0x6 +#define RVC_FUNCT3_C_BNEZ 0x7 +#define RVC_FUNCT3_C_J 0x5 +#define RVC_FUNCT3_C_JAL 0x1 +#define RVC_FUNCT4_C_JR 0x8 +#define RVC_FUNCT4_C_JALR 0x9 +#define RVC_FUNCT4_C_EBREAK 0x9 + +#define RVG_FUNCT12_EBREAK 0x1 +#define RVG_FUNCT12_SRET 0x102 + +#define RVG_MATCH_AUIPC (RVG_OPCODE_AUIPC) +#define RVG_MATCH_JALR (RV_ENCODE_FUNCT3(JALR) | RVG_OPCODE_JALR) +#define RVG_MATCH_JAL (RVG_OPCODE_JAL) +#define RVG_MATCH_FENCE (RVG_OPCODE_FENCE) +#define RVG_MATCH_BEQ (RV_ENCODE_FUNCT3(BEQ) | RVG_OPCODE_BRANCH) +#define RVG_MATCH_BNE (RV_ENCODE_FUNCT3(BNE) | RVG_OPCODE_BRANCH) +#define RVG_MATCH_BLT (RV_ENCODE_FUNCT3(BLT) | RVG_OPCODE_BRANCH) +#define RVG_MATCH_BGE (RV_ENCODE_FUNCT3(BGE) | RVG_OPCODE_BRANCH) +#define RVG_MATCH_BLTU (RV_ENCODE_FUNCT3(BLTU) | RVG_OPCODE_BRANCH) +#define RVG_MATCH_BGEU (RV_ENCODE_FUNCT3(BGEU) | RVG_OPCODE_BRANCH) +#define RVG_MATCH_EBREAK (RV_ENCODE_FUNCT12(EBREAK) | RVG_OPCODE_SYSTEM) +#define RVG_MATCH_SRET (RV_ENCODE_FUNCT12(SRET) | RVG_OPCODE_SYSTEM) +#define RVC_MATCH_C_BEQZ (RVC_ENCODE_FUNCT3(C_BEQZ) | RVC_OPCODE_C1) +#define RVC_MATCH_C_BNEZ (RVC_ENCODE_FUNCT3(C_BNEZ) | RVC_OPCODE_C1) +#define RVC_MATCH_C_J (RVC_ENCODE_FUNCT3(C_J) | RVC_OPCODE_C1) +#define RVC_MATCH_C_JAL (RVC_ENCODE_FUNCT3(C_JAL) | RVC_OPCODE_C1) +#define RVC_MATCH_C_JR (RVC_ENCODE_FUNCT4(C_JR) | RVC_OPCODE_C2) +#define RVC_MATCH_C_JALR (RVC_ENCODE_FUNCT4(C_JALR) | RVC_OPCODE_C2) +#define RVC_MATCH_C_EBREAK (RVC_ENCODE_FUNCT4(C_EBREAK) | RVC_OPCODE_C2) + +#define RVG_MASK_AUIPC (RV_INSN_OPCODE_MASK) +#define RVG_MASK_JALR (RV_INSN_FUNCT3_MASK | RV_INSN_OPCODE_MASK) +#define RVG_MASK_JAL (RV_INSN_OPCODE_MASK) +#define RVG_MASK_FENCE (RV_INSN_OPCODE_MASK) +#define RVC_MASK_C_JALR (RVC_INSN_FUNCT4_MASK | RVC_INSN_J_RS2_MASK | RVC_INSN_OPCODE_MASK) +#define RVC_MASK_C_JR (RVC_INSN_FUNCT4_MASK | RVC_INSN_J_RS2_MASK | RVC_INSN_OPCODE_MASK) +#define RVC_MASK_C_JAL (RVC_INSN_FUNCT3_MASK | RVC_INSN_OPCODE_MASK) +#define RVC_MASK_C_J (RVC_INSN_FUNCT3_MASK | RVC_INSN_OPCODE_MASK) +#define RVG_MASK_BEQ (RV_INSN_FUNCT3_MASK | RV_INSN_OPCODE_MASK) +#define RVG_MASK_BNE (RV_INSN_FUNCT3_MASK | RV_INSN_OPCODE_MASK) +#define RVG_MASK_BLT (RV_INSN_FUNCT3_MASK | RV_INSN_OPCODE_MASK) +#define RVG_MASK_BGE (RV_INSN_FUNCT3_MASK | RV_INSN_OPCODE_MASK) +#define RVG_MASK_BLTU (RV_INSN_FUNCT3_MASK | RV_INSN_OPCODE_MASK) +#define RVG_MASK_BGEU (RV_INSN_FUNCT3_MASK | RV_INSN_OPCODE_MASK) +#define RVC_MASK_C_BEQZ (RVC_INSN_FUNCT3_MASK | RVC_INSN_OPCODE_MASK) +#define RVC_MASK_C_BNEZ (RVC_INSN_FUNCT3_MASK | RVC_INSN_OPCODE_MASK) +#define RVC_MASK_C_EBREAK 0xffff +#define RVG_MASK_EBREAK 0xffffffff +#define RVG_MASK_SRET 0xffffffff + +#define __INSN_LENGTH_MASK _UL(0x3) +#define __INSN_LENGTH_GE_32 _UL(0x3) +#define __INSN_OPCODE_MASK _UL(0x7F) +#define __INSN_BRANCH_OPCODE _UL(RVG_OPCODE_BRANCH) + +#define __RISCV_INSN_FUNCS(name, mask, val) \ +static __always_inline bool riscv_insn_is_##name(u32 code) \ +{ \ + BUILD_BUG_ON(~(mask) & (val)); \ + return (code & (mask)) == (val); \ +} \ + +#if __riscv_xlen == 32 +/* C.JAL is an RV32C-only instruction */ +__RISCV_INSN_FUNCS(c_jal, RVC_MASK_C_JAL, RVC_MATCH_C_JAL) +#else +#define riscv_insn_is_c_jal(opcode) 0 +#endif +__RISCV_INSN_FUNCS(auipc, RVG_MASK_AUIPC, RVG_MATCH_AUIPC) +__RISCV_INSN_FUNCS(jalr, RVG_MASK_JALR, RVG_MATCH_JALR) +__RISCV_INSN_FUNCS(jal, RVG_MASK_JAL, RVG_MATCH_JAL) +__RISCV_INSN_FUNCS(c_jr, RVC_MASK_C_JR, RVC_MATCH_C_JR) +__RISCV_INSN_FUNCS(c_jalr, RVC_MASK_C_JALR, RVC_MATCH_C_JALR) +__RISCV_INSN_FUNCS(c_j, RVC_MASK_C_J, RVC_MATCH_C_J) +__RISCV_INSN_FUNCS(beq, RVG_MASK_BEQ, RVG_MATCH_BEQ) +__RISCV_INSN_FUNCS(bne, RVG_MASK_BNE, RVG_MATCH_BNE) +__RISCV_INSN_FUNCS(blt, RVG_MASK_BLT, RVG_MATCH_BLT) +__RISCV_INSN_FUNCS(bge, RVG_MASK_BGE, RVG_MATCH_BGE) +__RISCV_INSN_FUNCS(bltu, RVG_MASK_BLTU, RVG_MATCH_BLTU) +__RISCV_INSN_FUNCS(bgeu, RVG_MASK_BGEU, RVG_MATCH_BGEU) +__RISCV_INSN_FUNCS(c_beqz, RVC_MASK_C_BEQZ, RVC_MATCH_C_BEQZ) +__RISCV_INSN_FUNCS(c_bnez, RVC_MASK_C_BNEZ, RVC_MATCH_C_BNEZ) +__RISCV_INSN_FUNCS(c_ebreak, RVC_MASK_C_EBREAK, RVC_MATCH_C_EBREAK) +__RISCV_INSN_FUNCS(ebreak, RVG_MASK_EBREAK, RVG_MATCH_EBREAK) +__RISCV_INSN_FUNCS(sret, RVG_MASK_SRET, RVG_MATCH_SRET) +__RISCV_INSN_FUNCS(fence, RVG_MASK_FENCE, RVG_MATCH_FENCE); + +/* special case to catch _any_ system instruction */ +static __always_inline bool riscv_insn_is_system(u32 code) +{ + return (code & RV_INSN_OPCODE_MASK) == RVG_OPCODE_SYSTEM; +} + +/* special case to catch _any_ branch instruction */ +static __always_inline bool riscv_insn_is_branch(u32 code) +{ + return (code & RV_INSN_OPCODE_MASK) == RVG_OPCODE_BRANCH; +} + +#define RV_IMM_SIGN(x) (-(((x) >> 31) & 1)) +#define RVC_IMM_SIGN(x) (-(((x) >> 12) & 1)) +#define RV_X(X, s, mask) (((X) >> (s)) & (mask)) +#define RVC_X(X, s, mask) RV_X(X, s, mask) + +#define RV_EXTRACT_RD_REG(x) \ + ({typeof(x) x_ = (x); \ + (RV_X(x_, RVG_RD_OPOFF, RVG_RD_MASK)); }) + +#define RV_EXTRACT_UTYPE_IMM(x) \ + ({typeof(x) x_ = (x); \ + (RV_X(x_, RV_U_IMM_31_12_OPOFF, RV_U_IMM_31_12_MASK)); }) + +#define RV_EXTRACT_JTYPE_IMM(x) \ + ({typeof(x) x_ = (x); \ + (RV_X(x_, RV_J_IMM_10_1_OPOFF, RV_J_IMM_10_1_MASK) << RV_J_IMM_10_1_OFF) | \ + (RV_X(x_, RV_J_IMM_11_OPOFF, RV_J_IMM_11_MASK) << RV_J_IMM_11_OFF) | \ + (RV_X(x_, RV_J_IMM_19_12_OPOFF, RV_J_IMM_19_12_MASK) << RV_J_IMM_19_12_OFF) | \ + (RV_IMM_SIGN(x_) << RV_J_IMM_SIGN_OFF); }) + +#define RV_EXTRACT_ITYPE_IMM(x) \ + ({typeof(x) x_ = (x); \ + (RV_X(x_, RV_I_IMM_11_0_OPOFF, RV_I_IMM_11_0_MASK)) | \ + (RV_IMM_SIGN(x_) << RV_I_IMM_SIGN_OFF); }) + +#define RV_EXTRACT_BTYPE_IMM(x) \ + ({typeof(x) x_ = (x); \ + (RV_X(x_, RV_B_IMM_4_1_OPOFF, RV_B_IMM_4_1_MASK) << RV_B_IMM_4_1_OFF) | \ + (RV_X(x_, RV_B_IMM_10_5_OPOFF, RV_B_IMM_10_5_MASK) << RV_B_IMM_10_5_OFF) | \ + (RV_X(x_, RV_B_IMM_11_OPOFF, RV_B_IMM_11_MASK) << RV_B_IMM_11_OFF) | \ + (RV_IMM_SIGN(x_) << RV_B_IMM_SIGN_OFF); }) + +#define RVC_EXTRACT_JTYPE_IMM(x) \ + ({typeof(x) x_ = (x); \ + (RVC_X(x_, RVC_J_IMM_3_1_OPOFF, RVC_J_IMM_3_1_MASK) << RVC_J_IMM_3_1_OFF) | \ + (RVC_X(x_, RVC_J_IMM_4_OPOFF, RVC_J_IMM_4_MASK) << RVC_J_IMM_4_OFF) | \ + (RVC_X(x_, RVC_J_IMM_5_OPOFF, RVC_J_IMM_5_MASK) << RVC_J_IMM_5_OFF) | \ + (RVC_X(x_, RVC_J_IMM_6_OPOFF, RVC_J_IMM_6_MASK) << RVC_J_IMM_6_OFF) | \ + (RVC_X(x_, RVC_J_IMM_7_OPOFF, RVC_J_IMM_7_MASK) << RVC_J_IMM_7_OFF) | \ + (RVC_X(x_, RVC_J_IMM_9_8_OPOFF, RVC_J_IMM_9_8_MASK) << RVC_J_IMM_9_8_OFF) | \ + (RVC_X(x_, RVC_J_IMM_10_OPOFF, RVC_J_IMM_10_MASK) << RVC_J_IMM_10_OFF) | \ + (RVC_IMM_SIGN(x_) << RVC_J_IMM_SIGN_OFF); }) + +#define RVC_EXTRACT_BTYPE_IMM(x) \ + ({typeof(x) x_ = (x); \ + (RVC_X(x_, RVC_B_IMM_2_1_OPOFF, RVC_B_IMM_2_1_MASK) << RVC_B_IMM_2_1_OFF) | \ + (RVC_X(x_, RVC_B_IMM_4_3_OPOFF, RVC_B_IMM_4_3_MASK) << RVC_B_IMM_4_3_OFF) | \ + (RVC_X(x_, RVC_B_IMM_5_OPOFF, RVC_B_IMM_5_MASK) << RVC_B_IMM_5_OFF) | \ + (RVC_X(x_, RVC_B_IMM_7_6_OPOFF, RVC_B_IMM_7_6_MASK) << RVC_B_IMM_7_6_OFF) | \ + (RVC_IMM_SIGN(x_) << RVC_B_IMM_SIGN_OFF); }) + +/* + * Get the immediate from a J-type instruction. + * + * @insn: instruction to process + * Return: immediate + */ +static inline s32 riscv_insn_extract_jtype_imm(u32 insn) +{ + return RV_EXTRACT_JTYPE_IMM(insn); +} + +/* + * Update a J-type instruction with an immediate value. + * + * @insn: pointer to the jtype instruction + * @imm: the immediate to insert into the instruction + */ +static inline void riscv_insn_insert_jtype_imm(u32 *insn, s32 imm) +{ + /* drop the old IMMs, all jal IMM bits sit at 31:12 */ + *insn &= ~GENMASK(31, 12); + *insn |= (RV_X(imm, RV_J_IMM_10_1_OFF, RV_J_IMM_10_1_MASK) << RV_J_IMM_10_1_OPOFF) | + (RV_X(imm, RV_J_IMM_11_OFF, RV_J_IMM_11_MASK) << RV_J_IMM_11_OPOFF) | + (RV_X(imm, RV_J_IMM_19_12_OFF, RV_J_IMM_19_12_MASK) << RV_J_IMM_19_12_OPOFF) | + (RV_X(imm, RV_J_IMM_SIGN_OFF, 1) << RV_J_IMM_SIGN_OPOFF); +} + +/* + * Put together one immediate from a U-type and I-type instruction pair. + * + * The U-type contains an upper immediate, meaning bits[31:12] with [11:0] + * being zero, while the I-type contains a 12bit immediate. + * Combined these can encode larger 32bit values and are used for example + * in auipc + jalr pairs to allow larger jumps. + * + * @utype_insn: instruction containing the upper immediate + * @itype_insn: instruction + * Return: combined immediate + */ +static inline s32 riscv_insn_extract_utype_itype_imm(u32 utype_insn, u32 itype_insn) +{ + s32 imm; + + imm = RV_EXTRACT_UTYPE_IMM(utype_insn); + imm += RV_EXTRACT_ITYPE_IMM(itype_insn); + + return imm; +} + +/* + * Update a set of two instructions (U-type + I-type) with an immediate value. + * + * Used for example in auipc+jalrs pairs the U-type instructions contains + * a 20bit upper immediate representing bits[31:12], while the I-type + * instruction contains a 12bit immediate representing bits[11:0]. + * + * This also takes into account that both separate immediates are + * considered as signed values, so if the I-type immediate becomes + * negative (BIT(11) set) the U-type part gets adjusted. + * + * @utype_insn: pointer to the utype instruction of the pair + * @itype_insn: pointer to the itype instruction of the pair + * @imm: the immediate to insert into the two instructions + */ +static inline void riscv_insn_insert_utype_itype_imm(u32 *utype_insn, u32 *itype_insn, s32 imm) +{ + /* drop possible old IMM values */ + *utype_insn &= ~(RV_U_IMM_31_12_MASK); + *itype_insn &= ~(RV_I_IMM_11_0_MASK << RV_I_IMM_11_0_OPOFF); + + /* add the adapted IMMs */ + *utype_insn |= (imm & RV_U_IMM_31_12_MASK) + ((imm & BIT(11)) << 1); + *itype_insn |= ((imm & RV_I_IMM_11_0_MASK) << RV_I_IMM_11_0_OPOFF); +} +#endif /* _ASM_RISCV_INSN_H */ diff --git a/arch/riscv/include/asm/jump_label.h b/arch/riscv/include/asm/jump_label.h index 6d58bbb5da46..14a5ea8d8ef0 100644 --- a/arch/riscv/include/asm/jump_label.h +++ b/arch/riscv/include/asm/jump_label.h @@ -18,6 +18,7 @@ static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { asm_volatile_goto( + " .align 2 \n\t" " .option push \n\t" " .option norelax \n\t" " .option norvc \n\t" @@ -39,6 +40,7 @@ static __always_inline bool arch_static_branch_jump(struct static_key * const ke const bool branch) { asm_volatile_goto( + " .align 2 \n\t" " .option push \n\t" " .option norelax \n\t" " .option norvc \n\t" diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 93f43a3e7886..cc7da66ee0c0 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -18,6 +18,7 @@ #include <asm/kvm_vcpu_insn.h> #include <asm/kvm_vcpu_sbi.h> #include <asm/kvm_vcpu_timer.h> +#include <asm/kvm_vcpu_pmu.h> #define KVM_MAX_VCPUS 1024 @@ -228,9 +229,11 @@ struct kvm_vcpu_arch { /* Don't run the VCPU (blocked) */ bool pause; + + /* Performance monitoring context */ + struct kvm_pmu pmu_context; }; -static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} @@ -297,11 +300,11 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm); void kvm_riscv_gstage_free_pgd(struct kvm *kvm); void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu); -void kvm_riscv_gstage_mode_detect(void); -unsigned long kvm_riscv_gstage_mode(void); +void __init kvm_riscv_gstage_mode_detect(void); +unsigned long __init kvm_riscv_gstage_mode(void); int kvm_riscv_gstage_gpa_bits(void); -void kvm_riscv_gstage_vmid_detect(void); +void __init kvm_riscv_gstage_vmid_detect(void); unsigned long kvm_riscv_gstage_vmid_bits(void); int kvm_riscv_gstage_vmid_init(struct kvm *kvm); bool kvm_riscv_gstage_vmid_ver_changed(struct kvm_vmid *vmid); diff --git a/arch/riscv/include/asm/kvm_vcpu_pmu.h b/arch/riscv/include/asm/kvm_vcpu_pmu.h new file mode 100644 index 000000000000..395518a1664e --- /dev/null +++ b/arch/riscv/include/asm/kvm_vcpu_pmu.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2023 Rivos Inc + * + * Authors: + * Atish Patra <atishp@rivosinc.com> + */ + +#ifndef __KVM_VCPU_RISCV_PMU_H +#define __KVM_VCPU_RISCV_PMU_H + +#include <linux/perf/riscv_pmu.h> +#include <asm/sbi.h> + +#ifdef CONFIG_RISCV_PMU_SBI +#define RISCV_KVM_MAX_FW_CTRS 32 +#define RISCV_KVM_MAX_HW_CTRS 32 +#define RISCV_KVM_MAX_COUNTERS (RISCV_KVM_MAX_HW_CTRS + RISCV_KVM_MAX_FW_CTRS) +static_assert(RISCV_KVM_MAX_COUNTERS <= 64); + +struct kvm_fw_event { + /* Current value of the event */ + unsigned long value; + + /* Event monitoring status */ + bool started; +}; + +/* Per virtual pmu counter data */ +struct kvm_pmc { + u8 idx; + struct perf_event *perf_event; + u64 counter_val; + union sbi_pmu_ctr_info cinfo; + /* Event monitoring status */ + bool started; + /* Monitoring event ID */ + unsigned long event_idx; +}; + +/* PMU data structure per vcpu */ +struct kvm_pmu { + struct kvm_pmc pmc[RISCV_KVM_MAX_COUNTERS]; + struct kvm_fw_event fw_event[RISCV_KVM_MAX_FW_CTRS]; + /* Number of the virtual firmware counters available */ + int num_fw_ctrs; + /* Number of the virtual hardware counters available */ + int num_hw_ctrs; + /* A flag to indicate that pmu initialization is done */ + bool init_done; + /* Bit map of all the virtual counter used */ + DECLARE_BITMAP(pmc_in_use, RISCV_KVM_MAX_COUNTERS); +}; + +#define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu_context) +#define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu_context)) + +#if defined(CONFIG_32BIT) +#define KVM_RISCV_VCPU_HPMCOUNTER_CSR_FUNCS \ +{.base = CSR_CYCLEH, .count = 31, .func = kvm_riscv_vcpu_pmu_read_hpm }, \ +{.base = CSR_CYCLE, .count = 31, .func = kvm_riscv_vcpu_pmu_read_hpm }, +#else +#define KVM_RISCV_VCPU_HPMCOUNTER_CSR_FUNCS \ +{.base = CSR_CYCLE, .count = 31, .func = kvm_riscv_vcpu_pmu_read_hpm }, +#endif + +int kvm_riscv_vcpu_pmu_incr_fw(struct kvm_vcpu *vcpu, unsigned long fid); +int kvm_riscv_vcpu_pmu_read_hpm(struct kvm_vcpu *vcpu, unsigned int csr_num, + unsigned long *val, unsigned long new_val, + unsigned long wr_mask); + +int kvm_riscv_vcpu_pmu_num_ctrs(struct kvm_vcpu *vcpu, struct kvm_vcpu_sbi_return *retdata); +int kvm_riscv_vcpu_pmu_ctr_info(struct kvm_vcpu *vcpu, unsigned long cidx, + struct kvm_vcpu_sbi_return *retdata); +int kvm_riscv_vcpu_pmu_ctr_start(struct kvm_vcpu *vcpu, unsigned long ctr_base, + unsigned long ctr_mask, unsigned long flags, u64 ival, + struct kvm_vcpu_sbi_return *retdata); +int kvm_riscv_vcpu_pmu_ctr_stop(struct kvm_vcpu *vcpu, unsigned long ctr_base, + unsigned long ctr_mask, unsigned long flags, + struct kvm_vcpu_sbi_return *retdata); +int kvm_riscv_vcpu_pmu_ctr_cfg_match(struct kvm_vcpu *vcpu, unsigned long ctr_base, + unsigned long ctr_mask, unsigned long flags, + unsigned long eidx, u64 evtdata, + struct kvm_vcpu_sbi_return *retdata); +int kvm_riscv_vcpu_pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx, + struct kvm_vcpu_sbi_return *retdata); +void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu); +void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu); +void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu); + +#else +struct kvm_pmu { +}; + +#define KVM_RISCV_VCPU_HPMCOUNTER_CSR_FUNCS \ +{.base = 0, .count = 0, .func = NULL }, + +static inline void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu) {} +static inline int kvm_riscv_vcpu_pmu_incr_fw(struct kvm_vcpu *vcpu, unsigned long fid) +{ + return 0; +} + +static inline void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu) {} +static inline void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu) {} +#endif /* CONFIG_RISCV_PMU_SBI */ +#endif /* !__KVM_VCPU_RISCV_PMU_H */ diff --git a/arch/riscv/include/asm/kvm_vcpu_sbi.h b/arch/riscv/include/asm/kvm_vcpu_sbi.h index f79478a85d2d..8425556af7d1 100644 --- a/arch/riscv/include/asm/kvm_vcpu_sbi.h +++ b/arch/riscv/include/asm/kvm_vcpu_sbi.h @@ -18,6 +18,13 @@ struct kvm_vcpu_sbi_context { int return_handled; }; +struct kvm_vcpu_sbi_return { + unsigned long out_val; + unsigned long err_val; + struct kvm_cpu_trap *utrap; + bool uexit; +}; + struct kvm_vcpu_sbi_extension { unsigned long extid_start; unsigned long extid_end; @@ -27,8 +34,10 @@ struct kvm_vcpu_sbi_extension { * specific error codes. */ int (*handler)(struct kvm_vcpu *vcpu, struct kvm_run *run, - unsigned long *out_val, struct kvm_cpu_trap *utrap, - bool *exit); + struct kvm_vcpu_sbi_return *retdata); + + /* Extension specific probe function */ + unsigned long (*probe)(struct kvm_vcpu *vcpu); }; void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run); diff --git a/arch/riscv/include/asm/module.h b/arch/riscv/include/asm/module.h index 76aa96a9fc08..0f3baaa6a9a8 100644 --- a/arch/riscv/include/asm/module.h +++ b/arch/riscv/include/asm/module.h @@ -5,6 +5,7 @@ #define _ASM_RISCV_MODULE_H #include <asm-generic/module.h> +#include <linux/elf.h> struct module; unsigned long module_emit_got_entry(struct module *mod, unsigned long val); @@ -111,4 +112,19 @@ static inline struct plt_entry *get_plt_entry(unsigned long val, #endif /* CONFIG_MODULE_SECTIONS */ +static inline const Elf_Shdr *find_section(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + const char *name) +{ + const Elf_Shdr *s, *se; + const char *secstrs = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + + for (s = sechdrs, se = sechdrs + hdr->e_shnum; s < se; s++) { + if (strcmp(name, secstrs + s->sh_name) == 0) + return s; + } + + return NULL; +} + #endif /* _ASM_RISCV_MODULE_H */ diff --git a/arch/riscv/include/asm/parse_asm.h b/arch/riscv/include/asm/parse_asm.h deleted file mode 100644 index f36368de839f..000000000000 --- a/arch/riscv/include/asm/parse_asm.h +++ /dev/null @@ -1,219 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2020 SiFive - */ - -#include <linux/bits.h> - -/* The bit field of immediate value in I-type instruction */ -#define I_IMM_SIGN_OPOFF 31 -#define I_IMM_11_0_OPOFF 20 -#define I_IMM_SIGN_OFF 12 -#define I_IMM_11_0_OFF 0 -#define I_IMM_11_0_MASK GENMASK(11, 0) - -/* The bit field of immediate value in J-type instruction */ -#define J_IMM_SIGN_OPOFF 31 -#define J_IMM_10_1_OPOFF 21 -#define J_IMM_11_OPOFF 20 -#define J_IMM_19_12_OPOFF 12 -#define J_IMM_SIGN_OFF 20 -#define J_IMM_10_1_OFF 1 -#define J_IMM_11_OFF 11 -#define J_IMM_19_12_OFF 12 -#define J_IMM_10_1_MASK GENMASK(9, 0) -#define J_IMM_11_MASK GENMASK(0, 0) -#define J_IMM_19_12_MASK GENMASK(7, 0) - -/* The bit field of immediate value in B-type instruction */ -#define B_IMM_SIGN_OPOFF 31 -#define B_IMM_10_5_OPOFF 25 -#define B_IMM_4_1_OPOFF 8 -#define B_IMM_11_OPOFF 7 -#define B_IMM_SIGN_OFF 12 -#define B_IMM_10_5_OFF 5 -#define B_IMM_4_1_OFF 1 -#define B_IMM_11_OFF 11 -#define B_IMM_10_5_MASK GENMASK(5, 0) -#define B_IMM_4_1_MASK GENMASK(3, 0) -#define B_IMM_11_MASK GENMASK(0, 0) - -/* The register offset in RVG instruction */ -#define RVG_RS1_OPOFF 15 -#define RVG_RS2_OPOFF 20 -#define RVG_RD_OPOFF 7 - -/* The bit field of immediate value in RVC J instruction */ -#define RVC_J_IMM_SIGN_OPOFF 12 -#define RVC_J_IMM_4_OPOFF 11 -#define RVC_J_IMM_9_8_OPOFF 9 -#define RVC_J_IMM_10_OPOFF 8 -#define RVC_J_IMM_6_OPOFF 7 -#define RVC_J_IMM_7_OPOFF 6 -#define RVC_J_IMM_3_1_OPOFF 3 -#define RVC_J_IMM_5_OPOFF 2 -#define RVC_J_IMM_SIGN_OFF 11 -#define RVC_J_IMM_4_OFF 4 -#define RVC_J_IMM_9_8_OFF 8 -#define RVC_J_IMM_10_OFF 10 -#define RVC_J_IMM_6_OFF 6 -#define RVC_J_IMM_7_OFF 7 -#define RVC_J_IMM_3_1_OFF 1 -#define RVC_J_IMM_5_OFF 5 -#define RVC_J_IMM_4_MASK GENMASK(0, 0) -#define RVC_J_IMM_9_8_MASK GENMASK(1, 0) -#define RVC_J_IMM_10_MASK GENMASK(0, 0) -#define RVC_J_IMM_6_MASK GENMASK(0, 0) -#define RVC_J_IMM_7_MASK GENMASK(0, 0) -#define RVC_J_IMM_3_1_MASK GENMASK(2, 0) -#define RVC_J_IMM_5_MASK GENMASK(0, 0) - -/* The bit field of immediate value in RVC B instruction */ -#define RVC_B_IMM_SIGN_OPOFF 12 -#define RVC_B_IMM_4_3_OPOFF 10 -#define RVC_B_IMM_7_6_OPOFF 5 -#define RVC_B_IMM_2_1_OPOFF 3 -#define RVC_B_IMM_5_OPOFF 2 -#define RVC_B_IMM_SIGN_OFF 8 -#define RVC_B_IMM_4_3_OFF 3 -#define RVC_B_IMM_7_6_OFF 6 -#define RVC_B_IMM_2_1_OFF 1 -#define RVC_B_IMM_5_OFF 5 -#define RVC_B_IMM_4_3_MASK GENMASK(1, 0) -#define RVC_B_IMM_7_6_MASK GENMASK(1, 0) -#define RVC_B_IMM_2_1_MASK GENMASK(1, 0) -#define RVC_B_IMM_5_MASK GENMASK(0, 0) - -/* The register offset in RVC op=C0 instruction */ -#define RVC_C0_RS1_OPOFF 7 -#define RVC_C0_RS2_OPOFF 2 -#define RVC_C0_RD_OPOFF 2 - -/* The register offset in RVC op=C1 instruction */ -#define RVC_C1_RS1_OPOFF 7 -#define RVC_C1_RS2_OPOFF 2 -#define RVC_C1_RD_OPOFF 7 - -/* The register offset in RVC op=C2 instruction */ -#define RVC_C2_RS1_OPOFF 7 -#define RVC_C2_RS2_OPOFF 2 -#define RVC_C2_RD_OPOFF 7 - -/* parts of opcode for RVG*/ -#define OPCODE_BRANCH 0x63 -#define OPCODE_JALR 0x67 -#define OPCODE_JAL 0x6f -#define OPCODE_SYSTEM 0x73 - -/* parts of opcode for RVC*/ -#define OPCODE_C_0 0x0 -#define OPCODE_C_1 0x1 -#define OPCODE_C_2 0x2 - -/* parts of funct3 code for I, M, A extension*/ -#define FUNCT3_JALR 0x0 -#define FUNCT3_BEQ 0x0 -#define FUNCT3_BNE 0x1000 -#define FUNCT3_BLT 0x4000 -#define FUNCT3_BGE 0x5000 -#define FUNCT3_BLTU 0x6000 -#define FUNCT3_BGEU 0x7000 - -/* parts of funct3 code for C extension*/ -#define FUNCT3_C_BEQZ 0xc000 -#define FUNCT3_C_BNEZ 0xe000 -#define FUNCT3_C_J 0xa000 -#define FUNCT3_C_JAL 0x2000 -#define FUNCT4_C_JR 0x8000 -#define FUNCT4_C_JALR 0xf000 - -#define FUNCT12_SRET 0x10200000 - -#define MATCH_JALR (FUNCT3_JALR | OPCODE_JALR) -#define MATCH_JAL (OPCODE_JAL) -#define MATCH_BEQ (FUNCT3_BEQ | OPCODE_BRANCH) -#define MATCH_BNE (FUNCT3_BNE | OPCODE_BRANCH) -#define MATCH_BLT (FUNCT3_BLT | OPCODE_BRANCH) -#define MATCH_BGE (FUNCT3_BGE | OPCODE_BRANCH) -#define MATCH_BLTU (FUNCT3_BLTU | OPCODE_BRANCH) -#define MATCH_BGEU (FUNCT3_BGEU | OPCODE_BRANCH) -#define MATCH_SRET (FUNCT12_SRET | OPCODE_SYSTEM) -#define MATCH_C_BEQZ (FUNCT3_C_BEQZ | OPCODE_C_1) -#define MATCH_C_BNEZ (FUNCT3_C_BNEZ | OPCODE_C_1) -#define MATCH_C_J (FUNCT3_C_J | OPCODE_C_1) -#define MATCH_C_JAL (FUNCT3_C_JAL | OPCODE_C_1) -#define MATCH_C_JR (FUNCT4_C_JR | OPCODE_C_2) -#define MATCH_C_JALR (FUNCT4_C_JALR | OPCODE_C_2) - -#define MASK_JALR 0x707f -#define MASK_JAL 0x7f -#define MASK_C_JALR 0xf07f -#define MASK_C_JR 0xf07f -#define MASK_C_JAL 0xe003 -#define MASK_C_J 0xe003 -#define MASK_BEQ 0x707f -#define MASK_BNE 0x707f -#define MASK_BLT 0x707f -#define MASK_BGE 0x707f -#define MASK_BLTU 0x707f -#define MASK_BGEU 0x707f -#define MASK_C_BEQZ 0xe003 -#define MASK_C_BNEZ 0xe003 -#define MASK_SRET 0xffffffff - -#define __INSN_LENGTH_MASK _UL(0x3) -#define __INSN_LENGTH_GE_32 _UL(0x3) -#define __INSN_OPCODE_MASK _UL(0x7F) -#define __INSN_BRANCH_OPCODE _UL(OPCODE_BRANCH) - -/* Define a series of is_XXX_insn functions to check if the value INSN - * is an instance of instruction XXX. - */ -#define DECLARE_INSN(INSN_NAME, INSN_MATCH, INSN_MASK) \ -static inline bool is_ ## INSN_NAME ## _insn(long insn) \ -{ \ - return (insn & (INSN_MASK)) == (INSN_MATCH); \ -} - -#define RV_IMM_SIGN(x) (-(((x) >> 31) & 1)) -#define RVC_IMM_SIGN(x) (-(((x) >> 12) & 1)) -#define RV_X(X, s, mask) (((X) >> (s)) & (mask)) -#define RVC_X(X, s, mask) RV_X(X, s, mask) - -#define EXTRACT_JTYPE_IMM(x) \ - ({typeof(x) x_ = (x); \ - (RV_X(x_, J_IMM_10_1_OPOFF, J_IMM_10_1_MASK) << J_IMM_10_1_OFF) | \ - (RV_X(x_, J_IMM_11_OPOFF, J_IMM_11_MASK) << J_IMM_11_OFF) | \ - (RV_X(x_, J_IMM_19_12_OPOFF, J_IMM_19_12_MASK) << J_IMM_19_12_OFF) | \ - (RV_IMM_SIGN(x_) << J_IMM_SIGN_OFF); }) - -#define EXTRACT_ITYPE_IMM(x) \ - ({typeof(x) x_ = (x); \ - (RV_X(x_, I_IMM_11_0_OPOFF, I_IMM_11_0_MASK)) | \ - (RV_IMM_SIGN(x_) << I_IMM_SIGN_OFF); }) - -#define EXTRACT_BTYPE_IMM(x) \ - ({typeof(x) x_ = (x); \ - (RV_X(x_, B_IMM_4_1_OPOFF, B_IMM_4_1_MASK) << B_IMM_4_1_OFF) | \ - (RV_X(x_, B_IMM_10_5_OPOFF, B_IMM_10_5_MASK) << B_IMM_10_5_OFF) | \ - (RV_X(x_, B_IMM_11_OPOFF, B_IMM_11_MASK) << B_IMM_11_OFF) | \ - (RV_IMM_SIGN(x_) << B_IMM_SIGN_OFF); }) - -#define EXTRACT_RVC_J_IMM(x) \ - ({typeof(x) x_ = (x); \ - (RVC_X(x_, RVC_J_IMM_3_1_OPOFF, RVC_J_IMM_3_1_MASK) << RVC_J_IMM_3_1_OFF) | \ - (RVC_X(x_, RVC_J_IMM_4_OPOFF, RVC_J_IMM_4_MASK) << RVC_J_IMM_4_OFF) | \ - (RVC_X(x_, RVC_J_IMM_5_OPOFF, RVC_J_IMM_5_MASK) << RVC_J_IMM_5_OFF) | \ - (RVC_X(x_, RVC_J_IMM_6_OPOFF, RVC_J_IMM_6_MASK) << RVC_J_IMM_6_OFF) | \ - (RVC_X(x_, RVC_J_IMM_7_OPOFF, RVC_J_IMM_7_MASK) << RVC_J_IMM_7_OFF) | \ - (RVC_X(x_, RVC_J_IMM_9_8_OPOFF, RVC_J_IMM_9_8_MASK) << RVC_J_IMM_9_8_OFF) | \ - (RVC_X(x_, RVC_J_IMM_10_OPOFF, RVC_J_IMM_10_MASK) << RVC_J_IMM_10_OFF) | \ - (RVC_IMM_SIGN(x_) << RVC_J_IMM_SIGN_OFF); }) - -#define EXTRACT_RVC_B_IMM(x) \ - ({typeof(x) x_ = (x); \ - (RVC_X(x_, RVC_B_IMM_2_1_OPOFF, RVC_B_IMM_2_1_MASK) << RVC_B_IMM_2_1_OFF) | \ - (RVC_X(x_, RVC_B_IMM_4_3_OPOFF, RVC_B_IMM_4_3_MASK) << RVC_B_IMM_4_3_OFF) | \ - (RVC_X(x_, RVC_B_IMM_5_OPOFF, RVC_B_IMM_5_MASK) << RVC_B_IMM_5_OFF) | \ - (RVC_X(x_, RVC_B_IMM_7_6_OPOFF, RVC_B_IMM_7_6_MASK) << RVC_B_IMM_7_6_OFF) | \ - (RVC_IMM_SIGN(x_) << RVC_B_IMM_SIGN_OFF); }) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index d8d8de0ded99..ab05f892d317 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -31,7 +31,7 @@ #define PTRS_PER_PTE (PAGE_SIZE / sizeof(pte_t)) /* - * Half of the kernel address space (half of the entries of the page global + * Half of the kernel address space (1/4 of the entries of the page global * directory) is for the direct mapping. */ #define KERN_VIRT_SIZE ((PTRS_PER_PGD / 2 * PGDIR_SIZE) / 2) @@ -415,7 +415,7 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, * Relying on flush_tlb_fix_spurious_fault would suffice, but * the extra traps reduce performance. So, eagerly SFENCE.VMA. */ - flush_tlb_page(vma, address); + local_flush_tlb_page(address); } #define __HAVE_ARCH_UPDATE_MMU_TLB diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 4ca7fbacff42..945b7be249c1 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -169,9 +169,9 @@ enum sbi_pmu_fw_generic_events_t { SBI_PMU_FW_ILLEGAL_INSN = 4, SBI_PMU_FW_SET_TIMER = 5, SBI_PMU_FW_IPI_SENT = 6, - SBI_PMU_FW_IPI_RECVD = 7, + SBI_PMU_FW_IPI_RCVD = 7, SBI_PMU_FW_FENCE_I_SENT = 8, - SBI_PMU_FW_FENCE_I_RECVD = 9, + SBI_PMU_FW_FENCE_I_RCVD = 9, SBI_PMU_FW_SFENCE_VMA_SENT = 10, SBI_PMU_FW_SFENCE_VMA_RCVD = 11, SBI_PMU_FW_SFENCE_VMA_ASID_SENT = 12, @@ -215,6 +215,9 @@ enum sbi_pmu_ctr_type { #define SBI_PMU_EVENT_CACHE_OP_ID_CODE_MASK 0x06 #define SBI_PMU_EVENT_CACHE_RESULT_ID_CODE_MASK 0x01 +#define SBI_PMU_EVENT_CACHE_ID_SHIFT 3 +#define SBI_PMU_EVENT_CACHE_OP_SHIFT 1 + #define SBI_PMU_EVENT_IDX_INVALID 0xFFFFFFFF /* Flags defined for config matching function */ diff --git a/arch/riscv/include/asm/signal.h b/arch/riscv/include/asm/signal.h index 532c29ef0376..956ae0a01bad 100644 --- a/arch/riscv/include/asm/signal.h +++ b/arch/riscv/include/asm/signal.h @@ -7,6 +7,6 @@ #include <uapi/asm/ptrace.h> asmlinkage __visible -void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags); +void do_work_pending(struct pt_regs *regs, unsigned long thread_info_flags); #endif diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h index 909049366555..a96b1fea24fe 100644 --- a/arch/riscv/include/asm/string.h +++ b/arch/riscv/include/asm/string.h @@ -18,6 +18,16 @@ extern asmlinkage void *__memcpy(void *, const void *, size_t); #define __HAVE_ARCH_MEMMOVE extern asmlinkage void *memmove(void *, const void *, size_t); extern asmlinkage void *__memmove(void *, const void *, size_t); + +#define __HAVE_ARCH_STRCMP +extern asmlinkage int strcmp(const char *cs, const char *ct); + +#define __HAVE_ARCH_STRLEN +extern asmlinkage __kernel_size_t strlen(const char *); + +#define __HAVE_ARCH_STRNCMP +extern asmlinkage int strncmp(const char *cs, const char *ct, size_t count); + /* For those files which don't want to check by kasan. */ #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) #define memcpy(dst, src, len) __memcpy(dst, src, len) diff --git a/arch/riscv/include/asm/switch_to.h b/arch/riscv/include/asm/switch_to.h index 11463489fec6..60f8ca01d36e 100644 --- a/arch/riscv/include/asm/switch_to.h +++ b/arch/riscv/include/asm/switch_to.h @@ -59,7 +59,8 @@ static inline void __switch_to_aux(struct task_struct *prev, static __always_inline bool has_fpu(void) { - return static_branch_likely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_FPU]); + return riscv_has_extension_likely(RISCV_ISA_EXT_f) || + riscv_has_extension_likely(RISCV_ISA_EXT_d); } #else static __always_inline bool has_fpu(void) { return false; } diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h index 67322f878e0d..f704c8dd57e0 100644 --- a/arch/riscv/include/asm/thread_info.h +++ b/arch/riscv/include/asm/thread_info.h @@ -43,6 +43,7 @@ #ifndef __ASSEMBLY__ extern long shadow_stack[SHADOW_OVERFLOW_STACK_SIZE / sizeof(long)]; +extern unsigned long spin_shadow_stack; #include <asm/processor.h> #include <asm/csr.h> diff --git a/arch/riscv/include/asm/vdso.h b/arch/riscv/include/asm/vdso.h index a7644f46d0e5..f891478829a5 100644 --- a/arch/riscv/include/asm/vdso.h +++ b/arch/riscv/include/asm/vdso.h @@ -28,8 +28,12 @@ #define COMPAT_VDSO_SYMBOL(base, name) \ (void __user *)((unsigned long)(base) + compat__vdso_##name##_offset) +extern char compat_vdso_start[], compat_vdso_end[]; + #endif /* CONFIG_COMPAT */ +extern char vdso_start[], vdso_end[]; + #endif /* !__ASSEMBLY__ */ #endif /* CONFIG_MMU */ diff --git a/arch/riscv/kernel/alternative.c b/arch/riscv/kernel/alternative.c index a7d26a00beea..2354c69dc7d1 100644 --- a/arch/riscv/kernel/alternative.c +++ b/arch/riscv/kernel/alternative.c @@ -11,10 +11,14 @@ #include <linux/cpu.h> #include <linux/uaccess.h> #include <asm/alternative.h> +#include <asm/module.h> #include <asm/sections.h> +#include <asm/vdso.h> #include <asm/vendorid_list.h> #include <asm/sbi.h> #include <asm/csr.h> +#include <asm/insn.h> +#include <asm/patch.h> struct cpu_manufacturer_info_t { unsigned long vendor_id; @@ -53,6 +57,88 @@ static void __init_or_module riscv_fill_cpu_mfr_info(struct cpu_manufacturer_inf } } +static u32 riscv_instruction_at(void *p) +{ + u16 *parcel = p; + + return (u32)parcel[0] | (u32)parcel[1] << 16; +} + +static void riscv_alternative_fix_auipc_jalr(void *ptr, u32 auipc_insn, + u32 jalr_insn, int patch_offset) +{ + u32 call[2] = { auipc_insn, jalr_insn }; + s32 imm; + + /* get and adjust new target address */ + imm = riscv_insn_extract_utype_itype_imm(auipc_insn, jalr_insn); + imm -= patch_offset; + + /* update instructions */ + riscv_insn_insert_utype_itype_imm(&call[0], &call[1], imm); + + /* patch the call place again */ + patch_text_nosync(ptr, call, sizeof(u32) * 2); +} + +static void riscv_alternative_fix_jal(void *ptr, u32 jal_insn, int patch_offset) +{ + s32 imm; + + /* get and adjust new target address */ + imm = riscv_insn_extract_jtype_imm(jal_insn); + imm -= patch_offset; + + /* update instruction */ + riscv_insn_insert_jtype_imm(&jal_insn, imm); + + /* patch the call place again */ + patch_text_nosync(ptr, &jal_insn, sizeof(u32)); +} + +void riscv_alternative_fix_offsets(void *alt_ptr, unsigned int len, + int patch_offset) +{ + int num_insn = len / sizeof(u32); + int i; + + for (i = 0; i < num_insn; i++) { + u32 insn = riscv_instruction_at(alt_ptr + i * sizeof(u32)); + + /* + * May be the start of an auipc + jalr pair + * Needs to check that at least one more instruction + * is in the list. + */ + if (riscv_insn_is_auipc(insn) && i < num_insn - 1) { + u32 insn2 = riscv_instruction_at(alt_ptr + (i + 1) * sizeof(u32)); + + if (!riscv_insn_is_jalr(insn2)) + continue; + + /* if instruction pair is a call, it will use the ra register */ + if (RV_EXTRACT_RD_REG(insn) != 1) + continue; + + riscv_alternative_fix_auipc_jalr(alt_ptr + i * sizeof(u32), + insn, insn2, patch_offset); + i++; + } + + if (riscv_insn_is_jal(insn)) { + s32 imm = riscv_insn_extract_jtype_imm(insn); + + /* Don't modify jumps inside the alternative block */ + if ((alt_ptr + i * sizeof(u32) + imm) >= alt_ptr && + (alt_ptr + i * sizeof(u32) + imm) < (alt_ptr + len)) + continue; + + riscv_alternative_fix_jal(alt_ptr + i * sizeof(u32), + insn, patch_offset); + } + } +} + /* * This is called very early in the boot process (directly after we run * a feature detect on the boot CPU). No need to worry about other CPUs @@ -77,6 +163,31 @@ static void __init_or_module _apply_alternatives(struct alt_entry *begin, stage); } +#ifdef CONFIG_MMU +static void __init apply_vdso_alternatives(void) +{ + const Elf_Ehdr *hdr; + const Elf_Shdr *shdr; + const Elf_Shdr *alt; + struct alt_entry *begin, *end; + + hdr = (Elf_Ehdr *)vdso_start; + shdr = (void *)hdr + hdr->e_shoff; + alt = find_section(hdr, shdr, ".alternative"); + if (!alt) + return; + + begin = (void *)hdr + alt->sh_offset, + end = (void *)hdr + alt->sh_offset + alt->sh_size, + + _apply_alternatives((struct alt_entry *)begin, + (struct alt_entry *)end, + RISCV_ALTERNATIVES_BOOT); +} +#else +static void __init apply_vdso_alternatives(void) { } +#endif + void __init apply_boot_alternatives(void) { /* If called on non-boot cpu things could go wrong */ @@ -85,6 +196,8 @@ void __init apply_boot_alternatives(void) _apply_alternatives((struct alt_entry *)__alt_start, (struct alt_entry *)__alt_end, RISCV_ALTERNATIVES_BOOT); + + apply_vdso_alternatives(); } /* diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index 1b9a5a66e55a..8400f0cc9704 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -144,30 +144,54 @@ arch_initcall(riscv_cpuinfo_init); .uprop = #UPROP, \ .isa_ext_id = EXTID, \ } + /* - * Here are the ordering rules of extension naming defined by RISC-V - * specification : - * 1. All extensions should be separated from other multi-letter extensions - * by an underscore. - * 2. The first letter following the 'Z' conventionally indicates the most + * The canonical order of ISA extension names in the ISA string is defined in + * chapter 27 of the unprivileged specification. + * + * Ordinarily, for in-kernel data structures, this order is unimportant but + * isa_ext_arr defines the order of the ISA string in /proc/cpuinfo. + * + * The specification uses vague wording, such as should, when it comes to + * ordering, so for our purposes the following rules apply: + * + * 1. All multi-letter extensions must be separated from other extensions by an + * underscore. + * + * 2. Additional standard extensions (starting with 'Z') must be sorted after + * single-letter extensions and before any higher-privileged extensions. + + * 3. The first letter following the 'Z' conventionally indicates the most * closely related alphabetical extension category, IMAFDQLCBKJTPVH. - * If multiple 'Z' extensions are named, they should be ordered first - * by category, then alphabetically within a category. - * 3. Standard supervisor-level extensions (starts with 'S') should be - * listed after standard unprivileged extensions. If multiple - * supervisor-level extensions are listed, they should be ordered + * If multiple 'Z' extensions are named, they must be ordered first by + * category, then alphabetically within a category. + * + * 3. Standard supervisor-level extensions (starting with 'S') must be listed + * after standard unprivileged extensions. If multiple supervisor-level + * extensions are listed, they must be ordered alphabetically. + * + * 4. Standard machine-level extensions (starting with 'Zxm') must be listed + * after any lower-privileged, standard extensions. If multiple + * machine-level extensions are listed, they must be ordered * alphabetically. - * 4. Non-standard extensions (starts with 'X') must be listed after all - * standard extensions. They must be separated from other multi-letter - * extensions by an underscore. + * + * 5. Non-standard extensions (starting with 'X') must be listed after all + * standard extensions. If multiple non-standard extensions are listed, they + * must be ordered alphabetically. + * + * An example string following the order is: + * rv64imadc_zifoo_zigoo_zafoo_sbar_scar_zxmbaz_xqux_xrux + * + * New entries to this struct should follow the ordering rules described above. */ static struct riscv_isa_ext_data isa_ext_arr[] = { + __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM), + __RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE), + __RISCV_ISA_EXT_DATA(zbb, RISCV_ISA_EXT_ZBB), __RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF), __RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC), __RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL), __RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT), - __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM), - __RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE), __RISCV_ISA_EXT_DATA("", RISCV_ISA_EXT_MAX), }; diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 93e45560af30..59d58ee0f68d 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -10,6 +10,7 @@ #include <linux/ctype.h> #include <linux/libfdt.h> #include <linux/log2.h> +#include <linux/memory.h> #include <linux/module.h> #include <linux/of.h> #include <asm/alternative.h> @@ -29,9 +30,6 @@ unsigned long elf_hwcap __read_mostly; /* Host ISA bitmap */ static DECLARE_BITMAP(riscv_isa, RISCV_ISA_EXT_MAX) __read_mostly; -DEFINE_STATIC_KEY_ARRAY_FALSE(riscv_isa_ext_keys, RISCV_ISA_EXT_KEY_MAX); -EXPORT_SYMBOL(riscv_isa_ext_keys); - /** * riscv_isa_extension_base() - Get base extension word * @@ -222,12 +220,14 @@ void __init riscv_fill_hwcap(void) set_bit(nr, this_isa); } } else { + /* sorted alphabetically */ SET_ISA_EXT_MAP("sscofpmf", RISCV_ISA_EXT_SSCOFPMF); + SET_ISA_EXT_MAP("sstc", RISCV_ISA_EXT_SSTC); + SET_ISA_EXT_MAP("svinval", RISCV_ISA_EXT_SVINVAL); SET_ISA_EXT_MAP("svpbmt", RISCV_ISA_EXT_SVPBMT); + SET_ISA_EXT_MAP("zbb", RISCV_ISA_EXT_ZBB); SET_ISA_EXT_MAP("zicbom", RISCV_ISA_EXT_ZICBOM); SET_ISA_EXT_MAP("zihintpause", RISCV_ISA_EXT_ZIHINTPAUSE); - SET_ISA_EXT_MAP("sstc", RISCV_ISA_EXT_SSTC); - SET_ISA_EXT_MAP("svinval", RISCV_ISA_EXT_SVINVAL); } #undef SET_ISA_EXT_MAP } @@ -266,81 +266,38 @@ void __init riscv_fill_hwcap(void) if (elf_hwcap & BIT_MASK(i)) print_str[j++] = (char)('a' + i); pr_info("riscv: ELF capabilities %s\n", print_str); - - for_each_set_bit(i, riscv_isa, RISCV_ISA_EXT_MAX) { - j = riscv_isa_ext2key(i); - if (j >= 0) - static_branch_enable(&riscv_isa_ext_keys[j]); - } } #ifdef CONFIG_RISCV_ALTERNATIVE -static bool __init_or_module cpufeature_probe_svpbmt(unsigned int stage) -{ - if (!IS_ENABLED(CONFIG_RISCV_ISA_SVPBMT)) - return false; - - if (stage == RISCV_ALTERNATIVES_EARLY_BOOT) - return false; - - return riscv_isa_extension_available(NULL, SVPBMT); -} - -static bool __init_or_module cpufeature_probe_zicbom(unsigned int stage) -{ - if (!IS_ENABLED(CONFIG_RISCV_ISA_ZICBOM)) - return false; - - if (stage == RISCV_ALTERNATIVES_EARLY_BOOT) - return false; - - if (!riscv_isa_extension_available(NULL, ZICBOM)) - return false; - - riscv_noncoherent_supported(); - return true; -} - -/* - * Probe presence of individual extensions. - * - * This code may also be executed before kernel relocation, so we cannot use - * addresses generated by the address-of operator as they won't be valid in - * this context. - */ -static u32 __init_or_module cpufeature_probe(unsigned int stage) -{ - u32 cpu_req_feature = 0; - - if (cpufeature_probe_svpbmt(stage)) - cpu_req_feature |= BIT(CPUFEATURE_SVPBMT); - - if (cpufeature_probe_zicbom(stage)) - cpu_req_feature |= BIT(CPUFEATURE_ZICBOM); - - return cpu_req_feature; -} - void __init_or_module riscv_cpufeature_patch_func(struct alt_entry *begin, struct alt_entry *end, unsigned int stage) { - u32 cpu_req_feature = cpufeature_probe(stage); struct alt_entry *alt; - u32 tmp; + void *oldptr, *altptr; + + if (stage == RISCV_ALTERNATIVES_EARLY_BOOT) + return; for (alt = begin; alt < end; alt++) { if (alt->vendor_id != 0) continue; - if (alt->errata_id >= CPUFEATURE_NUMBER) { - WARN(1, "This feature id:%d is not in kernel cpufeature list", + if (alt->errata_id >= RISCV_ISA_EXT_MAX) { + WARN(1, "This extension id:%d is not in ISA extension list", alt->errata_id); continue; } - tmp = (1U << alt->errata_id); - if (cpu_req_feature & tmp) - patch_text_nosync(alt->old_ptr, alt->alt_ptr, alt->alt_len); + if (!__riscv_isa_extension_available(NULL, alt->errata_id)) + continue; + + oldptr = ALT_OLD_PTR(alt); + altptr = ALT_ALT_PTR(alt); + + mutex_lock(&text_mutex); + patch_text_nosync(oldptr, altptr, alt->alt_len); + riscv_alternative_fix_offsets(oldptr, alt->alt_len, oldptr - altptr); + mutex_unlock(&text_mutex); } } #endif diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index 2086f6585773..5bff37af4770 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -55,12 +55,15 @@ static int ftrace_check_current_call(unsigned long hook_pos, } static int __ftrace_modify_call(unsigned long hook_pos, unsigned long target, - bool enable) + bool enable, bool ra) { unsigned int call[2]; unsigned int nops[2] = {NOP4, NOP4}; - make_call(hook_pos, target, call); + if (ra) + make_call_ra(hook_pos, target, call); + else + make_call_t0(hook_pos, target, call); /* Replace the auipc-jalr pair at once. Return -EPERM on write error. */ if (patch_text_nosync @@ -70,42 +73,13 @@ static int __ftrace_modify_call(unsigned long hook_pos, unsigned long target, return 0; } -/* - * Put 5 instructions with 16 bytes at the front of function within - * patchable function entry nops' area. - * - * 0: REG_S ra, -SZREG(sp) - * 1: auipc ra, 0x? - * 2: jalr -?(ra) - * 3: REG_L ra, -SZREG(sp) - * - * So the opcodes is: - * 0: 0xfe113c23 (sd)/0xfe112e23 (sw) - * 1: 0x???????? -> auipc - * 2: 0x???????? -> jalr - * 3: 0xff813083 (ld)/0xffc12083 (lw) - */ -#if __riscv_xlen == 64 -#define INSN0 0xfe113c23 -#define INSN3 0xff813083 -#elif __riscv_xlen == 32 -#define INSN0 0xfe112e23 -#define INSN3 0xffc12083 -#endif - -#define FUNC_ENTRY_SIZE 16 -#define FUNC_ENTRY_JMP 4 - int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { - unsigned int call[4] = {INSN0, 0, 0, INSN3}; - unsigned long target = addr; - unsigned long caller = rec->ip + FUNC_ENTRY_JMP; + unsigned int call[2]; - call[1] = to_auipc_insn((unsigned int)(target - caller)); - call[2] = to_jalr_insn((unsigned int)(target - caller)); + make_call_t0(rec->ip, addr, call); - if (patch_text_nosync((void *)rec->ip, call, FUNC_ENTRY_SIZE)) + if (patch_text_nosync((void *)rec->ip, call, MCOUNT_INSN_SIZE)) return -EPERM; return 0; @@ -114,15 +88,14 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { - unsigned int nops[4] = {NOP4, NOP4, NOP4, NOP4}; + unsigned int nops[2] = {NOP4, NOP4}; - if (patch_text_nosync((void *)rec->ip, nops, FUNC_ENTRY_SIZE)) + if (patch_text_nosync((void *)rec->ip, nops, MCOUNT_INSN_SIZE)) return -EPERM; return 0; } - /* * This is called early on, and isn't wrapped by * ftrace_arch_code_modify_{prepare,post_process}() and therefor doesn't hold @@ -144,10 +117,10 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) int ftrace_update_ftrace_func(ftrace_func_t func) { int ret = __ftrace_modify_call((unsigned long)&ftrace_call, - (unsigned long)func, true); + (unsigned long)func, true, true); if (!ret) { ret = __ftrace_modify_call((unsigned long)&ftrace_regs_call, - (unsigned long)func, true); + (unsigned long)func, true, true); } return ret; @@ -159,16 +132,16 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) { unsigned int call[2]; - unsigned long caller = rec->ip + FUNC_ENTRY_JMP; + unsigned long caller = rec->ip; int ret; - make_call(caller, old_addr, call); + make_call_t0(caller, old_addr, call); ret = ftrace_check_current_call(caller, call); if (ret) return ret; - return __ftrace_modify_call(caller, addr, true); + return __ftrace_modify_call(caller, addr, true, false); } #endif @@ -203,12 +176,12 @@ int ftrace_enable_ftrace_graph_caller(void) int ret; ret = __ftrace_modify_call((unsigned long)&ftrace_graph_call, - (unsigned long)&prepare_ftrace_return, true); + (unsigned long)&prepare_ftrace_return, true, true); if (ret) return ret; return __ftrace_modify_call((unsigned long)&ftrace_graph_regs_call, - (unsigned long)&prepare_ftrace_return, true); + (unsigned long)&prepare_ftrace_return, true, true); } int ftrace_disable_ftrace_graph_caller(void) @@ -216,12 +189,12 @@ int ftrace_disable_ftrace_graph_caller(void) int ret; ret = __ftrace_modify_call((unsigned long)&ftrace_graph_call, - (unsigned long)&prepare_ftrace_return, false); + (unsigned long)&prepare_ftrace_return, false, true); if (ret) return ret; return __ftrace_modify_call((unsigned long)&ftrace_graph_regs_call, - (unsigned long)&prepare_ftrace_return, false); + (unsigned long)&prepare_ftrace_return, false, true); } #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/riscv/kernel/kgdb.c b/arch/riscv/kernel/kgdb.c index 963ed7edcff2..2e0266ae6bd7 100644 --- a/arch/riscv/kernel/kgdb.c +++ b/arch/riscv/kernel/kgdb.c @@ -11,7 +11,7 @@ #include <linux/string.h> #include <asm/cacheflush.h> #include <asm/gdb_xml.h> -#include <asm/parse_asm.h> +#include <asm/insn.h> enum { NOT_KGDB_BREAK = 0, @@ -23,27 +23,6 @@ enum { static unsigned long stepped_address; static unsigned int stepped_opcode; -#if __riscv_xlen == 32 -/* C.JAL is an RV32C-only instruction */ -DECLARE_INSN(c_jal, MATCH_C_JAL, MASK_C_JAL) -#else -#define is_c_jal_insn(opcode) 0 -#endif -DECLARE_INSN(jalr, MATCH_JALR, MASK_JALR) -DECLARE_INSN(jal, MATCH_JAL, MASK_JAL) -DECLARE_INSN(c_jr, MATCH_C_JR, MASK_C_JR) -DECLARE_INSN(c_jalr, MATCH_C_JALR, MASK_C_JALR) -DECLARE_INSN(c_j, MATCH_C_J, MASK_C_J) -DECLARE_INSN(beq, MATCH_BEQ, MASK_BEQ) -DECLARE_INSN(bne, MATCH_BNE, MASK_BNE) -DECLARE_INSN(blt, MATCH_BLT, MASK_BLT) -DECLARE_INSN(bge, MATCH_BGE, MASK_BGE) -DECLARE_INSN(bltu, MATCH_BLTU, MASK_BLTU) -DECLARE_INSN(bgeu, MATCH_BGEU, MASK_BGEU) -DECLARE_INSN(c_beqz, MATCH_C_BEQZ, MASK_C_BEQZ) -DECLARE_INSN(c_bnez, MATCH_C_BNEZ, MASK_C_BNEZ) -DECLARE_INSN(sret, MATCH_SRET, MASK_SRET) - static int decode_register_index(unsigned long opcode, int offset) { return (opcode >> offset) & 0x1F; @@ -65,23 +44,25 @@ static int get_step_address(struct pt_regs *regs, unsigned long *next_addr) if (get_kernel_nofault(op_code, (void *)pc)) return -EINVAL; if ((op_code & __INSN_LENGTH_MASK) != __INSN_LENGTH_GE_32) { - if (is_c_jalr_insn(op_code) || is_c_jr_insn(op_code)) { + if (riscv_insn_is_c_jalr(op_code) || + riscv_insn_is_c_jr(op_code)) { rs1_num = decode_register_index(op_code, RVC_C2_RS1_OPOFF); *next_addr = regs_ptr[rs1_num]; - } else if (is_c_j_insn(op_code) || is_c_jal_insn(op_code)) { - *next_addr = EXTRACT_RVC_J_IMM(op_code) + pc; - } else if (is_c_beqz_insn(op_code)) { + } else if (riscv_insn_is_c_j(op_code) || + riscv_insn_is_c_jal(op_code)) { + *next_addr = RVC_EXTRACT_JTYPE_IMM(op_code) + pc; + } else if (riscv_insn_is_c_beqz(op_code)) { rs1_num = decode_register_index_short(op_code, RVC_C1_RS1_OPOFF); if (!rs1_num || regs_ptr[rs1_num] == 0) - *next_addr = EXTRACT_RVC_B_IMM(op_code) + pc; + *next_addr = RVC_EXTRACT_BTYPE_IMM(op_code) + pc; else *next_addr = pc + 2; - } else if (is_c_bnez_insn(op_code)) { + } else if (riscv_insn_is_c_bnez(op_code)) { rs1_num = decode_register_index_short(op_code, RVC_C1_RS1_OPOFF); if (rs1_num && regs_ptr[rs1_num] != 0) - *next_addr = EXTRACT_RVC_B_IMM(op_code) + pc; + *next_addr = RVC_EXTRACT_BTYPE_IMM(op_code) + pc; else *next_addr = pc + 2; } else { @@ -90,7 +71,7 @@ static int get_step_address(struct pt_regs *regs, unsigned long *next_addr) } else { if ((op_code & __INSN_OPCODE_MASK) == __INSN_BRANCH_OPCODE) { bool result = false; - long imm = EXTRACT_BTYPE_IMM(op_code); + long imm = RV_EXTRACT_BTYPE_IMM(op_code); unsigned long rs1_val = 0, rs2_val = 0; rs1_num = decode_register_index(op_code, RVG_RS1_OPOFF); @@ -100,34 +81,34 @@ static int get_step_address(struct pt_regs *regs, unsigned long *next_addr) if (rs2_num) rs2_val = regs_ptr[rs2_num]; - if (is_beq_insn(op_code)) + if (riscv_insn_is_beq(op_code)) result = (rs1_val == rs2_val) ? true : false; - else if (is_bne_insn(op_code)) + else if (riscv_insn_is_bne(op_code)) result = (rs1_val != rs2_val) ? true : false; - else if (is_blt_insn(op_code)) + else if (riscv_insn_is_blt(op_code)) result = ((long)rs1_val < (long)rs2_val) ? true : false; - else if (is_bge_insn(op_code)) + else if (riscv_insn_is_bge(op_code)) result = ((long)rs1_val >= (long)rs2_val) ? true : false; - else if (is_bltu_insn(op_code)) + else if (riscv_insn_is_bltu(op_code)) result = (rs1_val < rs2_val) ? true : false; - else if (is_bgeu_insn(op_code)) + else if (riscv_insn_is_bgeu(op_code)) result = (rs1_val >= rs2_val) ? true : false; if (result) *next_addr = imm + pc; else *next_addr = pc + 4; - } else if (is_jal_insn(op_code)) { - *next_addr = EXTRACT_JTYPE_IMM(op_code) + pc; - } else if (is_jalr_insn(op_code)) { + } else if (riscv_insn_is_jal(op_code)) { + *next_addr = RV_EXTRACT_JTYPE_IMM(op_code) + pc; + } else if (riscv_insn_is_jalr(op_code)) { rs1_num = decode_register_index(op_code, RVG_RS1_OPOFF); if (rs1_num) *next_addr = ((unsigned long *)regs)[rs1_num]; - *next_addr += EXTRACT_ITYPE_IMM(op_code); - } else if (is_sret_insn(op_code)) { + *next_addr += RV_EXTRACT_ITYPE_IMM(op_code); + } else if (riscv_insn_is_sret(op_code)) { *next_addr = pc; } else { *next_addr = pc + 4; diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S index d171eca623b6..125de818d1ba 100644 --- a/arch/riscv/kernel/mcount-dyn.S +++ b/arch/riscv/kernel/mcount-dyn.S @@ -13,8 +13,8 @@ .text -#define FENTRY_RA_OFFSET 12 -#define ABI_SIZE_ON_STACK 72 +#define FENTRY_RA_OFFSET 8 +#define ABI_SIZE_ON_STACK 80 #define ABI_A0 0 #define ABI_A1 8 #define ABI_A2 16 @@ -23,10 +23,10 @@ #define ABI_A5 40 #define ABI_A6 48 #define ABI_A7 56 -#define ABI_RA 64 +#define ABI_T0 64 +#define ABI_RA 72 .macro SAVE_ABI - addi sp, sp, -SZREG addi sp, sp, -ABI_SIZE_ON_STACK REG_S a0, ABI_A0(sp) @@ -37,6 +37,7 @@ REG_S a5, ABI_A5(sp) REG_S a6, ABI_A6(sp) REG_S a7, ABI_A7(sp) + REG_S t0, ABI_T0(sp) REG_S ra, ABI_RA(sp) .endm @@ -49,24 +50,18 @@ REG_L a5, ABI_A5(sp) REG_L a6, ABI_A6(sp) REG_L a7, ABI_A7(sp) + REG_L t0, ABI_T0(sp) REG_L ra, ABI_RA(sp) addi sp, sp, ABI_SIZE_ON_STACK - addi sp, sp, SZREG .endm #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS .macro SAVE_ALL - addi sp, sp, -SZREG addi sp, sp, -PT_SIZE_ON_STACK - REG_S x1, PT_EPC(sp) - addi sp, sp, PT_SIZE_ON_STACK - REG_L x1, (sp) - addi sp, sp, -PT_SIZE_ON_STACK + REG_S t0, PT_EPC(sp) REG_S x1, PT_RA(sp) - REG_L x1, PT_EPC(sp) - REG_S x2, PT_SP(sp) REG_S x3, PT_GP(sp) REG_S x4, PT_TP(sp) @@ -100,15 +95,11 @@ .endm .macro RESTORE_ALL + REG_L t0, PT_EPC(sp) REG_L x1, PT_RA(sp) - addi sp, sp, PT_SIZE_ON_STACK - REG_S x1, (sp) - addi sp, sp, -PT_SIZE_ON_STACK - REG_L x1, PT_EPC(sp) REG_L x2, PT_SP(sp) REG_L x3, PT_GP(sp) REG_L x4, PT_TP(sp) - REG_L x5, PT_T0(sp) REG_L x6, PT_T1(sp) REG_L x7, PT_T2(sp) REG_L x8, PT_S0(sp) @@ -137,17 +128,16 @@ REG_L x31, PT_T6(sp) addi sp, sp, PT_SIZE_ON_STACK - addi sp, sp, SZREG .endm #endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ ENTRY(ftrace_caller) SAVE_ABI - addi a0, ra, -FENTRY_RA_OFFSET + addi a0, t0, -FENTRY_RA_OFFSET la a1, function_trace_op REG_L a2, 0(a1) - REG_L a1, ABI_SIZE_ON_STACK(sp) + mv a1, ra mv a3, sp ftrace_call: @@ -155,8 +145,8 @@ ftrace_call: call ftrace_stub #ifdef CONFIG_FUNCTION_GRAPH_TRACER - addi a0, sp, ABI_SIZE_ON_STACK - REG_L a1, ABI_RA(sp) + addi a0, sp, ABI_RA + REG_L a1, ABI_T0(sp) addi a1, a1, -FENTRY_RA_OFFSET #ifdef HAVE_FUNCTION_GRAPH_FP_TEST mv a2, s0 @@ -166,17 +156,17 @@ ftrace_graph_call: call ftrace_stub #endif RESTORE_ABI - ret + jr t0 ENDPROC(ftrace_caller) #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS ENTRY(ftrace_regs_caller) SAVE_ALL - addi a0, ra, -FENTRY_RA_OFFSET + addi a0, t0, -FENTRY_RA_OFFSET la a1, function_trace_op REG_L a2, 0(a1) - REG_L a1, PT_SIZE_ON_STACK(sp) + mv a1, ra mv a3, sp ftrace_regs_call: @@ -196,6 +186,6 @@ ftrace_graph_regs_call: #endif RESTORE_ALL - ret + jr t0 ENDPROC(ftrace_regs_caller) #endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c index 91fe16bfaa07..7c651d55fcbd 100644 --- a/arch/riscv/kernel/module.c +++ b/arch/riscv/kernel/module.c @@ -268,6 +268,13 @@ static int apply_r_riscv_align_rela(struct module *me, u32 *location, return -EINVAL; } +static int apply_r_riscv_add16_rela(struct module *me, u32 *location, + Elf_Addr v) +{ + *(u16 *)location += (u16)v; + return 0; +} + static int apply_r_riscv_add32_rela(struct module *me, u32 *location, Elf_Addr v) { @@ -282,6 +289,13 @@ static int apply_r_riscv_add64_rela(struct module *me, u32 *location, return 0; } +static int apply_r_riscv_sub16_rela(struct module *me, u32 *location, + Elf_Addr v) +{ + *(u16 *)location -= (u16)v; + return 0; +} + static int apply_r_riscv_sub32_rela(struct module *me, u32 *location, Elf_Addr v) { @@ -315,8 +329,10 @@ static int (*reloc_handlers_rela[]) (struct module *me, u32 *location, [R_RISCV_CALL] = apply_r_riscv_call_rela, [R_RISCV_RELAX] = apply_r_riscv_relax_rela, [R_RISCV_ALIGN] = apply_r_riscv_align_rela, + [R_RISCV_ADD16] = apply_r_riscv_add16_rela, [R_RISCV_ADD32] = apply_r_riscv_add32_rela, [R_RISCV_ADD64] = apply_r_riscv_add64_rela, + [R_RISCV_SUB16] = apply_r_riscv_sub16_rela, [R_RISCV_SUB32] = apply_r_riscv_sub32_rela, [R_RISCV_SUB64] = apply_r_riscv_sub64_rela, }; @@ -429,21 +445,6 @@ void *module_alloc(unsigned long size) } #endif -static const Elf_Shdr *find_section(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - const char *name) -{ - const Elf_Shdr *s, *se; - const char *secstrs = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - - for (s = sechdrs, se = sechdrs + hdr->e_shnum; s < se; s++) { - if (strcmp(name, secstrs + s->sh_name) == 0) - return s; - } - - return NULL; -} - int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) diff --git a/arch/riscv/kernel/probes/simulate-insn.c b/arch/riscv/kernel/probes/simulate-insn.c index a20568bd1f1a..7441ac8a6843 100644 --- a/arch/riscv/kernel/probes/simulate-insn.c +++ b/arch/riscv/kernel/probes/simulate-insn.c @@ -136,13 +136,6 @@ bool __kprobes simulate_auipc(u32 opcode, unsigned long addr, struct pt_regs *re #define branch_offset(opcode) \ sign_extend32((branch_imm(opcode)), 12) -#define BRANCH_BEQ 0x0 -#define BRANCH_BNE 0x1 -#define BRANCH_BLT 0x4 -#define BRANCH_BGE 0x5 -#define BRANCH_BLTU 0x6 -#define BRANCH_BGEU 0x7 - bool __kprobes simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *regs) { /* @@ -169,22 +162,22 @@ bool __kprobes simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *r offset_tmp = branch_offset(opcode); switch (branch_funct3(opcode)) { - case BRANCH_BEQ: + case RVG_FUNCT3_BEQ: offset = (rs1_val == rs2_val) ? offset_tmp : 4; break; - case BRANCH_BNE: + case RVG_FUNCT3_BNE: offset = (rs1_val != rs2_val) ? offset_tmp : 4; break; - case BRANCH_BLT: + case RVG_FUNCT3_BLT: offset = ((long)rs1_val < (long)rs2_val) ? offset_tmp : 4; break; - case BRANCH_BGE: + case RVG_FUNCT3_BGE: offset = ((long)rs1_val >= (long)rs2_val) ? offset_tmp : 4; break; - case BRANCH_BLTU: + case RVG_FUNCT3_BLTU: offset = (rs1_val < rs2_val) ? offset_tmp : 4; break; - case BRANCH_BGEU: + case RVG_FUNCT3_BGEU: offset = (rs1_val >= rs2_val) ? offset_tmp : 4; break; default: diff --git a/arch/riscv/kernel/probes/simulate-insn.h b/arch/riscv/kernel/probes/simulate-insn.h index de8474146a9b..61e35db31001 100644 --- a/arch/riscv/kernel/probes/simulate-insn.h +++ b/arch/riscv/kernel/probes/simulate-insn.h @@ -3,14 +3,7 @@ #ifndef _RISCV_KERNEL_PROBES_SIMULATE_INSN_H #define _RISCV_KERNEL_PROBES_SIMULATE_INSN_H -#define __RISCV_INSN_FUNCS(name, mask, val) \ -static __always_inline bool riscv_insn_is_##name(probe_opcode_t code) \ -{ \ - BUILD_BUG_ON(~(mask) & (val)); \ - return (code & (mask)) == (val); \ -} \ -bool simulate_##name(u32 opcode, unsigned long addr, \ - struct pt_regs *regs) +#include <asm/insn.h> #define RISCV_INSN_REJECTED(name, code) \ do { \ @@ -19,9 +12,6 @@ bool simulate_##name(u32 opcode, unsigned long addr, \ } \ } while (0) -__RISCV_INSN_FUNCS(system, 0x7f, 0x73); -__RISCV_INSN_FUNCS(fence, 0x7f, 0x0f); - #define RISCV_INSN_SET_SIMULATE(name, code) \ do { \ if (riscv_insn_is_##name(code)) { \ @@ -30,18 +20,9 @@ __RISCV_INSN_FUNCS(fence, 0x7f, 0x0f); } \ } while (0) -__RISCV_INSN_FUNCS(c_j, 0xe003, 0xa001); -__RISCV_INSN_FUNCS(c_jr, 0xf07f, 0x8002); -__RISCV_INSN_FUNCS(c_jal, 0xe003, 0x2001); -__RISCV_INSN_FUNCS(c_jalr, 0xf07f, 0x9002); -__RISCV_INSN_FUNCS(c_beqz, 0xe003, 0xc001); -__RISCV_INSN_FUNCS(c_bnez, 0xe003, 0xe001); -__RISCV_INSN_FUNCS(c_ebreak, 0xffff, 0x9002); - -__RISCV_INSN_FUNCS(auipc, 0x7f, 0x17); -__RISCV_INSN_FUNCS(branch, 0x7f, 0x63); - -__RISCV_INSN_FUNCS(jal, 0x7f, 0x6f); -__RISCV_INSN_FUNCS(jalr, 0x707f, 0x67); +bool simulate_auipc(u32 opcode, unsigned long addr, struct pt_regs *regs); +bool simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *regs); +bool simulate_jal(u32 opcode, unsigned long addr, struct pt_regs *regs); +bool simulate_jalr(u32 opcode, unsigned long addr, struct pt_regs *regs); #endif /* _RISCV_KERNEL_PROBES_SIMULATE_INSN_H */ diff --git a/arch/riscv/kernel/riscv_ksyms.c b/arch/riscv/kernel/riscv_ksyms.c index 5ab1c7e1a6ed..a72879b4249a 100644 --- a/arch/riscv/kernel/riscv_ksyms.c +++ b/arch/riscv/kernel/riscv_ksyms.c @@ -12,6 +12,9 @@ EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memmove); +EXPORT_SYMBOL(strcmp); +EXPORT_SYMBOL(strlen); +EXPORT_SYMBOL(strncmp); EXPORT_SYMBOL(__memset); EXPORT_SYMBOL(__memcpy); EXPORT_SYMBOL(__memmove); diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 86acd690d529..376d2827e736 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -300,6 +300,9 @@ void __init setup_arch(char **cmdline_p) riscv_init_cbom_blocksize(); riscv_fill_hwcap(); apply_boot_alternatives(); + if (IS_ENABLED(CONFIG_RISCV_ISA_ZICBOM) && + riscv_isa_extension_available(NULL, ZICBOM)) + riscv_noncoherent_supported(); } static int __init topology_init(void) diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 549bde5c970a..f6fda94e8e59 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -29,22 +29,46 @@ int show_unhandled_signals = 1; static DEFINE_SPINLOCK(die_lock); +static void dump_kernel_instr(const char *loglvl, struct pt_regs *regs) +{ + char str[sizeof("0000 ") * 12 + 2 + 1], *p = str; + const u16 *insns = (u16 *)instruction_pointer(regs); + long bad; + u16 val; + int i; + + for (i = -10; i < 2; i++) { + bad = get_kernel_nofault(val, &insns[i]); + if (!bad) { + p += sprintf(p, i == 0 ? "(%04hx) " : "%04hx ", val); + } else { + printk("%sCode: Unable to access instruction at 0x%px.\n", + loglvl, &insns[i]); + return; + } + } + printk("%sCode: %s\n", loglvl, str); +} + void die(struct pt_regs *regs, const char *str) { static int die_counter; int ret; long cause; + unsigned long flags; oops_enter(); - spin_lock_irq(&die_lock); + spin_lock_irqsave(&die_lock, flags); console_verbose(); bust_spinlocks(1); pr_emerg("%s [#%d]\n", str, ++die_counter); print_modules(); - if (regs) + if (regs) { show_regs(regs); + dump_kernel_instr(KERN_EMERG, regs); + } cause = regs ? regs->cause : -1; ret = notify_die(DIE_OOPS, str, regs, 0, cause, SIGSEGV); @@ -54,7 +78,7 @@ void die(struct pt_regs *regs, const char *str) bust_spinlocks(0); add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); - spin_unlock_irq(&die_lock); + spin_unlock_irqrestore(&die_lock, flags); oops_exit(); if (in_interrupt()) diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index 5c30212d8d1c..cc2d1e8c8736 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c @@ -22,11 +22,6 @@ struct vdso_data { }; #endif -extern char vdso_start[], vdso_end[]; -#ifdef CONFIG_COMPAT -extern char compat_vdso_start[], compat_vdso_end[]; -#endif - enum vvar_pages { VVAR_DATA_PAGE_OFFSET, VVAR_TIMENS_PAGE_OFFSET, diff --git a/arch/riscv/kernel/vdso/vdso.lds.S b/arch/riscv/kernel/vdso/vdso.lds.S index 150b1a572e61..4a0606633290 100644 --- a/arch/riscv/kernel/vdso/vdso.lds.S +++ b/arch/riscv/kernel/vdso/vdso.lds.S @@ -40,6 +40,13 @@ SECTIONS . = 0x800; .text : { *(.text .text.*) } :text + . = ALIGN(4); + .alternative : { + __alt_start = .; + *(.alternative) + __alt_end = .; + } + .data : { *(.got.plt) *(.got) *(.data .data.* .gnu.linkonce.d.*) diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S index 643ab60e9efb..53a8ad65b255 100644 --- a/arch/riscv/kernel/vmlinux.lds.S +++ b/arch/riscv/kernel/vmlinux.lds.S @@ -5,6 +5,7 @@ */ #define RO_EXCEPTION_TABLE_ALIGN 4 +#define RUNTIME_DISCARD_EXIT #ifdef CONFIG_XIP_KERNEL #include "vmlinux-xip.lds.S" @@ -85,6 +86,9 @@ SECTIONS /* Start of init data section */ __init_data_begin = .; INIT_DATA_SECTION(16) + .init.bss : { + *(.init.bss) /* from the EFI stub */ + } .exit.data : { EXIT_DATA @@ -95,6 +99,10 @@ SECTIONS *(.rel.dyn*) } + .rela.dyn : { + *(.rela*) + } + __init_data_end = .; . = ALIGN(8); @@ -140,6 +148,7 @@ SECTIONS STABS_DEBUG DWARF_DEBUG ELF_DETAILS + .riscv.attributes 0 : { *(.riscv.attributes) } DISCARDS } diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index f36a737d5f96..d5a658a047a7 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -20,6 +20,7 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support (EXPERIMENTAL)" depends on RISCV_SBI && MMU + select KVM_GENERIC_HARDWARE_ENABLING select MMU_NOTIFIER select PREEMPT_NOTIFIERS select KVM_MMIO diff --git a/arch/riscv/kvm/Makefile b/arch/riscv/kvm/Makefile index 019df9208bdd..278e97c06e0a 100644 --- a/arch/riscv/kvm/Makefile +++ b/arch/riscv/kvm/Makefile @@ -25,3 +25,4 @@ kvm-y += vcpu_sbi_base.o kvm-y += vcpu_sbi_replace.o kvm-y += vcpu_sbi_hsm.o kvm-y += vcpu_timer.o +kvm-$(CONFIG_RISCV_PMU_SBI) += vcpu_pmu.o vcpu_sbi_pmu.o diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index 58c5489d3031..41ad7639a17b 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -20,16 +20,6 @@ long kvm_arch_dev_ioctl(struct file *filp, return -EINVAL; } -int kvm_arch_check_processor_compat(void *opaque) -{ - return 0; -} - -int kvm_arch_hardware_setup(void *opaque) -{ - return 0; -} - int kvm_arch_hardware_enable(void) { unsigned long hideleg, hedeleg; @@ -49,7 +39,8 @@ int kvm_arch_hardware_enable(void) hideleg |= (1UL << IRQ_VS_EXT); csr_write(CSR_HIDELEG, hideleg); - csr_write(CSR_HCOUNTEREN, -1UL); + /* VS should access only the time counter directly. Everything else should trap */ + csr_write(CSR_HCOUNTEREN, 0x02); csr_write(CSR_HVIP, 0); @@ -70,7 +61,7 @@ void kvm_arch_hardware_disable(void) csr_write(CSR_HIDELEG, 0); } -int kvm_arch_init(void *opaque) +static int __init riscv_kvm_init(void) { const char *str; @@ -115,16 +106,7 @@ int kvm_arch_init(void *opaque) kvm_info("VMID %ld bits available\n", kvm_riscv_gstage_vmid_bits()); - return 0; -} - -void kvm_arch_exit(void) -{ -} - -static int __init riscv_kvm_init(void) -{ - return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + return kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); } module_init(riscv_kvm_init); diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 34b57e0be2ef..78211aed36fa 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -20,12 +20,12 @@ #include <asm/pgtable.h> #ifdef CONFIG_64BIT -static unsigned long gstage_mode = (HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT); -static unsigned long gstage_pgd_levels = 3; +static unsigned long gstage_mode __ro_after_init = (HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT); +static unsigned long gstage_pgd_levels __ro_after_init = 3; #define gstage_index_bits 9 #else -static unsigned long gstage_mode = (HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT); -static unsigned long gstage_pgd_levels = 2; +static unsigned long gstage_mode __ro_after_init = (HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT); +static unsigned long gstage_pgd_levels __ro_after_init = 2; #define gstage_index_bits 10 #endif @@ -585,7 +585,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (!kvm->arch.pgd) return false; - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE); + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); if (!gstage_get_leaf_entry(kvm, range->start << PAGE_SHIFT, &ptep, &ptep_level)) @@ -603,7 +603,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (!kvm->arch.pgd) return false; - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE); + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); if (!gstage_get_leaf_entry(kvm, range->start << PAGE_SHIFT, &ptep, &ptep_level)) @@ -645,12 +645,12 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, if (logging || (vma->vm_flags & VM_PFNMAP)) vma_pagesize = PAGE_SIZE; - if (vma_pagesize == PMD_SIZE || vma_pagesize == PGDIR_SIZE) + if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) gfn = (gpa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; mmap_read_unlock(current->mm); - if (vma_pagesize != PGDIR_SIZE && + if (vma_pagesize != PUD_SIZE && vma_pagesize != PMD_SIZE && vma_pagesize != PAGE_SIZE) { kvm_err("Invalid VMA page size 0x%lx\n", vma_pagesize); @@ -758,7 +758,7 @@ void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu) kvm_riscv_local_hfence_gvma_all(); } -void kvm_riscv_gstage_mode_detect(void) +void __init kvm_riscv_gstage_mode_detect(void) { #ifdef CONFIG_64BIT /* Try Sv57x4 G-stage mode */ @@ -782,7 +782,7 @@ skip_sv48x4_test: #endif } -unsigned long kvm_riscv_gstage_mode(void) +unsigned long __init kvm_riscv_gstage_mode(void) { return gstage_mode >> HGATP_MODE_SHIFT; } diff --git a/arch/riscv/kvm/tlb.c b/arch/riscv/kvm/tlb.c index 309d79b3e5cd..0e5479600695 100644 --- a/arch/riscv/kvm/tlb.c +++ b/arch/riscv/kvm/tlb.c @@ -15,8 +15,7 @@ #include <asm/hwcap.h> #include <asm/insn-def.h> -#define has_svinval() \ - static_branch_unlikely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_SVINVAL]) +#define has_svinval() riscv_has_extension_unlikely(RISCV_ISA_EXT_SVINVAL) void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid, gpa_t gpa, gpa_t gpsz, @@ -181,6 +180,7 @@ void kvm_riscv_local_tlb_sanitize(struct kvm_vcpu *vcpu) void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu) { + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_FENCE_I_RCVD); local_flush_icache_all(); } @@ -264,15 +264,18 @@ void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu) d.addr, d.size, d.order); break; case KVM_RISCV_HFENCE_VVMA_ASID_GVA: + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_RCVD); kvm_riscv_local_hfence_vvma_asid_gva( READ_ONCE(v->vmid), d.asid, d.addr, d.size, d.order); break; case KVM_RISCV_HFENCE_VVMA_ASID_ALL: + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_RCVD); kvm_riscv_local_hfence_vvma_asid_all( READ_ONCE(v->vmid), d.asid); break; case KVM_RISCV_HFENCE_VVMA_GVA: + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_RCVD); kvm_riscv_local_hfence_vvma_gva( READ_ONCE(v->vmid), d.addr, d.size, d.order); diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 7c08567097f0..7d010b0be54e 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -138,6 +138,8 @@ static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu) WRITE_ONCE(vcpu->arch.irqs_pending, 0); WRITE_ONCE(vcpu->arch.irqs_pending_mask, 0); + kvm_riscv_vcpu_pmu_reset(vcpu); + vcpu->arch.hfence_head = 0; vcpu->arch.hfence_tail = 0; memset(vcpu->arch.hfence_queue, 0, sizeof(vcpu->arch.hfence_queue)); @@ -194,6 +196,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) /* Setup VCPU timer */ kvm_riscv_vcpu_timer_init(vcpu); + /* setup performance monitoring */ + kvm_riscv_vcpu_pmu_init(vcpu); + /* Reset VCPU */ kvm_riscv_reset_vcpu(vcpu); @@ -216,6 +221,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) /* Cleanup VCPU timer */ kvm_riscv_vcpu_timer_deinit(vcpu); + kvm_riscv_vcpu_pmu_deinit(vcpu); + /* Free unused pages pre-allocated for G-stage page table mappings */ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); } diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c index c9f741ab26f5..4ea101a73d8b 100644 --- a/arch/riscv/kvm/vcpu_exit.c +++ b/arch/riscv/kvm/vcpu_exit.c @@ -160,6 +160,9 @@ void kvm_riscv_vcpu_trap_redirect(struct kvm_vcpu *vcpu, /* Set Guest PC to Guest exception vector */ vcpu->arch.guest_context.sepc = csr_read(CSR_VSTVEC); + + /* Set Guest privilege mode to supervisor */ + vcpu->arch.guest_context.sstatus |= SR_SPP; } /* @@ -179,6 +182,12 @@ int kvm_riscv_vcpu_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, ret = -EFAULT; run->exit_reason = KVM_EXIT_UNKNOWN; switch (trap->scause) { + case EXC_INST_ILLEGAL: + if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV) { + kvm_riscv_vcpu_trap_redirect(vcpu, trap); + ret = 1; + } + break; case EXC_VIRTUAL_INST_FAULT: if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV) ret = kvm_riscv_vcpu_virtual_insn(vcpu, run, trap); diff --git a/arch/riscv/kvm/vcpu_insn.c b/arch/riscv/kvm/vcpu_insn.c index 0bb52761a3f7..f689337b78ff 100644 --- a/arch/riscv/kvm/vcpu_insn.c +++ b/arch/riscv/kvm/vcpu_insn.c @@ -213,7 +213,9 @@ struct csr_func { unsigned long wr_mask); }; -static const struct csr_func csr_funcs[] = { }; +static const struct csr_func csr_funcs[] = { + KVM_RISCV_VCPU_HPMCOUNTER_CSR_FUNCS +}; /** * kvm_riscv_vcpu_csr_return -- Handle CSR read/write after user space diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c new file mode 100644 index 000000000000..86391a5061dd --- /dev/null +++ b/arch/riscv/kvm/vcpu_pmu.c @@ -0,0 +1,633 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023 Rivos Inc + * + * Authors: + * Atish Patra <atishp@rivosinc.com> + */ + +#define pr_fmt(fmt) "riscv-kvm-pmu: " fmt +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/kvm_host.h> +#include <linux/perf/riscv_pmu.h> +#include <asm/csr.h> +#include <asm/kvm_vcpu_sbi.h> +#include <asm/kvm_vcpu_pmu.h> +#include <linux/bitops.h> + +#define kvm_pmu_num_counters(pmu) ((pmu)->num_hw_ctrs + (pmu)->num_fw_ctrs) +#define get_event_type(x) (((x) & SBI_PMU_EVENT_IDX_TYPE_MASK) >> 16) +#define get_event_code(x) ((x) & SBI_PMU_EVENT_IDX_CODE_MASK) + +static enum perf_hw_id hw_event_perf_map[SBI_PMU_HW_GENERAL_MAX] = { + [SBI_PMU_HW_CPU_CYCLES] = PERF_COUNT_HW_CPU_CYCLES, + [SBI_PMU_HW_INSTRUCTIONS] = PERF_COUNT_HW_INSTRUCTIONS, + [SBI_PMU_HW_CACHE_REFERENCES] = PERF_COUNT_HW_CACHE_REFERENCES, + [SBI_PMU_HW_CACHE_MISSES] = PERF_COUNT_HW_CACHE_MISSES, + [SBI_PMU_HW_BRANCH_INSTRUCTIONS] = PERF_COUNT_HW_BRANCH_INSTRUCTIONS, + [SBI_PMU_HW_BRANCH_MISSES] = PERF_COUNT_HW_BRANCH_MISSES, + [SBI_PMU_HW_BUS_CYCLES] = PERF_COUNT_HW_BUS_CYCLES, + [SBI_PMU_HW_STALLED_CYCLES_FRONTEND] = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, + [SBI_PMU_HW_STALLED_CYCLES_BACKEND] = PERF_COUNT_HW_STALLED_CYCLES_BACKEND, + [SBI_PMU_HW_REF_CPU_CYCLES] = PERF_COUNT_HW_REF_CPU_CYCLES, +}; + +static u64 kvm_pmu_get_sample_period(struct kvm_pmc *pmc) +{ + u64 counter_val_mask = GENMASK(pmc->cinfo.width, 0); + u64 sample_period; + + if (!pmc->counter_val) + sample_period = counter_val_mask + 1; + else + sample_period = (-pmc->counter_val) & counter_val_mask; + + return sample_period; +} + +static u32 kvm_pmu_get_perf_event_type(unsigned long eidx) +{ + enum sbi_pmu_event_type etype = get_event_type(eidx); + u32 type = PERF_TYPE_MAX; + + switch (etype) { + case SBI_PMU_EVENT_TYPE_HW: + type = PERF_TYPE_HARDWARE; + break; + case SBI_PMU_EVENT_TYPE_CACHE: + type = PERF_TYPE_HW_CACHE; + break; + case SBI_PMU_EVENT_TYPE_RAW: + case SBI_PMU_EVENT_TYPE_FW: + type = PERF_TYPE_RAW; + break; + default: + break; + } + + return type; +} + +static bool kvm_pmu_is_fw_event(unsigned long eidx) +{ + return get_event_type(eidx) == SBI_PMU_EVENT_TYPE_FW; +} + +static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc) +{ + if (pmc->perf_event) { + perf_event_disable(pmc->perf_event); + perf_event_release_kernel(pmc->perf_event); + pmc->perf_event = NULL; + } +} + +static u64 kvm_pmu_get_perf_event_hw_config(u32 sbi_event_code) +{ + return hw_event_perf_map[sbi_event_code]; +} + +static u64 kvm_pmu_get_perf_event_cache_config(u32 sbi_event_code) +{ + u64 config = U64_MAX; + unsigned int cache_type, cache_op, cache_result; + + /* All the cache event masks lie within 0xFF. No separate masking is necessary */ + cache_type = (sbi_event_code & SBI_PMU_EVENT_CACHE_ID_CODE_MASK) >> + SBI_PMU_EVENT_CACHE_ID_SHIFT; + cache_op = (sbi_event_code & SBI_PMU_EVENT_CACHE_OP_ID_CODE_MASK) >> + SBI_PMU_EVENT_CACHE_OP_SHIFT; + cache_result = sbi_event_code & SBI_PMU_EVENT_CACHE_RESULT_ID_CODE_MASK; + + if (cache_type >= PERF_COUNT_HW_CACHE_MAX || + cache_op >= PERF_COUNT_HW_CACHE_OP_MAX || + cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) + return config; + + config = cache_type | (cache_op << 8) | (cache_result << 16); + + return config; +} + +static u64 kvm_pmu_get_perf_event_config(unsigned long eidx, uint64_t evt_data) +{ + enum sbi_pmu_event_type etype = get_event_type(eidx); + u32 ecode = get_event_code(eidx); + u64 config = U64_MAX; + + switch (etype) { + case SBI_PMU_EVENT_TYPE_HW: + if (ecode < SBI_PMU_HW_GENERAL_MAX) + config = kvm_pmu_get_perf_event_hw_config(ecode); + break; + case SBI_PMU_EVENT_TYPE_CACHE: + config = kvm_pmu_get_perf_event_cache_config(ecode); + break; + case SBI_PMU_EVENT_TYPE_RAW: + config = evt_data & RISCV_PMU_RAW_EVENT_MASK; + break; + case SBI_PMU_EVENT_TYPE_FW: + if (ecode < SBI_PMU_FW_MAX) + config = (1ULL << 63) | ecode; + break; + default: + break; + } + + return config; +} + +static int kvm_pmu_get_fixed_pmc_index(unsigned long eidx) +{ + u32 etype = kvm_pmu_get_perf_event_type(eidx); + u32 ecode = get_event_code(eidx); + + if (etype != SBI_PMU_EVENT_TYPE_HW) + return -EINVAL; + + if (ecode == SBI_PMU_HW_CPU_CYCLES) + return 0; + else if (ecode == SBI_PMU_HW_INSTRUCTIONS) + return 2; + else + return -EINVAL; +} + +static int kvm_pmu_get_programmable_pmc_index(struct kvm_pmu *kvpmu, unsigned long eidx, + unsigned long cbase, unsigned long cmask) +{ + int ctr_idx = -1; + int i, pmc_idx; + int min, max; + + if (kvm_pmu_is_fw_event(eidx)) { + /* Firmware counters are mapped 1:1 starting from num_hw_ctrs for simplicity */ + min = kvpmu->num_hw_ctrs; + max = min + kvpmu->num_fw_ctrs; + } else { + /* First 3 counters are reserved for fixed counters */ + min = 3; + max = kvpmu->num_hw_ctrs; + } + + for_each_set_bit(i, &cmask, BITS_PER_LONG) { + pmc_idx = i + cbase; + if ((pmc_idx >= min && pmc_idx < max) && + !test_bit(pmc_idx, kvpmu->pmc_in_use)) { + ctr_idx = pmc_idx; + break; + } + } + + return ctr_idx; +} + +static int pmu_get_pmc_index(struct kvm_pmu *pmu, unsigned long eidx, + unsigned long cbase, unsigned long cmask) +{ + int ret; + + /* Fixed counters need to be have fixed mapping as they have different width */ + ret = kvm_pmu_get_fixed_pmc_index(eidx); + if (ret >= 0) + return ret; + + return kvm_pmu_get_programmable_pmc_index(pmu, eidx, cbase, cmask); +} + +static int pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx, + unsigned long *out_val) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + u64 enabled, running; + int fevent_code; + + pmc = &kvpmu->pmc[cidx]; + + if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) { + fevent_code = get_event_code(pmc->event_idx); + pmc->counter_val = kvpmu->fw_event[fevent_code].value; + } else if (pmc->perf_event) { + pmc->counter_val += perf_event_read_value(pmc->perf_event, &enabled, &running); + } else { + return -EINVAL; + } + *out_val = pmc->counter_val; + + return 0; +} + +static int kvm_pmu_validate_counter_mask(struct kvm_pmu *kvpmu, unsigned long ctr_base, + unsigned long ctr_mask) +{ + /* Make sure the we have a valid counter mask requested from the caller */ + if (!ctr_mask || (ctr_base + __fls(ctr_mask) >= kvm_pmu_num_counters(kvpmu))) + return -EINVAL; + + return 0; +} + +static int kvm_pmu_create_perf_event(struct kvm_pmc *pmc, struct perf_event_attr *attr, + unsigned long flags, unsigned long eidx, unsigned long evtdata) +{ + struct perf_event *event; + + kvm_pmu_release_perf_event(pmc); + attr->config = kvm_pmu_get_perf_event_config(eidx, evtdata); + if (flags & SBI_PMU_CFG_FLAG_CLEAR_VALUE) { + //TODO: Do we really want to clear the value in hardware counter + pmc->counter_val = 0; + } + + /* + * Set the default sample_period for now. The guest specified value + * will be updated in the start call. + */ + attr->sample_period = kvm_pmu_get_sample_period(pmc); + + event = perf_event_create_kernel_counter(attr, -1, current, NULL, pmc); + if (IS_ERR(event)) { + pr_err("kvm pmu event creation failed for eidx %lx: %ld\n", eidx, PTR_ERR(event)); + return PTR_ERR(event); + } + + pmc->perf_event = event; + if (flags & SBI_PMU_CFG_FLAG_AUTO_START) + perf_event_enable(pmc->perf_event); + + return 0; +} + +int kvm_riscv_vcpu_pmu_incr_fw(struct kvm_vcpu *vcpu, unsigned long fid) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + struct kvm_fw_event *fevent; + + if (!kvpmu || fid >= SBI_PMU_FW_MAX) + return -EINVAL; + + fevent = &kvpmu->fw_event[fid]; + if (fevent->started) + fevent->value++; + + return 0; +} + +int kvm_riscv_vcpu_pmu_read_hpm(struct kvm_vcpu *vcpu, unsigned int csr_num, + unsigned long *val, unsigned long new_val, + unsigned long wr_mask) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + int cidx, ret = KVM_INSN_CONTINUE_NEXT_SEPC; + + if (!kvpmu || !kvpmu->init_done) { + /* + * In absence of sscofpmf in the platform, the guest OS may use + * the legacy PMU driver to read cycle/instret. In that case, + * just return 0 to avoid any illegal trap. However, any other + * hpmcounter access should result in illegal trap as they must + * be access through SBI PMU only. + */ + if (csr_num == CSR_CYCLE || csr_num == CSR_INSTRET) { + *val = 0; + return ret; + } else { + return KVM_INSN_ILLEGAL_TRAP; + } + } + + /* The counter CSR are read only. Thus, any write should result in illegal traps */ + if (wr_mask) + return KVM_INSN_ILLEGAL_TRAP; + + cidx = csr_num - CSR_CYCLE; + + if (pmu_ctr_read(vcpu, cidx, val) < 0) + return KVM_INSN_ILLEGAL_TRAP; + + return ret; +} + +int kvm_riscv_vcpu_pmu_num_ctrs(struct kvm_vcpu *vcpu, + struct kvm_vcpu_sbi_return *retdata) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + + retdata->out_val = kvm_pmu_num_counters(kvpmu); + + return 0; +} + +int kvm_riscv_vcpu_pmu_ctr_info(struct kvm_vcpu *vcpu, unsigned long cidx, + struct kvm_vcpu_sbi_return *retdata) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + + if (cidx > RISCV_KVM_MAX_COUNTERS || cidx == 1) { + retdata->err_val = SBI_ERR_INVALID_PARAM; + return 0; + } + + retdata->out_val = kvpmu->pmc[cidx].cinfo.value; + + return 0; +} + +int kvm_riscv_vcpu_pmu_ctr_start(struct kvm_vcpu *vcpu, unsigned long ctr_base, + unsigned long ctr_mask, unsigned long flags, u64 ival, + struct kvm_vcpu_sbi_return *retdata) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + int i, pmc_index, sbiret = 0; + struct kvm_pmc *pmc; + int fevent_code; + + if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) { + sbiret = SBI_ERR_INVALID_PARAM; + goto out; + } + + /* Start the counters that have been configured and requested by the guest */ + for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) { + pmc_index = i + ctr_base; + if (!test_bit(pmc_index, kvpmu->pmc_in_use)) + continue; + pmc = &kvpmu->pmc[pmc_index]; + if (flags & SBI_PMU_START_FLAG_SET_INIT_VALUE) + pmc->counter_val = ival; + if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) { + fevent_code = get_event_code(pmc->event_idx); + if (fevent_code >= SBI_PMU_FW_MAX) { + sbiret = SBI_ERR_INVALID_PARAM; + goto out; + } + + /* Check if the counter was already started for some reason */ + if (kvpmu->fw_event[fevent_code].started) { + sbiret = SBI_ERR_ALREADY_STARTED; + continue; + } + + kvpmu->fw_event[fevent_code].started = true; + kvpmu->fw_event[fevent_code].value = pmc->counter_val; + } else if (pmc->perf_event) { + if (unlikely(pmc->started)) { + sbiret = SBI_ERR_ALREADY_STARTED; + continue; + } + perf_event_period(pmc->perf_event, kvm_pmu_get_sample_period(pmc)); + perf_event_enable(pmc->perf_event); + pmc->started = true; + } else { + sbiret = SBI_ERR_INVALID_PARAM; + } + } + +out: + retdata->err_val = sbiret; + + return 0; +} + +int kvm_riscv_vcpu_pmu_ctr_stop(struct kvm_vcpu *vcpu, unsigned long ctr_base, + unsigned long ctr_mask, unsigned long flags, + struct kvm_vcpu_sbi_return *retdata) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + int i, pmc_index, sbiret = 0; + u64 enabled, running; + struct kvm_pmc *pmc; + int fevent_code; + + if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) { + sbiret = SBI_ERR_INVALID_PARAM; + goto out; + } + + /* Stop the counters that have been configured and requested by the guest */ + for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) { + pmc_index = i + ctr_base; + if (!test_bit(pmc_index, kvpmu->pmc_in_use)) + continue; + pmc = &kvpmu->pmc[pmc_index]; + if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) { + fevent_code = get_event_code(pmc->event_idx); + if (fevent_code >= SBI_PMU_FW_MAX) { + sbiret = SBI_ERR_INVALID_PARAM; + goto out; + } + + if (!kvpmu->fw_event[fevent_code].started) + sbiret = SBI_ERR_ALREADY_STOPPED; + + kvpmu->fw_event[fevent_code].started = false; + } else if (pmc->perf_event) { + if (pmc->started) { + /* Stop counting the counter */ + perf_event_disable(pmc->perf_event); + pmc->started = false; + } else { + sbiret = SBI_ERR_ALREADY_STOPPED; + } + + if (flags & SBI_PMU_STOP_FLAG_RESET) { + /* Relase the counter if this is a reset request */ + pmc->counter_val += perf_event_read_value(pmc->perf_event, + &enabled, &running); + kvm_pmu_release_perf_event(pmc); + } + } else { + sbiret = SBI_ERR_INVALID_PARAM; + } + if (flags & SBI_PMU_STOP_FLAG_RESET) { + pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID; + clear_bit(pmc_index, kvpmu->pmc_in_use); + } + } + +out: + retdata->err_val = sbiret; + + return 0; +} + +int kvm_riscv_vcpu_pmu_ctr_cfg_match(struct kvm_vcpu *vcpu, unsigned long ctr_base, + unsigned long ctr_mask, unsigned long flags, + unsigned long eidx, u64 evtdata, + struct kvm_vcpu_sbi_return *retdata) +{ + int ctr_idx, ret, sbiret = 0; + bool is_fevent; + unsigned long event_code; + u32 etype = kvm_pmu_get_perf_event_type(eidx); + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc = NULL; + struct perf_event_attr attr = { + .type = etype, + .size = sizeof(struct perf_event_attr), + .pinned = true, + /* + * It should never reach here if the platform doesn't support the sscofpmf + * extension as mode filtering won't work without it. + */ + .exclude_host = true, + .exclude_hv = true, + .exclude_user = !!(flags & SBI_PMU_CFG_FLAG_SET_UINH), + .exclude_kernel = !!(flags & SBI_PMU_CFG_FLAG_SET_SINH), + .config1 = RISCV_PMU_CONFIG1_GUEST_EVENTS, + }; + + if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) { + sbiret = SBI_ERR_INVALID_PARAM; + goto out; + } + + event_code = get_event_code(eidx); + is_fevent = kvm_pmu_is_fw_event(eidx); + if (is_fevent && event_code >= SBI_PMU_FW_MAX) { + sbiret = SBI_ERR_NOT_SUPPORTED; + goto out; + } + + /* + * SKIP_MATCH flag indicates the caller is aware of the assigned counter + * for this event. Just do a sanity check if it already marked used. + */ + if (flags & SBI_PMU_CFG_FLAG_SKIP_MATCH) { + if (!test_bit(ctr_base + __ffs(ctr_mask), kvpmu->pmc_in_use)) { + sbiret = SBI_ERR_FAILURE; + goto out; + } + ctr_idx = ctr_base + __ffs(ctr_mask); + } else { + ctr_idx = pmu_get_pmc_index(kvpmu, eidx, ctr_base, ctr_mask); + if (ctr_idx < 0) { + sbiret = SBI_ERR_NOT_SUPPORTED; + goto out; + } + } + + pmc = &kvpmu->pmc[ctr_idx]; + pmc->idx = ctr_idx; + + if (is_fevent) { + if (flags & SBI_PMU_CFG_FLAG_AUTO_START) + kvpmu->fw_event[event_code].started = true; + } else { + ret = kvm_pmu_create_perf_event(pmc, &attr, flags, eidx, evtdata); + if (ret) + return ret; + } + + set_bit(ctr_idx, kvpmu->pmc_in_use); + pmc->event_idx = eidx; + retdata->out_val = ctr_idx; +out: + retdata->err_val = sbiret; + + return 0; +} + +int kvm_riscv_vcpu_pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx, + struct kvm_vcpu_sbi_return *retdata) +{ + int ret; + + ret = pmu_ctr_read(vcpu, cidx, &retdata->out_val); + if (ret == -EINVAL) + retdata->err_val = SBI_ERR_INVALID_PARAM; + + return 0; +} + +void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu) +{ + int i = 0, ret, num_hw_ctrs = 0, hpm_width = 0; + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + + /* + * PMU functionality should be only available to guests if privilege mode + * filtering is available in the host. Otherwise, guest will always count + * events while the execution is in hypervisor mode. + */ + if (!riscv_isa_extension_available(NULL, SSCOFPMF)) + return; + + ret = riscv_pmu_get_hpm_info(&hpm_width, &num_hw_ctrs); + if (ret < 0 || !hpm_width || !num_hw_ctrs) + return; + + /* + * Increase the number of hardware counters to offset the time counter. + */ + kvpmu->num_hw_ctrs = num_hw_ctrs + 1; + kvpmu->num_fw_ctrs = SBI_PMU_FW_MAX; + memset(&kvpmu->fw_event, 0, SBI_PMU_FW_MAX * sizeof(struct kvm_fw_event)); + + if (kvpmu->num_hw_ctrs > RISCV_KVM_MAX_HW_CTRS) { + pr_warn_once("Limiting the hardware counters to 32 as specified by the ISA"); + kvpmu->num_hw_ctrs = RISCV_KVM_MAX_HW_CTRS; + } + + /* + * There is no correlation between the logical hardware counter and virtual counters. + * However, we need to encode a hpmcounter CSR in the counter info field so that + * KVM can trap n emulate the read. This works well in the migration use case as + * KVM doesn't care if the actual hpmcounter is available in the hardware or not. + */ + for (i = 0; i < kvm_pmu_num_counters(kvpmu); i++) { + /* TIME CSR shouldn't be read from perf interface */ + if (i == 1) + continue; + pmc = &kvpmu->pmc[i]; + pmc->idx = i; + pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID; + if (i < kvpmu->num_hw_ctrs) { + pmc->cinfo.type = SBI_PMU_CTR_TYPE_HW; + if (i < 3) + /* CY, IR counters */ + pmc->cinfo.width = 63; + else + pmc->cinfo.width = hpm_width; + /* + * The CSR number doesn't have any relation with the logical + * hardware counters. The CSR numbers are encoded sequentially + * to avoid maintaining a map between the virtual counter + * and CSR number. + */ + pmc->cinfo.csr = CSR_CYCLE + i; + } else { + pmc->cinfo.type = SBI_PMU_CTR_TYPE_FW; + pmc->cinfo.width = BITS_PER_LONG - 1; + } + } + + kvpmu->init_done = true; +} + +void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + int i; + + if (!kvpmu) + return; + + for_each_set_bit(i, kvpmu->pmc_in_use, RISCV_MAX_COUNTERS) { + pmc = &kvpmu->pmc[i]; + pmc->counter_val = 0; + kvm_pmu_release_perf_event(pmc); + pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID; + } + bitmap_zero(kvpmu->pmc_in_use, RISCV_MAX_COUNTERS); + memset(&kvpmu->fw_event, 0, SBI_PMU_FW_MAX * sizeof(struct kvm_fw_event)); +} + +void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu) +{ + kvm_riscv_vcpu_pmu_deinit(vcpu); +} diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c index f96991d230bf..15fde15f9fb8 100644 --- a/arch/riscv/kvm/vcpu_sbi.c +++ b/arch/riscv/kvm/vcpu_sbi.c @@ -12,26 +12,6 @@ #include <asm/sbi.h> #include <asm/kvm_vcpu_sbi.h> -static int kvm_linux_err_map_sbi(int err) -{ - switch (err) { - case 0: - return SBI_SUCCESS; - case -EPERM: - return SBI_ERR_DENIED; - case -EINVAL: - return SBI_ERR_INVALID_PARAM; - case -EFAULT: - return SBI_ERR_INVALID_ADDRESS; - case -EOPNOTSUPP: - return SBI_ERR_NOT_SUPPORTED; - case -EALREADY: - return SBI_ERR_ALREADY_AVAILABLE; - default: - return SBI_ERR_FAILURE; - }; -} - #ifndef CONFIG_RISCV_SBI_V01 static const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_v01 = { .extid_start = -1UL, @@ -40,6 +20,16 @@ static const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_v01 = { }; #endif +#ifdef CONFIG_RISCV_PMU_SBI +extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_pmu; +#else +static const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_pmu = { + .extid_start = -1UL, + .extid_end = -1UL, + .handler = NULL, +}; +#endif + static const struct kvm_vcpu_sbi_extension *sbi_ext[] = { &vcpu_sbi_ext_v01, &vcpu_sbi_ext_base, @@ -48,6 +38,7 @@ static const struct kvm_vcpu_sbi_extension *sbi_ext[] = { &vcpu_sbi_ext_rfence, &vcpu_sbi_ext_srst, &vcpu_sbi_ext_hsm, + &vcpu_sbi_ext_pmu, &vcpu_sbi_ext_experimental, &vcpu_sbi_ext_vendor, }; @@ -125,11 +116,14 @@ int kvm_riscv_vcpu_sbi_ecall(struct kvm_vcpu *vcpu, struct kvm_run *run) { int ret = 1; bool next_sepc = true; - bool userspace_exit = false; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; const struct kvm_vcpu_sbi_extension *sbi_ext; - struct kvm_cpu_trap utrap = { 0 }; - unsigned long out_val = 0; + struct kvm_cpu_trap utrap = {0}; + struct kvm_vcpu_sbi_return sbi_ret = { + .out_val = 0, + .err_val = 0, + .utrap = &utrap, + }; bool ext_is_v01 = false; sbi_ext = kvm_vcpu_sbi_find_ext(cp->a7); @@ -139,42 +133,46 @@ int kvm_riscv_vcpu_sbi_ecall(struct kvm_vcpu *vcpu, struct kvm_run *run) cp->a7 <= SBI_EXT_0_1_SHUTDOWN) ext_is_v01 = true; #endif - ret = sbi_ext->handler(vcpu, run, &out_val, &utrap, &userspace_exit); + ret = sbi_ext->handler(vcpu, run, &sbi_ret); } else { /* Return error for unsupported SBI calls */ cp->a0 = SBI_ERR_NOT_SUPPORTED; goto ecall_done; } + /* + * When the SBI extension returns a Linux error code, it exits the ioctl + * loop and forwards the error to userspace. + */ + if (ret < 0) { + next_sepc = false; + goto ecall_done; + } + /* Handle special error cases i.e trap, exit or userspace forward */ - if (utrap.scause) { + if (sbi_ret.utrap->scause) { /* No need to increment sepc or exit ioctl loop */ ret = 1; - utrap.sepc = cp->sepc; - kvm_riscv_vcpu_trap_redirect(vcpu, &utrap); + sbi_ret.utrap->sepc = cp->sepc; + kvm_riscv_vcpu_trap_redirect(vcpu, sbi_ret.utrap); next_sepc = false; goto ecall_done; } /* Exit ioctl loop or Propagate the error code the guest */ - if (userspace_exit) { + if (sbi_ret.uexit) { next_sepc = false; ret = 0; } else { - /** - * SBI extension handler always returns an Linux error code. Convert - * it to the SBI specific error code that can be propagated the SBI - * caller. - */ - ret = kvm_linux_err_map_sbi(ret); - cp->a0 = ret; + cp->a0 = sbi_ret.err_val; ret = 1; } ecall_done: if (next_sepc) cp->sepc += 4; - if (!ext_is_v01) - cp->a1 = out_val; + /* a1 should only be updated when we continue the ioctl loop */ + if (!ext_is_v01 && ret == 1) + cp->a1 = sbi_ret.out_val; return ret; } diff --git a/arch/riscv/kvm/vcpu_sbi_base.c b/arch/riscv/kvm/vcpu_sbi_base.c index 5d65c634d301..9945aff34c14 100644 --- a/arch/riscv/kvm/vcpu_sbi_base.c +++ b/arch/riscv/kvm/vcpu_sbi_base.c @@ -14,11 +14,11 @@ #include <asm/kvm_vcpu_sbi.h> static int kvm_sbi_ext_base_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, - unsigned long *out_val, - struct kvm_cpu_trap *trap, bool *exit) + struct kvm_vcpu_sbi_return *retdata) { - int ret = 0; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; + const struct kvm_vcpu_sbi_extension *sbi_ext; + unsigned long *out_val = &retdata->out_val; switch (cp->a6) { case SBI_EXT_BASE_GET_SPEC_VERSION: @@ -42,9 +42,12 @@ static int kvm_sbi_ext_base_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, * forward it to the userspace */ kvm_riscv_vcpu_sbi_forward(vcpu, run); - *exit = true; - } else - *out_val = kvm_vcpu_sbi_find_ext(cp->a0) ? 1 : 0; + retdata->uexit = true; + } else { + sbi_ext = kvm_vcpu_sbi_find_ext(cp->a0); + *out_val = sbi_ext && sbi_ext->probe ? + sbi_ext->probe(vcpu) : !!sbi_ext; + } break; case SBI_EXT_BASE_GET_MVENDORID: *out_val = vcpu->arch.mvendorid; @@ -56,11 +59,11 @@ static int kvm_sbi_ext_base_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, *out_val = vcpu->arch.mimpid; break; default: - ret = -EOPNOTSUPP; + retdata->err_val = SBI_ERR_NOT_SUPPORTED; break; } - return ret; + return 0; } const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_base = { @@ -70,17 +73,15 @@ const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_base = { }; static int kvm_sbi_ext_forward_handler(struct kvm_vcpu *vcpu, - struct kvm_run *run, - unsigned long *out_val, - struct kvm_cpu_trap *utrap, - bool *exit) + struct kvm_run *run, + struct kvm_vcpu_sbi_return *retdata) { /* * Both SBI experimental and vendor extensions are * unconditionally forwarded to userspace. */ kvm_riscv_vcpu_sbi_forward(vcpu, run); - *exit = true; + retdata->uexit = true; return 0; } diff --git a/arch/riscv/kvm/vcpu_sbi_hsm.c b/arch/riscv/kvm/vcpu_sbi_hsm.c index 2e915cafd551..7dca0e9381d9 100644 --- a/arch/riscv/kvm/vcpu_sbi_hsm.c +++ b/arch/riscv/kvm/vcpu_sbi_hsm.c @@ -21,9 +21,9 @@ static int kvm_sbi_hsm_vcpu_start(struct kvm_vcpu *vcpu) target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, target_vcpuid); if (!target_vcpu) - return -EINVAL; + return SBI_ERR_INVALID_PARAM; if (!target_vcpu->arch.power_off) - return -EALREADY; + return SBI_ERR_ALREADY_AVAILABLE; reset_cntx = &target_vcpu->arch.guest_reset_context; /* start address */ @@ -42,7 +42,7 @@ static int kvm_sbi_hsm_vcpu_start(struct kvm_vcpu *vcpu) static int kvm_sbi_hsm_vcpu_stop(struct kvm_vcpu *vcpu) { if (vcpu->arch.power_off) - return -EINVAL; + return SBI_ERR_FAILURE; kvm_riscv_vcpu_power_off(vcpu); @@ -57,7 +57,7 @@ static int kvm_sbi_hsm_vcpu_get_status(struct kvm_vcpu *vcpu) target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, target_vcpuid); if (!target_vcpu) - return -EINVAL; + return SBI_ERR_INVALID_PARAM; if (!target_vcpu->arch.power_off) return SBI_HSM_STATE_STARTED; else if (vcpu->stat.generic.blocking) @@ -67,9 +67,7 @@ static int kvm_sbi_hsm_vcpu_get_status(struct kvm_vcpu *vcpu) } static int kvm_sbi_ext_hsm_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, - unsigned long *out_val, - struct kvm_cpu_trap *utrap, - bool *exit) + struct kvm_vcpu_sbi_return *retdata) { int ret = 0; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; @@ -88,27 +86,29 @@ static int kvm_sbi_ext_hsm_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, case SBI_EXT_HSM_HART_STATUS: ret = kvm_sbi_hsm_vcpu_get_status(vcpu); if (ret >= 0) { - *out_val = ret; - ret = 0; + retdata->out_val = ret; + retdata->err_val = 0; } - break; + return 0; case SBI_EXT_HSM_HART_SUSPEND: switch (cp->a0) { case SBI_HSM_SUSPEND_RET_DEFAULT: kvm_riscv_vcpu_wfi(vcpu); break; case SBI_HSM_SUSPEND_NON_RET_DEFAULT: - ret = -EOPNOTSUPP; + ret = SBI_ERR_NOT_SUPPORTED; break; default: - ret = -EINVAL; + ret = SBI_ERR_INVALID_PARAM; } break; default: - ret = -EOPNOTSUPP; + ret = SBI_ERR_NOT_SUPPORTED; } - return ret; + retdata->err_val = ret; + + return 0; } const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_hsm = { diff --git a/arch/riscv/kvm/vcpu_sbi_pmu.c b/arch/riscv/kvm/vcpu_sbi_pmu.c new file mode 100644 index 000000000000..7eca72df2cbd --- /dev/null +++ b/arch/riscv/kvm/vcpu_sbi_pmu.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023 Rivos Inc + * + * Authors: + * Atish Patra <atishp@rivosinc.com> + */ + +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/kvm_host.h> +#include <asm/csr.h> +#include <asm/sbi.h> +#include <asm/kvm_vcpu_sbi.h> + +static int kvm_sbi_ext_pmu_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, + struct kvm_vcpu_sbi_return *retdata) +{ + int ret = 0; + struct kvm_cpu_context *cp = &vcpu->arch.guest_context; + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + unsigned long funcid = cp->a6; + u64 temp; + + if (!kvpmu->init_done) { + retdata->err_val = SBI_ERR_NOT_SUPPORTED; + return 0; + } + + switch (funcid) { + case SBI_EXT_PMU_NUM_COUNTERS: + ret = kvm_riscv_vcpu_pmu_num_ctrs(vcpu, retdata); + break; + case SBI_EXT_PMU_COUNTER_GET_INFO: + ret = kvm_riscv_vcpu_pmu_ctr_info(vcpu, cp->a0, retdata); + break; + case SBI_EXT_PMU_COUNTER_CFG_MATCH: +#if defined(CONFIG_32BIT) + temp = ((uint64_t)cp->a5 << 32) | cp->a4; +#else + temp = cp->a4; +#endif + /* + * This can fail if perf core framework fails to create an event. + * Forward the error to userspace because it's an error which + * happened within the host kernel. The other option would be + * to convert to an SBI error and forward to the guest. + */ + ret = kvm_riscv_vcpu_pmu_ctr_cfg_match(vcpu, cp->a0, cp->a1, + cp->a2, cp->a3, temp, retdata); + break; + case SBI_EXT_PMU_COUNTER_START: +#if defined(CONFIG_32BIT) + temp = ((uint64_t)cp->a4 << 32) | cp->a3; +#else + temp = cp->a3; +#endif + ret = kvm_riscv_vcpu_pmu_ctr_start(vcpu, cp->a0, cp->a1, cp->a2, + temp, retdata); + break; + case SBI_EXT_PMU_COUNTER_STOP: + ret = kvm_riscv_vcpu_pmu_ctr_stop(vcpu, cp->a0, cp->a1, cp->a2, retdata); + break; + case SBI_EXT_PMU_COUNTER_FW_READ: + ret = kvm_riscv_vcpu_pmu_ctr_read(vcpu, cp->a0, retdata); + break; + default: + retdata->err_val = SBI_ERR_NOT_SUPPORTED; + } + + return ret; +} + +static unsigned long kvm_sbi_ext_pmu_probe(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + + return kvpmu->init_done; +} + +const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_pmu = { + .extid_start = SBI_EXT_PMU, + .extid_end = SBI_EXT_PMU, + .handler = kvm_sbi_ext_pmu_handler, + .probe = kvm_sbi_ext_pmu_probe, +}; diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c index 03a0198389f0..7c4d5d38a339 100644 --- a/arch/riscv/kvm/vcpu_sbi_replace.c +++ b/arch/riscv/kvm/vcpu_sbi_replace.c @@ -11,19 +11,21 @@ #include <linux/kvm_host.h> #include <asm/sbi.h> #include <asm/kvm_vcpu_timer.h> +#include <asm/kvm_vcpu_pmu.h> #include <asm/kvm_vcpu_sbi.h> static int kvm_sbi_ext_time_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, - unsigned long *out_val, - struct kvm_cpu_trap *utrap, bool *exit) + struct kvm_vcpu_sbi_return *retdata) { - int ret = 0; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; u64 next_cycle; - if (cp->a6 != SBI_EXT_TIME_SET_TIMER) - return -EINVAL; + if (cp->a6 != SBI_EXT_TIME_SET_TIMER) { + retdata->err_val = SBI_ERR_INVALID_PARAM; + return 0; + } + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_SET_TIMER); #if __riscv_xlen == 32 next_cycle = ((u64)cp->a1 << 32) | (u64)cp->a0; #else @@ -31,7 +33,7 @@ static int kvm_sbi_ext_time_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, #endif kvm_riscv_vcpu_timer_next_event(vcpu, next_cycle); - return ret; + return 0; } const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_time = { @@ -41,8 +43,7 @@ const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_time = { }; static int kvm_sbi_ext_ipi_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, - unsigned long *out_val, - struct kvm_cpu_trap *utrap, bool *exit) + struct kvm_vcpu_sbi_return *retdata) { int ret = 0; unsigned long i; @@ -51,9 +52,12 @@ static int kvm_sbi_ext_ipi_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, unsigned long hmask = cp->a0; unsigned long hbase = cp->a1; - if (cp->a6 != SBI_EXT_IPI_SEND_IPI) - return -EINVAL; + if (cp->a6 != SBI_EXT_IPI_SEND_IPI) { + retdata->err_val = SBI_ERR_INVALID_PARAM; + return 0; + } + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_IPI_SENT); kvm_for_each_vcpu(i, tmp, vcpu->kvm) { if (hbase != -1UL) { if (tmp->vcpu_id < hbase) @@ -64,6 +68,7 @@ static int kvm_sbi_ext_ipi_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, ret = kvm_riscv_vcpu_set_interrupt(tmp, IRQ_VS_SOFT); if (ret < 0) break; + kvm_riscv_vcpu_pmu_incr_fw(tmp, SBI_PMU_FW_IPI_RCVD); } return ret; @@ -76,10 +81,8 @@ const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_ipi = { }; static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, - unsigned long *out_val, - struct kvm_cpu_trap *utrap, bool *exit) + struct kvm_vcpu_sbi_return *retdata) { - int ret = 0; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; unsigned long hmask = cp->a0; unsigned long hbase = cp->a1; @@ -88,6 +91,7 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run switch (funcid) { case SBI_EXT_RFENCE_REMOTE_FENCE_I: kvm_riscv_fence_i(vcpu->kvm, hbase, hmask); + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_FENCE_I_SENT); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA: if (cp->a2 == 0 && cp->a3 == 0) @@ -95,6 +99,7 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run else kvm_riscv_hfence_vvma_gva(vcpu->kvm, hbase, hmask, cp->a2, cp->a3, PAGE_SHIFT); + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_SENT); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID: if (cp->a2 == 0 && cp->a3 == 0) @@ -105,6 +110,7 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run hbase, hmask, cp->a2, cp->a3, PAGE_SHIFT, cp->a4); + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_SENT); break; case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA: case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID: @@ -116,10 +122,10 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run */ break; default: - ret = -EOPNOTSUPP; + retdata->err_val = SBI_ERR_NOT_SUPPORTED; } - return ret; + return 0; } const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_rfence = { @@ -130,14 +136,12 @@ const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_rfence = { static int kvm_sbi_ext_srst_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, - unsigned long *out_val, - struct kvm_cpu_trap *utrap, bool *exit) + struct kvm_vcpu_sbi_return *retdata) { struct kvm_cpu_context *cp = &vcpu->arch.guest_context; unsigned long funcid = cp->a6; u32 reason = cp->a1; u32 type = cp->a0; - int ret = 0; switch (funcid) { case SBI_EXT_SRST_RESET: @@ -146,24 +150,24 @@ static int kvm_sbi_ext_srst_handler(struct kvm_vcpu *vcpu, kvm_riscv_vcpu_sbi_system_reset(vcpu, run, KVM_SYSTEM_EVENT_SHUTDOWN, reason); - *exit = true; + retdata->uexit = true; break; case SBI_SRST_RESET_TYPE_COLD_REBOOT: case SBI_SRST_RESET_TYPE_WARM_REBOOT: kvm_riscv_vcpu_sbi_system_reset(vcpu, run, KVM_SYSTEM_EVENT_RESET, reason); - *exit = true; + retdata->uexit = true; break; default: - ret = -EOPNOTSUPP; + retdata->err_val = SBI_ERR_NOT_SUPPORTED; } break; default: - ret = -EOPNOTSUPP; + retdata->err_val = SBI_ERR_NOT_SUPPORTED; } - return ret; + return 0; } const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_srst = { diff --git a/arch/riscv/kvm/vcpu_sbi_v01.c b/arch/riscv/kvm/vcpu_sbi_v01.c index 489f225ee66d..8f4c4fa16227 100644 --- a/arch/riscv/kvm/vcpu_sbi_v01.c +++ b/arch/riscv/kvm/vcpu_sbi_v01.c @@ -14,9 +14,7 @@ #include <asm/kvm_vcpu_sbi.h> static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, - unsigned long *out_val, - struct kvm_cpu_trap *utrap, - bool *exit) + struct kvm_vcpu_sbi_return *retdata) { ulong hmask; int i, ret = 0; @@ -24,6 +22,7 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, struct kvm_vcpu *rvcpu; struct kvm *kvm = vcpu->kvm; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; + struct kvm_cpu_trap *utrap = retdata->utrap; switch (cp->a7) { case SBI_EXT_0_1_CONSOLE_GETCHAR: @@ -33,7 +32,7 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, * handled in kernel so we forward these to user-space */ kvm_riscv_vcpu_sbi_forward(vcpu, run); - *exit = true; + retdata->uexit = true; break; case SBI_EXT_0_1_SET_TIMER: #if __riscv_xlen == 32 @@ -48,8 +47,7 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, break; case SBI_EXT_0_1_SEND_IPI: if (cp->a0) - hmask = kvm_riscv_vcpu_unpriv_read(vcpu, false, cp->a0, - utrap); + hmask = kvm_riscv_vcpu_unpriv_read(vcpu, false, cp->a0, utrap); else hmask = (1UL << atomic_read(&kvm->online_vcpus)) - 1; if (utrap->scause) @@ -65,14 +63,13 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, case SBI_EXT_0_1_SHUTDOWN: kvm_riscv_vcpu_sbi_system_reset(vcpu, run, KVM_SYSTEM_EVENT_SHUTDOWN, 0); - *exit = true; + retdata->uexit = true; break; case SBI_EXT_0_1_REMOTE_FENCE_I: case SBI_EXT_0_1_REMOTE_SFENCE_VMA: case SBI_EXT_0_1_REMOTE_SFENCE_VMA_ASID: if (cp->a0) - hmask = kvm_riscv_vcpu_unpriv_read(vcpu, false, cp->a0, - utrap); + hmask = kvm_riscv_vcpu_unpriv_read(vcpu, false, cp->a0, utrap); else hmask = (1UL << atomic_read(&kvm->online_vcpus)) - 1; if (utrap->scause) @@ -103,7 +100,7 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, } break; default: - ret = -EINVAL; + retdata->err_val = SBI_ERR_NOT_SUPPORTED; break; } diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c index 6cd93995fb65..5246da1c9167 100644 --- a/arch/riscv/kvm/vmid.c +++ b/arch/riscv/kvm/vmid.c @@ -17,10 +17,10 @@ static unsigned long vmid_version = 1; static unsigned long vmid_next; -static unsigned long vmid_bits; +static unsigned long vmid_bits __ro_after_init; static DEFINE_SPINLOCK(vmid_lock); -void kvm_riscv_gstage_vmid_detect(void) +void __init kvm_riscv_gstage_vmid_detect(void) { unsigned long old; diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile index 25d5c9664e57..6c74b0bedd60 100644 --- a/arch/riscv/lib/Makefile +++ b/arch/riscv/lib/Makefile @@ -3,6 +3,9 @@ lib-y += delay.o lib-y += memcpy.o lib-y += memset.o lib-y += memmove.o +lib-y += strcmp.o +lib-y += strlen.o +lib-y += strncmp.o lib-$(CONFIG_MMU) += uaccess.o lib-$(CONFIG_64BIT) += tishift.o diff --git a/arch/riscv/lib/strcmp.S b/arch/riscv/lib/strcmp.S new file mode 100644 index 000000000000..986ab23fe787 --- /dev/null +++ b/arch/riscv/lib/strcmp.S @@ -0,0 +1,121 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#include <linux/linkage.h> +#include <asm/asm.h> +#include <asm-generic/export.h> +#include <asm/alternative-macros.h> +#include <asm/errata_list.h> + +/* int strcmp(const char *cs, const char *ct) */ +SYM_FUNC_START(strcmp) + + ALTERNATIVE("nop", "j strcmp_zbb", 0, RISCV_ISA_EXT_ZBB, CONFIG_RISCV_ISA_ZBB) + + /* + * Returns + * a0 - comparison result, value like strcmp + * + * Parameters + * a0 - string1 + * a1 - string2 + * + * Clobbers + * t0, t1 + */ +1: + lbu t0, 0(a0) + lbu t1, 0(a1) + addi a0, a0, 1 + addi a1, a1, 1 + bne t0, t1, 2f + bnez t0, 1b + li a0, 0 + ret +2: + /* + * strcmp only needs to return (< 0, 0, > 0) values + * not necessarily -1, 0, +1 + */ + sub a0, t0, t1 + ret + +/* + * Variant of strcmp using the ZBB extension if available + */ +#ifdef CONFIG_RISCV_ISA_ZBB +strcmp_zbb: + +.option push +.option arch,+zbb + + /* + * Returns + * a0 - comparison result, value like strcmp + * + * Parameters + * a0 - string1 + * a1 - string2 + * + * Clobbers + * t0, t1, t2, t3, t4, t5 + */ + + or t2, a0, a1 + li t4, -1 + and t2, t2, SZREG-1 + bnez t2, 3f + + /* Main loop for aligned string. */ + .p2align 3 +1: + REG_L t0, 0(a0) + REG_L t1, 0(a1) + orc.b t3, t0 + bne t3, t4, 2f + addi a0, a0, SZREG + addi a1, a1, SZREG + beq t0, t1, 1b + + /* + * Words don't match, and no null byte in the first + * word. Get bytes in big-endian order and compare. + */ +#ifndef CONFIG_CPU_BIG_ENDIAN + rev8 t0, t0 + rev8 t1, t1 +#endif + + /* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence. */ + sltu a0, t0, t1 + neg a0, a0 + ori a0, a0, 1 + ret + +2: + /* + * Found a null byte. + * If words don't match, fall back to simple loop. + */ + bne t0, t1, 3f + + /* Otherwise, strings are equal. */ + li a0, 0 + ret + + /* Simple loop for misaligned strings. */ + .p2align 3 +3: + lbu t0, 0(a0) + lbu t1, 0(a1) + addi a0, a0, 1 + addi a1, a1, 1 + bne t0, t1, 4f + bnez t0, 3b + +4: + sub a0, t0, t1 + ret + +.option pop +#endif +SYM_FUNC_END(strcmp) diff --git a/arch/riscv/lib/strlen.S b/arch/riscv/lib/strlen.S new file mode 100644 index 000000000000..8345ceeee3f6 --- /dev/null +++ b/arch/riscv/lib/strlen.S @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#include <linux/linkage.h> +#include <asm/asm.h> +#include <asm-generic/export.h> +#include <asm/alternative-macros.h> +#include <asm/errata_list.h> + +/* int strlen(const char *s) */ +SYM_FUNC_START(strlen) + + ALTERNATIVE("nop", "j strlen_zbb", 0, RISCV_ISA_EXT_ZBB, CONFIG_RISCV_ISA_ZBB) + + /* + * Returns + * a0 - string length + * + * Parameters + * a0 - String to measure + * + * Clobbers: + * t0, t1 + */ + mv t1, a0 +1: + lbu t0, 0(t1) + beqz t0, 2f + addi t1, t1, 1 + j 1b +2: + sub a0, t1, a0 + ret + +/* + * Variant of strlen using the ZBB extension if available + */ +#ifdef CONFIG_RISCV_ISA_ZBB +strlen_zbb: + +#ifdef CONFIG_CPU_BIG_ENDIAN +# define CZ clz +# define SHIFT sll +#else +# define CZ ctz +# define SHIFT srl +#endif + +.option push +.option arch,+zbb + + /* + * Returns + * a0 - string length + * + * Parameters + * a0 - String to measure + * + * Clobbers + * t0, t1, t2, t3 + */ + + /* Number of irrelevant bytes in the first word. */ + andi t2, a0, SZREG-1 + + /* Align pointer. */ + andi t0, a0, -SZREG + + li t3, SZREG + sub t3, t3, t2 + slli t2, t2, 3 + + /* Get the first word. */ + REG_L t1, 0(t0) + + /* + * Shift away the partial data we loaded to remove the irrelevant bytes + * preceding the string with the effect of adding NUL bytes at the + * end of the string's first word. + */ + SHIFT t1, t1, t2 + + /* Convert non-NUL into 0xff and NUL into 0x00. */ + orc.b t1, t1 + + /* Convert non-NUL into 0x00 and NUL into 0xff. */ + not t1, t1 + + /* + * Search for the first set bit (corresponding to a NUL byte in the + * original chunk). + */ + CZ t1, t1 + + /* + * The first chunk is special: compare against the number + * of valid bytes in this chunk. + */ + srli a0, t1, 3 + bgtu t3, a0, 3f + + /* Prepare for the word comparison loop. */ + addi t2, t0, SZREG + li t3, -1 + + /* + * Our critical loop is 4 instructions and processes data in + * 4 byte or 8 byte chunks. + */ + .p2align 3 +1: + REG_L t1, SZREG(t0) + addi t0, t0, SZREG + orc.b t1, t1 + beq t1, t3, 1b +2: + not t1, t1 + CZ t1, t1 + + /* Get number of processed words. */ + sub t2, t0, t2 + + /* Add number of characters in the first word. */ + add a0, a0, t2 + srli t1, t1, 3 + + /* Add number of characters in the last word. */ + add a0, a0, t1 +3: + ret + +.option pop +#endif +SYM_FUNC_END(strlen) diff --git a/arch/riscv/lib/strncmp.S b/arch/riscv/lib/strncmp.S new file mode 100644 index 000000000000..ee49595075be --- /dev/null +++ b/arch/riscv/lib/strncmp.S @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#include <linux/linkage.h> +#include <asm/asm.h> +#include <asm-generic/export.h> +#include <asm/alternative-macros.h> +#include <asm/errata_list.h> + +/* int strncmp(const char *cs, const char *ct, size_t count) */ +SYM_FUNC_START(strncmp) + + ALTERNATIVE("nop", "j strncmp_zbb", 0, RISCV_ISA_EXT_ZBB, CONFIG_RISCV_ISA_ZBB) + + /* + * Returns + * a0 - comparison result, value like strncmp + * + * Parameters + * a0 - string1 + * a1 - string2 + * a2 - number of characters to compare + * + * Clobbers + * t0, t1, t2 + */ + li t2, 0 +1: + beq a2, t2, 2f + lbu t0, 0(a0) + lbu t1, 0(a1) + addi a0, a0, 1 + addi a1, a1, 1 + bne t0, t1, 3f + addi t2, t2, 1 + bnez t0, 1b +2: + li a0, 0 + ret +3: + /* + * strncmp only needs to return (< 0, 0, > 0) values + * not necessarily -1, 0, +1 + */ + sub a0, t0, t1 + ret + +/* + * Variant of strncmp using the ZBB extension if available + */ +#ifdef CONFIG_RISCV_ISA_ZBB +strncmp_zbb: + +.option push +.option arch,+zbb + + /* + * Returns + * a0 - comparison result, like strncmp + * + * Parameters + * a0 - string1 + * a1 - string2 + * a2 - number of characters to compare + * + * Clobbers + * t0, t1, t2, t3, t4, t5, t6 + */ + + or t2, a0, a1 + li t5, -1 + and t2, t2, SZREG-1 + add t4, a0, a2 + bnez t2, 4f + + /* Adjust limit for fast-path. */ + andi t6, t4, -SZREG + + /* Main loop for aligned string. */ + .p2align 3 +1: + bgt a0, t6, 3f + REG_L t0, 0(a0) + REG_L t1, 0(a1) + orc.b t3, t0 + bne t3, t5, 2f + addi a0, a0, SZREG + addi a1, a1, SZREG + beq t0, t1, 1b + + /* + * Words don't match, and no null byte in the first + * word. Get bytes in big-endian order and compare. + */ +#ifndef CONFIG_CPU_BIG_ENDIAN + rev8 t0, t0 + rev8 t1, t1 +#endif + + /* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence. */ + sltu a0, t0, t1 + neg a0, a0 + ori a0, a0, 1 + ret + +2: + /* + * Found a null byte. + * If words don't match, fall back to simple loop. + */ + bne t0, t1, 3f + + /* Otherwise, strings are equal. */ + li a0, 0 + ret + + /* Simple loop for misaligned strings. */ +3: + /* Restore limit for slow-path. */ + .p2align 3 +4: + bge a0, t4, 6f + lbu t0, 0(a0) + lbu t1, 0(a1) + addi a0, a0, 1 + addi a1, a1, 1 + bne t0, t1, 5f + bnez t0, 4b + +5: + sub a0, t0, t1 + ret + +6: + li a0, 0 + ret + +.option pop +#endif +SYM_FUNC_END(strncmp) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index d86f7cebd4a7..eb0774d9c03b 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -267,10 +267,12 @@ asmlinkage void do_page_fault(struct pt_regs *regs) if (user_mode(regs)) flags |= FAULT_FLAG_USER; - if (!user_mode(regs) && addr < TASK_SIZE && - unlikely(!(regs->status & SR_SUM))) - die_kernel_fault("access to user memory without uaccess routines", - addr, regs); + if (!user_mode(regs) && addr < TASK_SIZE && unlikely(!(regs->status & SR_SUM))) { + if (fixup_exception(regs)) + return; + + die_kernel_fault("access to user memory without uaccess routines", addr, regs); + } perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); diff --git a/arch/riscv/purgatory/Makefile b/arch/riscv/purgatory/Makefile index dd58e1d99397..d16bf715a586 100644 --- a/arch/riscv/purgatory/Makefile +++ b/arch/riscv/purgatory/Makefile @@ -2,6 +2,7 @@ OBJECT_FILES_NON_STANDARD := y purgatory-y := purgatory.o sha256.o entry.o string.o ctype.o memcpy.o memset.o +purgatory-y += strcmp.o strlen.o strncmp.o targets += $(purgatory-y) PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) @@ -18,6 +19,15 @@ $(obj)/memcpy.o: $(srctree)/arch/riscv/lib/memcpy.S FORCE $(obj)/memset.o: $(srctree)/arch/riscv/lib/memset.S FORCE $(call if_changed_rule,as_o_S) +$(obj)/strcmp.o: $(srctree)/arch/riscv/lib/strcmp.S FORCE + $(call if_changed_rule,as_o_S) + +$(obj)/strlen.o: $(srctree)/arch/riscv/lib/strlen.S FORCE + $(call if_changed_rule,as_o_S) + +$(obj)/strncmp.o: $(srctree)/arch/riscv/lib/strncmp.S FORCE + $(call if_changed_rule,as_o_S) + $(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE $(call if_changed_rule,cc_o_c) @@ -77,6 +87,9 @@ CFLAGS_ctype.o += $(PURGATORY_CFLAGS) AFLAGS_REMOVE_entry.o += -Wa,-gdwarf-2 AFLAGS_REMOVE_memcpy.o += -Wa,-gdwarf-2 AFLAGS_REMOVE_memset.o += -Wa,-gdwarf-2 +AFLAGS_REMOVE_strcmp.o += -Wa,-gdwarf-2 +AFLAGS_REMOVE_strlen.o += -Wa,-gdwarf-2 +AFLAGS_REMOVE_strncmp.o += -Wa,-gdwarf-2 $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE $(call if_changed,ld) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 933771b0b07a..078cd1a773a3 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -714,7 +714,9 @@ config EADM_SCH config VFIO_CCW def_tristate n prompt "Support for VFIO-CCW subchannels" - depends on S390_CCW_IOMMU && VFIO_MDEV + depends on S390_CCW_IOMMU + depends on VFIO + select VFIO_MDEV help This driver allows usage of I/O subchannels via VFIO-CCW. @@ -724,8 +726,10 @@ config VFIO_CCW config VFIO_AP def_tristate n prompt "VFIO support for AP devices" - depends on S390_AP_IOMMU && VFIO_MDEV && KVM + depends on S390_AP_IOMMU && KVM + depends on VFIO depends on ZCRYPT + select VFIO_MDEV help This driver grants access to Adjunct Processor (AP) devices via the VFIO mediated device interface. diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 74b35ec2ad28..3c68fe49042c 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -594,7 +594,6 @@ CONFIG_SYNC_FILE=y CONFIG_VFIO=m CONFIG_VFIO_PCI=m CONFIG_MLX5_VFIO_PCI=m -CONFIG_VFIO_MDEV=m CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_INPUT=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index cec71268e3bc..9ab91632f74c 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -583,7 +583,6 @@ CONFIG_SYNC_FILE=y CONFIG_VFIO=m CONFIG_VFIO_PCI=m CONFIG_MLX5_VFIO_PCI=m -CONFIG_VFIO_MDEV=m CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_INPUT=y diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index d67ce719d16a..2bbc3d54959d 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -1031,7 +1031,6 @@ extern char sie_exit; extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc); extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc); -static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, diff --git a/arch/s390/include/asm/msi.h b/arch/s390/include/asm/msi.h new file mode 100644 index 000000000000..399343ed9ffb --- /dev/null +++ b/arch/s390/include/asm/msi.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_MSI_H +#define _ASM_S390_MSI_H +#include <asm-generic/msi.h> + +/* + * Work around S390 not using irq_domain at all so we can't set + * IRQ_DOMAIN_FLAG_ISOLATED_MSI. See for an explanation how it works: + * + * https://lore.kernel.org/r/31af8174-35e9-ebeb-b9ef-74c90d4bfd93@linux.ibm.com/ + * + * Note this is less isolated than the ARM/x86 versions as userspace can trigger + * MSI belonging to kernel devices within the same gisa. + */ +#define arch_is_isolated_msi() true + +#endif diff --git a/arch/s390/include/asm/pci_dma.h b/arch/s390/include/asm/pci_dma.h index 91e63426bdc5..7119c04c51c5 100644 --- a/arch/s390/include/asm/pci_dma.h +++ b/arch/s390/include/asm/pci_dma.h @@ -186,9 +186,10 @@ static inline unsigned long *get_st_pto(unsigned long entry) /* Prototypes */ void dma_free_seg_table(unsigned long); -unsigned long *dma_alloc_cpu_table(void); +unsigned long *dma_alloc_cpu_table(gfp_t gfp); void dma_cleanup_tables(unsigned long *); -unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr); +unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr, + gfp_t gfp); void dma_update_cpu_trans(unsigned long *entry, phys_addr_t page_addr, int flags); extern const struct dma_map_ops s390_pci_dma_ops; diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile index 9e2b95a222a9..34f9542636e9 100644 --- a/arch/s390/kernel/vdso64/Makefile +++ b/arch/s390/kernel/vdso64/Makefile @@ -22,10 +22,10 @@ KBUILD_AFLAGS += -DBUILD_VDSO KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING KBUILD_AFLAGS_64 := $(filter-out -m64,$(KBUILD_AFLAGS)) -KBUILD_AFLAGS_64 += -m64 -s +KBUILD_AFLAGS_64 += -m64 KBUILD_CFLAGS_64 := $(filter-out -m64,$(KBUILD_CFLAGS)) -KBUILD_CFLAGS_64 += -m64 -fPIC -shared -fno-common -fno-builtin +KBUILD_CFLAGS_64 += -m64 -fPIC -fno-common -fno-builtin ldflags-y := -fPIC -shared -soname=linux-vdso64.so.1 \ --hash-style=both --build-id=sha1 -T diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 0243b6e38d36..3eb85f254881 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -1162,6 +1162,115 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, } /** + * cmpxchg_guest_abs_with_key() - Perform cmpxchg on guest absolute address. + * @kvm: Virtual machine instance. + * @gpa: Absolute guest address of the location to be changed. + * @len: Operand length of the cmpxchg, required: 1 <= len <= 16. Providing a + * non power of two will result in failure. + * @old_addr: Pointer to old value. If the location at @gpa contains this value, + * the exchange will succeed. After calling cmpxchg_guest_abs_with_key() + * *@old_addr contains the value at @gpa before the attempt to + * exchange the value. + * @new: The value to place at @gpa. + * @access_key: The access key to use for the guest access. + * @success: output value indicating if an exchange occurred. + * + * Atomically exchange the value at @gpa by @new, if it contains *@old. + * Honors storage keys. + * + * Return: * 0: successful exchange + * * >0: a program interruption code indicating the reason cmpxchg could + * not be attempted + * * -EINVAL: address misaligned or len not power of two + * * -EAGAIN: transient failure (len 1 or 2) + * * -EOPNOTSUPP: read-only memslot (should never occur) + */ +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, + __uint128_t *old_addr, __uint128_t new, + u8 access_key, bool *success) +{ + gfn_t gfn = gpa_to_gfn(gpa); + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + bool writable; + hva_t hva; + int ret; + + if (!IS_ALIGNED(gpa, len)) + return -EINVAL; + + hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); + if (kvm_is_error_hva(hva)) + return PGM_ADDRESSING; + /* + * Check if it's a read-only memslot, even though that cannot occur + * since those are unsupported. + * Don't try to actually handle that case. + */ + if (!writable) + return -EOPNOTSUPP; + + hva += offset_in_page(gpa); + /* + * The cmpxchg_user_key macro depends on the type of "old", so we need + * a case for each valid length and get some code duplication as long + * as we don't introduce a new macro. + */ + switch (len) { + case 1: { + u8 old; + + ret = cmpxchg_user_key((u8 __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + case 2: { + u16 old; + + ret = cmpxchg_user_key((u16 __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + case 4: { + u32 old; + + ret = cmpxchg_user_key((u32 __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + case 8: { + u64 old; + + ret = cmpxchg_user_key((u64 __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + case 16: { + __uint128_t old; + + ret = cmpxchg_user_key((__uint128_t __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + default: + return -EINVAL; + } + if (*success) + mark_page_dirty_in_slot(kvm, slot, gfn); + /* + * Assume that the fault is caused by protection, either key protection + * or user page write protection. + */ + if (ret == -EFAULT) + ret = PGM_PROTECTION; + return ret; +} + +/** * guest_translate_address_with_key - translate guest logical into guest absolute address * @vcpu: virtual cpu * @gva: Guest virtual address diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 9408d6cc8e2c..b320d12aa049 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -206,6 +206,9 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, unsigned long len, enum gacc_mode mode); +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, __uint128_t *old, + __uint128_t new, u8 access_key, bool *success); + /** * write_guest_with_key - copy data from kernel space to guest space * @vcpu: virtual cpu diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index ab26aa53ee37..9250fde1f97d 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -3103,9 +3103,9 @@ static enum hrtimer_restart gisa_vcpu_kicker(struct hrtimer *timer) static void process_gib_alert_list(void) { struct kvm_s390_gisa_interrupt *gi; + u32 final, gisa_phys, origin = 0UL; struct kvm_s390_gisa *gisa; struct kvm *kvm; - u32 final, origin = 0UL; do { /* @@ -3131,9 +3131,10 @@ static void process_gib_alert_list(void) * interruptions asap. */ while (origin & GISA_ADDR_MASK) { - gisa = (struct kvm_s390_gisa *)(u64)origin; + gisa_phys = origin; + gisa = phys_to_virt(gisa_phys); origin = gisa->next_alert; - gisa->next_alert = (u32)(u64)gisa; + gisa->next_alert = gisa_phys; kvm = container_of(gisa, struct sie_page2, gisa)->kvm; gi = &kvm->arch.gisa_int; if (hrtimer_active(&gi->timer)) @@ -3415,8 +3416,9 @@ void kvm_s390_gib_destroy(void) gib = NULL; } -int kvm_s390_gib_init(u8 nisc) +int __init kvm_s390_gib_init(u8 nisc) { + u32 gib_origin; int rc = 0; if (!css_general_characteristics.aiv) { @@ -3438,7 +3440,8 @@ int kvm_s390_gib_init(u8 nisc) } gib->nisc = nisc; - if (chsc_sgib((u32)(u64)gib)) { + gib_origin = virt_to_phys(gib); + if (chsc_sgib(gib_origin)) { pr_err("Associating the GIB with the AIV facility failed\n"); free_page((unsigned long)gib); gib = NULL; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index e4890e04b210..39b36562c043 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -256,17 +256,6 @@ debug_info_t *kvm_s390_dbf; debug_info_t *kvm_s390_dbf_uv; /* Section: not file related */ -int kvm_arch_hardware_enable(void) -{ - /* every s390 is virtualization enabled ;-) */ - return 0; -} - -int kvm_arch_check_processor_compat(void *opaque) -{ - return 0; -} - /* forward declarations */ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, unsigned long end); @@ -329,25 +318,6 @@ static struct notifier_block kvm_clock_notifier = { .notifier_call = kvm_clock_sync, }; -int kvm_arch_hardware_setup(void *opaque) -{ - gmap_notifier.notifier_call = kvm_gmap_notifier; - gmap_register_pte_notifier(&gmap_notifier); - vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier; - gmap_register_pte_notifier(&vsie_gmap_notifier); - atomic_notifier_chain_register(&s390_epoch_delta_notifier, - &kvm_clock_notifier); - return 0; -} - -void kvm_arch_hardware_unsetup(void) -{ - gmap_unregister_pte_notifier(&gmap_notifier); - gmap_unregister_pte_notifier(&vsie_gmap_notifier); - atomic_notifier_chain_unregister(&s390_epoch_delta_notifier, - &kvm_clock_notifier); -} - static void allow_cpu_feat(unsigned long nr) { set_bit_inv(nr, kvm_s390_available_cpu_feat); @@ -385,7 +355,7 @@ static __always_inline void __insn32_query(unsigned int opcode, u8 *query) #define INSN_SORTL 0xb938 #define INSN_DFLTCC 0xb939 -static void kvm_s390_cpu_feat_init(void) +static void __init kvm_s390_cpu_feat_init(void) { int i; @@ -488,7 +458,7 @@ static void kvm_s390_cpu_feat_init(void) */ } -int kvm_arch_init(void *opaque) +static int __init __kvm_s390_init(void) { int rc = -ENOMEM; @@ -498,11 +468,11 @@ int kvm_arch_init(void *opaque) kvm_s390_dbf_uv = debug_register("kvm-uv", 32, 1, 7 * sizeof(long)); if (!kvm_s390_dbf_uv) - goto out; + goto err_kvm_uv; if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view) || debug_register_view(kvm_s390_dbf_uv, &debug_sprintf_view)) - goto out; + goto err_debug_view; kvm_s390_cpu_feat_init(); @@ -510,30 +480,49 @@ int kvm_arch_init(void *opaque) rc = kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC); if (rc) { pr_err("A FLIC registration call failed with rc=%d\n", rc); - goto out; + goto err_flic; } if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) { rc = kvm_s390_pci_init(); if (rc) { pr_err("Unable to allocate AIFT for PCI\n"); - goto out; + goto err_pci; } } rc = kvm_s390_gib_init(GAL_ISC); if (rc) - goto out; + goto err_gib; + + gmap_notifier.notifier_call = kvm_gmap_notifier; + gmap_register_pte_notifier(&gmap_notifier); + vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier; + gmap_register_pte_notifier(&vsie_gmap_notifier); + atomic_notifier_chain_register(&s390_epoch_delta_notifier, + &kvm_clock_notifier); return 0; -out: - kvm_arch_exit(); +err_gib: + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) + kvm_s390_pci_exit(); +err_pci: +err_flic: +err_debug_view: + debug_unregister(kvm_s390_dbf_uv); +err_kvm_uv: + debug_unregister(kvm_s390_dbf); return rc; } -void kvm_arch_exit(void) +static void __kvm_s390_exit(void) { + gmap_unregister_pte_notifier(&gmap_notifier); + gmap_unregister_pte_notifier(&vsie_gmap_notifier); + atomic_notifier_chain_unregister(&s390_epoch_delta_notifier, + &kvm_clock_notifier); + kvm_s390_gib_destroy(); if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) kvm_s390_pci_exit(); @@ -584,7 +573,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_VCPU_RESETS: case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_S390_DIAG318: - case KVM_CAP_S390_MEM_OP_EXTENSION: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: @@ -598,6 +586,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_MEM_OP: r = MEM_OP_MAX_SIZE; break; + case KVM_CAP_S390_MEM_OP_EXTENSION: + /* + * Flag bits indicating which extensions are supported. + * If r > 0, the base extension must also be supported/indicated, + * in order to maintain backwards compatibility. + */ + r = KVM_S390_MEMOP_EXTENSION_CAP_BASE | + KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG; + break; case KVM_CAP_NR_VCPUS: case KVM_CAP_MAX_VCPUS: case KVM_CAP_MAX_VCPU_ID: @@ -2764,41 +2761,33 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) return r; } -static bool access_key_invalid(u8 access_key) -{ - return access_key > 0xf; -} - -static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop) +static int mem_op_validate_common(struct kvm_s390_mem_op *mop, u64 supported_flags) { - void __user *uaddr = (void __user *)mop->buf; - u64 supported_flags; - void *tmpbuf = NULL; - int r, srcu_idx; - - supported_flags = KVM_S390_MEMOP_F_SKEY_PROTECTION - | KVM_S390_MEMOP_F_CHECK_ONLY; if (mop->flags & ~supported_flags || !mop->size) return -EINVAL; if (mop->size > MEM_OP_MAX_SIZE) return -E2BIG; - /* - * This is technically a heuristic only, if the kvm->lock is not - * taken, it is not guaranteed that the vm is/remains non-protected. - * This is ok from a kernel perspective, wrongdoing is detected - * on the access, -EFAULT is returned and the vm may crash the - * next time it accesses the memory in question. - * There is no sane usecase to do switching and a memop on two - * different CPUs at the same time. - */ - if (kvm_s390_pv_get_handle(kvm)) - return -EINVAL; if (mop->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) { - if (access_key_invalid(mop->key)) + if (mop->key > 0xf) return -EINVAL; } else { mop->key = 0; } + return 0; +} + +static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop) +{ + void __user *uaddr = (void __user *)mop->buf; + enum gacc_mode acc_mode; + void *tmpbuf = NULL; + int r, srcu_idx; + + r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION | + KVM_S390_MEMOP_F_CHECK_ONLY); + if (r) + return r; + if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) { tmpbuf = vmalloc(mop->size); if (!tmpbuf) @@ -2812,35 +2801,25 @@ static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop) goto out_unlock; } - switch (mop->op) { - case KVM_S390_MEMOP_ABSOLUTE_READ: { - if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { - r = check_gpa_range(kvm, mop->gaddr, mop->size, GACC_FETCH, mop->key); - } else { - r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, - mop->size, GACC_FETCH, mop->key); - if (r == 0) { - if (copy_to_user(uaddr, tmpbuf, mop->size)) - r = -EFAULT; - } - } - break; + acc_mode = mop->op == KVM_S390_MEMOP_ABSOLUTE_READ ? GACC_FETCH : GACC_STORE; + if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { + r = check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key); + goto out_unlock; } - case KVM_S390_MEMOP_ABSOLUTE_WRITE: { - if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { - r = check_gpa_range(kvm, mop->gaddr, mop->size, GACC_STORE, mop->key); - } else { - if (copy_from_user(tmpbuf, uaddr, mop->size)) { - r = -EFAULT; - break; - } - r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, - mop->size, GACC_STORE, mop->key); + if (acc_mode == GACC_FETCH) { + r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, + mop->size, GACC_FETCH, mop->key); + if (r) + goto out_unlock; + if (copy_to_user(uaddr, tmpbuf, mop->size)) + r = -EFAULT; + } else { + if (copy_from_user(tmpbuf, uaddr, mop->size)) { + r = -EFAULT; + goto out_unlock; } - break; - } - default: - r = -EINVAL; + r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, + mop->size, GACC_STORE, mop->key); } out_unlock: @@ -2850,6 +2829,75 @@ out_unlock: return r; } +static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *mop) +{ + void __user *uaddr = (void __user *)mop->buf; + void __user *old_addr = (void __user *)mop->old_addr; + union { + __uint128_t quad; + char raw[sizeof(__uint128_t)]; + } old = { .quad = 0}, new = { .quad = 0 }; + unsigned int off_in_quad = sizeof(new) - mop->size; + int r, srcu_idx; + bool success; + + r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION); + if (r) + return r; + /* + * This validates off_in_quad. Checking that size is a power + * of two is not necessary, as cmpxchg_guest_abs_with_key + * takes care of that + */ + if (mop->size > sizeof(new)) + return -EINVAL; + if (copy_from_user(&new.raw[off_in_quad], uaddr, mop->size)) + return -EFAULT; + if (copy_from_user(&old.raw[off_in_quad], old_addr, mop->size)) + return -EFAULT; + + srcu_idx = srcu_read_lock(&kvm->srcu); + + if (kvm_is_error_gpa(kvm, mop->gaddr)) { + r = PGM_ADDRESSING; + goto out_unlock; + } + + r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old.quad, + new.quad, mop->key, &success); + if (!success && copy_to_user(old_addr, &old.raw[off_in_quad], mop->size)) + r = -EFAULT; + +out_unlock: + srcu_read_unlock(&kvm->srcu, srcu_idx); + return r; +} + +static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop) +{ + /* + * This is technically a heuristic only, if the kvm->lock is not + * taken, it is not guaranteed that the vm is/remains non-protected. + * This is ok from a kernel perspective, wrongdoing is detected + * on the access, -EFAULT is returned and the vm may crash the + * next time it accesses the memory in question. + * There is no sane usecase to do switching and a memop on two + * different CPUs at the same time. + */ + if (kvm_s390_pv_get_handle(kvm)) + return -EINVAL; + + switch (mop->op) { + case KVM_S390_MEMOP_ABSOLUTE_READ: + case KVM_S390_MEMOP_ABSOLUTE_WRITE: + return kvm_s390_vm_mem_op_abs(kvm, mop); + case KVM_S390_MEMOP_ABSOLUTE_CMPXCHG: + return kvm_s390_vm_mem_op_cmpxchg(kvm, mop); + default: + return -EINVAL; + } +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -5249,62 +5297,54 @@ static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu, struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; + enum gacc_mode acc_mode; void *tmpbuf = NULL; - int r = 0; - const u64 supported_flags = KVM_S390_MEMOP_F_INJECT_EXCEPTION - | KVM_S390_MEMOP_F_CHECK_ONLY - | KVM_S390_MEMOP_F_SKEY_PROTECTION; + int r; - if (mop->flags & ~supported_flags || mop->ar >= NUM_ACRS || !mop->size) + r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_INJECT_EXCEPTION | + KVM_S390_MEMOP_F_CHECK_ONLY | + KVM_S390_MEMOP_F_SKEY_PROTECTION); + if (r) + return r; + if (mop->ar >= NUM_ACRS) return -EINVAL; - if (mop->size > MEM_OP_MAX_SIZE) - return -E2BIG; if (kvm_s390_pv_cpu_is_protected(vcpu)) return -EINVAL; - if (mop->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) { - if (access_key_invalid(mop->key)) - return -EINVAL; - } else { - mop->key = 0; - } if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) { tmpbuf = vmalloc(mop->size); if (!tmpbuf) return -ENOMEM; } - switch (mop->op) { - case KVM_S390_MEMOP_LOGICAL_READ: - if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { - r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, - GACC_FETCH, mop->key); - break; - } + acc_mode = mop->op == KVM_S390_MEMOP_LOGICAL_READ ? GACC_FETCH : GACC_STORE; + if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { + r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, + acc_mode, mop->key); + goto out_inject; + } + if (acc_mode == GACC_FETCH) { r = read_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size, mop->key); - if (r == 0) { - if (copy_to_user(uaddr, tmpbuf, mop->size)) - r = -EFAULT; - } - break; - case KVM_S390_MEMOP_LOGICAL_WRITE: - if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { - r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, - GACC_STORE, mop->key); - break; + if (r) + goto out_inject; + if (copy_to_user(uaddr, tmpbuf, mop->size)) { + r = -EFAULT; + goto out_free; } + } else { if (copy_from_user(tmpbuf, uaddr, mop->size)) { r = -EFAULT; - break; + goto out_free; } r = write_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size, mop->key); - break; } +out_inject: if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0) kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); +out_free: vfree(tmpbuf); return r; } @@ -5633,23 +5673,40 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, if (kvm_s390_pv_get_handle(kvm)) return -EINVAL; - if (change == KVM_MR_DELETE || change == KVM_MR_FLAGS_ONLY) - return 0; + if (change != KVM_MR_DELETE && change != KVM_MR_FLAGS_ONLY) { + /* + * A few sanity checks. We can have memory slots which have to be + * located/ended at a segment boundary (1MB). The memory in userland is + * ok to be fragmented into various different vmas. It is okay to mmap() + * and munmap() stuff in this slot after doing this call at any time + */ - /* A few sanity checks. We can have memory slots which have to be - located/ended at a segment boundary (1MB). The memory in userland is - ok to be fragmented into various different vmas. It is okay to mmap() - and munmap() stuff in this slot after doing this call at any time */ + if (new->userspace_addr & 0xffffful) + return -EINVAL; - if (new->userspace_addr & 0xffffful) - return -EINVAL; + size = new->npages * PAGE_SIZE; + if (size & 0xffffful) + return -EINVAL; - size = new->npages * PAGE_SIZE; - if (size & 0xffffful) - return -EINVAL; + if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit) + return -EINVAL; + } - if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit) - return -EINVAL; + if (!kvm->arch.migration_mode) + return 0; + + /* + * Turn off migration mode when: + * - userspace creates a new memslot with dirty logging off, + * - userspace modifies an existing memslot (MOVE or FLAGS_ONLY) and + * dirty logging is turned off. + * Migration mode expects dirty page logging being enabled to store + * its dirty bitmap. + */ + if (change != KVM_MR_DELETE && + !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) + WARN(kvm_s390_vm_stop_migration(kvm), + "Failed to stop migration mode"); return 0; } @@ -5696,7 +5753,7 @@ static inline unsigned long nonhyp_mask(int i) static int __init kvm_s390_init(void) { - int i; + int i, r; if (!sclp.has_sief2) { pr_info("SIE is not available\n"); @@ -5712,12 +5769,23 @@ static int __init kvm_s390_init(void) kvm_s390_fac_base[i] |= stfle_fac_list[i] & nonhyp_mask(i); - return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + r = __kvm_s390_init(); + if (r) + return r; + + r = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); + if (r) { + __kvm_s390_exit(); + return r; + } + return 0; } static void __exit kvm_s390_exit(void) { kvm_exit(); + + __kvm_s390_exit(); } module_init(kvm_s390_init); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index d48588c207d8..0261d42c7d01 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -470,7 +470,7 @@ void kvm_s390_gisa_clear(struct kvm *kvm); void kvm_s390_gisa_destroy(struct kvm *kvm); void kvm_s390_gisa_disable(struct kvm *kvm); void kvm_s390_gisa_enable(struct kvm *kvm); -int kvm_s390_gib_init(u8 nisc); +int __init kvm_s390_gib_init(u8 nisc); void kvm_s390_gib_destroy(void); /* implemented in guestdbg.c */ diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c index ec51e810e381..b124d586db55 100644 --- a/arch/s390/kvm/pci.c +++ b/arch/s390/kvm/pci.c @@ -672,7 +672,7 @@ out: return r; } -int kvm_s390_pci_init(void) +int __init kvm_s390_pci_init(void) { zpci_kvm_hook.kvm_register = kvm_s390_pci_register_kvm; zpci_kvm_hook.kvm_unregister = kvm_s390_pci_unregister_kvm; diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h index 486d06ef563f..ff0972dd5e71 100644 --- a/arch/s390/kvm/pci.h +++ b/arch/s390/kvm/pci.h @@ -60,7 +60,7 @@ void kvm_s390_pci_clear_list(struct kvm *kvm); int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args); -int kvm_s390_pci_init(void); +int __init kvm_s390_pci_init(void); void kvm_s390_pci_exit(void); static inline bool kvm_s390_pci_interp_allowed(void) diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c index ea478d11fbd1..2d9b01d7ca4c 100644 --- a/arch/s390/pci/pci_dma.c +++ b/arch/s390/pci/pci_dma.c @@ -27,11 +27,11 @@ static int zpci_refresh_global(struct zpci_dev *zdev) zdev->iommu_pages * PAGE_SIZE); } -unsigned long *dma_alloc_cpu_table(void) +unsigned long *dma_alloc_cpu_table(gfp_t gfp) { unsigned long *table, *entry; - table = kmem_cache_alloc(dma_region_table_cache, GFP_ATOMIC); + table = kmem_cache_alloc(dma_region_table_cache, gfp); if (!table) return NULL; @@ -45,11 +45,11 @@ static void dma_free_cpu_table(void *table) kmem_cache_free(dma_region_table_cache, table); } -static unsigned long *dma_alloc_page_table(void) +static unsigned long *dma_alloc_page_table(gfp_t gfp) { unsigned long *table, *entry; - table = kmem_cache_alloc(dma_page_table_cache, GFP_ATOMIC); + table = kmem_cache_alloc(dma_page_table_cache, gfp); if (!table) return NULL; @@ -63,7 +63,7 @@ static void dma_free_page_table(void *table) kmem_cache_free(dma_page_table_cache, table); } -static unsigned long *dma_get_seg_table_origin(unsigned long *rtep) +static unsigned long *dma_get_seg_table_origin(unsigned long *rtep, gfp_t gfp) { unsigned long old_rte, rte; unsigned long *sto; @@ -72,7 +72,7 @@ static unsigned long *dma_get_seg_table_origin(unsigned long *rtep) if (reg_entry_isvalid(rte)) { sto = get_rt_sto(rte); } else { - sto = dma_alloc_cpu_table(); + sto = dma_alloc_cpu_table(gfp); if (!sto) return NULL; @@ -90,7 +90,7 @@ static unsigned long *dma_get_seg_table_origin(unsigned long *rtep) return sto; } -static unsigned long *dma_get_page_table_origin(unsigned long *step) +static unsigned long *dma_get_page_table_origin(unsigned long *step, gfp_t gfp) { unsigned long old_ste, ste; unsigned long *pto; @@ -99,7 +99,7 @@ static unsigned long *dma_get_page_table_origin(unsigned long *step) if (reg_entry_isvalid(ste)) { pto = get_st_pto(ste); } else { - pto = dma_alloc_page_table(); + pto = dma_alloc_page_table(gfp); if (!pto) return NULL; set_st_pto(&ste, virt_to_phys(pto)); @@ -116,18 +116,19 @@ static unsigned long *dma_get_page_table_origin(unsigned long *step) return pto; } -unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr) +unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr, + gfp_t gfp) { unsigned long *sto, *pto; unsigned int rtx, sx, px; rtx = calc_rtx(dma_addr); - sto = dma_get_seg_table_origin(&rto[rtx]); + sto = dma_get_seg_table_origin(&rto[rtx], gfp); if (!sto) return NULL; sx = calc_sx(dma_addr); - pto = dma_get_page_table_origin(&sto[sx]); + pto = dma_get_page_table_origin(&sto[sx], gfp); if (!pto) return NULL; @@ -170,7 +171,8 @@ static int __dma_update_trans(struct zpci_dev *zdev, phys_addr_t pa, return -EINVAL; for (i = 0; i < nr_pages; i++) { - entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr); + entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr, + GFP_ATOMIC); if (!entry) { rc = -ENOMEM; goto undo_cpu_trans; @@ -186,7 +188,8 @@ undo_cpu_trans: while (i-- > 0) { page_addr -= PAGE_SIZE; dma_addr -= PAGE_SIZE; - entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr); + entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr, + GFP_ATOMIC); if (!entry) break; dma_update_cpu_trans(entry, page_addr, flags); @@ -576,7 +579,7 @@ int zpci_dma_init_device(struct zpci_dev *zdev) spin_lock_init(&zdev->iommu_bitmap_lock); - zdev->dma_table = dma_alloc_cpu_table(); + zdev->dma_table = dma_alloc_cpu_table(GFP_KERNEL); if (!zdev->dma_table) { rc = -ENOMEM; goto out; diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile index d237bc6841cb..32573b4f9bd2 100644 --- a/arch/s390/purgatory/Makefile +++ b/arch/s390/purgatory/Makefile @@ -24,7 +24,7 @@ KCSAN_SANITIZE := n KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes KBUILD_CFLAGS += -Wno-pointer-sign -Wno-sign-compare KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding -KBUILD_CFLAGS += -c -MD -Os -m64 -msoft-float -fno-common +KBUILD_CFLAGS += -Os -m64 -msoft-float -fno-common KBUILD_CFLAGS += -fno-stack-protector KBUILD_CFLAGS += $(CLANG_FLAGS) KBUILD_CFLAGS += $(call cc-option,-fno-PIE) diff --git a/arch/sh/boot/compressed/Makefile b/arch/sh/boot/compressed/Makefile index 591125c42d49..b5e29f99c02c 100644 --- a/arch/sh/boot/compressed/Makefile +++ b/arch/sh/boot/compressed/Makefile @@ -8,13 +8,6 @@ OBJECTS := head_32.o misc.o cache.o piggy.o \ ashiftrt.o ashldi3.o ashrsi3.o ashlsi3.o lshrsi3.o -# These were previously generated files. When you are building the kernel -# with O=, make sure to remove the stale files in the output tree. Otherwise, -# the build system wrongly compiles the stale ones. -ifdef building_out_of_srctree -$(shell rm -f $(addprefix $(obj)/, ashiftrt.S ashldi3.c ashrsi3.S ashlsi3.S lshrsi3.S)) -endif - targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 \ vmlinux.bin.lzma vmlinux.bin.xz vmlinux.bin.lzo $(OBJECTS) diff --git a/arch/um/Kconfig b/arch/um/Kconfig index ad4ff3b0e91e..541a9b18e343 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -25,9 +25,12 @@ config UML select GENERIC_IRQ_SHOW select GENERIC_CPU_DEVICES select HAVE_GCC_PLUGINS + select ARCH_SUPPORTS_LTO_CLANG + select ARCH_SUPPORTS_LTO_CLANG_THIN select TRACE_IRQFLAGS_SUPPORT select TTY # Needed for line.c select HAVE_ARCH_VMAP_STACK + select HAVE_RUST if X86_64 config MMU bool @@ -242,4 +245,8 @@ source "arch/um/drivers/Kconfig" config ARCH_SUSPEND_POSSIBLE def_bool y +menu "Power management options" + source "kernel/power/Kconfig" + +endmenu diff --git a/arch/um/Makefile b/arch/um/Makefile index f1d4d67157be..8186d4761bda 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -68,6 +68,8 @@ KBUILD_CFLAGS += $(CFLAGS) $(CFLAGS-y) -D__arch_um__ \ -Din6addr_loopback=kernel_in6addr_loopback \ -Din6addr_any=kernel_in6addr_any -Dstrrchr=kernel_strrchr +KBUILD_RUSTFLAGS += -Crelocation-model=pie + KBUILD_AFLAGS += $(ARCH_INCLUDE) USER_CFLAGS = $(patsubst $(KERNEL_DEFINES),,$(patsubst -I%,,$(KBUILD_CFLAGS))) \ @@ -139,11 +141,10 @@ ifeq ($(CONFIG_LD_IS_BFD),y) LDFLAGS_EXECSTACK += $(call ld-option,--no-warn-rwx-segments) endif -LD_FLAGS_CMDLINE = $(foreach opt,$(KBUILD_LDFLAGS),-Wl,$(opt)) +LD_FLAGS_CMDLINE = $(foreach opt,$(KBUILD_LDFLAGS) $(LDFLAGS_EXECSTACK),-Wl,$(opt)) # Used by link-vmlinux.sh which has special support for um link -export CFLAGS_vmlinux := $(LINK-y) $(LINK_WRAPS) $(LD_FLAGS_CMDLINE) -export LDFLAGS_vmlinux := $(LDFLAGS_EXECSTACK) +export CFLAGS_vmlinux := $(LINK-y) $(LINK_WRAPS) $(LD_FLAGS_CMDLINE) $(CC_FLAGS_LTO) # When cleaning we don't include .config, so we don't include # TT or skas makefiles and don't clean skas_ptregs.h. diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig index a4f0a19fbe14..36911b1fddcf 100644 --- a/arch/um/drivers/Kconfig +++ b/arch/um/drivers/Kconfig @@ -261,6 +261,7 @@ config UML_NET_VECTOR config UML_NET_VDE bool "VDE transport (obsolete)" depends on UML_NET + depends on !MODVERSIONS select MAY_HAVE_RUNTIME_DEPS help This User-Mode Linux network transport allows one or more running @@ -309,6 +310,7 @@ config UML_NET_MCAST config UML_NET_PCAP bool "pcap transport (obsolete)" depends on UML_NET + depends on !MODVERSIONS select MAY_HAVE_RUNTIME_DEPS help The pcap transport makes a pcap packet stream on the host look diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile index e1dc4292bd22..dee6f66353b3 100644 --- a/arch/um/drivers/Makefile +++ b/arch/um/drivers/Makefile @@ -72,4 +72,4 @@ CFLAGS_null.o = -DDEV_NULL=$(DEV_NULL_PATH) CFLAGS_xterm.o += '-DCONFIG_XTERM_CHAN_DEFAULT_EMULATOR="$(CONFIG_XTERM_CHAN_DEFAULT_EMULATOR)"' -include arch/um/scripts/Makefile.rules +include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/arch/um/drivers/pcap_kern.c b/arch/um/drivers/pcap_kern.c index cfe4cb17694c..25ee2c97ca21 100644 --- a/arch/um/drivers/pcap_kern.c +++ b/arch/um/drivers/pcap_kern.c @@ -15,7 +15,7 @@ struct pcap_init { char *filter; }; -void pcap_init(struct net_device *dev, void *data) +void pcap_init_kern(struct net_device *dev, void *data) { struct uml_net_private *pri; struct pcap_data *ppri; @@ -44,7 +44,7 @@ static int pcap_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) } static const struct net_kern_info pcap_kern_info = { - .init = pcap_init, + .init = pcap_init_kern, .protocol = eth_protocol, .read = pcap_read, .write = pcap_write, diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index ded7c47d2fbe..131b7cb29576 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -767,6 +767,7 @@ static int vector_config(char *str, char **error_out) if (parsed == NULL) { *error_out = "vector_config failed to parse parameters"; + kfree(params); return -EINVAL; } diff --git a/arch/um/drivers/vector_user.h b/arch/um/drivers/vector_user.h index 3a73d17a0161..59ed5f9e6e41 100644 --- a/arch/um/drivers/vector_user.h +++ b/arch/um/drivers/vector_user.h @@ -68,8 +68,6 @@ struct vector_fds { }; #define VECTOR_READ 1 -#define VECTOR_WRITE (1 < 1) -#define VECTOR_HEADERS (1 < 2) extern struct arglist *uml_parse_vector_ifspec(char *arg); diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c index 3ac220dafec4..7699ca5f35d4 100644 --- a/arch/um/drivers/virt-pci.c +++ b/arch/um/drivers/virt-pci.c @@ -8,6 +8,7 @@ #include <linux/virtio.h> #include <linux/virtio_config.h> #include <linux/logic_iomem.h> +#include <linux/of_platform.h> #include <linux/irqdomain.h> #include <linux/virtio_pcidev.h> #include <linux/virtio-uml.h> @@ -39,6 +40,8 @@ struct um_pci_device { unsigned long status; int irq; + + bool platform; }; struct um_pci_device_reg { @@ -48,13 +51,15 @@ struct um_pci_device_reg { static struct pci_host_bridge *bridge; static DEFINE_MUTEX(um_pci_mtx); +static struct um_pci_device *um_pci_platform_device; static struct um_pci_device_reg um_pci_devices[MAX_DEVICES]; static struct fwnode_handle *um_pci_fwnode; static struct irq_domain *um_pci_inner_domain; static struct irq_domain *um_pci_msi_domain; static unsigned long um_pci_msi_used[BITS_TO_LONGS(MAX_MSI_VECTORS)]; -#define UM_VIRT_PCI_MAXDELAY 40000 +static unsigned int um_pci_max_delay_us = 40000; +module_param_named(max_delay_us, um_pci_max_delay_us, uint, 0644); struct um_pci_message_buffer { struct virtio_pcidev_msg hdr; @@ -132,8 +137,11 @@ static int um_pci_send_cmd(struct um_pci_device *dev, out ? 1 : 0, posted ? cmd : HANDLE_NO_FREE(cmd), GFP_ATOMIC); - if (ret) + if (ret) { + if (posted) + kfree(cmd); goto out; + } if (posted) { virtqueue_kick(dev->cmd_vq); @@ -155,7 +163,7 @@ static int um_pci_send_cmd(struct um_pci_device *dev, kfree(completed); if (WARN_ONCE(virtqueue_is_broken(dev->cmd_vq) || - ++delay_count > UM_VIRT_PCI_MAXDELAY, + ++delay_count > um_pci_max_delay_us, "um virt-pci delay: %d", delay_count)) { ret = -EIO; break; @@ -480,6 +488,9 @@ static void um_pci_handle_irq_message(struct virtqueue *vq, struct virtio_device *vdev = vq->vdev; struct um_pci_device *dev = vdev->priv; + if (!dev->irq) + return; + /* we should properly chain interrupts, but on ARCH=um we don't care */ switch (msg->op) { @@ -533,6 +544,25 @@ static void um_pci_irq_vq_cb(struct virtqueue *vq) } } +/* Copied from arch/x86/kernel/devicetree.c */ +struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus) +{ + struct device_node *np; + + for_each_node_by_type(np, "pci") { + const void *prop; + unsigned int bus_min; + + prop = of_get_property(np, "bus-range", NULL); + if (!prop) + continue; + bus_min = be32_to_cpup(prop); + if (bus->number == bus_min) + return np; + } + return NULL; +} + static int um_pci_init_vqs(struct um_pci_device *dev) { struct virtqueue *vqs[2]; @@ -561,6 +591,55 @@ static int um_pci_init_vqs(struct um_pci_device *dev) return 0; } +static void __um_pci_virtio_platform_remove(struct virtio_device *vdev, + struct um_pci_device *dev) +{ + virtio_reset_device(vdev); + vdev->config->del_vqs(vdev); + + mutex_lock(&um_pci_mtx); + um_pci_platform_device = NULL; + mutex_unlock(&um_pci_mtx); + + kfree(dev); +} + +static int um_pci_virtio_platform_probe(struct virtio_device *vdev, + struct um_pci_device *dev) +{ + int ret; + + dev->platform = true; + + mutex_lock(&um_pci_mtx); + + if (um_pci_platform_device) { + mutex_unlock(&um_pci_mtx); + ret = -EBUSY; + goto out_free; + } + + ret = um_pci_init_vqs(dev); + if (ret) { + mutex_unlock(&um_pci_mtx); + goto out_free; + } + + um_pci_platform_device = dev; + + mutex_unlock(&um_pci_mtx); + + ret = of_platform_default_populate(vdev->dev.of_node, NULL, &vdev->dev); + if (ret) + __um_pci_virtio_platform_remove(vdev, dev); + + return ret; + +out_free: + kfree(dev); + return ret; +} + static int um_pci_virtio_probe(struct virtio_device *vdev) { struct um_pci_device *dev; @@ -574,6 +653,9 @@ static int um_pci_virtio_probe(struct virtio_device *vdev) dev->vdev = vdev; vdev->priv = dev; + if (of_device_is_compatible(vdev->dev.of_node, "simple-bus")) + return um_pci_virtio_platform_probe(vdev, dev); + mutex_lock(&um_pci_mtx); for (i = 0; i < MAX_DEVICES; i++) { if (um_pci_devices[i].dev) @@ -623,9 +705,11 @@ static void um_pci_virtio_remove(struct virtio_device *vdev) struct um_pci_device *dev = vdev->priv; int i; - /* Stop all virtqueues */ - virtio_reset_device(vdev); - vdev->config->del_vqs(vdev); + if (dev->platform) { + of_platform_depopulate(&vdev->dev); + __um_pci_virtio_platform_remove(vdev, dev); + return; + } device_set_wakeup_enable(&vdev->dev, false); @@ -633,12 +717,27 @@ static void um_pci_virtio_remove(struct virtio_device *vdev) for (i = 0; i < MAX_DEVICES; i++) { if (um_pci_devices[i].dev != dev) continue; + um_pci_devices[i].dev = NULL; irq_free_desc(dev->irq); + + break; } mutex_unlock(&um_pci_mtx); - um_pci_rescan(); + if (i < MAX_DEVICES) { + struct pci_dev *pci_dev; + + pci_dev = pci_get_slot(bridge->bus, i); + if (pci_dev) + pci_stop_and_remove_bus_device_locked(pci_dev); + } + + /* Stop all virtqueues */ + virtio_reset_device(vdev); + dev->cmd_vq = NULL; + dev->irq_vq = NULL; + vdev->config->del_vqs(vdev); kfree(dev); } @@ -860,6 +959,30 @@ void *pci_root_bus_fwnode(struct pci_bus *bus) return um_pci_fwnode; } +static long um_pci_map_platform(unsigned long offset, size_t size, + const struct logic_iomem_ops **ops, + void **priv) +{ + if (!um_pci_platform_device) + return -ENOENT; + + *ops = &um_pci_device_bar_ops; + *priv = &um_pci_platform_device->resptr[0]; + + return 0; +} + +static const struct logic_iomem_region_ops um_pci_platform_ops = { + .map = um_pci_map_platform, +}; + +static struct resource virt_platform_resource = { + .name = "platform", + .start = 0x10000000, + .end = 0x1fffffff, + .flags = IORESOURCE_MEM, +}; + static int __init um_pci_init(void) { int err, i; @@ -868,6 +991,8 @@ static int __init um_pci_init(void) &um_pci_cfgspace_ops)); WARN_ON(logic_iomem_add_region(&virt_iomem_resource, &um_pci_iomem_ops)); + WARN_ON(logic_iomem_add_region(&virt_platform_resource, + &um_pci_platform_ops)); if (WARN(CONFIG_UML_PCI_OVER_VIRTIO_DEVICE_ID < 0, "No virtio device ID configured for PCI - no PCI support\n")) diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index 588930a0ced1..8adca2000e51 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -168,7 +168,8 @@ static void vhost_user_check_reset(struct virtio_uml_device *vu_dev, if (!vu_dev->registered) return; - virtio_break_device(&vu_dev->vdev); + vu_dev->registered = 0; + schedule_work(&pdata->conn_broken_wk); } @@ -412,7 +413,7 @@ static irqreturn_t vu_req_read_message(struct virtio_uml_device *vu_dev, if (msg.msg.header.flags & VHOST_USER_FLAG_NEED_REPLY) vhost_user_reply(vu_dev, &msg.msg, response); irq_rc = IRQ_HANDLED; - }; + } /* mask EAGAIN as we try non-blocking read until socket is empty */ vu_dev->recv_rc = (rc == -EAGAIN) ? 0 : rc; return irq_rc; @@ -1136,6 +1137,15 @@ void virtio_uml_set_no_vq_suspend(struct virtio_device *vdev, static void vu_of_conn_broken(struct work_struct *wk) { + struct virtio_uml_platform_data *pdata; + struct virtio_uml_device *vu_dev; + + pdata = container_of(wk, struct virtio_uml_platform_data, conn_broken_wk); + + vu_dev = platform_get_drvdata(pdata->pdev); + + virtio_break_device(&vu_dev->vdev); + /* * We can't remove the device from the devicetree so the only thing we * can do is warn. @@ -1266,8 +1276,14 @@ static int vu_unregister_cmdline_device(struct device *dev, void *data) static void vu_conn_broken(struct work_struct *wk) { struct virtio_uml_platform_data *pdata; + struct virtio_uml_device *vu_dev; pdata = container_of(wk, struct virtio_uml_platform_data, conn_broken_wk); + + vu_dev = platform_get_drvdata(pdata->pdev); + + virtio_break_device(&vu_dev->vdev); + vu_unregister_cmdline_device(&pdata->pdev->dev, NULL); } diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h index bb5f06480da9..7414154b8e9a 100644 --- a/arch/um/include/asm/processor-generic.h +++ b/arch/um/include/asm/processor-generic.h @@ -91,7 +91,7 @@ struct cpuinfo_um { extern struct cpuinfo_um boot_cpu_data; -#define cpu_data (&boot_cpu_data) +#define cpu_data(cpu) boot_cpu_data #define current_cpu_data boot_cpu_data #define cache_line_size() (boot_cpu_data.cache_alignment) diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index 1c2d4b29a3d4..811188be954c 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -29,7 +29,7 @@ obj-$(CONFIG_GENERIC_PCI_IOMAP) += ioport.o USER_OBJS := config.o -include arch/um/scripts/Makefile.rules +include $(srctree)/arch/um/scripts/Makefile.rules targets := config.c config.tmp capflags.c diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c index 58938d75871a..827a0d3fa589 100644 --- a/arch/um/kernel/exec.c +++ b/arch/um/kernel/exec.c @@ -29,8 +29,8 @@ void flush_thread(void) ret = unmap(¤t->mm->context.id, 0, TASK_SIZE, 1, &data); if (ret) { - printk(KERN_ERR "flush_thread - clearing address space failed, " - "err = %d\n", ret); + printk(KERN_ERR "%s - clearing address space failed, err = %d\n", + __func__, ret); force_sig(SIGKILL); } get_safe_registers(current_pt_regs()->regs.gp, diff --git a/arch/um/kernel/skas/Makefile b/arch/um/kernel/skas/Makefile index f3d494a4fd9b..f93972a25765 100644 --- a/arch/um/kernel/skas/Makefile +++ b/arch/um/kernel/skas/Makefile @@ -14,4 +14,4 @@ UNPROFILE_OBJS := clone.o KCOV_INSTRUMENT := n -include arch/um/scripts/Makefile.rules +include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c index ad449173a1a1..7d050ab0f78a 100644 --- a/arch/um/kernel/tlb.c +++ b/arch/um/kernel/tlb.c @@ -314,8 +314,8 @@ static inline int update_p4d_range(pgd_t *pgd, unsigned long addr, return ret; } -void fix_range_common(struct mm_struct *mm, unsigned long start_addr, - unsigned long end_addr, int force) +static void fix_range_common(struct mm_struct *mm, unsigned long start_addr, + unsigned long end_addr, int force) { pgd_t *pgd; struct host_vm_change hvc; @@ -597,6 +597,8 @@ void force_flush_all(void) struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); + mmap_read_lock(mm); for_each_vma(vmi, vma) fix_range(mm, vma->vm_start, vma->vm_end, 1); + mmap_read_unlock(mm); } diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 786b44dc20c9..8dcda617b8bf 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -96,7 +96,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) static void *c_start(struct seq_file *m, loff_t *pos) { - return *pos < nr_cpu_ids ? cpu_data + *pos : NULL; + return *pos < nr_cpu_ids ? &boot_cpu_data + *pos : NULL; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) diff --git a/arch/um/kernel/vmlinux.lds.S b/arch/um/kernel/vmlinux.lds.S index 16e49bfa2b42..53d719c04ba9 100644 --- a/arch/um/kernel/vmlinux.lds.S +++ b/arch/um/kernel/vmlinux.lds.S @@ -1,4 +1,4 @@ - +#define RUNTIME_DISCARD_EXIT KERNEL_STACK_SIZE = 4096 * (1 << CONFIG_KERNEL_STACK_ORDER); #ifdef CONFIG_LD_SCRIPT_STATIC diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile index 77ac50baa3f8..544e0b344c75 100644 --- a/arch/um/os-Linux/Makefile +++ b/arch/um/os-Linux/Makefile @@ -18,4 +18,4 @@ USER_OBJS := $(user-objs-y) elf_aux.o execvp.o file.o helper.o irq.o \ main.o mem.o process.o registers.o sigio.o signal.o start_up.o time.o \ tty.o umid.o util.o -include arch/um/scripts/Makefile.rules +include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/arch/um/os-Linux/drivers/Makefile b/arch/um/os-Linux/drivers/Makefile index d79e75f1b69a..cf2d75bb1884 100644 --- a/arch/um/os-Linux/drivers/Makefile +++ b/arch/um/os-Linux/drivers/Makefile @@ -10,4 +10,4 @@ obj-y = obj-$(CONFIG_UML_NET_ETHERTAP) += ethertap.o obj-$(CONFIG_UML_NET_TUNTAP) += tuntap.o -include arch/um/scripts/Makefile.rules +include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/arch/um/os-Linux/irq.c b/arch/um/os-Linux/irq.c index 98ea910ef87c..cf7e49c08b21 100644 --- a/arch/um/os-Linux/irq.c +++ b/arch/um/os-Linux/irq.c @@ -127,12 +127,10 @@ int os_mod_epoll_fd(int events, int fd, void *data) int os_del_epoll_fd(int fd) { struct epoll_event event; - int result; /* This is quiet as we use this as IO ON/OFF - so it is often * invoked on a non-existent fd */ - result = epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &event); - return result; + return epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &event); } void os_set_ioignore(void) diff --git a/arch/um/os-Linux/skas/Makefile b/arch/um/os-Linux/skas/Makefile index c4566e788815..75f11989d2e9 100644 --- a/arch/um/os-Linux/skas/Makefile +++ b/arch/um/os-Linux/skas/Makefile @@ -7,4 +7,4 @@ obj-y := mem.o process.o USER_OBJS := $(obj-y) -include arch/um/scripts/Makefile.rules +include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c index 3b4975ee67e2..953fb10f3f93 100644 --- a/arch/um/os-Linux/skas/mem.c +++ b/arch/um/os-Linux/skas/mem.c @@ -60,8 +60,8 @@ static inline long do_syscall_stub(struct mm_id * mm_idp, void **addr) printk(UM_KERN_ERR "Registers - \n"); for (i = 0; i < MAX_REG_NR; i++) printk(UM_KERN_ERR "\t%d\t0x%lx\n", i, syscall_regs[i]); - panic("do_syscall_stub : PTRACE_SETREGS failed, errno = %d\n", - -n); + panic("%s : PTRACE_SETREGS failed, errno = %d\n", + __func__, -n); } err = ptrace(PTRACE_CONT, pid, 0, 0); @@ -81,20 +81,17 @@ static inline long do_syscall_stub(struct mm_id * mm_idp, void **addr) offset = *((unsigned long *) mm_idp->stack + 1); if (offset) { data = (unsigned long *)(mm_idp->stack + offset - STUB_DATA); - printk(UM_KERN_ERR "do_syscall_stub : ret = %ld, offset = %ld, " - "data = %p\n", ret, offset, data); + printk(UM_KERN_ERR "%s : ret = %ld, offset = %ld, data = %p\n", + __func__, ret, offset, data); syscall = (unsigned long *)((unsigned long)data + data[0]); - printk(UM_KERN_ERR "do_syscall_stub: syscall %ld failed, " - "return value = 0x%lx, expected return value = 0x%lx\n", - syscall[0], ret, syscall[7]); - printk(UM_KERN_ERR " syscall parameters: " - "0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", + printk(UM_KERN_ERR "%s: syscall %ld failed, return value = 0x%lx, expected return value = 0x%lx\n", + __func__, syscall[0], ret, syscall[7]); + printk(UM_KERN_ERR " syscall parameters: 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", syscall[1], syscall[2], syscall[3], syscall[4], syscall[5], syscall[6]); for (n = 1; n < data[0]/sizeof(long); n++) { if (n == 1) - printk(UM_KERN_ERR " additional syscall " - "data:"); + printk(UM_KERN_ERR " additional syscall data:"); if (n % 4 == 1) printk("\n" UM_KERN_ERR " "); printk(" 0x%lx", data[n]); diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index b24db6017ded..b1ea53285af1 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -118,8 +118,8 @@ void wait_stub_done(int pid) err = ptrace(PTRACE_CONT, pid, 0, 0); if (err) { - printk(UM_KERN_ERR "wait_stub_done : continue failed, " - "errno = %d\n", errno); + printk(UM_KERN_ERR "%s : continue failed, errno = %d\n", + __func__, errno); fatal_sigsegv(); } } @@ -130,11 +130,10 @@ void wait_stub_done(int pid) bad_wait: err = ptrace_dump_regs(pid); if (err) - printk(UM_KERN_ERR "Failed to get registers from stub, " - "errno = %d\n", -err); - printk(UM_KERN_ERR "wait_stub_done : failed to wait for SIGTRAP, " - "pid = %d, n = %d, errno = %d, status = 0x%x\n", pid, n, errno, - status); + printk(UM_KERN_ERR "Failed to get registers from stub, errno = %d\n", + -err); + printk(UM_KERN_ERR "%s : failed to wait for SIGTRAP, pid = %d, n = %d, errno = %d, status = 0x%x\n", + __func__, pid, n, errno, status); fatal_sigsegv(); } @@ -195,15 +194,15 @@ static void handle_trap(int pid, struct uml_pt_regs *regs, err = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, __NR_getpid); if (err < 0) { - printk(UM_KERN_ERR "handle_trap - nullifying syscall " - "failed, errno = %d\n", errno); + printk(UM_KERN_ERR "%s - nullifying syscall failed, errno = %d\n", + __func__, errno); fatal_sigsegv(); } err = ptrace(PTRACE_SYSCALL, pid, 0, 0); if (err < 0) { - printk(UM_KERN_ERR "handle_trap - continuing to end of " - "syscall failed, errno = %d\n", errno); + printk(UM_KERN_ERR "%s - continuing to end of syscall failed, errno = %d\n", + __func__, errno); fatal_sigsegv(); } @@ -212,11 +211,10 @@ static void handle_trap(int pid, struct uml_pt_regs *regs, (WSTOPSIG(status) != SIGTRAP + 0x80)) { err = ptrace_dump_regs(pid); if (err) - printk(UM_KERN_ERR "Failed to get registers " - "from process, errno = %d\n", -err); - printk(UM_KERN_ERR "handle_trap - failed to wait at " - "end of syscall, errno = %d, status = %d\n", - errno, status); + printk(UM_KERN_ERR "Failed to get registers from process, errno = %d\n", + -err); + printk(UM_KERN_ERR "%s - failed to wait at end of syscall, errno = %d, status = %d\n", + __func__, errno, status); fatal_sigsegv(); } } @@ -256,8 +254,8 @@ static int userspace_tramp(void *stack) addr = mmap64((void *) STUB_CODE, UM_KERN_PAGE_SIZE, PROT_EXEC, MAP_FIXED | MAP_PRIVATE, fd, offset); if (addr == MAP_FAILED) { - printk(UM_KERN_ERR "mapping mmap stub at 0x%lx failed, " - "errno = %d\n", STUB_CODE, errno); + printk(UM_KERN_ERR "mapping mmap stub at 0x%lx failed, errno = %d\n", + STUB_CODE, errno); exit(1); } @@ -267,8 +265,7 @@ static int userspace_tramp(void *stack) UM_KERN_PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, fd, offset); if (addr == MAP_FAILED) { - printk(UM_KERN_ERR "mapping segfault stack " - "at 0x%lx failed, errno = %d\n", + printk(UM_KERN_ERR "mapping segfault stack at 0x%lx failed, errno = %d\n", STUB_DATA, errno); exit(1); } @@ -286,8 +283,8 @@ static int userspace_tramp(void *stack) sa.sa_sigaction = (void *) v; sa.sa_restorer = NULL; if (sigaction(SIGSEGV, &sa, NULL) < 0) { - printk(UM_KERN_ERR "userspace_tramp - setting SIGSEGV " - "handler failed - errno = %d\n", errno); + printk(UM_KERN_ERR "%s - setting SIGSEGV handler failed - errno = %d\n", + __func__, errno); exit(1); } } @@ -322,8 +319,8 @@ int start_userspace(unsigned long stub_stack) MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (stack == MAP_FAILED) { err = -errno; - printk(UM_KERN_ERR "start_userspace : mmap failed, " - "errno = %d\n", errno); + printk(UM_KERN_ERR "%s : mmap failed, errno = %d\n", + __func__, errno); return err; } @@ -336,8 +333,8 @@ int start_userspace(unsigned long stub_stack) pid = clone(userspace_tramp, (void *) sp, flags, (void *) stub_stack); if (pid < 0) { err = -errno; - printk(UM_KERN_ERR "start_userspace : clone failed, " - "errno = %d\n", errno); + printk(UM_KERN_ERR "%s : clone failed, errno = %d\n", + __func__, errno); return err; } @@ -345,31 +342,31 @@ int start_userspace(unsigned long stub_stack) CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL)); if (n < 0) { err = -errno; - printk(UM_KERN_ERR "start_userspace : wait failed, " - "errno = %d\n", errno); + printk(UM_KERN_ERR "%s : wait failed, errno = %d\n", + __func__, errno); goto out_kill; } } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM)); if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) { err = -EINVAL; - printk(UM_KERN_ERR "start_userspace : expected SIGSTOP, got " - "status = %d\n", status); + printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = %d\n", + __func__, status); goto out_kill; } if (ptrace(PTRACE_OLDSETOPTIONS, pid, NULL, (void *) PTRACE_O_TRACESYSGOOD) < 0) { err = -errno; - printk(UM_KERN_ERR "start_userspace : PTRACE_OLDSETOPTIONS " - "failed, errno = %d\n", errno); + printk(UM_KERN_ERR "%s : PTRACE_OLDSETOPTIONS failed, errno = %d\n", + __func__, errno); goto out_kill; } if (munmap(stack, UM_KERN_PAGE_SIZE) < 0) { err = -errno; - printk(UM_KERN_ERR "start_userspace : munmap failed, " - "errno = %d\n", errno); + printk(UM_KERN_ERR "%s : munmap failed, errno = %d\n", + __func__, errno); goto out_kill; } @@ -403,14 +400,14 @@ void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs) * just kill the process. */ if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) { - printk(UM_KERN_ERR "userspace - ptrace set regs " - "failed, errno = %d\n", errno); + printk(UM_KERN_ERR "%s - ptrace set regs failed, errno = %d\n", + __func__, errno); fatal_sigsegv(); } if (put_fp_registers(pid, regs->fp)) { - printk(UM_KERN_ERR "userspace - ptrace set fp regs " - "failed, errno = %d\n", errno); + printk(UM_KERN_ERR "%s - ptrace set fp regs failed, errno = %d\n", + __func__, errno); fatal_sigsegv(); } @@ -421,28 +418,28 @@ void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs) singlestepping(NULL)); if (ptrace(op, pid, 0, 0)) { - printk(UM_KERN_ERR "userspace - ptrace continue " - "failed, op = %d, errno = %d\n", op, errno); + printk(UM_KERN_ERR "%s - ptrace continue failed, op = %d, errno = %d\n", + __func__, op, errno); fatal_sigsegv(); } CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL)); if (err < 0) { - printk(UM_KERN_ERR "userspace - wait failed, " - "errno = %d\n", errno); + printk(UM_KERN_ERR "%s - wait failed, errno = %d\n", + __func__, errno); fatal_sigsegv(); } regs->is_user = 1; if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) { - printk(UM_KERN_ERR "userspace - PTRACE_GETREGS failed, " - "errno = %d\n", errno); + printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, errno = %d\n", + __func__, errno); fatal_sigsegv(); } if (get_fp_registers(pid, regs->fp)) { - printk(UM_KERN_ERR "userspace - get_fp_registers failed, " - "errno = %d\n", errno); + printk(UM_KERN_ERR "%s - get_fp_registers failed, errno = %d\n", + __func__, errno); fatal_sigsegv(); } @@ -494,8 +491,8 @@ void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs) unblock_signals_trace(); break; default: - printk(UM_KERN_ERR "userspace - child stopped " - "with signal %d\n", sig); + printk(UM_KERN_ERR "%s - child stopped with signal %d\n", + __func__, sig); fatal_sigsegv(); } pid = userspace_pid[0]; @@ -555,15 +552,15 @@ int copy_context_skas0(unsigned long new_stack, int pid) err = ptrace_setregs(pid, thread_regs); if (err < 0) { err = -errno; - printk(UM_KERN_ERR "copy_context_skas0 : PTRACE_SETREGS " - "failed, pid = %d, errno = %d\n", pid, -err); + printk(UM_KERN_ERR "%s : PTRACE_SETREGS failed, pid = %d, errno = %d\n", + __func__, pid, -err); return err; } err = put_fp_registers(pid, thread_fp_regs); if (err < 0) { - printk(UM_KERN_ERR "copy_context_skas0 : put_fp_registers " - "failed, pid = %d, err = %d\n", pid, err); + printk(UM_KERN_ERR "%s : put_fp_registers failed, pid = %d, err = %d\n", + __func__, pid, err); return err; } @@ -574,8 +571,8 @@ int copy_context_skas0(unsigned long new_stack, int pid) err = ptrace(PTRACE_CONT, pid, 0, 0); if (err) { err = -errno; - printk(UM_KERN_ERR "Failed to continue new process, pid = %d, " - "errno = %d\n", pid, errno); + printk(UM_KERN_ERR "Failed to continue new process, pid = %d, errno = %d\n", + pid, errno); return err; } @@ -583,8 +580,8 @@ int copy_context_skas0(unsigned long new_stack, int pid) pid = data->parent_err; if (pid < 0) { - printk(UM_KERN_ERR "copy_context_skas0 - stub-parent reports " - "error %d\n", -pid); + printk(UM_KERN_ERR "%s - stub-parent reports error %d\n", + __func__, -pid); return pid; } @@ -594,8 +591,8 @@ int copy_context_skas0(unsigned long new_stack, int pid) */ wait_stub_done(pid); if (child_data->child_err != STUB_DATA) { - printk(UM_KERN_ERR "copy_context_skas0 - stub-child %d reports " - "error %ld\n", pid, data->child_err); + printk(UM_KERN_ERR "%s - stub-child %d reports error %ld\n", + __func__, pid, data->child_err); err = data->child_err; goto out_kill; } @@ -603,8 +600,8 @@ int copy_context_skas0(unsigned long new_stack, int pid) if (ptrace(PTRACE_OLDSETOPTIONS, pid, NULL, (void *)PTRACE_O_TRACESYSGOOD) < 0) { err = -errno; - printk(UM_KERN_ERR "copy_context_skas0 : PTRACE_OLDSETOPTIONS " - "failed, errno = %d\n", errno); + printk(UM_KERN_ERR "%s : PTRACE_OLDSETOPTIONS failed, errno = %d\n", + __func__, errno); goto out_kill; } @@ -672,8 +669,8 @@ int start_idle_thread(void *stack, jmp_buf *switch_buf) kmalloc_ok = 0; return 1; default: - printk(UM_KERN_ERR "Bad sigsetjmp return in " - "start_idle_thread - %d\n", n); + printk(UM_KERN_ERR "Bad sigsetjmp return in %s - %d\n", + __func__, n); fatal_sigsegv(); } longjmp(*switch_buf, 1); diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um index b3c1ae084180..b70559b821df 100644 --- a/arch/x86/Makefile.um +++ b/arch/x86/Makefile.um @@ -1,6 +1,12 @@ # SPDX-License-Identifier: GPL-2.0 core-y += arch/x86/crypto/ +# +# Disable SSE and other FP/SIMD instructions to match normal x86 +# +KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx +KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2 + ifeq ($(CONFIG_X86_32),y) START := 0x8048000 @@ -17,7 +23,7 @@ LDS_EXTRA := -Ui386 export LDS_EXTRA # First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y. -include arch/x86/Makefile_32.cpu +include $(srctree)/arch/x86/Makefile_32.cpu # prevent gcc from keeping the stack 16 byte aligned. Taken from i386. cflags-y += $(call cc-option,-mpreferred-stack-boundary=2) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 1acff356d97a..6b6cfe607bdb 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -50,7 +50,7 @@ KBUILD_CFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=) KBUILD_CFLAGS += -fno-asynchronous-unwind-tables KBUILD_CFLAGS += -D__DISABLE_EXPORTS # Disable relocation relaxation in case the link is not PIE. -KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no) +KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no) KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h # sev.c indirectly inludes inat-table.h which is generated during diff --git a/arch/x86/coco/tdx/tdcall.S b/arch/x86/coco/tdx/tdcall.S index ad0d51f03cb4..6a255e6809bc 100644 --- a/arch/x86/coco/tdx/tdcall.S +++ b/arch/x86/coco/tdx/tdcall.S @@ -13,6 +13,12 @@ /* * Bitmasks of exposed registers (with VMM). */ +#define TDX_RDX BIT(2) +#define TDX_RBX BIT(3) +#define TDX_RSI BIT(6) +#define TDX_RDI BIT(7) +#define TDX_R8 BIT(8) +#define TDX_R9 BIT(9) #define TDX_R10 BIT(10) #define TDX_R11 BIT(11) #define TDX_R12 BIT(12) @@ -27,9 +33,9 @@ * details can be found in TDX GHCI specification, section * titled "TDCALL [TDG.VP.VMCALL] leaf". */ -#define TDVMCALL_EXPOSE_REGS_MASK ( TDX_R10 | TDX_R11 | \ - TDX_R12 | TDX_R13 | \ - TDX_R14 | TDX_R15 ) +#define TDVMCALL_EXPOSE_REGS_MASK \ + ( TDX_RDX | TDX_RBX | TDX_RSI | TDX_RDI | TDX_R8 | TDX_R9 | \ + TDX_R10 | TDX_R11 | TDX_R12 | TDX_R13 | TDX_R14 | TDX_R15 ) .section .noinstr.text, "ax" @@ -126,25 +132,38 @@ SYM_FUNC_START(__tdx_hypercall) push %r14 push %r13 push %r12 + push %rbx + + /* Free RDI and RSI to be used as TDVMCALL arguments */ + movq %rdi, %rax + push %rsi + + /* Copy hypercall registers from arg struct: */ + movq TDX_HYPERCALL_r8(%rax), %r8 + movq TDX_HYPERCALL_r9(%rax), %r9 + movq TDX_HYPERCALL_r10(%rax), %r10 + movq TDX_HYPERCALL_r11(%rax), %r11 + movq TDX_HYPERCALL_r12(%rax), %r12 + movq TDX_HYPERCALL_r13(%rax), %r13 + movq TDX_HYPERCALL_r14(%rax), %r14 + movq TDX_HYPERCALL_r15(%rax), %r15 + movq TDX_HYPERCALL_rdi(%rax), %rdi + movq TDX_HYPERCALL_rsi(%rax), %rsi + movq TDX_HYPERCALL_rbx(%rax), %rbx + movq TDX_HYPERCALL_rdx(%rax), %rdx + + push %rax /* Mangle function call ABI into TDCALL ABI: */ /* Set TDCALL leaf ID (TDVMCALL (0)) in RAX */ xor %eax, %eax - /* Copy hypercall registers from arg struct: */ - movq TDX_HYPERCALL_r10(%rdi), %r10 - movq TDX_HYPERCALL_r11(%rdi), %r11 - movq TDX_HYPERCALL_r12(%rdi), %r12 - movq TDX_HYPERCALL_r13(%rdi), %r13 - movq TDX_HYPERCALL_r14(%rdi), %r14 - movq TDX_HYPERCALL_r15(%rdi), %r15 - movl $TDVMCALL_EXPOSE_REGS_MASK, %ecx tdcall /* - * RAX==0 indicates a failure of the TDVMCALL mechanism itself and that + * RAX!=0 indicates a failure of the TDVMCALL mechanism itself and that * something has gone horribly wrong with the TDX module. * * The return status of the hypercall operation is in a separate @@ -154,30 +173,46 @@ SYM_FUNC_START(__tdx_hypercall) testq %rax, %rax jne .Lpanic - /* TDVMCALL leaf return code is in R10 */ - movq %r10, %rax + pop %rax /* Copy hypercall result registers to arg struct if needed */ - testq $TDX_HCALL_HAS_OUTPUT, %rsi + testq $TDX_HCALL_HAS_OUTPUT, (%rsp) jz .Lout - movq %r10, TDX_HYPERCALL_r10(%rdi) - movq %r11, TDX_HYPERCALL_r11(%rdi) - movq %r12, TDX_HYPERCALL_r12(%rdi) - movq %r13, TDX_HYPERCALL_r13(%rdi) - movq %r14, TDX_HYPERCALL_r14(%rdi) - movq %r15, TDX_HYPERCALL_r15(%rdi) + movq %r8, TDX_HYPERCALL_r8(%rax) + movq %r9, TDX_HYPERCALL_r9(%rax) + movq %r10, TDX_HYPERCALL_r10(%rax) + movq %r11, TDX_HYPERCALL_r11(%rax) + movq %r12, TDX_HYPERCALL_r12(%rax) + movq %r13, TDX_HYPERCALL_r13(%rax) + movq %r14, TDX_HYPERCALL_r14(%rax) + movq %r15, TDX_HYPERCALL_r15(%rax) + movq %rdi, TDX_HYPERCALL_rdi(%rax) + movq %rsi, TDX_HYPERCALL_rsi(%rax) + movq %rbx, TDX_HYPERCALL_rbx(%rax) + movq %rdx, TDX_HYPERCALL_rdx(%rax) .Lout: + /* TDVMCALL leaf return code is in R10 */ + movq %r10, %rax + /* * Zero out registers exposed to the VMM to avoid speculative execution * with VMM-controlled values. This needs to include all registers - * present in TDVMCALL_EXPOSE_REGS_MASK (except R12-R15). R12-R15 - * context will be restored. + * present in TDVMCALL_EXPOSE_REGS_MASK, except RBX, and R12-R15 which + * will be restored. */ + xor %r8d, %r8d + xor %r9d, %r9d xor %r10d, %r10d xor %r11d, %r11d + xor %rdi, %rdi + xor %rdx, %rdx + + /* Remove TDX_HCALL_* flags from the stack */ + pop %rsi /* Restore callee-saved GPRs as mandated by the x86_64 ABI */ + pop %rbx pop %r12 pop %r13 pop %r14 diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 3bd111d5e6a0..055300e08fb3 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -19,9 +19,14 @@ #define TDX_GET_VEINFO 3 #define TDX_GET_REPORT 4 #define TDX_ACCEPT_PAGE 6 +#define TDX_WR 8 + +/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */ +#define TDCS_NOTIFY_ENABLES 0x9100000000000010 /* TDX hypercall Leaf IDs */ #define TDVMCALL_MAP_GPA 0x10001 +#define TDVMCALL_REPORT_FATAL_ERROR 0x10003 /* MMIO direction */ #define EPT_READ 0 @@ -37,6 +42,7 @@ #define VE_GET_PORT_NUM(e) ((e) >> 16) #define VE_IS_IO_STRING(e) ((e) & BIT(4)) +#define ATTR_DEBUG BIT(0) #define ATTR_SEPT_VE_DISABLE BIT(28) /* TDX Module call error codes */ @@ -141,6 +147,41 @@ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport) } EXPORT_SYMBOL_GPL(tdx_mcall_get_report0); +static void __noreturn tdx_panic(const char *msg) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = TDVMCALL_REPORT_FATAL_ERROR, + .r12 = 0, /* Error code: 0 is Panic */ + }; + union { + /* Define register order according to the GHCI */ + struct { u64 r14, r15, rbx, rdi, rsi, r8, r9, rdx; }; + + char str[64]; + } message; + + /* VMM assumes '\0' in byte 65, if the message took all 64 bytes */ + strncpy(message.str, msg, 64); + + args.r8 = message.r8; + args.r9 = message.r9; + args.r14 = message.r14; + args.r15 = message.r15; + args.rdi = message.rdi; + args.rsi = message.rsi; + args.rbx = message.rbx; + args.rdx = message.rdx; + + /* + * This hypercall should never return and it is not safe + * to keep the guest running. Call it forever if it + * happens to return. + */ + while (1) + __tdx_hypercall(&args, 0); +} + static void tdx_parse_tdinfo(u64 *cc_mask) { struct tdx_module_output out; @@ -172,8 +213,15 @@ static void tdx_parse_tdinfo(u64 *cc_mask) * TD-private memory. Only VMM-shared memory (MMIO) will #VE. */ td_attr = out.rdx; - if (!(td_attr & ATTR_SEPT_VE_DISABLE)) - panic("TD misconfiguration: SEPT_VE_DISABLE attibute must be set.\n"); + if (!(td_attr & ATTR_SEPT_VE_DISABLE)) { + const char *msg = "TD misconfiguration: SEPT_VE_DISABLE attribute must be set."; + + /* Relax SEPT_VE_DISABLE check for debug TD. */ + if (td_attr & ATTR_DEBUG) + pr_warn("%s\n", msg); + else + tdx_panic(msg); + } } /* @@ -617,6 +665,11 @@ static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve) } } +static inline bool is_private_gpa(u64 gpa) +{ + return gpa == cc_mkenc(gpa); +} + /* * Handle the kernel #VE. * @@ -635,6 +688,8 @@ static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) case EXIT_REASON_CPUID: return handle_cpuid(regs, ve); case EXIT_REASON_EPT_VIOLATION: + if (is_private_gpa(ve->gpa)) + panic("Unexpected EPT-violation on private memory."); return handle_mmio(regs, ve); case EXIT_REASON_IO_INSTRUCTION: return handle_io(regs, ve); @@ -801,6 +856,9 @@ void __init tdx_early_init(void) tdx_parse_tdinfo(&cc_mask); cc_set_mask(cc_mask); + /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */ + tdx_module_call(TDX_WR, 0, TDCS_NOTIFY_ENABLES, 0, -1ULL, NULL); + /* * All bits above GPA width are reserved and kernel treats shared bit * as flag, not as part of physical address. diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 9fce2d1247a7..a3fb996a86a1 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6495,6 +6495,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints; x86_pmu.extra_regs = intel_spr_extra_regs; x86_pmu.limit_period = spr_limit_period; + x86_pmu.pebs_ept = 1; x86_pmu.pebs_aliases = NULL; x86_pmu.pebs_prec_dist = true; x86_pmu.pebs_block = true; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index b0354dc869d2..a2e566e53076 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2366,8 +2366,10 @@ void __init intel_ds_init(void) x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME; break; - case 4: case 5: + x86_pmu.pebs_ept = 1; + fallthrough; + case 4: x86_pmu.drain_pebs = intel_pmu_drain_pebs_icl; x86_pmu.pebs_record_size = sizeof(struct pebs_basic); if (x86_pmu.intel_cap.pebs_baseline) { diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 389ea336258f..73c9672c123b 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -315,6 +315,9 @@ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ #define X86_FEATURE_CMPCCXADD (12*32+ 7) /* "" CMPccXADD instructions */ #define X86_FEATURE_ARCH_PERFMON_EXT (12*32+ 8) /* "" Intel Architectural PerfMon Extension */ +#define X86_FEATURE_FZRM (12*32+10) /* "" Fast zero-length REP MOVSB */ +#define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */ +#define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */ #define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */ #define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ #define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 566ac26239ba..0b73a809e9e1 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -269,6 +269,9 @@ enum hv_isolation_type { /* TSC invariant control */ #define HV_X64_MSR_TSC_INVARIANT_CONTROL 0x40000118 +/* HV_X64_MSR_TSC_INVARIANT_CONTROL bits */ +#define HV_EXPOSE_INVARIANT_TSC BIT_ULL(0) + /* Register name aliases for temporary compatibility */ #define HV_X64_MSR_STIMER0_COUNT HV_REGISTER_STIMER0_COUNT #define HV_X64_MSR_STIMER0_CONFIG HV_REGISTER_STIMER0_CONFIG diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 72184b0b2219..b241af4ce9b4 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -582,18 +582,14 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check); /* NMI */ -#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL) +#if IS_ENABLED(CONFIG_KVM_INTEL) /* - * Special NOIST entry point for VMX which invokes this on the kernel - * stack. asm_exc_nmi() requires an IST to work correctly vs. the NMI - * 'executing' marker. - * - * On 32bit this just uses the regular NMI entry point because 32-bit does - * not have ISTs. + * Special entry point for VMX which invokes this on the kernel stack, even for + * 64-bit, i.e. without using an IST. asm_exc_nmi() requires an IST to work + * correctly vs. the NMI 'executing' marker. Used for 32-bit kernels as well + * to avoid more ifdeffery. */ -DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_noist); -#else -#define asm_exc_nmi_noist asm_exc_nmi +DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_kvm_vmx); #endif DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi); diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index abccd51dcfca..8dc345cc6318 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -14,6 +14,7 @@ BUILD_BUG_ON(1) * to make a definition optional, but in this case the default will * be __static_call_return0. */ +KVM_X86_OP(check_processor_compatibility) KVM_X86_OP(hardware_enable) KVM_X86_OP(hardware_disable) KVM_X86_OP(hardware_unsetup) @@ -76,7 +77,6 @@ KVM_X86_OP(set_nmi_mask) KVM_X86_OP(enable_nmi_window) KVM_X86_OP(enable_irq_window) KVM_X86_OP_OPTIONAL(update_cr8_intercept) -KVM_X86_OP(check_apicv_inhibit_reasons) KVM_X86_OP(refresh_apicv_exec_ctrl) KVM_X86_OP_OPTIONAL(hwapic_irr_update) KVM_X86_OP_OPTIONAL(hwapic_isr_update) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6aaae18f1854..808c292ad3f4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -134,8 +134,6 @@ #define INVALID_PAGE (~(hpa_t)0) #define VALID_PAGE(x) ((x) != INVALID_PAGE) -#define INVALID_GPA (~(gpa_t)0) - /* KVM Hugepage definitions for x86 */ #define KVM_MAX_HUGEPAGE_LEVEL PG_LEVEL_1G #define KVM_NR_PAGE_SIZES (KVM_MAX_HUGEPAGE_LEVEL - PG_LEVEL_4K + 1) @@ -514,6 +512,7 @@ struct kvm_pmc { #define MSR_ARCH_PERFMON_PERFCTR_MAX (MSR_ARCH_PERFMON_PERFCTR0 + KVM_INTEL_PMC_MAX_GENERIC - 1) #define MSR_ARCH_PERFMON_EVENTSEL_MAX (MSR_ARCH_PERFMON_EVENTSEL0 + KVM_INTEL_PMC_MAX_GENERIC - 1) #define KVM_PMC_MAX_FIXED 3 +#define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1) #define KVM_AMD_PMC_MAX_GENERIC 6 struct kvm_pmu { unsigned nr_arch_gp_counters; @@ -678,6 +677,11 @@ struct kvm_vcpu_hv { } nested; }; +struct kvm_hypervisor_cpuid { + u32 base; + u32 limit; +}; + /* Xen HVM per vcpu emulation context */ struct kvm_vcpu_xen { u64 hypercall_rip; @@ -698,6 +702,7 @@ struct kvm_vcpu_xen { struct hrtimer timer; int poll_evtchn; struct timer_list poll_timer; + struct kvm_hypervisor_cpuid cpuid; }; struct kvm_queued_exception { @@ -826,7 +831,7 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 *cpuid_entries; - u32 kvm_cpuid_base; + struct kvm_hypervisor_cpuid kvm_cpuid; u64 reserved_gpa_bits; int maxphyaddr; @@ -1022,19 +1027,30 @@ struct kvm_arch_memory_slot { }; /* - * We use as the mode the number of bits allocated in the LDR for the - * logical processor ID. It happens that these are all powers of two. - * This makes it is very easy to detect cases where the APICs are - * configured for multiple modes; in that case, we cannot use the map and - * hence cannot use kvm_irq_delivery_to_apic_fast either. + * Track the mode of the optimized logical map, as the rules for decoding the + * destination vary per mode. Enabling the optimized logical map requires all + * software-enabled local APIs to be in the same mode, each addressable APIC to + * be mapped to only one MDA, and each MDA to map to at most one APIC. */ -#define KVM_APIC_MODE_XAPIC_CLUSTER 4 -#define KVM_APIC_MODE_XAPIC_FLAT 8 -#define KVM_APIC_MODE_X2APIC 16 +enum kvm_apic_logical_mode { + /* All local APICs are software disabled. */ + KVM_APIC_MODE_SW_DISABLED, + /* All software enabled local APICs in xAPIC cluster addressing mode. */ + KVM_APIC_MODE_XAPIC_CLUSTER, + /* All software enabled local APICs in xAPIC flat addressing mode. */ + KVM_APIC_MODE_XAPIC_FLAT, + /* All software enabled local APICs in x2APIC mode. */ + KVM_APIC_MODE_X2APIC, + /* + * Optimized map disabled, e.g. not all local APICs in the same logical + * mode, same logical ID assigned to multiple APICs, etc. + */ + KVM_APIC_MODE_MAP_DISABLED, +}; struct kvm_apic_map { struct rcu_head rcu; - u8 mode; + enum kvm_apic_logical_mode logical_mode; u32 max_apic_id; union { struct kvm_lapic *xapic_flat_map[8]; @@ -1088,6 +1104,7 @@ struct kvm_hv { u64 hv_reenlightenment_control; u64 hv_tsc_emulation_control; u64 hv_tsc_emulation_status; + u64 hv_invtsc_control; /* How many vCPUs have VP index != vCPU index */ atomic_t num_mismatched_vp_indexes; @@ -1133,6 +1150,18 @@ struct kvm_x86_msr_filter { struct msr_bitmap_range ranges[16]; }; +struct kvm_x86_pmu_event_filter { + __u32 action; + __u32 nevents; + __u32 fixed_counter_bitmap; + __u32 flags; + __u32 nr_includes; + __u32 nr_excludes; + __u64 *includes; + __u64 *excludes; + __u64 events[]; +}; + enum kvm_apicv_inhibit { /********************************************************************/ @@ -1164,6 +1193,12 @@ enum kvm_apicv_inhibit { APICV_INHIBIT_REASON_BLOCKIRQ, /* + * APICv is disabled because not all vCPUs have a 1:1 mapping between + * APIC ID and vCPU, _and_ KVM is not applying its x2APIC hotplug hack. + */ + APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED, + + /* * For simplicity, the APIC acceleration is inhibited * first time either APIC ID or APIC base are changed by the guest * from their reset values. @@ -1201,6 +1236,12 @@ enum kvm_apicv_inhibit { * AVIC is disabled because SEV doesn't support it. */ APICV_INHIBIT_REASON_SEV, + + /* + * AVIC is disabled because not all vCPUs with a valid LDR have a 1:1 + * mapping between logical ID and vCPU. + */ + APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED, }; struct kvm_arch { @@ -1249,10 +1290,11 @@ struct kvm_arch { struct kvm_apic_map __rcu *apic_map; atomic_t apic_map_dirty; - /* Protects apic_access_memslot_enabled and apicv_inhibit_reasons */ - struct rw_semaphore apicv_update_lock; - bool apic_access_memslot_enabled; + bool apic_access_memslot_inhibited; + + /* Protects apicv_inhibit_reasons */ + struct rw_semaphore apicv_update_lock; unsigned long apicv_inhibit_reasons; gpa_t wall_clock; @@ -1302,7 +1344,6 @@ struct kvm_arch { u32 bsp_vcpu_id; u64 disabled_quirks; - int cpu_dirty_logging_count; enum kvm_irqchip_mode irqchip_mode; u8 nr_reserved_ioapic_pins; @@ -1338,25 +1379,16 @@ struct kvm_arch { /* Guest can access the SGX PROVISIONKEY. */ bool sgx_provisioning_allowed; - struct kvm_pmu_event_filter __rcu *pmu_event_filter; + struct kvm_x86_pmu_event_filter __rcu *pmu_event_filter; struct task_struct *nx_huge_page_recovery_thread; #ifdef CONFIG_X86_64 - /* - * Whether the TDP MMU is enabled for this VM. This contains a - * snapshot of the TDP MMU module parameter from when the VM was - * created and remains unchanged for the life of the VM. If this is - * true, TDP MMU handler functions will run for various MMU - * operations. - */ - bool tdp_mmu_enabled; - /* The number of TDP MMU pages across all roots. */ atomic64_t tdp_mmu_pages; /* - * List of kvm_mmu_page structs being used as roots. - * All kvm_mmu_page structs in the list should have + * List of struct kvm_mmu_pages being used as roots. + * All struct kvm_mmu_pages in the list should have * tdp_mmu_page set. * * For reads, this list is protected by: @@ -1520,6 +1552,8 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) struct kvm_x86_ops { const char *name; + int (*check_processor_compatibility)(void); + int (*hardware_enable)(void); void (*hardware_disable)(void); void (*hardware_unsetup)(void); @@ -1608,6 +1642,8 @@ struct kvm_x86_ops { void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason); + const unsigned long required_apicv_inhibits; + bool allow_apicv_in_x2apic_without_x2apic_virtualization; void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(int isr); @@ -1731,9 +1767,6 @@ struct kvm_x86_nested_ops { }; struct kvm_x86_init_ops { - int (*cpu_has_kvm_support)(void); - int (*disabled_by_bios)(void); - int (*check_processor_compatibility)(void); int (*hardware_setup)(void); unsigned int (*handle_intel_pt_intr)(void); @@ -1760,6 +1793,9 @@ extern struct kvm_x86_ops kvm_x86_ops; #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include <asm/kvm-x86-ops.h> +int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops); +void kvm_x86_vendor_exit(void); + #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { @@ -1982,7 +2018,7 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, bool kvm_apicv_activated(struct kvm *kvm); bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu); -void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); +void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, enum kvm_apicv_inhibit reason, bool set); void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, @@ -2054,14 +2090,11 @@ enum { TASK_SWITCH_GATE = 3, }; -#define HF_GIF_MASK (1 << 0) -#define HF_NMI_MASK (1 << 3) -#define HF_IRET_MASK (1 << 4) -#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ +#define HF_GUEST_MASK (1 << 0) /* VCPU is in guest-mode */ #ifdef CONFIG_KVM_SMM -#define HF_SMM_MASK (1 << 6) -#define HF_SMM_INSIDE_NMI_MASK (1 << 7) +#define HF_SMM_MASK (1 << 1) +#define HF_SMM_INSIDE_NMI_MASK (1 << 2) # define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE # define KVM_ADDRESS_SPACE_NUM 2 diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 04c17be9b5fd..bc5b4d788c08 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -25,6 +25,8 @@ void __noreturn machine_real_restart(unsigned int type); #define MRR_BIOS 0 #define MRR_APM 1 +void cpu_emergency_disable_virtualization(void); + typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); void nmi_panic_self_stop(struct pt_regs *regs); void nmi_shootdown_cpus(nmi_shootdown_cb callback); diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index 559176887791..4a03993e0785 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -21,12 +21,18 @@ * This is a software only structure and not part of the TDX module/VMM ABI. */ struct tdx_hypercall_args { + u64 r8; + u64 r9; u64 r10; u64 r11; u64 r12; u64 r13; u64 r14; u64 r15; + u64 rdi; + u64 rsi; + u64 rbx; + u64 rdx; }; /* Used to request services from the VMM */ diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index 8757078d4442..3b12e6b99412 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -126,7 +126,21 @@ static inline void cpu_svm_disable(void) wrmsrl(MSR_VM_HSAVE_PA, 0); rdmsrl(MSR_EFER, efer); - wrmsrl(MSR_EFER, efer & ~EFER_SVME); + if (efer & EFER_SVME) { + /* + * Force GIF=1 prior to disabling SVM to ensure INIT and NMI + * aren't blocked, e.g. if a fatal error occurred between CLGI + * and STGI. Note, STGI may #UD if SVM is disabled from NMI + * context between reading EFER and executing STGI. In that + * case, GIF must already be set, otherwise the NMI would have + * been blocked, so just eat the fault. + */ + asm_volatile_goto("1: stgi\n\t" + _ASM_EXTABLE(1b, %l[fault]) + ::: "memory" : fault); +fault: + wrmsrl(MSR_EFER, efer & ~EFER_SVME); + } } /** Makes sure SVM is disabled, if it is supported on the CPU diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 16f548a661cf..5fc35f889cd1 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -38,9 +38,11 @@ extern struct start_info *xen_start_info; #include <asm/processor.h> +#define XEN_SIGNATURE "XenVMMXenVMM" + static inline uint32_t xen_cpuid_base(void) { - return hypervisor_cpuid_base("XenVMMXenVMM", 2); + return hypervisor_cpuid_base(XEN_SIGNATURE, 2); } struct pci_dev; diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index e48deab8901d..7f467fe05d42 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -9,6 +9,7 @@ #include <linux/types.h> #include <linux/ioctl.h> +#include <linux/stddef.h> #define KVM_PIO_PAGE_OFFSET 1 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2 @@ -507,8 +508,8 @@ struct kvm_nested_state { * KVM_{GET,PUT}_NESTED_STATE ioctl values. */ union { - struct kvm_vmx_nested_state_data vmx[0]; - struct kvm_svm_nested_state_data svm[0]; + __DECLARE_FLEX_ARRAY(struct kvm_vmx_nested_state_data, vmx); + __DECLARE_FLEX_ARRAY(struct kvm_svm_nested_state_data, svm); } data; }; @@ -525,6 +526,35 @@ struct kvm_pmu_event_filter { #define KVM_PMU_EVENT_ALLOW 0 #define KVM_PMU_EVENT_DENY 1 +#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS BIT(0) +#define KVM_PMU_EVENT_FLAGS_VALID_MASK (KVM_PMU_EVENT_FLAG_MASKED_EVENTS) + +/* + * Masked event layout. + * Bits Description + * ---- ----------- + * 7:0 event select (low bits) + * 15:8 umask match + * 31:16 unused + * 35:32 event select (high bits) + * 36:54 unused + * 55 exclude bit + * 63:56 umask mask + */ + +#define KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, exclude) \ + (((event_select) & 0xFFULL) | (((event_select) & 0XF00ULL) << 24) | \ + (((mask) & 0xFFULL) << 56) | \ + (((match) & 0xFFULL) << 8) | \ + ((__u64)(!!(exclude)) << 55)) + +#define KVM_PMU_MASKED_ENTRY_EVENT_SELECT \ + (GENMASK_ULL(7, 0) | GENMASK_ULL(35, 32)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (GENMASK_ULL(63, 56)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (GENMASK_ULL(15, 8)) +#define KVM_PMU_MASKED_ENTRY_EXCLUDE (BIT_ULL(55)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT (56) + /* for KVM_{GET,SET,HAS}_DEVICE_ATTR */ #define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */ #define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index ef9e951415c5..283dcd2f62c8 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -76,12 +76,18 @@ static void __used common(void) OFFSET(TDX_MODULE_r11, tdx_module_output, r11); BLANK(); + OFFSET(TDX_HYPERCALL_r8, tdx_hypercall_args, r8); + OFFSET(TDX_HYPERCALL_r9, tdx_hypercall_args, r9); OFFSET(TDX_HYPERCALL_r10, tdx_hypercall_args, r10); OFFSET(TDX_HYPERCALL_r11, tdx_hypercall_args, r11); OFFSET(TDX_HYPERCALL_r12, tdx_hypercall_args, r12); OFFSET(TDX_HYPERCALL_r13, tdx_hypercall_args, r13); OFFSET(TDX_HYPERCALL_r14, tdx_hypercall_args, r14); OFFSET(TDX_HYPERCALL_r15, tdx_hypercall_args, r15); + OFFSET(TDX_HYPERCALL_rdi, tdx_hypercall_args, rdi); + OFFSET(TDX_HYPERCALL_rsi, tdx_hypercall_args, rsi); + OFFSET(TDX_HYPERCALL_rbx, tdx_hypercall_args, rbx); + OFFSET(TDX_HYPERCALL_rdx, tdx_hypercall_args, rdx); BLANK(); OFFSET(BP_scratch, boot_params, scratch); diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index f924a76c6923..f36dc2f796c5 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -460,7 +460,7 @@ static void __init ms_hyperv_init_platform(void) * setting of this MSR bit should happen before init_intel() * is called. */ - wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1); + wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC); setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); } diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 305514431f26..cdd92ab43cda 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -37,7 +37,6 @@ #include <linux/kdebug.h> #include <asm/cpu.h> #include <asm/reboot.h> -#include <asm/virtext.h> #include <asm/intel_pt.h> #include <asm/crash.h> #include <asm/cmdline.h> @@ -81,15 +80,6 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) */ cpu_crash_vmclear_loaded_vmcss(); - /* Disable VMX or SVM if needed. - * - * We need to disable virtualization on all CPUs. - * Having VMX or SVM enabled on any CPU may break rebooting - * after the kdump kernel has finished its task. - */ - cpu_emergency_vmxoff(); - cpu_emergency_svm_disable(); - /* * Disable Intel PT to stop its logging */ @@ -148,12 +138,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) */ cpu_crash_vmclear_loaded_vmcss(); - /* Booting kdump kernel with VMX or SVM enabled won't work, - * because (among other limitations) we can't disable paging - * with the virt flags. - */ - cpu_emergency_vmxoff(); - cpu_emergency_svm_disable(); + cpu_emergency_disable_virtualization(); /* * Disable Intel PT to stop its logging diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index c315b18ec7c8..776f4b1e395b 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -560,14 +560,14 @@ nmi_restart: } } -#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL) -DEFINE_IDTENTRY_RAW(exc_nmi_noist) +#if IS_ENABLED(CONFIG_KVM_INTEL) +DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx) { exc_nmi(regs); } -#endif #if IS_MODULE(CONFIG_KVM_INTEL) -EXPORT_SYMBOL_GPL(asm_exc_nmi_noist); +EXPORT_SYMBOL_GPL(asm_exc_nmi_kvm_vmx); +#endif #endif #ifdef CONFIG_NMI_CHECK_CPU diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index c3636ea4aa71..d03c551defcc 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -528,33 +528,29 @@ static inline void kb_wait(void) } } -static void vmxoff_nmi(int cpu, struct pt_regs *regs) -{ - cpu_emergency_vmxoff(); -} +static inline void nmi_shootdown_cpus_on_restart(void); -/* Use NMIs as IPIs to tell all CPUs to disable virtualization */ -static void emergency_vmx_disable_all(void) +static void emergency_reboot_disable_virtualization(void) { /* Just make sure we won't change CPUs while doing this */ local_irq_disable(); /* - * Disable VMX on all CPUs before rebooting, otherwise we risk hanging - * the machine, because the CPU blocks INIT when it's in VMX root. + * Disable virtualization on all CPUs before rebooting to avoid hanging + * the system, as VMX and SVM block INIT when running in the host. * * We can't take any locks and we may be on an inconsistent state, so - * use NMIs as IPIs to tell the other CPUs to exit VMX root and halt. + * use NMIs as IPIs to tell the other CPUs to disable VMX/SVM and halt. * - * Do the NMI shootdown even if VMX if off on _this_ CPU, as that - * doesn't prevent a different CPU from being in VMX root operation. + * Do the NMI shootdown even if virtualization is off on _this_ CPU, as + * other CPUs may have virtualization enabled. */ - if (cpu_has_vmx()) { - /* Safely force _this_ CPU out of VMX root operation. */ - __cpu_emergency_vmxoff(); + if (cpu_has_vmx() || cpu_has_svm(NULL)) { + /* Safely force _this_ CPU out of VMX/SVM operation. */ + cpu_emergency_disable_virtualization(); - /* Halt and exit VMX root operation on the other CPUs. */ - nmi_shootdown_cpus(vmxoff_nmi); + /* Disable VMX/SVM and halt on other CPUs. */ + nmi_shootdown_cpus_on_restart(); } } @@ -590,7 +586,7 @@ static void native_machine_emergency_restart(void) unsigned short mode; if (reboot_emergency) - emergency_vmx_disable_all(); + emergency_reboot_disable_virtualization(); tboot_shutdown(TB_SHUTDOWN_REBOOT); @@ -795,6 +791,17 @@ void machine_crash_shutdown(struct pt_regs *regs) /* This is the CPU performing the emergency shutdown work. */ int crashing_cpu = -1; +/* + * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during + * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if + * GIF=0, i.e. if the crash occurred between CLGI and STGI. + */ +void cpu_emergency_disable_virtualization(void) +{ + cpu_emergency_vmxoff(); + cpu_emergency_svm_disable(); +} + #if defined(CONFIG_SMP) static nmi_shootdown_cb shootdown_callback; @@ -817,7 +824,14 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) return NMI_HANDLED; local_irq_disable(); - shootdown_callback(cpu, regs); + if (shootdown_callback) + shootdown_callback(cpu, regs); + + /* + * Prepare the CPU for reboot _after_ invoking the callback so that the + * callback can safely use virtualization instructions, e.g. VMCLEAR. + */ + cpu_emergency_disable_virtualization(); atomic_dec(&waiting_for_crash_ipi); /* Assume hlt works */ @@ -828,18 +842,32 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) return NMI_HANDLED; } -/* - * Halt all other CPUs, calling the specified function on each of them +/** + * nmi_shootdown_cpus - Stop other CPUs via NMI + * @callback: Optional callback to be invoked from the NMI handler + * + * The NMI handler on the remote CPUs invokes @callback, if not + * NULL, first and then disables virtualization to ensure that + * INIT is recognized during reboot. * - * This function can be used to halt all other CPUs on crash - * or emergency reboot time. The function passed as parameter - * will be called inside a NMI handler on all CPUs. + * nmi_shootdown_cpus() can only be invoked once. After the first + * invocation all other CPUs are stuck in crash_nmi_callback() and + * cannot respond to a second NMI. */ void nmi_shootdown_cpus(nmi_shootdown_cb callback) { unsigned long msecs; + local_irq_disable(); + /* + * Avoid certain doom if a shootdown already occurred; re-registering + * the NMI handler will cause list corruption, modifying the callback + * will do who knows what, etc... + */ + if (WARN_ON_ONCE(crash_ipi_issued)) + return; + /* Make a note of crashing cpu. Will be used in NMI callback. */ crashing_cpu = safe_smp_processor_id(); @@ -867,7 +895,17 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) msecs--; } - /* Leave the nmi callback set */ + /* + * Leave the nmi callback set, shootdown is a one-time thing. Clearing + * the callback could result in a NULL pointer dereference if a CPU + * (finally) responds after the timeout expires. + */ +} + +static inline void nmi_shootdown_cpus_on_restart(void) +{ + if (!crash_ipi_issued) + nmi_shootdown_cpus(NULL); } /* @@ -897,6 +935,8 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) /* No other CPUs to shoot down */ } +static inline void nmi_shootdown_cpus_on_restart(void) { } + void run_crash_ipi_callback(struct pt_regs *regs) { } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 06db901fabe8..375b33ecafa2 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -32,7 +32,7 @@ #include <asm/mce.h> #include <asm/trace/irq_vectors.h> #include <asm/kexec.h> -#include <asm/virtext.h> +#include <asm/reboot.h> /* * Some notes on x86 processor bugs affecting SMP operation: @@ -122,7 +122,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) if (raw_smp_processor_id() == atomic_read(&stopping_cpu)) return NMI_HANDLED; - cpu_emergency_vmxoff(); + cpu_emergency_disable_virtualization(); stop_this_cpu(NULL); return NMI_HANDLED; @@ -134,7 +134,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) DEFINE_IDTENTRY_SYSVEC(sysvec_reboot) { ack_APIC_irq(); - cpu_emergency_vmxoff(); + cpu_emergency_disable_virtualization(); stop_this_cpu(NULL); } diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index fbeaa9ddef59..8e578311ca9d 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -49,6 +49,7 @@ config KVM select SRCU select INTERVAL_TREE select HAVE_KVM_PM_NOTIFIER if PM + select KVM_GENERIC_HARDWARE_ENABLING help Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7f1b585f9a67..599aebec2d52 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -8,6 +8,7 @@ * Copyright 2011 Red Hat, Inc. and/or its affiliates. * Copyright IBM Corporation, 2008 */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <linux/export.h> @@ -25,6 +26,7 @@ #include "mmu.h" #include "trace.h" #include "pmu.h" +#include "xen.h" /* * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be @@ -180,15 +182,15 @@ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 return 0; } -static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) +static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu, + const char *sig) { - u32 function; + struct kvm_hypervisor_cpuid cpuid = {}; struct kvm_cpuid_entry2 *entry; + u32 base; - vcpu->arch.kvm_cpuid_base = 0; - - for_each_possible_hypervisor_cpuid_base(function) { - entry = kvm_find_cpuid_entry(vcpu, function); + for_each_possible_hypervisor_cpuid_base(base) { + entry = kvm_find_cpuid_entry(vcpu, base); if (entry) { u32 signature[3]; @@ -197,19 +199,21 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) signature[1] = entry->ecx; signature[2] = entry->edx; - BUILD_BUG_ON(sizeof(signature) > sizeof(KVM_SIGNATURE)); - if (!memcmp(signature, KVM_SIGNATURE, sizeof(signature))) { - vcpu->arch.kvm_cpuid_base = function; + if (!memcmp(signature, sig, sizeof(signature))) { + cpuid.base = base; + cpuid.limit = entry->eax; break; } } } + + return cpuid; } static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, int nent) { - u32 base = vcpu->arch.kvm_cpuid_base; + u32 base = vcpu->arch.kvm_cpuid.base; if (!base) return NULL; @@ -439,7 +443,8 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, vcpu->arch.cpuid_entries = e2; vcpu->arch.cpuid_nent = nent; - kvm_update_kvm_cpuid_base(vcpu); + vcpu->arch.kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE); + vcpu->arch.xen.cpuid = kvm_get_hypervisor_cpuid(vcpu, XEN_SIGNATURE); kvm_vcpu_after_set_cpuid(vcpu); return 0; @@ -663,8 +668,9 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); kvm_cpu_cap_mask(CPUID_7_1_EAX, - F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) | F(AMX_FP16) | - F(AVX_IFMA) + F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) | + F(FZRM) | F(FSRS) | F(FSRC) | + F(AMX_FP16) | F(AVX_IFMA) ); kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, @@ -701,6 +707,10 @@ void kvm_set_cpu_caps(void) if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64)) kvm_cpu_cap_set(X86_FEATURE_GBPAGES); + kvm_cpu_cap_init_kvm_defined(CPUID_8000_0007_EDX, + SF(CONSTANT_TSC) + ); + kvm_cpu_cap_mask(CPUID_8000_0008_EBX, F(CLZERO) | F(XSAVEERPTR) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | @@ -1169,8 +1179,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx &= ~GENMASK(17, 16); break; case 0x80000007: /* Advanced power management */ - /* invariant TSC is CPUID.80000007H:EDX[8] */ - entry->edx &= (1 << 8); + cpuid_entry_override(entry, CPUID_8000_0007_EDX); + /* mask against host */ entry->edx &= boot_cpu_data.x86_power; entry->eax = entry->ebx = entry->ecx = 0; @@ -1485,6 +1495,9 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && (data & TSX_CTRL_CPUID_CLEAR)) *ebx &= ~(F(RTM) | F(HLE)); + } else if (function == 0x80000007) { + if (kvm_hv_invtsc_suppressed(vcpu)) + *edx &= ~SF(CONSTANT_TSC); } } else { *eax = *ebx = *ecx = *edx = 0; diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index c1390357126a..ee8c4c3496ed 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -4,6 +4,8 @@ * * Copyright 2016 Red Hat, Inc. and/or its affiliates. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kvm_host.h> #include <linux/debugfs.h> #include "lapic.h" diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 56e1cf7c339e..a20bec931764 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -17,6 +17,7 @@ * * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include "kvm_cache_regs.h" @@ -1633,7 +1634,7 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, case VCPU_SREG_SS: /* * segment is not a writable data segment or segment - * selector's RPL != CPL or segment selector's RPL != CPL + * selector's RPL != CPL or DPL != CPL */ if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl) goto exception; @@ -1695,11 +1696,11 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, /* * segment is not a data or readable code segment or * ((segment is a data or nonconforming code segment) - * and (both RPL and CPL > DPL)) + * and ((RPL > DPL) or (CPL > DPL))) */ if ((seg_desc.type & 0xa) == 0x8 || (((seg_desc.type & 0xc) != 0xc) && - (rpl > dpl && cpl > dpl))) + (rpl > dpl || cpl > dpl))) goto exception; break; } @@ -2309,7 +2310,7 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt) static int em_rsm(struct x86_emulate_ctxt *ctxt) { - if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0) + if (!ctxt->ops->is_smm(ctxt)) return emulate_ud(ctxt); if (ctxt->ops->leave_smm(ctxt)) @@ -5132,7 +5133,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) const struct x86_emulate_ops *ops = ctxt->ops; int rc = X86EMUL_CONTINUE; int saved_dst_type = ctxt->dst.type; - unsigned emul_flags; + bool is_guest_mode = ctxt->ops->is_guest_mode(ctxt); ctxt->mem_read.pos = 0; @@ -5147,7 +5148,6 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - emul_flags = ctxt->ops->get_hflags(ctxt); if (unlikely(ctxt->d & (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) { if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || @@ -5181,7 +5181,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) fetch_possible_mmx_operand(&ctxt->dst); } - if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) { + if (unlikely(is_guest_mode) && ctxt->intercept) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_PRE_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5210,7 +5210,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { + if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5264,7 +5264,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: - if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { + if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_MEMACCESS); if (rc != X86EMUL_CONTINUE) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index e8296942a868..b28fd020066f 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -17,6 +17,7 @@ * Ben-Ami Yassour <benami@il.ibm.com> * Andrey Smetanin <asmetanin@virtuozzo.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "x86.h" #include "lapic.h" @@ -43,6 +44,24 @@ #define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, HV_VCPUS_PER_SPARSE_BANK) +/* + * As per Hyper-V TLFS, extended hypercalls start from 0x8001 + * (HvExtCallQueryCapabilities). Response of this hypercalls is a 64 bit value + * where each bit tells which extended hypercall is available besides + * HvExtCallQueryCapabilities. + * + * 0x8001 - First extended hypercall, HvExtCallQueryCapabilities, no bit + * assigned. + * + * 0x8002 - Bit 0 + * 0x8003 - Bit 1 + * .. + * 0x8041 - Bit 63 + * + * Therefore, HV_EXT_CALL_MAX = 0x8001 + 64 + */ +#define HV_EXT_CALL_MAX (HV_EXT_CALL_QUERY_CAPABILITIES + 64) + static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, bool vcpu_kick); @@ -999,6 +1018,7 @@ static bool kvm_hv_msr_partition_wide(u32 msr) case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_TSC_INVARIANT_CONTROL: case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: r = true; @@ -1283,6 +1303,9 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) case HV_X64_MSR_TSC_EMULATION_STATUS: return hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_REENLIGHTENMENT; + case HV_X64_MSR_TSC_INVARIANT_CONTROL: + return hv_vcpu->cpuid_cache.features_eax & + HV_ACCESS_TSC_INVARIANT; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: return hv_vcpu->cpuid_cache.features_edx & @@ -1410,12 +1433,22 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, if (!host) return 1; break; + case HV_X64_MSR_TSC_INVARIANT_CONTROL: + /* Only bit 0 is supported */ + if (data & ~HV_EXPOSE_INVARIANT_TSC) + return 1; + + /* The feature can't be disabled from the guest */ + if (!host && hv->hv_invtsc_control && !data) + return 1; + + hv->hv_invtsc_control = data; + break; case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_set_msr(vcpu, msr, data, host); default: - vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } return 0; @@ -1536,8 +1569,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 1; break; default: - vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } @@ -1585,11 +1617,14 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, case HV_X64_MSR_TSC_EMULATION_STATUS: data = hv->hv_tsc_emulation_status; break; + case HV_X64_MSR_TSC_INVARIANT_CONTROL: + data = hv->hv_invtsc_control; + break; case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_get_msr(vcpu, msr, pdata, host); default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; } @@ -1654,7 +1689,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, data = APIC_BUS_FREQUENCY; break; default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; } *pdata = data; @@ -2420,6 +2455,9 @@ static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code) case HVCALL_SEND_IPI: return hv_vcpu->cpuid_cache.enlightenments_eax & HV_X64_CLUSTER_IPI_RECOMMENDED; + case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX: + return hv_vcpu->cpuid_cache.features_ebx & + HV_ENABLE_EXTENDED_HYPERCALLS; default: break; } @@ -2512,14 +2550,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - vcpu->run->exit_reason = KVM_EXIT_HYPERV; - vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; - vcpu->run->hyperv.u.hcall.input = hc.param; - vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa; - vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa; - vcpu->arch.complete_userspace_io = - kvm_hv_hypercall_complete_userspace; - return 0; + goto hypercall_userspace_exit; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: if (unlikely(hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; @@ -2578,15 +2609,14 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) ret = HV_STATUS_OPERATION_DENIED; break; } - vcpu->run->exit_reason = KVM_EXIT_HYPERV; - vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; - vcpu->run->hyperv.u.hcall.input = hc.param; - vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa; - vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa; - vcpu->arch.complete_userspace_io = - kvm_hv_hypercall_complete_userspace; - return 0; + goto hypercall_userspace_exit; } + case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX: + if (unlikely(hc.fast)) { + ret = HV_STATUS_INVALID_PARAMETER; + break; + } + goto hypercall_userspace_exit; default: ret = HV_STATUS_INVALID_HYPERCALL_CODE; break; @@ -2594,6 +2624,15 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) hypercall_complete: return kvm_hv_hypercall_complete(vcpu, ret); + +hypercall_userspace_exit: + vcpu->run->exit_reason = KVM_EXIT_HYPERV; + vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; + vcpu->run->hyperv.u.hcall.input = hc.param; + vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa; + vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa; + vcpu->arch.complete_userspace_io = kvm_hv_hypercall_complete_userspace; + return 0; } void kvm_hv_init_vm(struct kvm *kvm) @@ -2733,9 +2772,11 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE; ent->eax |= HV_ACCESS_FREQUENCY_MSRS; ent->eax |= HV_ACCESS_REENLIGHTENMENT; + ent->eax |= HV_ACCESS_TSC_INVARIANT; ent->ebx |= HV_POST_MESSAGES; ent->ebx |= HV_SIGNAL_EVENTS; + ent->ebx |= HV_ENABLE_EXTENDED_HYPERCALLS; ent->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE; ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 9f96414a31c5..f83b8db72b11 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -136,6 +136,33 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu) HV_SYNIC_STIMER_COUNT); } +/* + * With HV_ACCESS_TSC_INVARIANT feature, invariant TSC (CPUID.80000007H:EDX[8]) + * is only observed after HV_X64_MSR_TSC_INVARIANT_CONTROL was written to. + */ +static inline bool kvm_hv_invtsc_suppressed(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + /* + * If Hyper-V's invariant TSC control is not exposed to the guest, + * the invariant TSC CPUID flag is not suppressed, Windows guests were + * observed to be able to handle it correctly. Going forward, VMMs are + * encouraged to enable Hyper-V's invariant TSC control when invariant + * TSC CPUID flag is set to make KVM's behavior match genuine Hyper-V. + */ + if (!hv_vcpu || + !(hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_TSC_INVARIANT)) + return false; + + /* + * If Hyper-V's invariant TSC control is exposed to the guest, KVM is + * responsible for suppressing the invariant TSC CPUID flag if the + * Hyper-V control is not enabled. + */ + return !(to_kvm_hv(vcpu->kvm)->hv_invtsc_control & HV_EXPOSE_INVARIANT_TSC); +} + void kvm_hv_process_stimers(struct kvm_vcpu *vcpu); void kvm_hv_setup_tsc_page(struct kvm *kvm, diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index e0a7a0e7a73c..cd57a517d04a 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -30,7 +30,7 @@ * Based on QEMU and Xen. */ -#define pr_fmt(fmt) "pit: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <linux/slab.h> @@ -351,7 +351,7 @@ static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period) if (ps->period < min_period) { pr_info_ratelimited( - "kvm: requested %lld ns " + "requested %lld ns " "i8254 timer period limited to %lld ns\n", ps->period, min_period); ps->period = min_period; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index e1bb6218bb96..4756bcb5724f 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -26,6 +26,8 @@ * Yaozu (Eddie) Dong <Eddie.dong@intel.com> * Port from Qemu. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/mm.h> #include <linux/slab.h> #include <linux/bitops.h> @@ -35,7 +37,7 @@ #include "trace.h" #define pr_pic_unimpl(fmt, ...) \ - pr_err_ratelimited("kvm: pic: " fmt, ## __VA_ARGS__) + pr_err_ratelimited("pic: " fmt, ## __VA_ARGS__) static void pic_irq_request(struct kvm *kvm, int level); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 765943d7cfa5..042dee556125 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -26,6 +26,7 @@ * Yaozu (Eddie) Dong <eddie.dong@intel.com> * Based on Xen 3.1 code. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <linux/kvm.h> diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index a70952eca905..b2c397dd2bc6 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -7,6 +7,7 @@ * Authors: * Yaozu (Eddie) Dong <Eddie.dong@intel.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/export.h> #include <linux/kvm_host.h> diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 3742d9adacfc..16d076a1b91a 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -8,6 +8,7 @@ * * Copyright 2010 Red Hat, Inc. and/or its affiliates. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <linux/slab.h> @@ -56,7 +57,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, if (irq->dest_mode == APIC_DEST_PHYSICAL && irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { - printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); + pr_info("apic: phys broadcast and lowest prio\n"); irq->delivery_mode = APIC_DM_FIXED; } @@ -199,7 +200,7 @@ int kvm_request_irq_source_id(struct kvm *kvm) irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG); if (irq_source_id >= BITS_PER_LONG) { - printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n"); + pr_warn("exhausted allocatable IRQ sources!\n"); irq_source_id = -EFAULT; goto unlock; } @@ -221,7 +222,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) mutex_lock(&kvm->irq_lock); if (irq_source_id < 0 || irq_source_id >= BITS_PER_LONG) { - printk(KERN_ERR "kvm: IRQ source ID out of range!\n"); + pr_err("IRQ source ID out of range!\n"); goto unlock; } clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index c09174f73a34..4c91f626c058 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -76,6 +76,18 @@ static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu, } /* + * kvm_register_test_and_mark_available() is a special snowflake that uses an + * arch bitop directly to avoid the explicit instrumentation that comes with + * the generic bitops. This allows code that cannot be instrumented (noinstr + * functions), e.g. the low level VM-Enter/VM-Exit paths, to cache registers. + */ +static __always_inline bool kvm_register_test_and_mark_available(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + return arch___test_and_set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); +} + +/* * The "raw" register helpers are only for cases where the full 64 bits of a * register are read/written irrespective of current vCPU mode. In other words, * odds are good you shouldn't be using the raw variants. diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 2d9662be8333..ab65f3a47dfd 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -220,7 +220,8 @@ struct x86_emulate_ops { void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); - unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); + bool (*is_smm)(struct x86_emulate_ctxt *ctxt); + bool (*is_guest_mode)(struct x86_emulate_ctxt *ctxt); int (*leave_smm)(struct x86_emulate_ctxt *ctxt); void (*triple_fault)(struct x86_emulate_ctxt *ctxt); int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); @@ -275,10 +276,6 @@ enum x86emul_mode { X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */ }; -/* These match some of the HF_* flags defined in kvm_host.h */ -#define X86EMUL_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ -#define X86EMUL_SMM_MASK (1 << 6) - /* * fastop functions are declared as taking a never-defined fastop parameter, * so they can't be called from C directly. diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c index ee4f696a0782..482d6639ef88 100644 --- a/arch/x86/kvm/kvm_onhyperv.c +++ b/arch/x86/kvm/kvm_onhyperv.c @@ -2,6 +2,7 @@ /* * KVM L1 hypervisor optimizations on Hyper-V. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <asm/mshyperv.h> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4efdb4a4d72c..e542cf285b51 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -15,6 +15,7 @@ * * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <linux/kvm.h> @@ -166,9 +167,19 @@ static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu) return kvm_can_post_timer_interrupt(vcpu) && vcpu->mode == IN_GUEST_MODE; } +static inline u32 kvm_apic_calc_x2apic_ldr(u32 id) +{ + return ((id >> 4) << 16) | (1 << (id & 0xf)); +} + static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) { - switch (map->mode) { + switch (map->logical_mode) { + case KVM_APIC_MODE_SW_DISABLED: + /* Arbitrarily use the flat map so that @cluster isn't NULL. */ + *cluster = map->xapic_flat_map; + *mask = 0; + return true; case KVM_APIC_MODE_X2APIC: { u32 offset = (dest_id >> 16) * 16; u32 max_apic_id = map->max_apic_id; @@ -193,8 +204,10 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, *cluster = map->xapic_cluster_map[(dest_id >> 4) & 0xf]; *mask = dest_id & 0xf; return true; + case KVM_APIC_MODE_MAP_DISABLED: + return false; default: - /* Not optimized. */ + WARN_ON_ONCE(1); return false; } } @@ -206,6 +219,134 @@ static void kvm_apic_map_free(struct rcu_head *rcu) kvfree(map); } +static int kvm_recalculate_phys_map(struct kvm_apic_map *new, + struct kvm_vcpu *vcpu, + bool *xapic_id_mismatch) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 x2apic_id = kvm_x2apic_id(apic); + u32 xapic_id = kvm_xapic_id(apic); + u32 physical_id; + + /* + * Deliberately truncate the vCPU ID when detecting a mismatched APIC + * ID to avoid false positives if the vCPU ID, i.e. x2APIC ID, is a + * 32-bit value. Any unwanted aliasing due to truncation results will + * be detected below. + */ + if (!apic_x2apic_mode(apic) && xapic_id != (u8)vcpu->vcpu_id) + *xapic_id_mismatch = true; + + /* + * Apply KVM's hotplug hack if userspace has enable 32-bit APIC IDs. + * Allow sending events to vCPUs by their x2APIC ID even if the target + * vCPU is in legacy xAPIC mode, and silently ignore aliased xAPIC IDs + * (the x2APIC ID is truncated to 8 bits, causing IDs > 0xff to wrap + * and collide). + * + * Honor the architectural (and KVM's non-optimized) behavior if + * userspace has not enabled 32-bit x2APIC IDs. Each APIC is supposed + * to process messages independently. If multiple vCPUs have the same + * effective APIC ID, e.g. due to the x2APIC wrap or because the guest + * manually modified its xAPIC IDs, events targeting that ID are + * supposed to be recognized by all vCPUs with said ID. + */ + if (vcpu->kvm->arch.x2apic_format) { + /* See also kvm_apic_match_physical_addr(). */ + if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) && + x2apic_id <= new->max_apic_id) + new->phys_map[x2apic_id] = apic; + + if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id]) + new->phys_map[xapic_id] = apic; + } else { + /* + * Disable the optimized map if the physical APIC ID is already + * mapped, i.e. is aliased to multiple vCPUs. The optimized + * map requires a strict 1:1 mapping between IDs and vCPUs. + */ + if (apic_x2apic_mode(apic)) + physical_id = x2apic_id; + else + physical_id = xapic_id; + + if (new->phys_map[physical_id]) + return -EINVAL; + + new->phys_map[physical_id] = apic; + } + + return 0; +} + +static void kvm_recalculate_logical_map(struct kvm_apic_map *new, + struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + enum kvm_apic_logical_mode logical_mode; + struct kvm_lapic **cluster; + u16 mask; + u32 ldr; + + if (new->logical_mode == KVM_APIC_MODE_MAP_DISABLED) + return; + + if (!kvm_apic_sw_enabled(apic)) + return; + + ldr = kvm_lapic_get_reg(apic, APIC_LDR); + if (!ldr) + return; + + if (apic_x2apic_mode(apic)) { + logical_mode = KVM_APIC_MODE_X2APIC; + } else { + ldr = GET_APIC_LOGICAL_ID(ldr); + if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT) + logical_mode = KVM_APIC_MODE_XAPIC_FLAT; + else + logical_mode = KVM_APIC_MODE_XAPIC_CLUSTER; + } + + /* + * To optimize logical mode delivery, all software-enabled APICs must + * be configured for the same mode. + */ + if (new->logical_mode == KVM_APIC_MODE_SW_DISABLED) { + new->logical_mode = logical_mode; + } else if (new->logical_mode != logical_mode) { + new->logical_mode = KVM_APIC_MODE_MAP_DISABLED; + return; + } + + /* + * In x2APIC mode, the LDR is read-only and derived directly from the + * x2APIC ID, thus is guaranteed to be addressable. KVM reuses + * kvm_apic_map.phys_map to optimize logical mode x2APIC interrupts by + * reversing the LDR calculation to get cluster of APICs, i.e. no + * additional work is required. + */ + if (apic_x2apic_mode(apic)) { + WARN_ON_ONCE(ldr != kvm_apic_calc_x2apic_ldr(kvm_x2apic_id(apic))); + return; + } + + if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr, + &cluster, &mask))) { + new->logical_mode = KVM_APIC_MODE_MAP_DISABLED; + return; + } + + if (!mask) + return; + + ldr = ffs(mask) - 1; + if (!is_power_of_2(mask) || cluster[ldr]) + new->logical_mode = KVM_APIC_MODE_MAP_DISABLED; + else + cluster[ldr] = apic; +} + /* * CLEAN -> DIRTY and UPDATE_IN_PROGRESS -> DIRTY changes happen without a lock. * @@ -224,6 +365,7 @@ void kvm_recalculate_apic_map(struct kvm *kvm) struct kvm_vcpu *vcpu; unsigned long i; u32 max_id = 255; /* enough space for any xAPIC ID */ + bool xapic_id_mismatch = false; /* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */ if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN) @@ -256,54 +398,41 @@ void kvm_recalculate_apic_map(struct kvm *kvm) goto out; new->max_apic_id = max_id; + new->logical_mode = KVM_APIC_MODE_SW_DISABLED; kvm_for_each_vcpu(i, vcpu, kvm) { - struct kvm_lapic *apic = vcpu->arch.apic; - struct kvm_lapic **cluster; - u16 mask; - u32 ldr; - u8 xapic_id; - u32 x2apic_id; - if (!kvm_apic_present(vcpu)) continue; - xapic_id = kvm_xapic_id(apic); - x2apic_id = kvm_x2apic_id(apic); - - /* Hotplug hack: see kvm_apic_match_physical_addr(), ... */ - if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) && - x2apic_id <= new->max_apic_id) - new->phys_map[x2apic_id] = apic; - /* - * ... xAPIC ID of VCPUs with APIC ID > 0xff will wrap-around, - * prevent them from masking VCPUs with APIC ID <= 0xff. - */ - if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id]) - new->phys_map[xapic_id] = apic; - - if (!kvm_apic_sw_enabled(apic)) - continue; - - ldr = kvm_lapic_get_reg(apic, APIC_LDR); - - if (apic_x2apic_mode(apic)) { - new->mode |= KVM_APIC_MODE_X2APIC; - } else if (ldr) { - ldr = GET_APIC_LOGICAL_ID(ldr); - if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT) - new->mode |= KVM_APIC_MODE_XAPIC_FLAT; - else - new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER; + if (kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch)) { + kvfree(new); + new = NULL; + goto out; } - if (!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask)) - continue; - - if (mask) - cluster[ffs(mask) - 1] = apic; + kvm_recalculate_logical_map(new, vcpu); } out: + /* + * The optimized map is effectively KVM's internal version of APICv, + * and all unwanted aliasing that results in disabling the optimized + * map also applies to APICv. + */ + if (!new) + kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED); + else + kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED); + + if (!new || new->logical_mode == KVM_APIC_MODE_MAP_DISABLED) + kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED); + else + kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED); + + if (xapic_id_mismatch) + kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED); + else + kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED); + old = rcu_dereference_protected(kvm->arch.apic_map, lockdep_is_held(&kvm->arch.apic_map_lock)); rcu_assign_pointer(kvm->arch.apic_map, new); @@ -360,11 +489,6 @@ static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val) atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } -static inline u32 kvm_apic_calc_x2apic_ldr(u32 id) -{ - return ((id >> 4) << 16) | (1 << (id & 0xf)); -} - static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id) { u32 ldr = kvm_apic_calc_x2apic_ldr(id); @@ -941,8 +1065,7 @@ static void kvm_apic_disabled_lapic_found(struct kvm *kvm) { if (!kvm->arch.disabled_lapic_found) { kvm->arch.disabled_lapic_found = true; - printk(KERN_INFO - "Disabled LAPIC found during irq injection\n"); + pr_info("Disabled LAPIC found during irq injection\n"); } } @@ -951,7 +1074,7 @@ static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src, { if (kvm->arch.x2apic_broadcast_quirk_disabled) { if ((irq->dest_id == APIC_BROADCAST && - map->mode != KVM_APIC_MODE_X2APIC)) + map->logical_mode != KVM_APIC_MODE_X2APIC)) return true; if (irq->dest_id == X2APIC_BROADCAST) return true; @@ -1364,7 +1487,6 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) { ktime_t remaining, now; s64 ns; - u32 tmcct; ASSERT(apic != NULL); @@ -1379,10 +1501,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) remaining = 0; ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period); - tmcct = div64_u64(ns, - (APIC_BUS_CYCLE_NS * apic->divide_count)); - - return tmcct; + return div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->divide_count)); } static void __report_tpr_access(struct kvm_lapic *apic, bool write) @@ -1442,19 +1561,15 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev) #define APIC_REGS_MASK(first, count) \ (APIC_REG_MASK(first) * ((1ull << (count)) - 1)) -static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, - void *data) +u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic) { - unsigned char alignment = offset & 0xf; - u32 result; - /* this bitmask has a bit cleared for each reserved register */ + /* Leave bits '0' for reserved and write-only registers. */ u64 valid_reg_mask = APIC_REG_MASK(APIC_ID) | APIC_REG_MASK(APIC_LVR) | APIC_REG_MASK(APIC_TASKPRI) | APIC_REG_MASK(APIC_PROCPRI) | APIC_REG_MASK(APIC_LDR) | - APIC_REG_MASK(APIC_DFR) | APIC_REG_MASK(APIC_SPIV) | APIC_REGS_MASK(APIC_ISR, APIC_ISR_NR) | APIC_REGS_MASK(APIC_TMR, APIC_ISR_NR) | @@ -1474,21 +1589,33 @@ static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, if (kvm_lapic_lvt_supported(apic, LVT_CMCI)) valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI); - /* - * ARBPRI and ICR2 are not valid in x2APIC mode. WARN if KVM reads ICR - * in x2APIC mode as it's an 8-byte register in x2APIC and needs to be - * manually handled by the caller. - */ + /* ARBPRI, DFR, and ICR2 are not valid in x2APIC mode. */ if (!apic_x2apic_mode(apic)) valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI) | + APIC_REG_MASK(APIC_DFR) | APIC_REG_MASK(APIC_ICR2); - else - WARN_ON_ONCE(offset == APIC_ICR); + + return valid_reg_mask; +} +EXPORT_SYMBOL_GPL(kvm_lapic_readable_reg_mask); + +static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, + void *data) +{ + unsigned char alignment = offset & 0xf; + u32 result; + + /* + * WARN if KVM reads ICR in x2APIC mode, as it's an 8-byte register in + * x2APIC and needs to be manually handled by the caller. + */ + WARN_ON_ONCE(apic_x2apic_mode(apic) && offset == APIC_ICR); if (alignment + len > 4) return 1; - if (offset > 0x3f0 || !(valid_reg_mask & APIC_REG_MASK(offset))) + if (offset > 0x3f0 || + !(kvm_lapic_readable_reg_mask(apic) & APIC_REG_MASK(offset))) return 1; result = __apic_read(apic, offset & ~0xf); @@ -1560,7 +1687,7 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic) if (apic->lapic_timer.period < min_period) { pr_info_ratelimited( - "kvm: vcpu %i: requested %lld ns " + "vcpu %i: requested %lld ns " "lapic timer period limited to %lld ns\n", apic->vcpu->vcpu_id, apic->lapic_timer.period, min_period); @@ -1841,11 +1968,15 @@ static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg) if (unlikely(count_reg != APIC_TMICT)) { deadline = tmict_to_ns(apic, kvm_lapic_get_reg(apic, count_reg)); - if (unlikely(deadline <= 0)) - deadline = apic->lapic_timer.period; + if (unlikely(deadline <= 0)) { + if (apic_lvtt_period(apic)) + deadline = apic->lapic_timer.period; + else + deadline = 0; + } else if (unlikely(deadline > apic->lapic_timer.period)) { pr_info_ratelimited( - "kvm: vcpu %i: requested lapic timer restore with " + "vcpu %i: requested lapic timer restore with " "starting count register %#x=%u (%lld ns) > initial count (%lld ns). " "Using initial count to start timer.\n", apic->vcpu->vcpu_id, @@ -2068,19 +2199,6 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) } } -static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic) -{ - struct kvm *kvm = apic->vcpu->kvm; - - if (KVM_BUG_ON(apic_x2apic_mode(apic), kvm)) - return; - - if (kvm_xapic_id(apic) == apic->vcpu->vcpu_id) - return; - - kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED); -} - static int get_lvt_index(u32 reg) { if (reg == APIC_LVTCMCI) @@ -2101,7 +2219,6 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_ID: /* Local APIC ID */ if (!apic_x2apic_mode(apic)) { kvm_apic_set_xapic_id(apic, val >> 24); - kvm_lapic_xapic_id_updated(apic); } else { ret = 1; } @@ -2219,10 +2336,14 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; case APIC_SELF_IPI: - if (apic_x2apic_mode(apic)) - kvm_apic_send_ipi(apic, APIC_DEST_SELF | (val & APIC_VECTOR_MASK), 0); - else + /* + * Self-IPI exists only when x2APIC is enabled. Bits 7:0 hold + * the vector, everything else is reserved. + */ + if (!apic_x2apic_mode(apic) || (val & ~APIC_VECTOR_MASK)) ret = 1; + else + kvm_apic_send_ipi(apic, APIC_DEST_SELF | val, 0); break; default: ret = 1; @@ -2284,23 +2405,18 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) struct kvm_lapic *apic = vcpu->arch.apic; u64 val; - if (apic_x2apic_mode(apic)) { - if (KVM_BUG_ON(kvm_lapic_msr_read(apic, offset, &val), vcpu->kvm)) - return; - } else { - val = kvm_lapic_get_reg(apic, offset); - } - /* * ICR is a single 64-bit register when x2APIC is enabled. For legacy * xAPIC, ICR writes need to go down the common (slightly slower) path * to get the upper half from ICR2. */ if (apic_x2apic_mode(apic) && offset == APIC_ICR) { + val = kvm_lapic_get_reg64(apic, APIC_ICR); kvm_apic_send_ipi(apic, (u32)val, (u32)(val >> 32)); trace_kvm_apic_write(APIC_ICR, val); } else { /* TODO: optimize to just emulate side effect w/o one more write */ + val = kvm_lapic_get_reg(apic, offset); kvm_lapic_reg_write(apic, offset, (u32)val); } } @@ -2394,11 +2510,15 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) } } - if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE)) - kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); + if ((old_value ^ value) & X2APIC_ENABLE) { + if (value & X2APIC_ENABLE) + kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); + else if (value & MSR_IA32_APICBASE_ENABLE) + kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); + } if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) { - kvm_vcpu_update_apicv(vcpu); + kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu); } @@ -2429,6 +2549,78 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) */ apic->isr_count = count_vectors(apic->regs + APIC_ISR); } + apic->highest_isr_cache = -1; +} + +int kvm_alloc_apic_access_page(struct kvm *kvm) +{ + struct page *page; + void __user *hva; + int ret = 0; + + mutex_lock(&kvm->slots_lock); + if (kvm->arch.apic_access_memslot_enabled || + kvm->arch.apic_access_memslot_inhibited) + goto out; + + hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, + APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); + if (IS_ERR(hva)) { + ret = PTR_ERR(hva); + goto out; + } + + page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); + if (is_error_page(page)) { + ret = -EFAULT; + goto out; + } + + /* + * Do not pin the page in memory, so that memory hot-unplug + * is able to migrate it. + */ + put_page(page); + kvm->arch.apic_access_memslot_enabled = true; +out: + mutex_unlock(&kvm->slots_lock); + return ret; +} +EXPORT_SYMBOL_GPL(kvm_alloc_apic_access_page); + +void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + if (!kvm->arch.apic_access_memslot_enabled) + return; + + kvm_vcpu_srcu_read_unlock(vcpu); + + mutex_lock(&kvm->slots_lock); + + if (kvm->arch.apic_access_memslot_enabled) { + __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0); + /* + * Clear "enabled" after the memslot is deleted so that a + * different vCPU doesn't get a false negative when checking + * the flag out of slots_lock. No additional memory barrier is + * needed as modifying memslots requires waiting other vCPUs to + * drop SRCU (see above), and false positives are ok as the + * flag is rechecked after acquiring slots_lock. + */ + kvm->arch.apic_access_memslot_enabled = false; + + /* + * Mark the memslot as inhibited to prevent reallocating the + * memslot during vCPU creation, e.g. if a vCPU is hotplugged. + */ + kvm->arch.apic_access_memslot_inhibited = true; + } + + mutex_unlock(&kvm->slots_lock); + + kvm_vcpu_srcu_read_lock(vcpu); } void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -2484,7 +2676,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } kvm_apic_update_apicv(vcpu); - apic->highest_isr_cache = -1; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); @@ -2756,9 +2947,6 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) } memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s)); - if (!apic_x2apic_mode(apic)) - kvm_lapic_xapic_id_updated(apic); - atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); kvm_recalculate_apic_map(vcpu->kvm); kvm_apic_set_version(vcpu); @@ -2772,7 +2960,6 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) __start_apic_timer(apic, APIC_TMCCT); kvm_lapic_set_reg(apic, APIC_TMCCT, 0); kvm_apic_update_apicv(vcpu); - apic->highest_isr_cache = -1; if (apic->apicv_active) { static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); @@ -2943,13 +3130,17 @@ static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data) static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data) { /* - * ICR is a 64-bit register in x2APIC mode (and Hyper'v PV vAPIC) and + * ICR is a 64-bit register in x2APIC mode (and Hyper-V PV vAPIC) and * can be written as such, all other registers remain accessible only * through 32-bit reads/writes. */ if (reg == APIC_ICR) return kvm_x2apic_icr_write(apic, data); + /* Bits 63:32 are reserved in all other registers. */ + if (data >> 32) + return 1; + return kvm_lapic_reg_write(apic, reg, (u32)data); } @@ -2972,9 +3163,6 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic)) return 1; - if (reg == APIC_DFR) - return 1; - return kvm_lapic_msr_read(apic, reg, data); } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 58c3242fcc7a..0a0ea4b5dd8c 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -112,6 +112,8 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, struct dest_map *dest_map); int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); void kvm_apic_update_apicv(struct kvm_vcpu *vcpu); +int kvm_alloc_apic_access_page(struct kvm *kvm); +void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu); bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map); @@ -144,6 +146,8 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len); void kvm_lapic_exit(void); +u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic); + #define VEC_POS(v) ((v) & (32 - 1)) #define REG_POS(v) (((v) >> 5) << 4) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 6bdaacb6faa0..168c46fd8dd1 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -230,14 +230,14 @@ static inline bool kvm_shadow_root_allocated(struct kvm *kvm) } #ifdef CONFIG_X86_64 -static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } +extern bool tdp_mmu_enabled; #else -static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } +#define tdp_mmu_enabled false #endif static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) { - return !is_tdp_mmu_enabled(kvm) || kvm_shadow_root_allocated(kvm); + return !tdp_mmu_enabled || kvm_shadow_root_allocated(kvm); } static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 835426254e76..c8ebe542c565 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -14,6 +14,7 @@ * Yaniv Kamay <yaniv@qumranet.com> * Avi Kivity <avi@qumranet.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "irq.h" #include "ioapic.h" @@ -43,6 +44,7 @@ #include <linux/uaccess.h> #include <linux/hash.h> #include <linux/kern_levels.h> +#include <linux/kstrtox.h> #include <linux/kthread.h> #include <asm/page.h> @@ -99,6 +101,13 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); */ bool tdp_enabled = false; +static bool __ro_after_init tdp_mmu_allowed; + +#ifdef CONFIG_X86_64 +bool __read_mostly tdp_mmu_enabled = true; +module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444); +#endif + static int max_huge_page_level __read_mostly; static int tdp_root_level __read_mostly; static int max_tdp_level __read_mostly; @@ -261,6 +270,17 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, kvm_flush_remote_tlbs_with_range(kvm, &range); } +static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index); + +/* Flush the range of guest memory mapped by the given SPTE. */ +static void kvm_flush_remote_tlbs_sptep(struct kvm *kvm, u64 *sptep) +{ + struct kvm_mmu_page *sp = sptep_to_sp(sptep); + gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(sptep)); + + kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level); +} + static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned int access) { @@ -609,9 +629,14 @@ static bool mmu_spte_age(u64 *sptep) return true; } +static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu) +{ + return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct; +} + static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { - if (is_tdp_mmu(vcpu->arch.mmu)) { + if (is_tdp_mmu_active(vcpu)) { kvm_tdp_mmu_walk_lockless_begin(); } else { /* @@ -630,7 +655,7 @@ static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { - if (is_tdp_mmu(vcpu->arch.mmu)) { + if (is_tdp_mmu_active(vcpu)) { kvm_tdp_mmu_walk_lockless_end(); } else { /* @@ -800,7 +825,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_disallow_lpage(slot, gfn); if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) - kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); + kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K); } void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -1174,8 +1199,7 @@ static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush) drop_spte(kvm, sptep); if (flush) - kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, - KVM_PAGES_PER_HPAGE(sp->role.level)); + kvm_flush_remote_tlbs_sptep(kvm, sptep); } /* @@ -1279,7 +1303,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, true); @@ -1312,7 +1336,7 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, false); @@ -1395,7 +1419,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, } } - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) write_protected |= kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level); @@ -1456,7 +1480,7 @@ restart: } if (need_flush && kvm_available_flush_tlb_with_range()) { - kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); + kvm_flush_remote_tlbs_gfn(kvm, gfn, level); return false; } @@ -1558,7 +1582,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) if (kvm_memslots_have_rmaps(kvm)) flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap); - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); return flush; @@ -1571,7 +1595,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (kvm_memslots_have_rmaps(kvm)) flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap); - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range); return flush; @@ -1626,8 +1650,7 @@ static void __rmap_add(struct kvm *kvm, kvm->stat.max_mmu_rmap_size = rmap_count; if (rmap_count > RMAP_RECYCLE_THRESHOLD) { kvm_zap_all_rmap_sptes(kvm, rmap_head); - kvm_flush_remote_tlbs_with_address( - kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); + kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level); } } @@ -1646,7 +1669,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (kvm_memslots_have_rmaps(kvm)) young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap); - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) young |= kvm_tdp_mmu_age_gfn_range(kvm, range); return young; @@ -1659,7 +1682,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (kvm_memslots_have_rmaps(kvm)) young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap); - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) young |= kvm_tdp_mmu_test_age_gfn(kvm, range); return young; @@ -1921,7 +1944,7 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) return true; /* TDP MMU pages do not use the MMU generation. */ - return !sp->tdp_mmu_page && + return !is_tdp_mmu_page(sp) && unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); } @@ -2355,7 +2378,16 @@ static void __link_shadow_page(struct kvm *kvm, mmu_page_add_parent_pte(cache, sp, sptep); - if (sp->unsync_children || sp->unsync) + /* + * The non-direct sub-pagetable must be updated before linking. For + * L1 sp, the pagetable is updated via kvm_sync_page() in + * kvm_mmu_find_shadow_page() without write-protecting the gfn, + * so sp->unsync can be true or false. For higher level non-direct + * sp, the pagetable is updated/synced via mmu_sync_children() in + * FNAME(fetch)(), so sp->unsync_children can only be false. + * WARN_ON_ONCE() if anything happens unexpectedly. + */ + if (WARN_ON_ONCE(sp->unsync_children) || sp->unsync) mark_unsync(sptep); } @@ -2383,7 +2415,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, return; drop_parent_pte(child, sptep); - kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1); + kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep); } } @@ -2867,8 +2899,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, } if (flush) - kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, - KVM_PAGES_PER_HPAGE(level)); + kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level); pgprintk("%s: setting spte %llx\n", __func__, *sptep); @@ -3116,11 +3147,11 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_ !is_large_pte(spte) && spte_to_child_sp(spte)->nx_huge_page_disallowed) { /* - * A small SPTE exists for this pfn, but FNAME(fetch) - * and __direct_map would like to create a large PTE - * instead: just force them to go down another level, - * patching back for them into pfn the next 9 bits of - * the address. + * A small SPTE exists for this pfn, but FNAME(fetch), + * direct_map(), or kvm_tdp_mmu_map() would like to create a + * large PTE instead: just force them to go down another level, + * patching back for them into pfn the next 9 bits of the + * address. */ u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) - KVM_PAGES_PER_HPAGE(cur_level - 1); @@ -3129,7 +3160,7 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_ } } -static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; @@ -3147,7 +3178,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (fault->nx_huge_page_workaround_enabled) disallowed_hugepage_adjust(fault, *it.sptep, it.level); - base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + base_gfn = gfn_round_for_level(fault->gfn, it.level); if (it.level == fault->goal_level) break; @@ -3173,14 +3204,16 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return ret; } -static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) +static void kvm_send_hwpoison_signal(struct kvm_memory_slot *slot, gfn_t gfn) { - send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk); + unsigned long hva = gfn_to_hva_memslot(slot, gfn); + + send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SHIFT, current); } -static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) +static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - if (is_sigpending_pfn(pfn)) { + if (is_sigpending_pfn(fault->pfn)) { kvm_handle_signal_exit(vcpu); return -EINTR; } @@ -3190,43 +3223,43 @@ static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) * into the spte otherwise read access on readonly gfn also can * caused mmio page fault and treat it as mmio access. */ - if (pfn == KVM_PFN_ERR_RO_FAULT) + if (fault->pfn == KVM_PFN_ERR_RO_FAULT) return RET_PF_EMULATE; - if (pfn == KVM_PFN_ERR_HWPOISON) { - kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); + if (fault->pfn == KVM_PFN_ERR_HWPOISON) { + kvm_send_hwpoison_signal(fault->slot, fault->gfn); return RET_PF_RETRY; } return -EFAULT; } -static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, - unsigned int access) +static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, + unsigned int access) { - /* The pfn is invalid, report the error! */ - if (unlikely(is_error_pfn(fault->pfn))) - return kvm_handle_error_pfn(vcpu, fault->gfn, fault->pfn); + gva_t gva = fault->is_tdp ? 0 : fault->addr; - if (unlikely(!fault->slot)) { - gva_t gva = fault->is_tdp ? 0 : fault->addr; + vcpu_cache_mmio_info(vcpu, gva, fault->gfn, + access & shadow_mmio_access_mask); - vcpu_cache_mmio_info(vcpu, gva, fault->gfn, - access & shadow_mmio_access_mask); - /* - * If MMIO caching is disabled, emulate immediately without - * touching the shadow page tables as attempting to install an - * MMIO SPTE will just be an expensive nop. Do not cache MMIO - * whose gfn is greater than host.MAXPHYADDR, any guest that - * generates such gfns is running nested and is being tricked - * by L0 userspace (you can observe gfn > L1.MAXPHYADDR if - * and only if L1's MAXPHYADDR is inaccurate with respect to - * the hardware's). - */ - if (unlikely(!enable_mmio_caching) || - unlikely(fault->gfn > kvm_mmu_max_gfn())) - return RET_PF_EMULATE; - } + /* + * If MMIO caching is disabled, emulate immediately without + * touching the shadow page tables as attempting to install an + * MMIO SPTE will just be an expensive nop. + */ + if (unlikely(!enable_mmio_caching)) + return RET_PF_EMULATE; + + /* + * Do not create an MMIO SPTE for a gfn greater than host.MAXPHYADDR, + * any guest that generates such gfns is running nested and is being + * tricked by L0 userspace (you can observe gfn > L1.MAXPHYADDR if and + * only if L1's MAXPHYADDR is inaccurate with respect to the + * hardware's). + */ + if (unlikely(fault->gfn > kvm_mmu_max_gfn())) + return RET_PF_EMULATE; return RET_PF_CONTINUE; } @@ -3350,7 +3383,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) do { u64 new_spte; - if (is_tdp_mmu(vcpu->arch.mmu)) + if (tdp_mmu_enabled) sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte); else sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte); @@ -3433,8 +3466,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) } if (++retry_count > 4) { - printk_once(KERN_WARNING - "kvm: Fast #PF retrying more than 4 times.\n"); + pr_warn_once("Fast #PF retrying more than 4 times.\n"); break; } @@ -3596,7 +3628,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) if (r < 0) goto out_unlock; - if (is_tdp_mmu_enabled(vcpu->kvm)) { + if (tdp_mmu_enabled) { root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); mmu->root.hpa = root; } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { @@ -4026,7 +4058,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) walk_shadow_page_lockless_begin(vcpu); - if (is_tdp_mmu(vcpu->arch.mmu)) + if (is_tdp_mmu_active(vcpu)) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); else leaf = get_walk(vcpu, addr, sptes, &root); @@ -4174,7 +4206,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); } -static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_memory_slot *slot = fault->slot; bool async; @@ -4235,12 +4267,33 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return RET_PF_CONTINUE; } +static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, + unsigned int access) +{ + int ret; + + fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; + smp_rmb(); + + ret = __kvm_faultin_pfn(vcpu, fault); + if (ret != RET_PF_CONTINUE) + return ret; + + if (unlikely(is_error_pfn(fault->pfn))) + return kvm_handle_error_pfn(vcpu, fault); + + if (unlikely(!fault->slot)) + return kvm_handle_noslot_fault(vcpu, fault, access); + + return RET_PF_CONTINUE; +} + /* * Returns true if the page fault is stale and needs to be retried, i.e. if the * root was invalidated by a memslot update or a relevant mmu_notifier fired. */ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, - struct kvm_page_fault *fault, int mmu_seq) + struct kvm_page_fault *fault) { struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa); @@ -4260,19 +4313,13 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, return true; return fault->slot && - mmu_invalidate_retry_hva(vcpu->kvm, mmu_seq, fault->hva); + mmu_invalidate_retry_hva(vcpu->kvm, fault->mmu_seq, fault->hva); } static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); - - unsigned long mmu_seq; int r; - fault->gfn = fault->addr >> PAGE_SHIFT; - fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); - if (page_fault_handle_page_track(vcpu, fault)) return RET_PF_EMULATE; @@ -4284,41 +4331,24 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - mmu_seq = vcpu->kvm->mmu_invalidate_seq; - smp_rmb(); - - r = kvm_faultin_pfn(vcpu, fault); - if (r != RET_PF_CONTINUE) - return r; - - r = handle_abnormal_pfn(vcpu, fault, ACC_ALL); + r = kvm_faultin_pfn(vcpu, fault, ACC_ALL); if (r != RET_PF_CONTINUE) return r; r = RET_PF_RETRY; + write_lock(&vcpu->kvm->mmu_lock); - if (is_tdp_mmu_fault) - read_lock(&vcpu->kvm->mmu_lock); - else - write_lock(&vcpu->kvm->mmu_lock); + if (is_page_fault_stale(vcpu, fault)) + goto out_unlock; - if (is_page_fault_stale(vcpu, fault, mmu_seq)) + r = make_mmu_pages_available(vcpu); + if (r) goto out_unlock; - if (is_tdp_mmu_fault) { - r = kvm_tdp_mmu_map(vcpu, fault); - } else { - r = make_mmu_pages_available(vcpu); - if (r) - goto out_unlock; - r = __direct_map(vcpu, fault); - } + r = direct_map(vcpu, fault); out_unlock: - if (is_tdp_mmu_fault) - read_unlock(&vcpu->kvm->mmu_lock); - else - write_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(fault->pfn); return r; } @@ -4366,6 +4396,42 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, } EXPORT_SYMBOL_GPL(kvm_handle_page_fault); +#ifdef CONFIG_X86_64 +static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + int r; + + if (page_fault_handle_page_track(vcpu, fault)) + return RET_PF_EMULATE; + + r = fast_page_fault(vcpu, fault); + if (r != RET_PF_INVALID) + return r; + + r = mmu_topup_memory_caches(vcpu, false); + if (r) + return r; + + r = kvm_faultin_pfn(vcpu, fault, ACC_ALL); + if (r != RET_PF_CONTINUE) + return r; + + r = RET_PF_RETRY; + read_lock(&vcpu->kvm->mmu_lock); + + if (is_page_fault_stale(vcpu, fault)) + goto out_unlock; + + r = kvm_tdp_mmu_map(vcpu, fault); + +out_unlock: + read_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(fault->pfn); + return r; +} +#endif + int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { /* @@ -4383,13 +4449,19 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) { for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) { int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); - gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1); + gfn_t base = gfn_round_for_level(fault->gfn, + fault->max_level); if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) break; } } +#ifdef CONFIG_X86_64 + if (tdp_mmu_enabled) + return kvm_tdp_mmu_page_fault(vcpu, fault); +#endif + return direct_page_fault(vcpu, fault); } @@ -4494,10 +4566,12 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) struct kvm_mmu *mmu = vcpu->arch.mmu; union kvm_mmu_page_role new_role = mmu->root_role; - if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) { - /* kvm_mmu_ensure_valid_pgd will set up a new root. */ + /* + * Return immediately if no usable root was found, kvm_mmu_reload() + * will establish a valid root prior to the next VM-Enter. + */ + if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) return; - } /* * It's possible that the cached previous root page is obsolete because @@ -5719,6 +5793,9 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, tdp_root_level = tdp_forced_root_level; max_tdp_level = tdp_max_root_level; +#ifdef CONFIG_X86_64 + tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled; +#endif /* * max_huge_page_level reflects KVM's MMU capabilities irrespective * of kernel support, e.g. KVM may be capable of using 1GB pages when @@ -5966,7 +6043,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) * write and in the same critical section as making the reload request, * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield. */ - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) kvm_tdp_mmu_invalidate_all_roots(kvm); /* @@ -5991,7 +6068,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) * Deferring the zap until the final reference to the root is put would * lead to use-after-free. */ - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) kvm_tdp_mmu_zap_invalidated_roots(kvm); } @@ -6017,9 +6094,11 @@ int kvm_mmu_init_vm(struct kvm *kvm) INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); - r = kvm_mmu_init_tdp_mmu(kvm); - if (r < 0) - return r; + if (tdp_mmu_enabled) { + r = kvm_mmu_init_tdp_mmu(kvm); + if (r < 0) + return r; + } node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; @@ -6049,7 +6128,8 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) kvm_page_track_unregister_notifier(kvm, node); - kvm_mmu_uninit_tdp_mmu(kvm); + if (tdp_mmu_enabled) + kvm_mmu_uninit_tdp_mmu(kvm); mmu_free_vm_memory_caches(kvm); } @@ -6103,7 +6183,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); - if (is_tdp_mmu_enabled(kvm)) { + if (tdp_mmu_enabled) { for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start, gfn_end, true, flush); @@ -6136,7 +6216,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, write_unlock(&kvm->mmu_lock); } - if (is_tdp_mmu_enabled(kvm)) { + if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); read_unlock(&kvm->mmu_lock); @@ -6379,7 +6459,7 @@ void kvm_mmu_try_split_huge_pages(struct kvm *kvm, u64 start, u64 end, int target_level) { - if (!is_tdp_mmu_enabled(kvm)) + if (!tdp_mmu_enabled) return; if (kvm_memslots_have_rmaps(kvm)) @@ -6400,7 +6480,7 @@ void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, u64 start = memslot->base_gfn; u64 end = start + memslot->npages; - if (!is_tdp_mmu_enabled(kvm)) + if (!tdp_mmu_enabled) return; if (kvm_memslots_have_rmaps(kvm)) { @@ -6450,8 +6530,7 @@ restart: kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) - kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, - KVM_PAGES_PER_HPAGE(sp->role.level)); + kvm_flush_remote_tlbs_sptep(kvm, sptep); else need_tlb_flush = 1; @@ -6483,7 +6562,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, write_unlock(&kvm->mmu_lock); } - if (is_tdp_mmu_enabled(kvm)) { + if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); read_unlock(&kvm->mmu_lock); @@ -6518,7 +6597,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, write_unlock(&kvm->mmu_lock); } - if (is_tdp_mmu_enabled(kvm)) { + if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); read_unlock(&kvm->mmu_lock); @@ -6553,7 +6632,7 @@ restart: kvm_mmu_commit_zap_page(kvm, &invalid_list); - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) kvm_tdp_mmu_zap_all(kvm); write_unlock(&kvm->mmu_lock); @@ -6579,7 +6658,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) * zap all shadow pages. */ if (unlikely(gen == 0)) { - kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); + kvm_debug_ratelimited("zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_zap_all_fast(kvm); } } @@ -6684,7 +6763,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) new_val = 1; else if (sysfs_streq(val, "auto")) new_val = get_nx_auto_mode(); - else if (strtobool(val, &new_val) < 0) + else if (kstrtobool(val, &new_val) < 0) return -EINVAL; __set_nx_huge_pages(new_val); @@ -6718,6 +6797,13 @@ void __init kvm_mmu_x86_module_init(void) if (nx_huge_pages == -1) __set_nx_huge_pages(get_nx_auto_mode()); + /* + * Snapshot userspace's desire to enable the TDP MMU. Whether or not the + * TDP MMU is actually enabled is determined in kvm_configure_mmu() + * when the vendor module is loaded. + */ + tdp_mmu_allowed = tdp_mmu_enabled; + kvm_mmu_spte_module_init(); } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index dbaf6755c5a7..cc58631e2336 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -156,6 +156,11 @@ static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode; } +static inline gfn_t gfn_round_for_level(gfn_t gfn, int level) +{ + return gfn & -KVM_PAGES_PER_HPAGE(level); +} + int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, bool can_unsync, bool prefetch); @@ -164,8 +169,17 @@ void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn, int min_level); + void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, u64 start_gfn, u64 pages); + +/* Flush the given page (huge or not) of guest memory. */ +static inline void kvm_flush_remote_tlbs_gfn(struct kvm *kvm, gfn_t gfn, int level) +{ + kvm_flush_remote_tlbs_with_address(kvm, gfn_round_for_level(gfn, level), + KVM_PAGES_PER_HPAGE(level)); +} + unsigned int pte_list_count(struct kvm_rmap_head *rmap_head); extern int nx_huge_pages; @@ -199,7 +213,7 @@ struct kvm_page_fault { /* * Maximum page size that can be created for this fault; input to - * FNAME(fetch), __direct_map and kvm_tdp_mmu_map. + * FNAME(fetch), direct_map() and kvm_tdp_mmu_map(). */ u8 max_level; @@ -222,6 +236,7 @@ struct kvm_page_fault { struct kvm_memory_slot *slot; /* Outputs of kvm_faultin_pfn. */ + unsigned long mmu_seq; kvm_pfn_t pfn; hva_t hva; bool map_writable; @@ -279,6 +294,11 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, }; int r; + if (vcpu->arch.mmu->root_role.direct) { + fault.gfn = fault.addr >> PAGE_SHIFT; + fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn); + } + /* * Async #PF "faults", a.k.a. prefetch faults, are not faults from the * guest perspective and have already been counted at the time of the diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 2e09d1b6249f..0a2ac438d647 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -10,6 +10,7 @@ * Author: * Xiao Guangrong <guangrong.xiao@linux.intel.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <linux/rculist.h> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 0f6455072055..57f0b75c80f9 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -642,12 +642,12 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) goto out_gpte_changed; - for (shadow_walk_init(&it, vcpu, fault->addr); - shadow_walk_okay(&it) && it.level > gw->level; - shadow_walk_next(&it)) { + for_each_shadow_entry(vcpu, fault->addr, it) { gfn_t table_gfn; clear_sp_write_flooding_count(it.sptep); + if (it.level == gw->level) + break; table_gfn = gw->table_gfn[it.level - 2]; access = gw->pt_access[it.level - 2]; @@ -692,8 +692,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, trace_kvm_mmu_spte_requested(fault); for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { - clear_sp_write_flooding_count(it.sptep); - /* * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. @@ -701,7 +699,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (fault->nx_huge_page_workaround_enabled) disallowed_hugepage_adjust(fault, *it.sptep, it.level); - base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + base_gfn = gfn_round_for_level(fault->gfn, it.level); if (it.level == fault->goal_level) break; @@ -791,7 +789,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault { struct guest_walker walker; int r; - unsigned long mmu_seq; bool is_self_change_mapping; pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code); @@ -838,14 +835,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault else fault->max_level = walker.level; - mmu_seq = vcpu->kvm->mmu_invalidate_seq; - smp_rmb(); - - r = kvm_faultin_pfn(vcpu, fault); - if (r != RET_PF_CONTINUE) - return r; - - r = handle_abnormal_pfn(vcpu, fault, walker.pte_access); + r = kvm_faultin_pfn(vcpu, fault, walker.pte_access); if (r != RET_PF_CONTINUE) return r; @@ -871,7 +861,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); - if (is_page_fault_stale(vcpu, fault, mmu_seq)) + if (is_page_fault_stale(vcpu, fault)) goto out_unlock; r = make_mmu_pages_available(vcpu); @@ -937,8 +927,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL); if (is_shadow_present_pte(old_spte)) - kvm_flush_remote_tlbs_with_address(vcpu->kvm, - sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); + kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep); if (!rmap_can_add(vcpu)) break; diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index c0fd7e049b4e..c15bfca3ed15 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -7,7 +7,7 @@ * Copyright (C) 2006 Qumranet, Inc. * Copyright 2020 Red Hat, Inc. and/or its affiliates. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include "mmu.h" @@ -147,9 +147,9 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, WARN_ON_ONCE(!pte_access && !shadow_present_mask); if (sp->role.ad_disabled) - spte |= SPTE_TDP_AD_DISABLED_MASK; + spte |= SPTE_TDP_AD_DISABLED; else if (kvm_mmu_page_ad_need_write_protect(sp)) - spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK; + spte |= SPTE_TDP_AD_WRPROT_ONLY; /* * For the EPT case, shadow_present_mask is 0 if hardware @@ -317,7 +317,7 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) shadow_user_mask | shadow_x_mask | shadow_me_value; if (ad_disabled) - spte |= SPTE_TDP_AD_DISABLED_MASK; + spte |= SPTE_TDP_AD_DISABLED; else spte |= shadow_accessed_mask; @@ -352,7 +352,7 @@ u64 mark_spte_for_access_track(u64 spte) WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK << SHADOW_ACC_TRACK_SAVED_BITS_SHIFT), - "kvm: Access Tracking saved bit locations are not zero\n"); + "Access Tracking saved bit locations are not zero\n"); spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) << SHADOW_ACC_TRACK_SAVED_BITS_SHIFT; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 6f54dc9409c9..1279db2eab44 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -28,10 +28,10 @@ */ #define SPTE_TDP_AD_SHIFT 52 #define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT) -#define SPTE_TDP_AD_ENABLED_MASK (0ULL << SPTE_TDP_AD_SHIFT) -#define SPTE_TDP_AD_DISABLED_MASK (1ULL << SPTE_TDP_AD_SHIFT) -#define SPTE_TDP_AD_WRPROT_ONLY_MASK (2ULL << SPTE_TDP_AD_SHIFT) -static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); +#define SPTE_TDP_AD_ENABLED (0ULL << SPTE_TDP_AD_SHIFT) +#define SPTE_TDP_AD_DISABLED (1ULL << SPTE_TDP_AD_SHIFT) +#define SPTE_TDP_AD_WRPROT_ONLY (2ULL << SPTE_TDP_AD_SHIFT) +static_assert(SPTE_TDP_AD_ENABLED == 0); #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK #define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) @@ -164,7 +164,7 @@ extern u64 __read_mostly shadow_me_value; extern u64 __read_mostly shadow_me_mask; /* - * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED_MASK; + * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED; * shadow_acc_track_mask is the set of bits to be cleared in non-accessed * pages. */ @@ -266,18 +266,18 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) static inline bool spte_ad_enabled(u64 spte) { MMU_WARN_ON(!is_shadow_present_pte(spte)); - return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED_MASK; + return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED; } static inline bool spte_ad_need_write_protect(u64 spte) { MMU_WARN_ON(!is_shadow_present_pte(spte)); /* - * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED_MASK is '0', + * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED is '0', * and non-TDP SPTEs will never set these bits. Optimize for 64-bit * TDP and do the A/D type check unconditionally. */ - return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED_MASK; + return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED; } static inline u64 spte_shadow_accessed_mask(u64 spte) @@ -435,11 +435,11 @@ static inline void check_spte_writable_invariants(u64 spte) { if (spte & shadow_mmu_writable_mask) WARN_ONCE(!(spte & shadow_host_writable_mask), - "kvm: MMU-writable SPTE is not Host-writable: %llx", + KBUILD_MODNAME ": MMU-writable SPTE is not Host-writable: %llx", spte); else WARN_ONCE(is_writable_pte(spte), - "kvm: Writable SPTE is not MMU-writable: %llx", spte); + KBUILD_MODNAME ": Writable SPTE is not MMU-writable: %llx", spte); } static inline bool is_mmu_writable_spte(u64 spte) diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index 39b48e7d7d1a..d2eb0d4f8710 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "mmu_internal.h" #include "tdp_iter.h" @@ -15,11 +16,6 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter) iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep); } -static gfn_t round_gfn_for_level(gfn_t gfn, int level) -{ - return gfn & -KVM_PAGES_PER_HPAGE(level); -} - /* * Return the TDP iterator to the root PT and allow it to continue its * traversal over the paging structure from there. @@ -30,7 +26,7 @@ void tdp_iter_restart(struct tdp_iter *iter) iter->yielded_gfn = iter->next_last_level_gfn; iter->level = iter->root_level; - iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); + iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level); tdp_iter_refresh_sptep(iter); iter->valid = true; @@ -97,7 +93,7 @@ static bool try_step_down(struct tdp_iter *iter) iter->level--; iter->pt_path[iter->level - 1] = child_pt; - iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); + iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level); tdp_iter_refresh_sptep(iter); return true; @@ -139,7 +135,7 @@ static bool try_step_up(struct tdp_iter *iter) return false; iter->level++; - iter->gfn = round_gfn_for_level(iter->gfn, iter->level); + iter->gfn = gfn_round_for_level(iter->gfn, iter->level); tdp_iter_refresh_sptep(iter); return true; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index d6df38d371a0..7c25dbf32ecc 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "mmu.h" #include "mmu_internal.h" @@ -10,23 +11,15 @@ #include <asm/cmpxchg.h> #include <trace/events/kvm.h> -static bool __read_mostly tdp_mmu_enabled = true; -module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); - /* Initializes the TDP MMU for the VM, if enabled. */ int kvm_mmu_init_tdp_mmu(struct kvm *kvm) { struct workqueue_struct *wq; - if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) - return 0; - wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); if (!wq) return -ENOMEM; - /* This should not be changed for the lifetime of the VM. */ - kvm->arch.tdp_mmu_enabled = true; INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); kvm->arch.tdp_mmu_zap_wq = wq; @@ -47,9 +40,6 @@ static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) { - if (!kvm->arch.tdp_mmu_enabled) - return; - /* Also waits for any queued work items. */ destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); @@ -144,7 +134,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) return; - WARN_ON(!root->tdp_mmu_page); + WARN_ON(!is_tdp_mmu_page(root)); /* * The root now has refcount=0. It is valid, but readers already @@ -690,8 +680,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, if (ret) return ret; - kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, - KVM_PAGES_PER_HPAGE(iter->level)); + kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level); /* * No other thread can overwrite the removed SPTE as they must either @@ -1090,8 +1079,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, return RET_PF_RETRY; else if (is_shadow_present_pte(iter->old_spte) && !is_last_spte(iter->old_spte, iter->level)) - kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, - KVM_PAGES_PER_HPAGE(iter->level + 1)); + kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level); /* * If the page fault was caused by a write but the page is write diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index d3714200b932..0a63b1afabd3 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -7,6 +7,9 @@ #include "spte.h" +int kvm_mmu_init_tdp_mmu(struct kvm *kvm); +void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); + hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) @@ -68,31 +71,9 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, u64 *spte); #ifdef CONFIG_X86_64 -int kvm_mmu_init_tdp_mmu(struct kvm *kvm); -void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } - -static inline bool is_tdp_mmu(struct kvm_mmu *mmu) -{ - struct kvm_mmu_page *sp; - hpa_t hpa = mmu->root.hpa; - - if (WARN_ON(!VALID_PAGE(hpa))) - return false; - - /* - * A NULL shadow page is legal when shadowing a non-paging guest with - * PAE paging, as the MMU will be direct with root_hpa pointing at the - * pae_root page, not a shadow page. - */ - sp = to_shadow_page(hpa); - return sp && is_tdp_mmu_page(sp) && sp->root_count; -} #else -static inline int kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return 0; } -static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } -static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; } #endif #endif /* __KVM_X86_MMU_TDP_MMU_H */ diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index a8502e02f479..9fac1ec03463 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -13,6 +13,7 @@ * Paolo Bonzini <pbonzini@redhat.com> * Xiao Guangrong <guangrong.xiao@linux.intel.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <asm/mtrr.h> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index eb594620dd75..612e6c70ce2e 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -9,6 +9,7 @@ * Gleb Natapov <gleb@redhat.com> * Wei Huang <wei@redhat.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/kvm_host.h> @@ -28,9 +29,18 @@ struct x86_pmu_capability __read_mostly kvm_pmu_cap; EXPORT_SYMBOL_GPL(kvm_pmu_cap); -static const struct x86_cpu_id vmx_icl_pebs_cpu[] = { +/* Precise Distribution of Instructions Retired (PDIR) */ +static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL), + /* Instruction-Accurate PDIR (PDIR++) */ + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL), + {} +}; + +/* Precise Distribution (PDist) */ +static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = { + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL), {} }; @@ -155,6 +165,28 @@ static void kvm_perf_overflow(struct perf_event *perf_event, kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } +static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc) +{ + /* + * For some model specific pebs counters with special capabilities + * (PDIR, PDIR++, PDIST), KVM needs to raise the event precise + * level to the maximum value (currently 3, backwards compatible) + * so that the perf subsystem would assign specific hardware counter + * with that capability for vPMC. + */ + if ((pmc->idx == 0 && x86_match_cpu(vmx_pebs_pdist_cpu)) || + (pmc->idx == 32 && x86_match_cpu(vmx_pebs_pdir_cpu))) + return 3; + + /* + * The non-zero precision level of guest event makes the ordinary + * guest event becomes a guest PEBS event and triggers the host + * PEBS PMI handler to determine whether the PEBS overflow PMI + * comes from the host counters or the guest. + */ + return 1; +} + static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, bool exclude_user, bool exclude_kernel, bool intr) @@ -186,22 +218,12 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, } if (pebs) { /* - * The non-zero precision level of guest event makes the ordinary - * guest event becomes a guest PEBS event and triggers the host - * PEBS PMI handler to determine whether the PEBS overflow PMI - * comes from the host counters or the guest. - * * For most PEBS hardware events, the difference in the software * precision levels of guest and host PEBS events will not affect * the accuracy of the PEBS profiling result, because the "event IP" * in the PEBS record is calibrated on the guest side. - * - * On Icelake everything is fine. Other hardware (GLC+, TNT+) that - * could possibly care here is unsupported and needs changes. */ - attr.precise_ip = 1; - if (x86_match_cpu(vmx_icl_pebs_cpu) && pmc->idx == 32) - attr.precise_ip = 3; + attr.precise_ip = pmc_get_pebs_precise_level(pmc); } event = perf_event_create_kernel_counter(&attr, -1, current, @@ -254,48 +276,128 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) return true; } -static int cmp_u64(const void *pa, const void *pb) +static int filter_cmp(const void *pa, const void *pb, u64 mask) { - u64 a = *(u64 *)pa; - u64 b = *(u64 *)pb; + u64 a = *(u64 *)pa & mask; + u64 b = *(u64 *)pb & mask; return (a > b) - (a < b); } + +static int filter_sort_cmp(const void *pa, const void *pb) +{ + return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT | + KVM_PMU_MASKED_ENTRY_EXCLUDE)); +} + +/* + * For the event filter, searching is done on the 'includes' list and + * 'excludes' list separately rather than on the 'events' list (which + * has both). As a result the exclude bit can be ignored. + */ +static int filter_event_cmp(const void *pa, const void *pb) +{ + return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT)); +} + +static int find_filter_index(u64 *events, u64 nevents, u64 key) +{ + u64 *fe = bsearch(&key, events, nevents, sizeof(events[0]), + filter_event_cmp); + + if (!fe) + return -1; + + return fe - events; +} + +static bool is_filter_entry_match(u64 filter_event, u64 umask) +{ + u64 mask = filter_event >> (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8); + u64 match = filter_event & KVM_PMU_MASKED_ENTRY_UMASK_MATCH; + + BUILD_BUG_ON((KVM_PMU_ENCODE_MASKED_ENTRY(0, 0xff, 0, false) >> + (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8)) != + ARCH_PERFMON_EVENTSEL_UMASK); + + return (umask & mask) == match; +} + +static bool filter_contains_match(u64 *events, u64 nevents, u64 eventsel) +{ + u64 event_select = eventsel & kvm_pmu_ops.EVENTSEL_EVENT; + u64 umask = eventsel & ARCH_PERFMON_EVENTSEL_UMASK; + int i, index; + + index = find_filter_index(events, nevents, event_select); + if (index < 0) + return false; + + /* + * Entries are sorted by the event select. Walk the list in both + * directions to process all entries with the targeted event select. + */ + for (i = index; i < nevents; i++) { + if (filter_event_cmp(&events[i], &event_select)) + break; + + if (is_filter_entry_match(events[i], umask)) + return true; + } + + for (i = index - 1; i >= 0; i--) { + if (filter_event_cmp(&events[i], &event_select)) + break; + + if (is_filter_entry_match(events[i], umask)) + return true; + } + + return false; +} + +static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f, + u64 eventsel) +{ + if (filter_contains_match(f->includes, f->nr_includes, eventsel) && + !filter_contains_match(f->excludes, f->nr_excludes, eventsel)) + return f->action == KVM_PMU_EVENT_ALLOW; + + return f->action == KVM_PMU_EVENT_DENY; +} + +static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter, + int idx) +{ + int fixed_idx = idx - INTEL_PMC_IDX_FIXED; + + if (filter->action == KVM_PMU_EVENT_DENY && + test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap)) + return false; + if (filter->action == KVM_PMU_EVENT_ALLOW && + !test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap)) + return false; + + return true; +} + static bool check_pmu_event_filter(struct kvm_pmc *pmc) { - struct kvm_pmu_event_filter *filter; + struct kvm_x86_pmu_event_filter *filter; struct kvm *kvm = pmc->vcpu->kvm; - bool allow_event = true; - __u64 key; - int idx; if (!static_call(kvm_x86_pmu_hw_event_available)(pmc)) return false; filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); if (!filter) - goto out; + return true; - if (pmc_is_gp(pmc)) { - key = pmc->eventsel & AMD64_RAW_EVENT_MASK_NB; - if (bsearch(&key, filter->events, filter->nevents, - sizeof(__u64), cmp_u64)) - allow_event = filter->action == KVM_PMU_EVENT_ALLOW; - else - allow_event = filter->action == KVM_PMU_EVENT_DENY; - } else { - idx = pmc->idx - INTEL_PMC_IDX_FIXED; - if (filter->action == KVM_PMU_EVENT_DENY && - test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) - allow_event = false; - if (filter->action == KVM_PMU_EVENT_ALLOW && - !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) - allow_event = false; - } + if (pmc_is_gp(pmc)) + return is_gp_event_allowed(filter, pmc->eventsel); -out: - return allow_event; + return is_fixed_event_allowed(filter, pmc->idx); } static void reprogram_counter(struct kvm_pmc *pmc) @@ -592,47 +694,133 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id) } EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); +static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter) +{ + u64 mask = kvm_pmu_ops.EVENTSEL_EVENT | + KVM_PMU_MASKED_ENTRY_UMASK_MASK | + KVM_PMU_MASKED_ENTRY_UMASK_MATCH | + KVM_PMU_MASKED_ENTRY_EXCLUDE; + int i; + + for (i = 0; i < filter->nevents; i++) { + if (filter->events[i] & ~mask) + return false; + } + + return true; +} + +static void convert_to_masked_filter(struct kvm_x86_pmu_event_filter *filter) +{ + int i, j; + + for (i = 0, j = 0; i < filter->nevents; i++) { + /* + * Skip events that are impossible to match against a guest + * event. When filtering, only the event select + unit mask + * of the guest event is used. To maintain backwards + * compatibility, impossible filters can't be rejected :-( + */ + if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT | + ARCH_PERFMON_EVENTSEL_UMASK)) + continue; + /* + * Convert userspace events to a common in-kernel event so + * only one code path is needed to support both events. For + * the in-kernel events use masked events because they are + * flexible enough to handle both cases. To convert to masked + * events all that's needed is to add an "all ones" umask_mask, + * (unmasked filter events don't support EXCLUDE). + */ + filter->events[j++] = filter->events[i] | + (0xFFULL << KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT); + } + + filter->nevents = j; +} + +static int prepare_filter_lists(struct kvm_x86_pmu_event_filter *filter) +{ + int i; + + if (!(filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS)) + convert_to_masked_filter(filter); + else if (!is_masked_filter_valid(filter)) + return -EINVAL; + + /* + * Sort entries by event select and includes vs. excludes so that all + * entries for a given event select can be processed efficiently during + * filtering. The EXCLUDE flag uses a more significant bit than the + * event select, and so the sorted list is also effectively split into + * includes and excludes sub-lists. + */ + sort(&filter->events, filter->nevents, sizeof(filter->events[0]), + filter_sort_cmp, NULL); + + i = filter->nevents; + /* Find the first EXCLUDE event (only supported for masked events). */ + if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) { + for (i = 0; i < filter->nevents; i++) { + if (filter->events[i] & KVM_PMU_MASKED_ENTRY_EXCLUDE) + break; + } + } + + filter->nr_includes = i; + filter->nr_excludes = filter->nevents - filter->nr_includes; + filter->includes = filter->events; + filter->excludes = filter->events + filter->nr_includes; + + return 0; +} + int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) { - struct kvm_pmu_event_filter tmp, *filter; + struct kvm_pmu_event_filter __user *user_filter = argp; + struct kvm_x86_pmu_event_filter *filter; + struct kvm_pmu_event_filter tmp; struct kvm_vcpu *vcpu; unsigned long i; size_t size; int r; - if (copy_from_user(&tmp, argp, sizeof(tmp))) + if (copy_from_user(&tmp, user_filter, sizeof(tmp))) return -EFAULT; if (tmp.action != KVM_PMU_EVENT_ALLOW && tmp.action != KVM_PMU_EVENT_DENY) return -EINVAL; - if (tmp.flags != 0) + if (tmp.flags & ~KVM_PMU_EVENT_FLAGS_VALID_MASK) return -EINVAL; if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS) return -E2BIG; size = struct_size(filter, events, tmp.nevents); - filter = kmalloc(size, GFP_KERNEL_ACCOUNT); + filter = kzalloc(size, GFP_KERNEL_ACCOUNT); if (!filter) return -ENOMEM; + filter->action = tmp.action; + filter->nevents = tmp.nevents; + filter->fixed_counter_bitmap = tmp.fixed_counter_bitmap; + filter->flags = tmp.flags; + r = -EFAULT; - if (copy_from_user(filter, argp, size)) + if (copy_from_user(filter->events, user_filter->events, + sizeof(filter->events[0]) * filter->nevents)) goto cleanup; - /* Ensure nevents can't be changed between the user copies. */ - *filter = tmp; - - /* - * Sort the in-kernel list so that we can search it with bsearch. - */ - sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL); + r = prepare_filter_lists(filter); + if (r) + goto cleanup; mutex_lock(&kvm->lock); filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter, mutex_is_locked(&kvm->lock)); + mutex_unlock(&kvm->lock); synchronize_srcu_expedited(&kvm->srcu); BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) > @@ -643,8 +831,6 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) kvm_make_all_cpus_request(kvm, KVM_REQ_PMU); - mutex_unlock(&kvm->lock); - r = 0; cleanup: kfree(filter); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index ee67ba625094..be62c16f2265 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -18,12 +18,6 @@ #define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001 #define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002 -struct kvm_event_hw_type_mapping { - u8 eventsel; - u8 unit_mask; - unsigned event_type; -}; - struct kvm_pmu_ops { bool (*hw_event_available)(struct kvm_pmc *pmc); bool (*pmc_is_enabled)(struct kvm_pmc *pmc); @@ -40,6 +34,9 @@ struct kvm_pmu_ops { void (*reset)(struct kvm_vcpu *vcpu); void (*deliver_pmi)(struct kvm_vcpu *vcpu); void (*cleanup)(struct kvm_vcpu *vcpu); + + const u64 EVENTSEL_EVENT; + const int MAX_NR_GP_COUNTERS; }; void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); @@ -161,7 +158,7 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) extern struct x86_pmu_capability kvm_pmu_cap; -static inline void kvm_init_pmu_capability(void) +static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) { bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; @@ -192,6 +189,8 @@ static inline void kvm_init_pmu_capability(void) } kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); + kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp, + pmu_ops->MAX_NR_GP_COUNTERS); kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, KVM_PMC_MAX_FIXED); } diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 81f4e9ce0c77..a5717282bb9c 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -14,6 +14,7 @@ enum kvm_only_cpuid_leafs { CPUID_12_EAX = NCAPINTS, CPUID_7_1_EDX, + CPUID_8000_0007_EDX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, @@ -43,6 +44,9 @@ enum kvm_only_cpuid_leafs { #define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) #define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14) +/* CPUID level 0x80000007 (EDX). */ +#define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8) + struct cpuid_reg { u32 function; u32 index; @@ -68,6 +72,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX}, [CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX}, [CPUID_7_1_EDX] = { 7, 1, CPUID_EDX}, + [CPUID_8000_0007_EDX] = {0x80000007, 0, CPUID_EDX}, [CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX}, }; @@ -101,6 +106,8 @@ static __always_inline u32 __feature_translate(int x86_feature) return KVM_X86_FEATURE_SGX2; else if (x86_feature == X86_FEATURE_SGX_EDECCSSA) return KVM_X86_FEATURE_SGX_EDECCSSA; + else if (x86_feature == X86_FEATURE_CONSTANT_TSC) + return KVM_X86_FEATURE_CONSTANT_TSC; return x86_feature; } diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index a9c1c2af8d94..b42111a24cc2 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include "x86.h" @@ -110,8 +111,6 @@ static void check_smram_offsets(void) void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) { - BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); - trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm); if (entering_smm) { diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 6919dee69f18..ca684979e90d 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -12,7 +12,7 @@ * Avi Kivity <avi@qumranet.com> */ -#define pr_fmt(fmt) "SVM: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_types.h> #include <linux/hashtable.h> @@ -53,7 +53,7 @@ static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); static u32 next_vm_id = 0; static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); -enum avic_modes avic_mode; +bool x2avic_enabled; /* * This is a wrapper of struct amd_iommu_ir_data. @@ -72,20 +72,25 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) vmcb->control.int_ctl |= AVIC_ENABLE_MASK; - /* Note: - * KVM can support hybrid-AVIC mode, where KVM emulates x2APIC - * MSR accesses, while interrupt injection to a running vCPU - * can be achieved using AVIC doorbell. The AVIC hardware still - * accelerate MMIO accesses, but this does not cause any harm - * as the guest is not supposed to access xAPIC mmio when uses x2APIC. + /* + * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR + * accesses, while interrupt injection to a running vCPU can be + * achieved using AVIC doorbell. KVM disables the APIC access page + * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling + * AVIC in hybrid mode activates only the doorbell mechanism. */ - if (apic_x2apic_mode(svm->vcpu.arch.apic) && - avic_mode == AVIC_MODE_X2) { + if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) { vmcb->control.int_ctl |= X2APIC_MODE_MASK; vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID; /* Disabling MSR intercept for x2APIC registers */ svm_set_x2apic_msr_interception(svm, false); } else { + /* + * Flush the TLB, the guest may have inserted a non-APIC + * mapping into the TLB while AVIC was disabled. + */ + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); + /* For xAVIC and hybrid-xAVIC modes */ vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; /* Enabling MSR intercept for x2APIC registers */ @@ -241,8 +246,8 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, u64 *avic_physical_id_table; struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); - if ((avic_mode == AVIC_MODE_X1 && index > AVIC_MAX_PHYSICAL_ID) || - (avic_mode == AVIC_MODE_X2 && index > X2AVIC_MAX_PHYSICAL_ID)) + if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) || + (index > X2AVIC_MAX_PHYSICAL_ID)) return NULL; avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page); @@ -250,47 +255,14 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, return &avic_physical_id_table[index]; } -/* - * Note: - * AVIC hardware walks the nested page table to check permissions, - * but does not use the SPA address specified in the leaf page - * table entry since it uses address in the AVIC_BACKING_PAGE pointer - * field of the VMCB. Therefore, we set up the - * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here. - */ -static int avic_alloc_access_page(struct kvm *kvm) -{ - void __user *ret; - int r = 0; - - mutex_lock(&kvm->slots_lock); - - if (kvm->arch.apic_access_memslot_enabled) - goto out; - - ret = __x86_set_memory_region(kvm, - APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, - APIC_DEFAULT_PHYS_BASE, - PAGE_SIZE); - if (IS_ERR(ret)) { - r = PTR_ERR(ret); - goto out; - } - - kvm->arch.apic_access_memslot_enabled = true; -out: - mutex_unlock(&kvm->slots_lock); - return r; -} - static int avic_init_backing_page(struct kvm_vcpu *vcpu) { u64 *entry, new_entry; int id = vcpu->vcpu_id; struct vcpu_svm *svm = to_svm(vcpu); - if ((avic_mode == AVIC_MODE_X1 && id > AVIC_MAX_PHYSICAL_ID) || - (avic_mode == AVIC_MODE_X2 && id > X2AVIC_MAX_PHYSICAL_ID)) + if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) || + (id > X2AVIC_MAX_PHYSICAL_ID)) return -EINVAL; if (!vcpu->arch.apic->regs) @@ -299,7 +271,13 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) if (kvm_apicv_activated(vcpu->kvm)) { int ret; - ret = avic_alloc_access_page(vcpu->kvm); + /* + * Note, AVIC hardware walks the nested page table to check + * permissions, but does not use the SPA address specified in + * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE + * pointer field of the VMCB. + */ + ret = kvm_alloc_apic_access_page(vcpu->kvm); if (ret) return ret; } @@ -339,6 +317,60 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu) put_cpu(); } + +static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl) +{ + vcpu->arch.apic->irr_pending = true; + svm_complete_interrupt_delivery(vcpu, + icrl & APIC_MODE_MASK, + icrl & APIC_INT_LEVELTRIG, + icrl & APIC_VECTOR_MASK); +} + +static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id, + u32 icrl) +{ + /* + * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID, + * i.e. APIC ID == vCPU ID. + */ + struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id); + + /* Once again, nothing to do if the target vCPU doesn't exist. */ + if (unlikely(!target_vcpu)) + return; + + avic_kick_vcpu(target_vcpu, icrl); +} + +static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table, + u32 logid_index, u32 icrl) +{ + u32 physical_id; + + if (avic_logical_id_table) { + u32 logid_entry = avic_logical_id_table[logid_index]; + + /* Nothing to do if the logical destination is invalid. */ + if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK))) + return; + + physical_id = logid_entry & + AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; + } else { + /* + * For x2APIC, the logical APIC ID is a read-only value that is + * derived from the x2APIC ID, thus the x2APIC ID can be found + * by reversing the calculation (stored in logid_index). Note, + * bits 31:20 of the x2APIC ID aren't propagated to the logical + * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS. + */ + physical_id = logid_index; + } + + avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl); +} + /* * A fast-path version of avic_kick_target_vcpus(), which attempts to match * destination APIC ID to vCPU without looping through all vCPUs. @@ -346,11 +378,10 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu) static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source, u32 icrl, u32 icrh, u32 index) { - u32 l1_physical_id, dest; - struct kvm_vcpu *target_vcpu; int dest_mode = icrl & APIC_DEST_MASK; int shorthand = icrl & APIC_SHORT_MASK; struct kvm_svm *kvm_svm = to_kvm_svm(kvm); + u32 dest; if (shorthand != APIC_DEST_NOSHORT) return -EINVAL; @@ -367,18 +398,18 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST) return -EINVAL; - l1_physical_id = dest; - - if (WARN_ON_ONCE(l1_physical_id != index)) + if (WARN_ON_ONCE(dest != index)) return -EINVAL; + avic_kick_vcpu_by_physical_id(kvm, dest, icrl); } else { - u32 bitmap, cluster; - int logid_index; + u32 *avic_logical_id_table; + unsigned long bitmap, i; + u32 cluster; if (apic_x2apic_mode(source)) { /* 16 bit dest mask, 16 bit cluster id */ - bitmap = dest & 0xFFFF0000; + bitmap = dest & 0xFFFF; cluster = (dest >> 16) << 4; } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) { /* 8 bit dest mask*/ @@ -390,67 +421,32 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source cluster = (dest >> 4) << 2; } + /* Nothing to do if there are no destinations in the cluster. */ if (unlikely(!bitmap)) - /* guest bug: nobody to send the logical interrupt to */ return 0; - if (!is_power_of_2(bitmap)) - /* multiple logical destinations, use slow path */ - return -EINVAL; - - logid_index = cluster + __ffs(bitmap); - - if (!apic_x2apic_mode(source)) { - u32 *avic_logical_id_table = - page_address(kvm_svm->avic_logical_id_table_page); - - u32 logid_entry = avic_logical_id_table[logid_index]; - - if (WARN_ON_ONCE(index != logid_index)) - return -EINVAL; - - /* guest bug: non existing/reserved logical destination */ - if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK))) - return 0; - - l1_physical_id = logid_entry & - AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; - } else { - /* - * For x2APIC logical mode, cannot leverage the index. - * Instead, calculate physical ID from logical ID in ICRH. - */ - int cluster = (icrh & 0xffff0000) >> 16; - int apic = ffs(icrh & 0xffff) - 1; - - /* - * If the x2APIC logical ID sub-field (i.e. icrh[15:0]) - * contains anything but a single bit, we cannot use the - * fast path, because it is limited to a single vCPU. - */ - if (apic < 0 || icrh != (1 << apic)) - return -EINVAL; + if (apic_x2apic_mode(source)) + avic_logical_id_table = NULL; + else + avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page); - l1_physical_id = (cluster << 4) + apic; - } + /* + * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical + * IDs, thus each bit in the destination is guaranteed to map + * to at most one vCPU. + */ + for_each_set_bit(i, &bitmap, 16) + avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table, + cluster + i, icrl); } - target_vcpu = kvm_get_vcpu_by_id(kvm, l1_physical_id); - if (unlikely(!target_vcpu)) - /* guest bug: non existing vCPU is a target of this IPI*/ - return 0; - - target_vcpu->arch.apic->irr_pending = true; - svm_complete_interrupt_delivery(target_vcpu, - icrl & APIC_MODE_MASK, - icrl & APIC_INT_LEVELTRIG, - icrl & APIC_VECTOR_MASK); return 0; } static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, u32 icrl, u32 icrh, u32 index) { + u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh); unsigned long i; struct kvm_vcpu *vcpu; @@ -466,21 +462,9 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, * since entered the guest will have processed pending IRQs at VMRUN. */ kvm_for_each_vcpu(i, vcpu, kvm) { - u32 dest; - - if (apic_x2apic_mode(vcpu->arch.apic)) - dest = icrh; - else - dest = GET_XAPIC_DEST_FIELD(icrh); - if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, - dest, icrl & APIC_DEST_MASK)) { - vcpu->arch.apic->irr_pending = true; - svm_complete_interrupt_delivery(vcpu, - icrl & APIC_MODE_MASK, - icrl & APIC_INT_LEVELTRIG, - icrl & APIC_VECTOR_MASK); - } + dest, icrl & APIC_DEST_MASK)) + avic_kick_vcpu(vcpu, icrl); } } @@ -496,14 +480,18 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index); switch (id) { + case AVIC_IPI_FAILURE_INVALID_TARGET: case AVIC_IPI_FAILURE_INVALID_INT_TYPE: /* * Emulate IPIs that are not handled by AVIC hardware, which - * only virtualizes Fixed, Edge-Triggered INTRs. The exit is - * a trap, e.g. ICR holds the correct value and RIP has been - * advanced, KVM is responsible only for emulating the IPI. - * Sadly, hardware may sometimes leave the BUSY flag set, in - * which case KVM needs to emulate the ICR write as well in + * only virtualizes Fixed, Edge-Triggered INTRs, and falls over + * if _any_ targets are invalid, e.g. if the logical mode mask + * is a superset of running vCPUs. + * + * The exit is a trap, e.g. ICR holds the correct value and RIP + * has been advanced, KVM is responsible only for emulating the + * IPI. Sadly, hardware may sometimes leave the BUSY flag set, + * in which case KVM needs to emulate the ICR write as well in * order to clear the BUSY flag. */ if (icrl & APIC_ICR_BUSY) @@ -519,8 +507,6 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) */ avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index); break; - case AVIC_IPI_FAILURE_INVALID_TARGET: - break; case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: WARN_ONCE(1, "Invalid backing page\n"); break; @@ -541,33 +527,33 @@ unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu) static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) { struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); - int index; u32 *logical_apic_id_table; - int dlid = GET_APIC_LOGICAL_ID(ldr); - - if (!dlid) - return NULL; + u32 cluster, index; - if (flat) { /* flat */ - index = ffs(dlid) - 1; - if (index > 7) - return NULL; - } else { /* cluster */ - int cluster = (dlid & 0xf0) >> 4; - int apic = ffs(dlid & 0x0f) - 1; + ldr = GET_APIC_LOGICAL_ID(ldr); - if ((apic < 0) || (apic > 7) || - (cluster >= 0xf)) + if (flat) { + cluster = 0; + } else { + cluster = (ldr >> 4); + if (cluster >= 0xf) return NULL; - index = (cluster << 2) + apic; + ldr &= 0xf; } + if (!ldr || !is_power_of_2(ldr)) + return NULL; + + index = __ffs(ldr); + if (WARN_ON_ONCE(index > 7)) + return NULL; + index += (cluster << 2); logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page); return &logical_apic_id_table[index]; } -static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) +static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) { bool flat; u32 *entry, new_entry; @@ -575,15 +561,13 @@ static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT; entry = avic_get_logical_id_entry(vcpu, ldr, flat); if (!entry) - return -EINVAL; + return; new_entry = READ_ONCE(*entry); new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; WRITE_ONCE(*entry, new_entry); - - return 0; } static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) @@ -601,29 +585,23 @@ static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry); } -static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) +static void avic_handle_ldr_update(struct kvm_vcpu *vcpu) { - int ret = 0; struct vcpu_svm *svm = to_svm(vcpu); u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); u32 id = kvm_xapic_id(vcpu->arch.apic); /* AVIC does not support LDR update for x2APIC */ if (apic_x2apic_mode(vcpu->arch.apic)) - return 0; + return; if (ldr == svm->ldr_reg) - return 0; + return; avic_invalidate_logical_id_entry(vcpu); - if (ldr) - ret = avic_ldr_write(vcpu, id, ldr); - - if (!ret) - svm->ldr_reg = ldr; - - return ret; + svm->ldr_reg = ldr; + avic_ldr_write(vcpu, id, ldr); } static void avic_handle_dfr_update(struct kvm_vcpu *vcpu) @@ -645,12 +623,14 @@ static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu) switch (offset) { case APIC_LDR: - if (avic_handle_ldr_update(vcpu)) - return 0; + avic_handle_ldr_update(vcpu); break; case APIC_DFR: avic_handle_dfr_update(vcpu); break; + case APIC_RRR: + /* Ignore writes to Read Remote Data, it's read-only. */ + return 1; default: break; } @@ -739,18 +719,6 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } -void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu) -{ - if (!lapic_in_kernel(vcpu) || avic_mode == AVIC_MODE_NONE) - return; - - if (kvm_get_apic_mode(vcpu) == LAPIC_MODE_INVALID) { - WARN_ONCE(true, "Invalid local APIC state (vcpu_id=%d)", vcpu->vcpu_id); - return; - } - avic_refresh_apicv_exec_ctrl(vcpu); -} - static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) { int ret = 0; @@ -995,23 +963,6 @@ out: return ret; } -bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) -{ - ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | - BIT(APICV_INHIBIT_REASON_ABSENT) | - BIT(APICV_INHIBIT_REASON_HYPERV) | - BIT(APICV_INHIBIT_REASON_NESTED) | - BIT(APICV_INHIBIT_REASON_IRQWIN) | - BIT(APICV_INHIBIT_REASON_PIT_REINJ) | - BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | - BIT(APICV_INHIBIT_REASON_SEV) | - BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | - BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED); - - return supported & BIT(reason); -} - - static inline int avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) { @@ -1064,6 +1015,7 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) return; entry = READ_ONCE(*(svm->avic_physical_id_cache)); + WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); @@ -1092,17 +1044,15 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) WRITE_ONCE(*(svm->avic_physical_id_cache), entry); } - -void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) +void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb01.ptr; - bool activated = kvm_vcpu_apicv_active(vcpu); - if (!enable_apicv) + if (!lapic_in_kernel(vcpu) || !enable_apicv) return; - if (activated) { + if (kvm_vcpu_apicv_active(vcpu)) { /** * During AVIC temporary deactivation, guest could update * APIC ID, DFR and LDR registers, which would not be trapped @@ -1116,6 +1066,16 @@ void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) avic_deactivate_vmcb(svm); } vmcb_mark_dirty(vmcb, VMCB_AVIC); +} + +void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) +{ + bool activated = kvm_vcpu_apicv_active(vcpu); + + if (!enable_apicv) + return; + + avic_refresh_virtual_apic_mode(vcpu); if (activated) avic_vcpu_load(vcpu, vcpu->cpu); @@ -1160,37 +1120,37 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) * - Hypervisor can support both xAVIC and x2AVIC in the same guest. * - The mode can be switched at run-time. */ -bool avic_hardware_setup(struct kvm_x86_ops *x86_ops) +bool avic_hardware_setup(void) { if (!npt_enabled) return false; + /* AVIC is a prerequisite for x2AVIC. */ + if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) { + if (boot_cpu_has(X86_FEATURE_X2AVIC)) { + pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled"); + pr_warn(FW_BUG "Try enable AVIC using force_avic option"); + } + return false; + } + if (boot_cpu_has(X86_FEATURE_AVIC)) { - avic_mode = AVIC_MODE_X1; pr_info("AVIC enabled\n"); } else if (force_avic) { /* * Some older systems does not advertise AVIC support. * See Revision Guide for specific AMD processor for more detail. */ - avic_mode = AVIC_MODE_X1; pr_warn("AVIC is not supported in CPUID but force enabled"); pr_warn("Your system might crash and burn"); } /* AVIC is a prerequisite for x2AVIC. */ - if (boot_cpu_has(X86_FEATURE_X2AVIC)) { - if (avic_mode == AVIC_MODE_X1) { - avic_mode = AVIC_MODE_X2; - pr_info("x2AVIC enabled\n"); - } else { - pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled"); - pr_warn(FW_BUG "Try enable AVIC using force_avic option"); - } - } + x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC); + if (x2avic_enabled) + pr_info("x2AVIC enabled\n"); - if (avic_mode != AVIC_MODE_NONE) - amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); - return !!avic_mode; + return true; } diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index add65dd59756..05d38944a6c0 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -12,7 +12,7 @@ * Avi Kivity <avi@qumranet.com> */ -#define pr_fmt(fmt) "SVM: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_types.h> #include <linux/kvm_host.h> @@ -1008,7 +1008,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.next_rip = vmcb02->control.next_rip; vmcb12->control.int_ctl = svm->nested.ctl.int_ctl; - vmcb12->control.tlb_ctl = svm->nested.ctl.tlb_ctl; vmcb12->control.event_inj = svm->nested.ctl.event_inj; vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err; @@ -1104,7 +1103,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) * to benefit from it right away. */ if (kvm_apicv_activated(vcpu->kvm)) - kvm_vcpu_update_apicv(vcpu); + __kvm_vcpu_update_apicv(vcpu); return 0; } diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 0e313fbae055..cc77a0681800 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -9,6 +9,8 @@ * * Implementation is based on pmu_intel.c file */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/kvm_host.h> #include <linux/perf_event.h> @@ -229,4 +231,6 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .refresh = amd_pmu_refresh, .init = amd_pmu_init, .reset = amd_pmu_reset, + .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT, + .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC, }; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 86d6897f4806..c25aeb550cd9 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -6,6 +6,7 @@ * * Copyright 2010 Red Hat, Inc. and/or its affiliates. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_types.h> #include <linux/kvm_host.h> @@ -812,7 +813,7 @@ static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr, if (!IS_ALIGNED(dst_paddr, 16) || !IS_ALIGNED(paddr, 16) || !IS_ALIGNED(size, 16)) { - tpage = (void *)alloc_page(GFP_KERNEL | __GFP_ZERO); + tpage = (void *)alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!tpage) return -ENOMEM; @@ -1293,7 +1294,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Check if we are crossing the page boundary */ offset = params.guest_uaddr & (PAGE_SIZE - 1); - if ((params.guest_len + offset > PAGE_SIZE)) + if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE) return -EINVAL; /* Pin guest memory */ @@ -1473,7 +1474,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Check if we are crossing the page boundary */ offset = params.guest_uaddr & (PAGE_SIZE - 1); - if ((params.guest_len + offset > PAGE_SIZE)) + if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE) return -EINVAL; hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 60c7c880266b..252e7f37e4e2 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1,4 +1,4 @@ -#define pr_fmt(fmt) "SVM: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> @@ -519,21 +519,37 @@ static void svm_init_osvw(struct kvm_vcpu *vcpu) vcpu->arch.osvw.status |= 1; } -static int has_svm(void) +static bool kvm_is_svm_supported(void) { + int cpu = raw_smp_processor_id(); const char *msg; + u64 vm_cr; if (!cpu_has_svm(&msg)) { - printk(KERN_INFO "has_svm: %s\n", msg); - return 0; + pr_err("SVM not supported by CPU %d, %s\n", cpu, msg); + return false; } if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { pr_info("KVM is unsupported when running as an SEV guest\n"); - return 0; + return false; } - return 1; + rdmsrl(MSR_VM_CR, vm_cr); + if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) { + pr_err("SVM disabled (by BIOS) in MSR_VM_CR on CPU %d\n", cpu); + return false; + } + + return true; +} + +static int svm_check_processor_compat(void) +{ + if (!kvm_is_svm_supported()) + return -EIO; + + return 0; } void __svm_write_tsc_multiplier(u64 multiplier) @@ -572,10 +588,6 @@ static int svm_hardware_enable(void) if (efer & EFER_SVME) return -EBUSY; - if (!has_svm()) { - pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me); - return -EINVAL; - } sd = per_cpu_ptr(&svm_data, me); sd->asid_generation = 1; sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; @@ -813,7 +825,7 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) if (intercept == svm->x2avic_msrs_intercepted) return; - if (avic_mode != AVIC_MODE_X2 || + if (!x2avic_enabled || !apic_x2apic_mode(svm->vcpu.arch.apic)) return; @@ -1326,6 +1338,9 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) vcpu->arch.microcode_version = 0x01000065; svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio; + svm->nmi_masked = false; + svm->awaiting_iret_completion = false; + if (sev_es_guest(vcpu->kvm)) sev_es_vcpu_reset(svm); } @@ -2076,7 +2091,7 @@ static void svm_handle_mce(struct kvm_vcpu *vcpu) * Erratum 383 triggered. Guest state is corrupt so kill the * guest. */ - pr_err("KVM: Guest triggered AMD Erratum 383\n"); + pr_err("Guest triggered AMD Erratum 383\n"); kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); @@ -2470,7 +2485,7 @@ static int iret_interception(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); ++vcpu->stat.nmi_window_exits; - vcpu->arch.hflags |= HF_IRET_MASK; + svm->awaiting_iret_completion = true; if (!sev_es_guest(vcpu->kvm)) { svm_clr_intercept(svm, INTERCEPT_IRET); svm->nmi_iret_rip = kvm_rip_read(vcpu); @@ -3003,8 +3018,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_DEBUGCTLMSR: if (!lbrv) { - vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", - __func__, data); + kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; } if (data & DEBUGCTL_RESERVED_BITS) @@ -3033,7 +3047,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_VM_CR: return svm_set_vm_cr(vcpu, data); case MSR_VM_IGNNE: - vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); + kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; case MSR_AMD64_DE_CFG: { struct kvm_msr_entry msr_entry; @@ -3466,7 +3480,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) if (svm->nmi_l1_to_l2) return; - vcpu->arch.hflags |= HF_NMI_MASK; + svm->nmi_masked = true; if (!sev_es_guest(vcpu->kvm)) svm_set_intercept(svm, INTERCEPT_IRET); ++vcpu->stat.nmi_injections; @@ -3571,7 +3585,6 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - bool ret; if (!gif_set(svm)) return true; @@ -3579,10 +3592,8 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) return false; - ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || - (vcpu->arch.hflags & HF_NMI_MASK); - - return ret; + return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || + svm->nmi_masked; } static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) @@ -3602,7 +3613,7 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) { - return !!(vcpu->arch.hflags & HF_NMI_MASK); + return to_svm(vcpu)->nmi_masked; } static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) @@ -3610,11 +3621,11 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) struct vcpu_svm *svm = to_svm(vcpu); if (masked) { - vcpu->arch.hflags |= HF_NMI_MASK; + svm->nmi_masked = true; if (!sev_es_guest(vcpu->kvm)) svm_set_intercept(svm, INTERCEPT_IRET); } else { - vcpu->arch.hflags &= ~HF_NMI_MASK; + svm->nmi_masked = false; if (!sev_es_guest(vcpu->kvm)) svm_clr_intercept(svm, INTERCEPT_IRET); } @@ -3700,7 +3711,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK) + if (svm->nmi_masked && !svm->awaiting_iret_completion) return; /* IRET will cause a vm exit */ if (!gif_set(svm)) { @@ -3824,10 +3835,11 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) * If we've made progress since setting HF_IRET_MASK, we've * executed an IRET and can allow NMI injection. */ - if ((vcpu->arch.hflags & HF_IRET_MASK) && + if (svm->awaiting_iret_completion && (sev_es_guest(vcpu->kvm) || kvm_rip_read(vcpu) != svm->nmi_iret_rip)) { - vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); + svm->awaiting_iret_completion = false; + svm->nmi_masked = false; kvm_make_request(KVM_REQ_EVENT, vcpu); } @@ -4076,17 +4088,6 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, vmcb_mark_dirty(svm->vmcb, VMCB_CR); } -static int is_disabled(void) -{ - u64 vm_cr; - - rdmsrl(MSR_VM_CR, vm_cr); - if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) - return 1; - - return 0; -} - static void svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) { @@ -4098,11 +4099,6 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xd9; } -static int __init svm_check_processor_compat(void) -{ - return 0; -} - /* * The kvm parameter can be NULL (module initialization, or invocation before * VM creation). Be sure to check the kvm parameter before using it. @@ -4629,7 +4625,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, smap = cr4 & X86_CR4_SMAP; is_user = svm_get_cpl(vcpu) == 3; if (smap && (!smep || is_user)) { - pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n"); + pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n"); /* * If the fault occurred in userspace, arbitrarily inject #GP @@ -4701,7 +4697,9 @@ static int svm_vm_init(struct kvm *kvm) } static struct kvm_x86_ops svm_x86_ops __initdata = { - .name = "kvm_amd", + .name = KBUILD_MODNAME, + + .check_processor_compatibility = svm_check_processor_compat, .hardware_unsetup = svm_hardware_unsetup, .hardware_enable = svm_hardware_enable, @@ -4771,10 +4769,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .enable_nmi_window = svm_enable_nmi_window, .enable_irq_window = svm_enable_irq_window, .update_cr8_intercept = svm_update_cr8_intercept, - .set_virtual_apic_mode = avic_set_virtual_apic_mode, + .set_virtual_apic_mode = avic_refresh_virtual_apic_mode, .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, - .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons, .apicv_post_state_restore = avic_apicv_post_state_restore, + .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS, .get_exit_info = svm_get_exit_info, @@ -4981,7 +4979,7 @@ static __init int svm_hardware_setup(void) } if (nested) { - printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); + pr_info("Nested Virtualization enabled\n"); kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); } @@ -4999,7 +4997,7 @@ static __init int svm_hardware_setup(void) /* Force VM NPT level equal to the host's paging level */ kvm_configure_mmu(npt_enabled, get_npt_level(), get_npt_level(), PG_LEVEL_1G); - pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); + pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); /* Setup shadow_me_value and shadow_me_mask */ kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask); @@ -5025,12 +5023,14 @@ static __init int svm_hardware_setup(void) nrips = false; } - enable_apicv = avic = avic && avic_hardware_setup(&svm_x86_ops); + enable_apicv = avic = avic && avic_hardware_setup(); if (!enable_apicv) { svm_x86_ops.vcpu_blocking = NULL; svm_x86_ops.vcpu_unblocking = NULL; svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; + } else if (!x2avic_enabled) { + svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true; } if (vls) { @@ -5089,10 +5089,7 @@ err: static struct kvm_x86_init_ops svm_init_ops __initdata = { - .cpu_has_kvm_support = has_svm, - .disabled_by_bios = is_disabled, .hardware_setup = svm_hardware_setup, - .check_processor_compatibility = svm_check_processor_compat, .runtime_ops = &svm_x86_ops, .pmu_ops = &amd_pmu_ops, @@ -5100,15 +5097,37 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = { static int __init svm_init(void) { + int r; + __unused_size_checks(); - return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm), - __alignof__(struct vcpu_svm), THIS_MODULE); + if (!kvm_is_svm_supported()) + return -EOPNOTSUPP; + + r = kvm_x86_vendor_init(&svm_init_ops); + if (r) + return r; + + /* + * Common KVM initialization _must_ come last, after this, /dev/kvm is + * exposed to userspace! + */ + r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm), + THIS_MODULE); + if (r) + goto err_kvm_init; + + return 0; + +err_kvm_init: + kvm_x86_vendor_exit(); + return r; } static void __exit svm_exit(void) { kvm_exit(); + kvm_x86_vendor_exit(); } module_init(svm_init) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 4826e6cc611b..839809972da1 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -35,14 +35,7 @@ extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; extern int vgif; extern bool intercept_smi; - -enum avic_modes { - AVIC_MODE_NONE = 0, - AVIC_MODE_X1, - AVIC_MODE_X2, -}; - -extern enum avic_modes avic_mode; +extern bool x2avic_enabled; /* * Clean bits in VMCB. @@ -237,8 +230,26 @@ struct vcpu_svm { struct svm_nested_state nested; + /* NMI mask value, used when vNMI is not enabled */ + bool nmi_masked; + + /* + * True when NMIs are still masked but guest IRET was just intercepted + * and KVM is waiting for RIP to change, which will signal that the + * intercepted IRET was retired and thus NMI can be unmasked. + */ + bool awaiting_iret_completion; + + /* + * Set when KVM is awaiting IRET completion and needs to inject NMIs as + * soon as the IRET completes (e.g. NMI is pending injection). KVM + * temporarily steals RFLAGS.TF to single-step the guest in this case + * in order to regain control as soon as the NMI-blocking condition + * goes away. + */ bool nmi_singlestep; u64 nmi_singlestep_guest_rflags; + bool nmi_l1_to_l2; unsigned long soft_int_csbase; @@ -280,6 +291,9 @@ struct vcpu_svm { bool guest_state_loaded; bool x2avic_msrs_intercepted; + + /* Guest GIF value, used when vGIF is not enabled */ + bool guest_gif; }; struct svm_cpu_data { @@ -497,7 +511,7 @@ static inline void enable_gif(struct vcpu_svm *svm) if (vmcb) vmcb->control.int_ctl |= V_GIF_MASK; else - svm->vcpu.arch.hflags |= HF_GIF_MASK; + svm->guest_gif = true; } static inline void disable_gif(struct vcpu_svm *svm) @@ -507,7 +521,7 @@ static inline void disable_gif(struct vcpu_svm *svm) if (vmcb) vmcb->control.int_ctl &= ~V_GIF_MASK; else - svm->vcpu.arch.hflags &= ~HF_GIF_MASK; + svm->guest_gif = false; } static inline bool gif_set(struct vcpu_svm *svm) @@ -517,7 +531,7 @@ static inline bool gif_set(struct vcpu_svm *svm) if (vmcb) return !!(vmcb->control.int_ctl & V_GIF_MASK); else - return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); + return svm->guest_gif; } static inline bool nested_npt_enabled(struct vcpu_svm *svm) @@ -628,8 +642,23 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb); extern struct kvm_x86_nested_ops svm_nested_ops; /* avic.c */ - -bool avic_hardware_setup(struct kvm_x86_ops *ops); +#define AVIC_REQUIRED_APICV_INHIBITS \ +( \ + BIT(APICV_INHIBIT_REASON_DISABLE) | \ + BIT(APICV_INHIBIT_REASON_ABSENT) | \ + BIT(APICV_INHIBIT_REASON_HYPERV) | \ + BIT(APICV_INHIBIT_REASON_NESTED) | \ + BIT(APICV_INHIBIT_REASON_IRQWIN) | \ + BIT(APICV_INHIBIT_REASON_PIT_REINJ) | \ + BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \ + BIT(APICV_INHIBIT_REASON_SEV) | \ + BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \ + BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \ + BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) | \ + BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) \ +) + +bool avic_hardware_setup(void); int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); @@ -641,14 +670,13 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void avic_vcpu_put(struct kvm_vcpu *vcpu); void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu); void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); -bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason); int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void avic_vcpu_blocking(struct kvm_vcpu *vcpu); void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); void avic_ring_doorbell(struct kvm_vcpu *vcpu); unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu); -void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu); +void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu); /* sev.c */ diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c index 26a89d0da93e..7af8422d3382 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.c +++ b/arch/x86/kvm/svm/svm_onhyperv.c @@ -2,6 +2,7 @@ /* * KVM L1 hypervisor optimizations on Hyper-V for SVM. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index 45faf84476ce..cff838f15db5 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -30,11 +30,11 @@ static inline void svm_hv_init_vmcb(struct vmcb *vmcb) hve->hv_enlightenments_control.msr_bitmap = 1; } -static inline void svm_hv_hardware_setup(void) +static inline __init void svm_hv_hardware_setup(void) { if (npt_enabled && ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) { - pr_info("kvm: Hyper-V enlightened NPT TLB flush enabled\n"); + pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n"); svm_x86_ops.tlb_remote_flush = hv_remote_flush_tlb; svm_x86_ops.tlb_remote_flush_with_range = hv_remote_flush_tlb_with_range; @@ -43,7 +43,7 @@ static inline void svm_hv_hardware_setup(void) if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) { int cpu; - pr_info("kvm: Hyper-V Direct TLB Flush enabled\n"); + pr_info(KBUILD_MODNAME ": Hyper-V Direct TLB Flush enabled\n"); for_each_online_cpu(cpu) { struct hv_vp_assist_page *vp_ap = hv_get_vp_assist_page(cpu); @@ -84,7 +84,7 @@ static inline void svm_hv_init_vmcb(struct vmcb *vmcb) { } -static inline void svm_hv_hardware_setup(void) +static inline __init void svm_hv_hardware_setup(void) { } diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index cd2ac9536c99..45162c1bcd8f 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -66,13 +66,13 @@ struct vmcs_config { u64 misc; struct nested_vmx_msrs nested; }; -extern struct vmcs_config vmcs_config; +extern struct vmcs_config vmcs_config __ro_after_init; struct vmx_capability { u32 ept; u32 vpid; }; -extern struct vmx_capability vmx_capability; +extern struct vmx_capability vmx_capability __ro_after_init; static inline bool cpu_has_vmx_basic_inout(void) { diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index ae03d1fe0355..22daca752797 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/errno.h> #include <linux/smp.h> @@ -361,35 +362,43 @@ enum evmcs_revision { enum evmcs_ctrl_type { EVMCS_EXIT_CTRLS, EVMCS_ENTRY_CTRLS, + EVMCS_EXEC_CTRL, EVMCS_2NDEXEC, + EVMCS_3RDEXEC, EVMCS_PINCTRL, EVMCS_VMFUNC, NR_EVMCS_CTRLS, }; -static const u32 evmcs_unsupported_ctrls[NR_EVMCS_CTRLS][NR_EVMCS_REVISIONS] = { +static const u32 evmcs_supported_ctrls[NR_EVMCS_CTRLS][NR_EVMCS_REVISIONS] = { [EVMCS_EXIT_CTRLS] = { - [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMEXIT_CTRL, + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMEXIT_CTRL, }, [EVMCS_ENTRY_CTRLS] = { - [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMENTRY_CTRL, + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMENTRY_CTRL, + }, + [EVMCS_EXEC_CTRL] = { + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_EXEC_CTRL, }, [EVMCS_2NDEXEC] = { - [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_2NDEXEC, + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_2NDEXEC & ~SECONDARY_EXEC_TSC_SCALING, + }, + [EVMCS_3RDEXEC] = { + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_3RDEXEC, }, [EVMCS_PINCTRL] = { - [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_PINCTRL, + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_PINCTRL, }, [EVMCS_VMFUNC] = { - [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMFUNC, + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMFUNC, }, }; -static u32 evmcs_get_unsupported_ctls(enum evmcs_ctrl_type ctrl_type) +static u32 evmcs_get_supported_ctls(enum evmcs_ctrl_type ctrl_type) { enum evmcs_revision evmcs_rev = EVMCSv1_LEGACY; - return evmcs_unsupported_ctrls[ctrl_type][evmcs_rev]; + return evmcs_supported_ctrls[ctrl_type][evmcs_rev]; } static bool evmcs_has_perf_global_ctrl(struct kvm_vcpu *vcpu) @@ -413,7 +422,7 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 * { u32 ctl_low = (u32)*pdata; u32 ctl_high = (u32)(*pdata >> 32); - u32 unsupported_ctrls; + u32 supported_ctrls; /* * Hyper-V 2016 and 2019 try using these features even when eVMCS @@ -422,27 +431,31 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 * switch (msr_index) { case MSR_IA32_VMX_EXIT_CTLS: case MSR_IA32_VMX_TRUE_EXIT_CTLS: - unsupported_ctrls = evmcs_get_unsupported_ctls(EVMCS_EXIT_CTRLS); + supported_ctrls = evmcs_get_supported_ctls(EVMCS_EXIT_CTRLS); if (!evmcs_has_perf_global_ctrl(vcpu)) - unsupported_ctrls |= VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; - ctl_high &= ~unsupported_ctrls; + supported_ctrls &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + ctl_high &= supported_ctrls; break; case MSR_IA32_VMX_ENTRY_CTLS: case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - unsupported_ctrls = evmcs_get_unsupported_ctls(EVMCS_ENTRY_CTRLS); + supported_ctrls = evmcs_get_supported_ctls(EVMCS_ENTRY_CTRLS); if (!evmcs_has_perf_global_ctrl(vcpu)) - unsupported_ctrls |= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; - ctl_high &= ~unsupported_ctrls; + supported_ctrls &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + ctl_high &= supported_ctrls; + break; + case MSR_IA32_VMX_PROCBASED_CTLS: + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + ctl_high &= evmcs_get_supported_ctls(EVMCS_EXEC_CTRL); break; case MSR_IA32_VMX_PROCBASED_CTLS2: - ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_2NDEXEC); + ctl_high &= evmcs_get_supported_ctls(EVMCS_2NDEXEC); break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: - ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_PINCTRL); + ctl_high &= evmcs_get_supported_ctls(EVMCS_PINCTRL); break; case MSR_IA32_VMX_VMFUNC: - ctl_low &= ~evmcs_get_unsupported_ctls(EVMCS_VMFUNC); + ctl_low &= evmcs_get_supported_ctls(EVMCS_VMFUNC); break; } @@ -452,7 +465,7 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 * static bool nested_evmcs_is_valid_controls(enum evmcs_ctrl_type ctrl_type, u32 val) { - return !(val & evmcs_get_unsupported_ctls(ctrl_type)); + return !(val & ~evmcs_get_supported_ctls(ctrl_type)); } int nested_evmcs_check_controls(struct vmcs12 *vmcs12) @@ -461,6 +474,10 @@ int nested_evmcs_check_controls(struct vmcs12 *vmcs12) vmcs12->pin_based_vm_exec_control))) return -EINVAL; + if (CC(!nested_evmcs_is_valid_controls(EVMCS_EXEC_CTRL, + vmcs12->cpu_based_vm_exec_control))) + return -EINVAL; + if (CC(!nested_evmcs_is_valid_controls(EVMCS_2NDEXEC, vmcs12->secondary_vm_exec_control))) return -EINVAL; @@ -488,6 +505,38 @@ int nested_evmcs_check_controls(struct vmcs12 *vmcs12) return 0; } +#if IS_ENABLED(CONFIG_HYPERV) +/* + * KVM on Hyper-V always uses the latest known eVMCSv1 revision, the assumption + * is: in case a feature has corresponding fields in eVMCS described and it was + * exposed in VMX feature MSRs, KVM is free to use it. Warn if KVM meets a + * feature which has no corresponding eVMCS field, this likely means that KVM + * needs to be updated. + */ +#define evmcs_check_vmcs_conf(field, ctrl) \ + do { \ + typeof(vmcs_conf->field) unsupported; \ + \ + unsupported = vmcs_conf->field & ~EVMCS1_SUPPORTED_ ## ctrl; \ + if (unsupported) { \ + pr_warn_once(#field " unsupported with eVMCS: 0x%llx\n",\ + (u64)unsupported); \ + vmcs_conf->field &= EVMCS1_SUPPORTED_ ## ctrl; \ + } \ + } \ + while (0) + +void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) +{ + evmcs_check_vmcs_conf(cpu_based_exec_ctrl, EXEC_CTRL); + evmcs_check_vmcs_conf(pin_based_exec_ctrl, PINCTRL); + evmcs_check_vmcs_conf(cpu_based_2nd_exec_ctrl, 2NDEXEC); + evmcs_check_vmcs_conf(cpu_based_3rd_exec_ctrl, 3RDEXEC); + evmcs_check_vmcs_conf(vmentry_ctrl, VMENTRY_CTRL); + evmcs_check_vmcs_conf(vmexit_ctrl, VMEXIT_CTRL); +} +#endif + int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version) { diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index 571e7929d14e..78d17667e7ec 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -48,22 +48,84 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs); * Currently unsupported in KVM: * GUEST_IA32_RTIT_CTL = 0x00002814, */ -#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \ - PIN_BASED_VMX_PREEMPTION_TIMER) -#define EVMCS1_UNSUPPORTED_EXEC_CTRL (CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) -#define EVMCS1_UNSUPPORTED_2NDEXEC \ - (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \ - SECONDARY_EXEC_APIC_REGISTER_VIRT | \ - SECONDARY_EXEC_ENABLE_PML | \ - SECONDARY_EXEC_ENABLE_VMFUNC | \ - SECONDARY_EXEC_SHADOW_VMCS | \ +#define EVMCS1_SUPPORTED_PINCTRL \ + (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ + PIN_BASED_EXT_INTR_MASK | \ + PIN_BASED_NMI_EXITING | \ + PIN_BASED_VIRTUAL_NMIS) + +#define EVMCS1_SUPPORTED_EXEC_CTRL \ + (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ + CPU_BASED_HLT_EXITING | \ + CPU_BASED_CR3_LOAD_EXITING | \ + CPU_BASED_CR3_STORE_EXITING | \ + CPU_BASED_UNCOND_IO_EXITING | \ + CPU_BASED_MOV_DR_EXITING | \ + CPU_BASED_USE_TSC_OFFSETTING | \ + CPU_BASED_MWAIT_EXITING | \ + CPU_BASED_MONITOR_EXITING | \ + CPU_BASED_INVLPG_EXITING | \ + CPU_BASED_RDPMC_EXITING | \ + CPU_BASED_INTR_WINDOW_EXITING | \ + CPU_BASED_CR8_LOAD_EXITING | \ + CPU_BASED_CR8_STORE_EXITING | \ + CPU_BASED_RDTSC_EXITING | \ + CPU_BASED_TPR_SHADOW | \ + CPU_BASED_USE_IO_BITMAPS | \ + CPU_BASED_MONITOR_TRAP_FLAG | \ + CPU_BASED_USE_MSR_BITMAPS | \ + CPU_BASED_NMI_WINDOW_EXITING | \ + CPU_BASED_PAUSE_EXITING | \ + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) + +#define EVMCS1_SUPPORTED_2NDEXEC \ + (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \ + SECONDARY_EXEC_WBINVD_EXITING | \ + SECONDARY_EXEC_ENABLE_VPID | \ + SECONDARY_EXEC_ENABLE_EPT | \ + SECONDARY_EXEC_UNRESTRICTED_GUEST | \ + SECONDARY_EXEC_DESC | \ + SECONDARY_EXEC_ENABLE_RDTSCP | \ + SECONDARY_EXEC_ENABLE_INVPCID | \ + SECONDARY_EXEC_XSAVES | \ + SECONDARY_EXEC_RDSEED_EXITING | \ + SECONDARY_EXEC_RDRAND_EXITING | \ SECONDARY_EXEC_TSC_SCALING | \ - SECONDARY_EXEC_PAUSE_LOOP_EXITING) -#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL \ - (VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) -#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (0) -#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING) + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \ + SECONDARY_EXEC_PT_USE_GPA | \ + SECONDARY_EXEC_PT_CONCEAL_VMX | \ + SECONDARY_EXEC_BUS_LOCK_DETECTION | \ + SECONDARY_EXEC_NOTIFY_VM_EXITING | \ + SECONDARY_EXEC_ENCLS_EXITING) + +#define EVMCS1_SUPPORTED_3RDEXEC (0ULL) + +#define EVMCS1_SUPPORTED_VMEXIT_CTRL \ + (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \ + VM_EXIT_SAVE_DEBUG_CONTROLS | \ + VM_EXIT_ACK_INTR_ON_EXIT | \ + VM_EXIT_HOST_ADDR_SPACE_SIZE | \ + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_EXIT_SAVE_IA32_PAT | \ + VM_EXIT_LOAD_IA32_PAT | \ + VM_EXIT_SAVE_IA32_EFER | \ + VM_EXIT_LOAD_IA32_EFER | \ + VM_EXIT_CLEAR_BNDCFGS | \ + VM_EXIT_PT_CONCEAL_PIP | \ + VM_EXIT_CLEAR_IA32_RTIT_CTL) + +#define EVMCS1_SUPPORTED_VMENTRY_CTRL \ + (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \ + VM_ENTRY_LOAD_DEBUG_CONTROLS | \ + VM_ENTRY_IA32E_MODE | \ + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_ENTRY_LOAD_IA32_PAT | \ + VM_ENTRY_LOAD_IA32_EFER | \ + VM_ENTRY_LOAD_BNDCFGS | \ + VM_ENTRY_PT_CONCEAL_PIP | \ + VM_ENTRY_LOAD_IA32_RTIT_CTL) + +#define EVMCS1_SUPPORTED_VMFUNC (0) struct evmcs_field { u16 offset; @@ -117,9 +179,7 @@ static __always_inline int get_evmcs_offset(unsigned long field, { int offset = evmcs_field_offset(field, clean_field); - WARN_ONCE(offset < 0, "KVM: accessing unsupported EVMCS field %lx\n", - field); - + WARN_ONCE(offset < 0, "accessing unsupported EVMCS field %lx\n", field); return offset; } @@ -136,7 +196,7 @@ static __always_inline void evmcs_write64(unsigned long field, u64 value) current_evmcs->hv_clean_fields &= ~clean_field; } -static inline void evmcs_write32(unsigned long field, u32 value) +static __always_inline void evmcs_write32(unsigned long field, u32 value) { u16 clean_field; int offset = get_evmcs_offset(field, &clean_field); @@ -148,7 +208,7 @@ static inline void evmcs_write32(unsigned long field, u32 value) current_evmcs->hv_clean_fields &= ~clean_field; } -static inline void evmcs_write16(unsigned long field, u16 value) +static __always_inline void evmcs_write16(unsigned long field, u16 value) { u16 clean_field; int offset = get_evmcs_offset(field, &clean_field); @@ -160,7 +220,7 @@ static inline void evmcs_write16(unsigned long field, u16 value) current_evmcs->hv_clean_fields &= ~clean_field; } -static inline u64 evmcs_read64(unsigned long field) +static __always_inline u64 evmcs_read64(unsigned long field) { int offset = get_evmcs_offset(field, NULL); @@ -170,7 +230,7 @@ static inline u64 evmcs_read64(unsigned long field) return *(u64 *)((char *)current_evmcs + offset); } -static inline u32 evmcs_read32(unsigned long field) +static __always_inline u32 evmcs_read32(unsigned long field) { int offset = get_evmcs_offset(field, NULL); @@ -180,7 +240,7 @@ static inline u32 evmcs_read32(unsigned long field) return *(u32 *)((char *)current_evmcs + offset); } -static inline u16 evmcs_read16(unsigned long field) +static __always_inline u16 evmcs_read16(unsigned long field) { int offset = get_evmcs_offset(field, NULL); @@ -190,16 +250,6 @@ static inline u16 evmcs_read16(unsigned long field) return *(u16 *)((char *)current_evmcs + offset); } -static inline void evmcs_touch_msr_bitmap(void) -{ - if (unlikely(!current_evmcs)) - return; - - if (current_evmcs->hv_enlightenments_control.msr_bitmap) - current_evmcs->hv_clean_fields &= - ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; -} - static inline void evmcs_load(u64 phys_addr) { struct hv_vp_assist_page *vp_ap = @@ -211,15 +261,15 @@ static inline void evmcs_load(u64 phys_addr) vp_ap->enlighten_vmentry = 1; } +void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); #else /* !IS_ENABLED(CONFIG_HYPERV) */ static __always_inline void evmcs_write64(unsigned long field, u64 value) {} -static inline void evmcs_write32(unsigned long field, u32 value) {} -static inline void evmcs_write16(unsigned long field, u16 value) {} -static inline u64 evmcs_read64(unsigned long field) { return 0; } -static inline u32 evmcs_read32(unsigned long field) { return 0; } -static inline u16 evmcs_read16(unsigned long field) { return 0; } +static __always_inline void evmcs_write32(unsigned long field, u32 value) {} +static __always_inline void evmcs_write16(unsigned long field, u16 value) {} +static __always_inline u64 evmcs_read64(unsigned long field) { return 0; } +static __always_inline u32 evmcs_read32(unsigned long field) { return 0; } +static __always_inline u16 evmcs_read16(unsigned long field) { return 0; } static inline void evmcs_load(u64 phys_addr) {} -static inline void evmcs_touch_msr_bitmap(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ #define EVMPTR_INVALID (-1ULL) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d93c715cda6a..7c4f5ca405c7 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/objtool.h> #include <linux/percpu.h> @@ -203,7 +204,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) { /* TODO: not to reset guest simply here. */ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); + pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); } static inline bool vmx_control_verify(u32 control, u32 low, u32 high) @@ -5863,11 +5864,10 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) u32 function = kvm_rax_read(vcpu); /* - * VMFUNC is only supported for nested guests, but we always enable the - * secondary control for simplicity; for non-nested mode, fake that we - * didn't by injecting #UD. + * VMFUNC should never execute cleanly while L1 is active; KVM supports + * VMFUNC for nested VMs, but not for L1. */ - if (!is_guest_mode(vcpu)) { + if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -6880,6 +6880,7 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_RDRAND_EXITING | SECONDARY_EXEC_ENABLE_INVPCID | + SECONDARY_EXEC_ENABLE_VMFUNC | SECONDARY_EXEC_RDSEED_EXITING | SECONDARY_EXEC_XSAVES | SECONDARY_EXEC_TSC_SCALING | @@ -6912,18 +6913,13 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) SECONDARY_EXEC_ENABLE_PML; msrs->ept_caps |= VMX_EPT_AD_BIT; } - } - if (cpu_has_vmx_vmfunc()) { - msrs->secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_VMFUNC; /* - * Advertise EPTP switching unconditionally - * since we emulate it + * Advertise EPTP switching irrespective of hardware support, + * KVM emulates it in software so long as VMFUNC is supported. */ - if (enable_ept) - msrs->vmfunc_controls = - VMX_VMFUNC_EPTP_SWITCHING; + if (cpu_has_vmx_vmfunc()) + msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; } /* diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index e5cec07ca8d9..e8a3be0b9df9 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -8,6 +8,8 @@ * Avi Kivity <avi@redhat.com> * Gleb Natapov <gleb@redhat.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/kvm_host.h> #include <linux/perf_event.h> @@ -20,16 +22,19 @@ #define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) -static struct kvm_event_hw_type_mapping intel_arch_events[] = { - [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, - [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, - [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, - [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES }, - [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, - [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, +static struct { + u8 eventsel; + u8 unit_mask; +} const intel_arch_events[] = { + [0] = { 0x3c, 0x00 }, + [1] = { 0xc0, 0x00 }, + [2] = { 0x3c, 0x01 }, + [3] = { 0x2e, 0x4f }, + [4] = { 0x2e, 0x41 }, + [5] = { 0xc4, 0x00 }, + [6] = { 0xc5, 0x00 }, /* The above index must match CPUID 0x0A.EBX bit vector */ - [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES }, + [7] = { 0x00, 0x03 }, }; /* mapping between fixed pmc index and intel_arch_events array */ @@ -762,8 +767,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu) return; warn: - pr_warn_ratelimited("kvm: vcpu-%d: fail to passthrough LBR.\n", - vcpu->vcpu_id); + pr_warn_ratelimited("vcpu-%d: fail to passthrough LBR.\n", vcpu->vcpu_id); } static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) @@ -810,4 +814,6 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { .reset = intel_pmu_reset, .deliver_pmi = intel_pmu_deliver_pmi, .cleanup = intel_pmu_cleanup, + .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, + .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC, }; diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 1b56c5e5c9fb..94c38bea60e7 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -1,4 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kvm_host.h> #include <asm/irq_remapping.h> diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index b12da2a6dec9..aa53c98034bf 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2021 Intel Corporation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <asm/sgx.h> @@ -164,7 +165,7 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, if (!vcpu->kvm->arch.sgx_provisioning_allowed && (attributes & SGX_ATTR_PROVISIONKEY)) { if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY) - pr_warn_once("KVM: SGX PROVISIONKEY advertised but not allowed\n"); + pr_warn_once("SGX PROVISIONKEY advertised but not allowed\n"); kvm_inject_gp(vcpu, 0); return 1; } @@ -381,7 +382,7 @@ int handle_encls(struct kvm_vcpu *vcpu) return handle_encls_ecreate(vcpu); if (leaf == EINIT) return handle_encls_einit(vcpu); - WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf); + WARN_ONCE(1, "unexpected exit on ENCLS[%u]", leaf); vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS; return 0; diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index ac290a44a693..7c1996b433e2 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -75,7 +75,7 @@ struct loaded_vmcs { struct vmcs_controls_shadow controls_shadow; }; -static inline bool is_intr_type(u32 intr_info, u32 type) +static __always_inline bool is_intr_type(u32 intr_info, u32 type) { const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK; @@ -146,7 +146,7 @@ static inline bool is_icebp(u32 intr_info) return is_intr_type(intr_info, INTR_TYPE_PRIV_SW_EXCEPTION); } -static inline bool is_nmi(u32 intr_info) +static __always_inline bool is_nmi(u32 intr_info) { return is_intr_type(intr_info, INTR_TYPE_NMI_INTR); } diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index 2251b60920f8..106a72c923ca 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "vmcs12.h" diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 766c6b3ef5ed..f550540ed54e 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -31,6 +31,39 @@ #define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE #endif +.macro VMX_DO_EVENT_IRQOFF call_insn call_target + /* + * Unconditionally create a stack frame, getting the correct RSP on the + * stack (for x86-64) would take two instructions anyways, and RBP can + * be used to restore RSP to make objtool happy (see below). + */ + push %_ASM_BP + mov %_ASM_SP, %_ASM_BP + +#ifdef CONFIG_X86_64 + /* + * Align RSP to a 16-byte boundary (to emulate CPU behavior) before + * creating the synthetic interrupt stack frame for the IRQ/NMI. + */ + and $-16, %rsp + push $__KERNEL_DS + push %rbp +#endif + pushf + push $__KERNEL_CS + \call_insn \call_target + + /* + * "Restore" RSP from RBP, even though IRET has already unwound RSP to + * the correct value. objtool doesn't know the callee will IRET and, + * without the explicit restore, thinks the stack is getting walloped. + * Using an unwind hint is problematic due to x86-64's dynamic alignment. + */ + mov %_ASM_BP, %_ASM_SP + pop %_ASM_BP + RET +.endm + .section .noinstr.text, "ax" /** @@ -69,8 +102,8 @@ SYM_FUNC_START(__vmx_vcpu_run) */ push %_ASM_ARG2 - /* Copy @flags to BL, _ASM_ARG3 is volatile. */ - mov %_ASM_ARG3B, %bl + /* Copy @flags to EBX, _ASM_ARG3 is volatile. */ + mov %_ASM_ARG3L, %ebx lea (%_ASM_SP), %_ASM_ARG2 call vmx_update_host_rsp @@ -106,7 +139,7 @@ SYM_FUNC_START(__vmx_vcpu_run) mov (%_ASM_SP), %_ASM_AX /* Check if vmlaunch or vmresume is needed */ - testb $VMX_RUN_VMRESUME, %bl + test $VMX_RUN_VMRESUME, %ebx /* Load guest registers. Don't clobber flags. */ mov VCPU_RCX(%_ASM_AX), %_ASM_CX @@ -128,7 +161,7 @@ SYM_FUNC_START(__vmx_vcpu_run) /* Load guest RAX. This kills the @regs pointer! */ mov VCPU_RAX(%_ASM_AX), %_ASM_AX - /* Check EFLAGS.ZF from 'testb' above */ + /* Check EFLAGS.ZF from 'test VMX_RUN_VMRESUME' above */ jz .Lvmlaunch /* @@ -266,6 +299,10 @@ SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL) SYM_FUNC_END(__vmx_vcpu_run) +SYM_FUNC_START(vmx_do_nmi_irqoff) + VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx +SYM_FUNC_END(vmx_do_nmi_irqoff) + .section .text, "ax" @@ -320,35 +357,6 @@ SYM_FUNC_START(vmread_error_trampoline) SYM_FUNC_END(vmread_error_trampoline) #endif -SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff) - /* - * Unconditionally create a stack frame, getting the correct RSP on the - * stack (for x86-64) would take two instructions anyways, and RBP can - * be used to restore RSP to make objtool happy (see below). - */ - push %_ASM_BP - mov %_ASM_SP, %_ASM_BP - -#ifdef CONFIG_X86_64 - /* - * Align RSP to a 16-byte boundary (to emulate CPU behavior) before - * creating the synthetic interrupt stack frame for the IRQ/NMI. - */ - and $-16, %rsp - push $__KERNEL_DS - push %rbp -#endif - pushf - push $__KERNEL_CS - CALL_NOSPEC _ASM_ARG1 - - /* - * "Restore" RSP from RBP, even though IRET has already unwound RSP to - * the correct value. objtool doesn't know the callee will IRET and, - * without the explicit restore, thinks the stack is getting walloped. - * Using an unwind hint is problematic due to x86-64's dynamic alignment. - */ - mov %_ASM_BP, %_ASM_SP - pop %_ASM_BP - RET -SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff) +SYM_FUNC_START(vmx_do_interrupt_irqoff) + VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1 +SYM_FUNC_END(vmx_do_interrupt_irqoff) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 7eec0226d56a..bcac3efcde41 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -12,6 +12,7 @@ * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/highmem.h> #include <linux/hrtimer.h> @@ -444,36 +445,36 @@ void vmread_error(unsigned long field, bool fault) if (fault) kvm_spurious_fault(); else - vmx_insn_failed("kvm: vmread failed: field=%lx\n", field); + vmx_insn_failed("vmread failed: field=%lx\n", field); } noinline void vmwrite_error(unsigned long field, unsigned long value) { - vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%u\n", + vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n", field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmclear failed: %p/%llx err=%u\n", + vmx_insn_failed("vmclear failed: %p/%llx err=%u\n", vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmptrld failed: %p/%llx err=%u\n", + vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n", vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) { - vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", + vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", ext, vpid, gva); } noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa) { - vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", + vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", ext, eptp, gpa); } @@ -488,8 +489,8 @@ static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); static DEFINE_SPINLOCK(vmx_vpid_lock); -struct vmcs_config vmcs_config; -struct vmx_capability vmx_capability; +struct vmcs_config vmcs_config __ro_after_init; +struct vmx_capability vmx_capability __ro_after_init; #define VMX_SEGMENT_FIELD(seg) \ [VCPU_SREG_##seg] = { \ @@ -523,6 +524,8 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) static unsigned long host_idt_base; #if IS_ENABLED(CONFIG_HYPERV) +static struct kvm_x86_ops vmx_x86_ops __initdata; + static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); @@ -551,6 +554,71 @@ static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) return 0; } +static __init void hv_init_evmcs(void) +{ + int cpu; + + if (!enlightened_vmcs) + return; + + /* + * Enlightened VMCS usage should be recommended and the host needs + * to support eVMCS v1 or above. + */ + if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && + (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= + KVM_EVMCS_VERSION) { + + /* Check that we have assist pages on all online CPUs */ + for_each_online_cpu(cpu) { + if (!hv_get_vp_assist_page(cpu)) { + enlightened_vmcs = false; + break; + } + } + + if (enlightened_vmcs) { + pr_info("Using Hyper-V Enlightened VMCS\n"); + static_branch_enable(&enable_evmcs); + } + + if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) + vmx_x86_ops.enable_l2_tlb_flush + = hv_enable_l2_tlb_flush; + + } else { + enlightened_vmcs = false; + } +} + +static void hv_reset_evmcs(void) +{ + struct hv_vp_assist_page *vp_ap; + + if (!static_branch_unlikely(&enable_evmcs)) + return; + + /* + * KVM should enable eVMCS if and only if all CPUs have a VP assist + * page, and should reject CPU onlining if eVMCS is enabled the CPU + * doesn't have a VP assist page allocated. + */ + vp_ap = hv_get_vp_assist_page(smp_processor_id()); + if (WARN_ON_ONCE(!vp_ap)) + return; + + /* + * Reset everything to support using non-enlightened VMCS access later + * (e.g. when we reload the module with enlightened_vmcs=0) + */ + vp_ap->nested_control.features.directhypercall = 0; + vp_ap->current_nested_vmcs = 0; + vp_ap->enlighten_vmentry = 0; +} + +#else /* IS_ENABLED(CONFIG_HYPERV) */ +static void hv_init_evmcs(void) {} +static void hv_reset_evmcs(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ /* @@ -1613,8 +1681,8 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) if (!instr_len) goto rip_updated; - WARN(exit_reason.enclave_mode, - "KVM: skipping instruction after SGX enclave VM-Exit"); + WARN_ONCE(exit_reason.enclave_mode, + "skipping instruction after SGX enclave VM-Exit"); orig_rip = kvm_rip_read(vcpu); rip = orig_rip + instr_len; @@ -2138,9 +2206,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { - if (report_ignored_msrs) - vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n", - __func__, data); + kvm_pr_unimpl_wrmsr(vcpu, msr_index, data); data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); } @@ -2448,88 +2514,6 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) } } -static __init int cpu_has_kvm_support(void) -{ - return cpu_has_vmx(); -} - -static __init int vmx_disabled_by_bios(void) -{ - return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || - !boot_cpu_has(X86_FEATURE_VMX); -} - -static int kvm_cpu_vmxon(u64 vmxon_pointer) -{ - u64 msr; - - cr4_set_bits(X86_CR4_VMXE); - - asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t" - _ASM_EXTABLE(1b, %l[fault]) - : : [vmxon_pointer] "m"(vmxon_pointer) - : : fault); - return 0; - -fault: - WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", - rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); - cr4_clear_bits(X86_CR4_VMXE); - - return -EFAULT; -} - -static int vmx_hardware_enable(void) -{ - int cpu = raw_smp_processor_id(); - u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); - int r; - - if (cr4_read_shadow() & X86_CR4_VMXE) - return -EBUSY; - - /* - * This can happen if we hot-added a CPU but failed to allocate - * VP assist page for it. - */ - if (static_branch_unlikely(&enable_evmcs) && - !hv_get_vp_assist_page(cpu)) - return -EFAULT; - - intel_pt_handle_vmx(1); - - r = kvm_cpu_vmxon(phys_addr); - if (r) { - intel_pt_handle_vmx(0); - return r; - } - - if (enable_ept) - ept_sync_global(); - - return 0; -} - -static void vmclear_local_loaded_vmcss(void) -{ - int cpu = raw_smp_processor_id(); - struct loaded_vmcs *v, *n; - - list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), - loaded_vmcss_on_cpu_link) - __loaded_vmcs_clear(v); -} - -static void vmx_hardware_disable(void) -{ - vmclear_local_loaded_vmcss(); - - if (cpu_vmxoff()) - kvm_spurious_fault(); - - intel_pt_handle_vmx(0); -} - /* * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID * directly instead of going through cpu_has(), to ensure KVM is trapping @@ -2565,8 +2549,7 @@ static bool cpu_has_perf_global_ctrl_bug(void) return false; } -static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, - u32 msr, u32 *result) +static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) { u32 vmx_msr_low, vmx_msr_high; u32 ctl = ctl_min | ctl_opt; @@ -2584,7 +2567,7 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, return 0; } -static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) +static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) { u64 allowed; @@ -2593,8 +2576,8 @@ static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) return ctl_opt & allowed; } -static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, - struct vmx_capability *vmx_cap) +static int setup_vmcs_config(struct vmcs_config *vmcs_conf, + struct vmx_capability *vmx_cap) { u32 vmx_msr_low, vmx_msr_high; u32 _pin_based_exec_control = 0; @@ -2752,9 +2735,127 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, vmcs_conf->vmentry_ctrl = _vmentry_control; vmcs_conf->misc = misc_msr; +#if IS_ENABLED(CONFIG_HYPERV) + if (enlightened_vmcs) + evmcs_sanitize_exec_ctrls(vmcs_conf); +#endif + + return 0; +} + +static bool kvm_is_vmx_supported(void) +{ + int cpu = raw_smp_processor_id(); + + if (!cpu_has_vmx()) { + pr_err("VMX not supported by CPU %d\n", cpu); + return false; + } + + if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || + !this_cpu_has(X86_FEATURE_VMX)) { + pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu); + return false; + } + + return true; +} + +static int vmx_check_processor_compat(void) +{ + int cpu = raw_smp_processor_id(); + struct vmcs_config vmcs_conf; + struct vmx_capability vmx_cap; + + if (!kvm_is_vmx_supported()) + return -EIO; + + if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) { + pr_err("Failed to setup VMCS config on CPU %d\n", cpu); + return -EIO; + } + if (nested) + nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); + if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) { + pr_err("Inconsistent VMCS config on CPU %d\n", cpu); + return -EIO; + } + return 0; +} + +static int kvm_cpu_vmxon(u64 vmxon_pointer) +{ + u64 msr; + + cr4_set_bits(X86_CR4_VMXE); + + asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t" + _ASM_EXTABLE(1b, %l[fault]) + : : [vmxon_pointer] "m"(vmxon_pointer) + : : fault); + return 0; + +fault: + WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", + rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); + cr4_clear_bits(X86_CR4_VMXE); + + return -EFAULT; +} + +static int vmx_hardware_enable(void) +{ + int cpu = raw_smp_processor_id(); + u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); + int r; + + if (cr4_read_shadow() & X86_CR4_VMXE) + return -EBUSY; + + /* + * This can happen if we hot-added a CPU but failed to allocate + * VP assist page for it. + */ + if (static_branch_unlikely(&enable_evmcs) && + !hv_get_vp_assist_page(cpu)) + return -EFAULT; + + intel_pt_handle_vmx(1); + + r = kvm_cpu_vmxon(phys_addr); + if (r) { + intel_pt_handle_vmx(0); + return r; + } + + if (enable_ept) + ept_sync_global(); + return 0; } +static void vmclear_local_loaded_vmcss(void) +{ + int cpu = raw_smp_processor_id(); + struct loaded_vmcs *v, *n; + + list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), + loaded_vmcss_on_cpu_link) + __loaded_vmcs_clear(v); +} + +static void vmx_hardware_disable(void) +{ + vmclear_local_loaded_vmcss(); + + if (cpu_vmxoff()) + kvm_spurious_fault(); + + hv_reset_evmcs(); + + intel_pt_handle_vmx(0); +} + struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) { int node = cpu_to_node(cpu); @@ -2950,9 +3051,8 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save) var.type = 0x3; var.avl = 0; if (save->base & 0xf) - printk_once(KERN_WARNING "kvm: segment base is not " - "paragraph aligned when entering " - "protected mode (seg=%d)", seg); + pr_warn_once("segment base is not paragraph aligned " + "when entering protected mode (seg=%d)", seg); } vmcs_write16(sf->selector, var.selector); @@ -2982,8 +3082,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) * vcpu. Warn the user that an update is overdue. */ if (!kvm_vmx->tss_addr) - printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " - "called before entering vcpu\n"); + pr_warn_once("KVM_SET_TSS_ADDR needs to be called before running vCPU\n"); vmx_segment_cache_clear(vmx); @@ -3800,39 +3899,6 @@ static void seg_setup(int seg) vmcs_write32(sf->ar_bytes, ar); } -static int alloc_apic_access_page(struct kvm *kvm) -{ - struct page *page; - void __user *hva; - int ret = 0; - - mutex_lock(&kvm->slots_lock); - if (kvm->arch.apic_access_memslot_enabled) - goto out; - hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, - APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); - if (IS_ERR(hva)) { - ret = PTR_ERR(hva); - goto out; - } - - page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (is_error_page(page)) { - ret = -EFAULT; - goto out; - } - - /* - * Do not pin the page in memory, so that memory hot-unplug - * is able to migrate it. - */ - put_page(page); - kvm->arch.apic_access_memslot_enabled = true; -out: - mutex_unlock(&kvm->slots_lock); - return ret; -} - int allocate_vpid(void) { int vpid; @@ -3865,8 +3931,13 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR * bitmap has changed. */ - if (static_branch_unlikely(&enable_evmcs)) - evmcs_touch_msr_bitmap(); + if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs)) { + struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; + + if (evmcs->hv_enlightenments_control.msr_bitmap) + evmcs->hv_clean_fields &= + ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; + } vmx->nested.force_msr_bitmap_recalc = true; } @@ -3947,29 +4018,20 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) vmx_set_msr_bitmap_write(msr_bitmap, msr); } -static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode) -{ - unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; - unsigned long read_intercept; - int msr; - - read_intercept = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; - - for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { - unsigned int read_idx = msr / BITS_PER_LONG; - unsigned int write_idx = read_idx + (0x800 / sizeof(long)); - - msr_bitmap[read_idx] = read_intercept; - msr_bitmap[write_idx] = ~0ul; - } -} - static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) { + /* + * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves + * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0, + * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits. + */ + const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG; + const int write_idx = read_idx + (0x800 / sizeof(u64)); struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap; u8 mode; - if (!cpu_has_vmx_msr_bitmap()) + if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu))) return; if (cpu_has_secondary_exec_ctrls() && @@ -3987,7 +4049,18 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) vmx->x2apic_msr_bitmap_mode = mode; - vmx_reset_x2apic_msrs(vcpu, mode); + /* + * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended + * registers (0x840 and above) intercepted, KVM doesn't support them. + * Intercept all writes by default and poke holes as needed. Pass + * through reads for all valid registers by default in x2APIC+APICv + * mode, only the current timer count needs on-demand emulation by KVM. + */ + if (mode & MSR_BITMAP_MODE_X2APIC_APICV) + msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic); + else + msr_bitmap[read_idx] = ~0ull; + msr_bitmap[write_idx] = ~0ull; /* * TPR reads and writes can be virtualized even if virtual interrupt @@ -4519,6 +4592,12 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + /* + * KVM doesn't support VMFUNC for L1, but the control is set in KVM's + * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2. + */ + exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC; + /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, * in vmx_set_cr4. */ exec_control &= ~SECONDARY_EXEC_DESC; @@ -4535,7 +4614,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) * it needs to be set here when dirty logging is already active, e.g. * if this vCPU was created after dirty logging was enabled. */ - if (!vcpu->kvm->arch.cpu_dirty_logging_count) + if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) exec_control &= ~SECONDARY_EXEC_ENABLE_PML; if (cpu_has_vmx_xsaves()) { @@ -5099,8 +5178,13 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) vect_info = vmx->idt_vectoring_info; intr_info = vmx_get_intr_info(vcpu); + /* + * Machine checks are handled by handle_exception_irqoff(), or by + * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by + * vmx_vcpu_enter_exit(). + */ if (is_machine_check(intr_info) || is_nmi(intr_info)) - return 1; /* handled by handle_exception_nmi_irqoff() */ + return 1; /* * Queue the exception here instead of in handle_nm_fault_irqoff(). @@ -6790,17 +6874,8 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); } -void vmx_do_interrupt_nmi_irqoff(unsigned long entry); - -static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, - unsigned long entry) -{ - bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist; - - kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ); - vmx_do_interrupt_nmi_irqoff(entry); - kvm_after_interrupt(vcpu); -} +void vmx_do_interrupt_irqoff(unsigned long entry); +void vmx_do_nmi_irqoff(void); static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) { @@ -6822,9 +6897,8 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); } -static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) +static void handle_exception_irqoff(struct vcpu_vmx *vmx) { - const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist; u32 intr_info = vmx_get_intr_info(&vmx->vcpu); /* if exit due to PF check for async PF */ @@ -6836,9 +6910,6 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) /* Handle machine checks before interrupts are enabled */ else if (is_machine_check(intr_info)) kvm_machine_check(); - /* We need to handle NMIs before interrupts are enabled */ - else if (is_nmi(intr_info)) - handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry); } static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) @@ -6848,10 +6919,13 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) gate_desc *desc = (gate_desc *)host_idt_base + vector; if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, - "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) + "unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; - handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc)); + kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); + vmx_do_interrupt_irqoff(gate_offset(desc)); + kvm_after_interrupt(vcpu); + vcpu->arch.at_instruction_boundary = true; } @@ -6865,7 +6939,7 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) handle_external_interrupt_irqoff(vcpu); else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) - handle_exception_nmi_irqoff(vmx); + handle_exception_irqoff(vmx); } /* @@ -7100,9 +7174,10 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) } static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, - struct vcpu_vmx *vmx, - unsigned long flags) + unsigned int flags) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + guest_state_enter_irqoff(); /* L1D Flush includes CPU buffer clear to mitigate MDS */ @@ -7126,6 +7201,18 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, vmx_enable_fb_clear(vmx); + if (unlikely(vmx->fail)) + vmx->exit_reason.full = 0xdead; + else + vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); + + if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI && + is_nmi(vmx_get_intr_info(vcpu))) { + kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); + vmx_do_nmi_irqoff(); + kvm_after_interrupt(vcpu); + } + guest_state_exit_irqoff(); } @@ -7220,7 +7307,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) kvm_wait_lapic_expire(vcpu); /* The actual VMENTER/EXIT is in the .noinstr.text section. */ - vmx_vcpu_enter_exit(vcpu, vmx, __vmx_vcpu_run_flags(vmx)); + vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx)); /* All fields are clean at this point */ if (static_branch_unlikely(&enable_evmcs)) { @@ -7267,12 +7354,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->idt_vectoring_info = 0; - if (unlikely(vmx->fail)) { - vmx->exit_reason.full = 0xdead; + if (unlikely(vmx->fail)) return EXIT_FASTPATH_NONE; - } - vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) kvm_machine_check(); @@ -7386,7 +7470,7 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu) vmx->loaded_vmcs = &vmx->vmcs01; if (cpu_need_virtualize_apic_accesses(vcpu)) { - err = alloc_apic_access_page(vcpu->kvm); + err = kvm_alloc_apic_access_page(vcpu->kvm); if (err) goto free_vmcs; } @@ -7446,29 +7530,6 @@ static int vmx_vm_init(struct kvm *kvm) return 0; } -static int __init vmx_check_processor_compat(void) -{ - struct vmcs_config vmcs_conf; - struct vmx_capability vmx_cap; - - if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || - !this_cpu_has(X86_FEATURE_VMX)) { - pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id()); - return -EIO; - } - - if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) - return -EIO; - if (nested) - nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); - if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { - printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", - smp_processor_id()); - return -EIO; - } - return 0; -} - static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { u8 cache; @@ -7940,17 +8001,20 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (WARN_ON_ONCE(!enable_pml)) + return; + if (is_guest_mode(vcpu)) { vmx->nested.update_vmcs01_cpu_dirty_logging = true; return; } /* - * Note, cpu_dirty_logging_count can be changed concurrent with this + * Note, nr_memslots_dirty_logging can be changed concurrent with this * code, but in that case another update request will be made and so * the guest will never run with a stale PML value. */ - if (vcpu->kvm->arch.cpu_dirty_logging_count) + if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML); else secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); @@ -8048,17 +8112,16 @@ static void vmx_hardware_unsetup(void) free_kvm_area(); } -static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) -{ - ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | - BIT(APICV_INHIBIT_REASON_ABSENT) | - BIT(APICV_INHIBIT_REASON_HYPERV) | - BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | - BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | - BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED); - - return supported & BIT(reason); -} +#define VMX_REQUIRED_APICV_INHIBITS \ +( \ + BIT(APICV_INHIBIT_REASON_DISABLE)| \ + BIT(APICV_INHIBIT_REASON_ABSENT) | \ + BIT(APICV_INHIBIT_REASON_HYPERV) | \ + BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \ + BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \ + BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \ + BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) \ +) static void vmx_vm_destroy(struct kvm *kvm) { @@ -8068,7 +8131,9 @@ static void vmx_vm_destroy(struct kvm *kvm) } static struct kvm_x86_ops vmx_x86_ops __initdata = { - .name = "kvm_intel", + .name = KBUILD_MODNAME, + + .check_processor_compatibility = vmx_check_processor_compat, .hardware_unsetup = vmx_hardware_unsetup, @@ -8142,7 +8207,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, .load_eoi_exitmap = vmx_load_eoi_exitmap, .apicv_post_state_restore = vmx_apicv_post_state_restore, - .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons, + .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, @@ -8288,7 +8353,7 @@ static __init int hardware_setup(void) return -EIO; if (cpu_has_perf_global_ctrl_bug()) - pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " + pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " "does not work properly. Using workaround\n"); if (boot_cpu_has(X86_FEATURE_NX)) @@ -8296,7 +8361,7 @@ static __init int hardware_setup(void) if (boot_cpu_has(X86_FEATURE_MPX)) { rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); - WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); + WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost"); } if (!cpu_has_vmx_mpx()) @@ -8315,7 +8380,7 @@ static __init int hardware_setup(void) /* NX support is required for shadow paging. */ if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { - pr_err_ratelimited("kvm: NX (Execute Disable) not supported\n"); + pr_err_ratelimited("NX (Execute Disable) not supported\n"); return -EOPNOTSUPP; } @@ -8467,9 +8532,6 @@ static __init int hardware_setup(void) } static struct kvm_x86_init_ops vmx_init_ops __initdata = { - .cpu_has_kvm_support = cpu_has_kvm_support, - .disabled_by_bios = vmx_disabled_by_bios, - .check_processor_compatibility = vmx_check_processor_compat, .hardware_setup = hardware_setup, .handle_intel_pt_intr = NULL, @@ -8487,41 +8549,23 @@ static void vmx_cleanup_l1d_flush(void) l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; } -static void vmx_exit(void) +static void __vmx_exit(void) { + allow_smaller_maxphyaddr = false; + #ifdef CONFIG_KEXEC_CORE RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); synchronize_rcu(); #endif + vmx_cleanup_l1d_flush(); +} +static void vmx_exit(void) +{ kvm_exit(); + kvm_x86_vendor_exit(); -#if IS_ENABLED(CONFIG_HYPERV) - if (static_branch_unlikely(&enable_evmcs)) { - int cpu; - struct hv_vp_assist_page *vp_ap; - /* - * Reset everything to support using non-enlightened VMCS - * access later (e.g. when we reload the module with - * enlightened_vmcs=0) - */ - for_each_online_cpu(cpu) { - vp_ap = hv_get_vp_assist_page(cpu); - - if (!vp_ap) - continue; - - vp_ap->nested_control.features.directhypercall = 0; - vp_ap->current_nested_vmcs = 0; - vp_ap->enlighten_vmentry = 0; - } - - static_branch_disable(&enable_evmcs); - } -#endif - vmx_cleanup_l1d_flush(); - - allow_smaller_maxphyaddr = false; + __vmx_exit(); } module_exit(vmx_exit); @@ -8529,56 +8573,29 @@ static int __init vmx_init(void) { int r, cpu; -#if IS_ENABLED(CONFIG_HYPERV) + if (!kvm_is_vmx_supported()) + return -EOPNOTSUPP; + /* - * Enlightened VMCS usage should be recommended and the host needs - * to support eVMCS v1 or above. We can also disable eVMCS support - * with module parameter. + * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing + * to unwind if a later step fails. */ - if (enlightened_vmcs && - ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && - (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= - KVM_EVMCS_VERSION) { - - /* Check that we have assist pages on all online CPUs */ - for_each_online_cpu(cpu) { - if (!hv_get_vp_assist_page(cpu)) { - enlightened_vmcs = false; - break; - } - } + hv_init_evmcs(); - if (enlightened_vmcs) { - pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n"); - static_branch_enable(&enable_evmcs); - } - - if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) - vmx_x86_ops.enable_l2_tlb_flush - = hv_enable_l2_tlb_flush; - - } else { - enlightened_vmcs = false; - } -#endif - - r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx), - __alignof__(struct vcpu_vmx), THIS_MODULE); + r = kvm_x86_vendor_init(&vmx_init_ops); if (r) return r; /* - * Must be called after kvm_init() so enable_ept is properly set + * Must be called after common x86 init so enable_ept is properly set * up. Hand the parameter mitigation value in which was stored in * the pre module init parser. If no parameter was given, it will * contain 'auto' which will be turned into the default 'cond' * mitigation mode. */ r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); - if (r) { - vmx_exit(); - return r; - } + if (r) + goto err_l1d_flush; vmx_setup_fb_clear_ctrl(); @@ -8602,6 +8619,21 @@ static int __init vmx_init(void) if (!enable_ept) allow_smaller_maxphyaddr = true; + /* + * Common KVM initialization _must_ come last, after this, /dev/kvm is + * exposed to userspace! + */ + r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), + THIS_MODULE); + if (r) + goto err_kvm_init; + return 0; + +err_kvm_init: + __vmx_exit(); +err_l1d_flush: + kvm_x86_vendor_exit(); + return r; } module_init(vmx_init); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index a3da84f4ea45..2acdc54bc34b 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -640,12 +640,12 @@ BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64) (1 << VCPU_EXREG_EXIT_INFO_1) | \ (1 << VCPU_EXREG_EXIT_INFO_2)) -static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) +static __always_inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) { return container_of(kvm, struct kvm_vmx, kvm); } -static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) +static __always_inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) { return container_of(vcpu, struct vcpu_vmx, vcpu); } @@ -669,25 +669,23 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu); int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu); -static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) +static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_1)) { - kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); + if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1)) vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - } + return vmx->exit_qualification; } -static inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) +static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_2)) { - kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); + if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2)) vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - } + return vmx->exit_intr_info; } diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 842dc898c972..db95bde52998 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -100,8 +100,10 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) return value; do_fail: - WARN_ONCE(1, "kvm: vmread failed: field=%lx\n", field); - pr_warn_ratelimited("kvm: vmread failed: field=%lx\n", field); + instrumentation_begin(); + WARN_ONCE(1, KBUILD_MODNAME ": vmread failed: field=%lx\n", field); + pr_warn_ratelimited(KBUILD_MODNAME ": vmread failed: field=%lx\n", field); + instrumentation_end(); return 0; do_exception: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0735fbc9ba8c..7713420abab0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -15,6 +15,7 @@ * Amit Shah <amit.shah@qumranet.com> * Ben-Ami Yassour <benami@il.ibm.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include "irq.h" @@ -128,6 +129,7 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu); static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); +static DEFINE_MUTEX(vendor_module_lock); struct kvm_x86_ops kvm_x86_ops __read_mostly; #define KVM_X86_OP(func) \ @@ -1421,7 +1423,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); * may depend on host virtualization features rather than host cpu features. */ -static const u32 msrs_to_save_all[] = { +static const u32 msrs_to_save_base[] = { MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -1438,6 +1440,10 @@ static const u32 msrs_to_save_all[] = { MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, MSR_IA32_UMWAIT_CONTROL, + MSR_IA32_XFD, MSR_IA32_XFD_ERR, +}; + +static const u32 msrs_to_save_pmu[] = { MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, @@ -1462,11 +1468,10 @@ static const u32 msrs_to_save_all[] = { MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, - - MSR_IA32_XFD, MSR_IA32_XFD_ERR, }; -static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; +static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + + ARRAY_SIZE(msrs_to_save_pmu)]; static unsigned num_msrs_to_save; static const u32 emulated_msrs_all[] = { @@ -1484,7 +1489,7 @@ static const u32 emulated_msrs_all[] = { HV_X64_MSR_STIMER0_CONFIG, HV_X64_MSR_VP_ASSIST_PAGE, HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, - HV_X64_MSR_TSC_EMULATION_STATUS, + HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_X64_MSR_SYNDBG_OPTIONS, HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, @@ -2093,7 +2098,7 @@ static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT)) return kvm_handle_invalid_op(vcpu); - pr_warn_once("kvm: %s instruction emulated as NOP!\n", insn); + pr_warn_once("%s instruction emulated as NOP!\n", insn); return kvm_emulate_as_nop(vcpu); } int kvm_emulate_mwait(struct kvm_vcpu *vcpu) @@ -2440,7 +2445,8 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) { - pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi); + pr_debug("requested TSC rate %u falls outside tolerance [%u,%u]\n", + user_tsc_khz, thresh_lo, thresh_hi); use_scaling = 1; } return set_tsc_khz(vcpu, user_tsc_khz, use_scaling); @@ -3165,6 +3171,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) &vcpu->hv_clock.tsc_shift, &vcpu->hv_clock.tsc_to_system_mul); vcpu->hw_tsc_khz = tgt_tsc_khz; + kvm_xen_update_tsc_info(v); } vcpu->hv_clock.tsc_timestamp = tsc_timestamp; @@ -3562,9 +3569,20 @@ static void record_steal_time(struct kvm_vcpu *vcpu) mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } +static bool kvm_is_msr_to_save(u32 msr_index) +{ + unsigned int i; + + for (i = 0; i < num_msrs_to_save; i++) { + if (msrs_to_save[i] == msr_index) + return true; + } + + return false; +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - bool pr = false; u32 msr = msr_info->index; u64 data = msr_info->data; @@ -3610,15 +3628,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data == BIT_ULL(18)) { vcpu->arch.msr_hwcr = data; } else if (data != 0) { - vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", - data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } break; case MSR_FAM10H_MMIO_CONF_BASE: if (data != 0) { - vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " - "0x%llx\n", data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } break; @@ -3798,16 +3814,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: - pr = true; - fallthrough; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); - if (pr || data != 0) - vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); + if (data) + kvm_pr_unimpl_wrmsr(vcpu, msr, data); break; case MSR_K7_CLK_CTL: /* @@ -3828,15 +3841,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_TSC_INVARIANT_CONTROL: return kvm_hv_set_msr_common(vcpu, msr, data, msr_info->host_initiated); case MSR_IA32_BBL_CR_CTL3: /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. */ - if (report_ignored_msrs) - vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", - msr, data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); break; case MSR_AMD64_OSVW_ID_LENGTH: if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) @@ -3884,20 +3896,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.guest_fpu.xfd_err = data; break; #endif - case MSR_IA32_PEBS_ENABLE: - case MSR_IA32_DS_AREA: - case MSR_PEBS_DATA_CFG: - case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: + default: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); + /* * Userspace is allowed to write '0' to MSRs that KVM reports * as to-be-saved, even if an MSRs isn't fully supported. */ - return !msr_info->host_initiated || data; - default: - if (kvm_pmu_is_valid_msr(vcpu, msr)) - return kvm_pmu_set_msr(vcpu, msr_info); + if (msr_info->host_initiated && !data && + kvm_is_msr_to_save(msr)) + break; + return KVM_MSR_RET_INVALID; } return 0; @@ -3987,20 +3997,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ msr_info->data = 0; break; - case MSR_IA32_PEBS_ENABLE: - case MSR_IA32_DS_AREA: - case MSR_PEBS_DATA_CFG: - case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) - return kvm_pmu_get_msr(vcpu, msr_info); - /* - * Userspace is allowed to read MSRs that KVM reports as - * to-be-saved, even if an MSR isn't fully supported. - */ - if (!msr_info->host_initiated) - return 1; - msr_info->data = 0; - break; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: @@ -4198,6 +4194,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_TSC_INVARIANT_CONTROL: return kvm_hv_get_msr_common(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); @@ -4255,6 +4252,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); + + /* + * Userspace is allowed to read MSRs that KVM reports as + * to-be-saved, even if an MSR isn't fully supported. + */ + if (msr_info->host_initiated && + kvm_is_msr_to_save(msr_info->index)) { + msr_info->data = 0; + break; + } + return KVM_MSR_RET_INVALID; } return 0; @@ -4292,8 +4300,8 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, { struct kvm_msrs msrs; struct kvm_msr_entry *entries; - int r, n; unsigned size; + int r; r = -EFAULT; if (copy_from_user(&msrs, user_msrs, sizeof(msrs))) @@ -4310,17 +4318,11 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, goto out; } - r = n = __msr_io(vcpu, &msrs, entries, do_msr); - if (r < 0) - goto out_free; + r = __msr_io(vcpu, &msrs, entries, do_msr); - r = -EFAULT; if (writeback && copy_to_user(user_msrs->entries, entries, size)) - goto out_free; - - r = n; + r = -EFAULT; -out_free: kfree(entries); out: return r; @@ -4408,6 +4410,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_IMMEDIATE_EXIT: case KVM_CAP_PMU_EVENT_FILTER: + case KVM_CAP_PMU_EVENT_MASKED_EVENTS: case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: @@ -6482,7 +6485,7 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, struct kvm_x86_msr_filter *new_filter, *old_filter; bool default_allow; bool empty = true; - int r = 0; + int r; u32 i; if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK) @@ -6508,17 +6511,14 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, } mutex_lock(&kvm->lock); - - /* The per-VM filter is protected by kvm->lock... */ - old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1); - - rcu_assign_pointer(kvm->arch.msr_filter, new_filter); + old_filter = rcu_replace_pointer(kvm->arch.msr_filter, new_filter, + mutex_is_locked(&kvm->lock)); + mutex_unlock(&kvm->lock); synchronize_srcu(&kvm->srcu); kvm_free_msr_filter(old_filter); kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); - mutex_unlock(&kvm->lock); return 0; } @@ -7018,83 +7018,98 @@ out: return r; } -static void kvm_init_msr_list(void) +static void kvm_probe_msr_to_save(u32 msr_index) { u32 dummy[2]; + + if (rdmsr_safe(msr_index, &dummy[0], &dummy[1])) + return; + + /* + * Even MSRs that are valid in the host may not be exposed to guests in + * some cases. + */ + switch (msr_index) { + case MSR_IA32_BNDCFGS: + if (!kvm_mpx_supported()) + return; + break; + case MSR_TSC_AUX: + if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) && + !kvm_cpu_cap_has(X86_FEATURE_RDPID)) + return; + break; + case MSR_IA32_UMWAIT_CONTROL: + if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) + return; + break; + case MSR_IA32_RTIT_CTL: + case MSR_IA32_RTIT_STATUS: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) + return; + break; + case MSR_IA32_RTIT_CR3_MATCH: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) + return; + break; + case MSR_IA32_RTIT_OUTPUT_BASE: + case MSR_IA32_RTIT_OUTPUT_MASK: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && + !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) + return; + break; + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + (msr_index - MSR_IA32_RTIT_ADDR0_A >= + intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)) + return; + break; + case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX: + if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >= + kvm_pmu_cap.num_counters_gp) + return; + break; + case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX: + if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >= + kvm_pmu_cap.num_counters_gp) + return; + break; + case MSR_ARCH_PERFMON_FIXED_CTR0 ... MSR_ARCH_PERFMON_FIXED_CTR_MAX: + if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >= + kvm_pmu_cap.num_counters_fixed) + return; + break; + case MSR_IA32_XFD: + case MSR_IA32_XFD_ERR: + if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) + return; + break; + default: + break; + } + + msrs_to_save[num_msrs_to_save++] = msr_index; +} + +static void kvm_init_msr_list(void) +{ unsigned i; BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3, - "Please update the fixed PMCs in msrs_to_saved_all[]"); + "Please update the fixed PMCs in msrs_to_save_pmu[]"); num_msrs_to_save = 0; num_emulated_msrs = 0; num_msr_based_features = 0; - for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) { - if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0) - continue; - - /* - * Even MSRs that are valid in the host may not be exposed - * to the guests in some cases. - */ - switch (msrs_to_save_all[i]) { - case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) - continue; - break; - case MSR_TSC_AUX: - if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) && - !kvm_cpu_cap_has(X86_FEATURE_RDPID)) - continue; - break; - case MSR_IA32_UMWAIT_CONTROL: - if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) - continue; - break; - case MSR_IA32_RTIT_CTL: - case MSR_IA32_RTIT_STATUS: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) - continue; - break; - case MSR_IA32_RTIT_CR3_MATCH: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) - continue; - break; - case MSR_IA32_RTIT_OUTPUT_BASE: - case MSR_IA32_RTIT_OUTPUT_MASK: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && - !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) - continue; - break; - case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >= - intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) - continue; - break; - case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX: - if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= - min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) - continue; - break; - case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX: - if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= - min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) - continue; - break; - case MSR_IA32_XFD: - case MSR_IA32_XFD_ERR: - if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) - continue; - break; - default: - break; - } + for (i = 0; i < ARRAY_SIZE(msrs_to_save_base); i++) + kvm_probe_msr_to_save(msrs_to_save_base[i]); - msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i]; + if (enable_pmu) { + for (i = 0; i < ARRAY_SIZE(msrs_to_save_pmu); i++) + kvm_probe_msr_to_save(msrs_to_save_pmu[i]); } for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { @@ -7721,7 +7736,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; emul_write: - printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); + pr_warn_once("emulating exchange as write\n"); return emulator_write_emulated(ctxt, addr, new, bytes, exception); } @@ -8167,9 +8182,14 @@ static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked); } -static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) +static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt) +{ + return is_smm(emul_to_vcpu(ctxt)); +} + +static bool emulator_is_guest_mode(struct x86_emulate_ctxt *ctxt) { - return emul_to_vcpu(ctxt)->arch.hflags; + return is_guest_mode(emul_to_vcpu(ctxt)); } #ifndef CONFIG_KVM_SMM @@ -8238,7 +8258,8 @@ static const struct x86_emulate_ops emulate_ops = { .guest_has_fxsr = emulator_guest_has_fxsr, .guest_has_rdpid = emulator_guest_has_rdpid, .set_nmi_mask = emulator_set_nmi_mask, - .get_hflags = emulator_get_hflags, + .is_smm = emulator_is_smm, + .is_guest_mode = emulator_is_guest_mode, .leave_smm = emulator_leave_smm, .triple_fault = emulator_triple_fault, .set_xcr = emulator_set_xcr, @@ -8282,7 +8303,7 @@ static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu) ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT); if (!ctxt) { - pr_err("kvm: failed to allocate vcpu's emulator\n"); + pr_err("failed to allocate vcpu's emulator\n"); return NULL; } @@ -8310,8 +8331,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK); - ctxt->interruptibility = 0; ctxt->have_exception = false; ctxt->exception.vector = -1; @@ -9293,35 +9312,66 @@ static struct notifier_block pvclock_gtod_notifier = { }; #endif -int kvm_arch_init(void *opaque) +static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) +{ + memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); + +#define __KVM_X86_OP(func) \ + static_call_update(kvm_x86_##func, kvm_x86_ops.func); +#define KVM_X86_OP(func) \ + WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) +#define KVM_X86_OP_OPTIONAL __KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0(func) \ + static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \ + (void *)__static_call_return0); +#include <asm/kvm-x86-ops.h> +#undef __KVM_X86_OP + + kvm_pmu_ops_update(ops->pmu_ops); +} + +static int kvm_x86_check_processor_compatibility(void) +{ + int cpu = smp_processor_id(); + struct cpuinfo_x86 *c = &cpu_data(cpu); + + /* + * Compatibility checks are done when loading KVM and when enabling + * hardware, e.g. during CPU hotplug, to ensure all online CPUs are + * compatible, i.e. KVM should never perform a compatibility check on + * an offline CPU. + */ + WARN_ON(!cpu_online(cpu)); + + if (__cr4_reserved_bits(cpu_has, c) != + __cr4_reserved_bits(cpu_has, &boot_cpu_data)) + return -EIO; + + return static_call(kvm_x86_check_processor_compatibility)(); +} + +static void kvm_x86_check_cpu_compat(void *ret) +{ + *(int *)ret = kvm_x86_check_processor_compatibility(); +} + +static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) { - struct kvm_x86_init_ops *ops = opaque; u64 host_pat; - int r; + int r, cpu; if (kvm_x86_ops.hardware_enable) { - pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name); + pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name); return -EEXIST; } - if (!ops->cpu_has_kvm_support()) { - pr_err_ratelimited("kvm: no hardware support for '%s'\n", - ops->runtime_ops->name); - return -EOPNOTSUPP; - } - if (ops->disabled_by_bios()) { - pr_err_ratelimited("kvm: support for '%s' disabled by bios\n", - ops->runtime_ops->name); - return -EOPNOTSUPP; - } - /* * KVM explicitly assumes that the guest has an FPU and * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the * vCPU's FPU state as a fxregs_state struct. */ if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) { - printk(KERN_ERR "kvm: inadequate fpu\n"); + pr_err("inadequate fpu\n"); return -EOPNOTSUPP; } @@ -9339,19 +9389,19 @@ int kvm_arch_init(void *opaque) */ if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) || (host_pat & GENMASK(2, 0)) != 6) { - pr_err("kvm: host PAT[0] is not WB\n"); + pr_err("host PAT[0] is not WB\n"); return -EIO; } x86_emulator_cache = kvm_alloc_emulator_cache(); if (!x86_emulator_cache) { - pr_err("kvm: failed to allocate cache for x86 emulator\n"); + pr_err("failed to allocate cache for x86 emulator\n"); return -ENOMEM; } user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); if (!user_return_msrs) { - printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n"); + pr_err("failed to allocate percpu kvm_user_return_msrs\n"); r = -ENOMEM; goto out_free_x86_emulator_cache; } @@ -9361,13 +9411,37 @@ int kvm_arch_init(void *opaque) if (r) goto out_free_percpu; - kvm_timer_init(); - if (boot_cpu_has(X86_FEATURE_XSAVE)) { host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; } + rdmsrl_safe(MSR_EFER, &host_efer); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + rdmsrl(MSR_IA32_XSS, host_xss); + + kvm_init_pmu_capability(ops->pmu_ops); + + r = ops->hardware_setup(); + if (r != 0) + goto out_mmu_exit; + + kvm_ops_update(ops); + + for_each_online_cpu(cpu) { + smp_call_function_single(cpu, kvm_x86_check_cpu_compat, &r, 1); + if (r < 0) + goto out_unwind_ops; + } + + /* + * Point of no return! DO NOT add error paths below this point unless + * absolutely necessary, as most operations from this point forward + * require unwinding. + */ + kvm_timer_init(); + if (pi_inject_timer == -1) pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER); #ifdef CONFIG_X86_64 @@ -9377,8 +9451,35 @@ int kvm_arch_init(void *opaque) set_hv_tscchange_cb(kvm_hyperv_tsc_notifier); #endif + kvm_register_perf_callbacks(ops->handle_intel_pt_intr); + + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) + kvm_caps.supported_xss = 0; + +#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) + cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); +#undef __kvm_cpu_cap_has + + if (kvm_caps.has_tsc_control) { + /* + * Make sure the user can only configure tsc_khz values that + * fit into a signed integer. + * A min value is not calculated because it will always + * be 1 on all machines. + */ + u64 max = min(0x7fffffffULL, + __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz)); + kvm_caps.max_guest_tsc_khz = max; + } + kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; + kvm_init_msr_list(); return 0; +out_unwind_ops: + kvm_x86_ops.hardware_enable = NULL; + static_call(kvm_x86_hardware_unsetup)(); +out_mmu_exit: + kvm_mmu_vendor_module_exit(); out_free_percpu: free_percpu(user_return_msrs); out_free_x86_emulator_cache: @@ -9386,8 +9487,22 @@ out_free_x86_emulator_cache: return r; } -void kvm_arch_exit(void) +int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) +{ + int r; + + mutex_lock(&vendor_module_lock); + r = __kvm_x86_vendor_init(ops); + mutex_unlock(&vendor_module_lock); + + return r; +} +EXPORT_SYMBOL_GPL(kvm_x86_vendor_init); + +void kvm_x86_vendor_exit(void) { + kvm_unregister_perf_callbacks(); + #ifdef CONFIG_X86_64 if (hypervisor_is_type(X86_HYPER_MS_HYPERV)) clear_hv_tscchange_cb(); @@ -9404,7 +9519,7 @@ void kvm_arch_exit(void) irq_work_sync(&pvclock_irq_work); cancel_work_sync(&pvclock_gtod_work); #endif - kvm_x86_ops.hardware_enable = NULL; + static_call(kvm_x86_hardware_unsetup)(); kvm_mmu_vendor_module_exit(); free_percpu(user_return_msrs); kmem_cache_destroy(x86_emulator_cache); @@ -9412,7 +9527,11 @@ void kvm_arch_exit(void) static_key_deferred_flush(&kvm_xen_enabled); WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); #endif + mutex_lock(&vendor_module_lock); + kvm_x86_ops.hardware_enable = NULL; + mutex_unlock(&vendor_module_lock); } +EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit); static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) { @@ -10065,7 +10184,7 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm) kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); } -void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) +void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; bool activate; @@ -10100,7 +10219,30 @@ out: preempt_enable(); up_read(&vcpu->kvm->arch.apicv_update_lock); } -EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); +EXPORT_SYMBOL_GPL(__kvm_vcpu_update_apicv); + +static void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) +{ + if (!lapic_in_kernel(vcpu)) + return; + + /* + * Due to sharing page tables across vCPUs, the xAPIC memslot must be + * deleted if any vCPU has xAPIC virtualization and x2APIC enabled, but + * and hardware doesn't support x2APIC virtualization. E.g. some AMD + * CPUs support AVIC but not x2APIC. KVM still allows enabling AVIC in + * this case so that KVM can the AVIC doorbell to inject interrupts to + * running vCPUs, but KVM must not create SPTEs for the APIC base as + * the vCPU would incorrectly be able to access the vAPIC page via MMIO + * despite being in x2APIC mode. For simplicity, inhibiting the APIC + * access page is sticky. + */ + if (apic_x2apic_mode(vcpu->arch.apic) && + kvm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization) + kvm_inhibit_apic_access_page(vcpu); + + __kvm_vcpu_update_apicv(vcpu); +} void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, enum kvm_apicv_inhibit reason, bool set) @@ -10109,7 +10251,7 @@ void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, lockdep_assert_held_write(&kvm->arch.apicv_update_lock); - if (!static_call(kvm_x86_check_apicv_inhibit_reasons)(reason)) + if (!(kvm_x86_ops.required_apicv_inhibits & BIT(reason))) return; old = new = kvm->arch.apicv_inhibit_reasons; @@ -11553,7 +11695,7 @@ static int sync_regs(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { if (kvm_check_tsc_unstable() && kvm->created_vcpus) - pr_warn_once("kvm: SMP vm created on host with unstable TSC; " + pr_warn_once("SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); if (!kvm->arch.max_vcpu_ids) @@ -11630,7 +11772,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) goto free_wbinvd_dirty_mask; if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) { - pr_err("kvm: failed to allocate vcpu's fpu\n"); + pr_err("failed to allocate vcpu's fpu\n"); goto free_emulate_ctxt; } @@ -11904,6 +12046,11 @@ int kvm_arch_hardware_enable(void) bool stable, backwards_tsc = false; kvm_user_return_msr_cpu_online(); + + ret = kvm_x86_check_processor_compatibility(); + if (ret) + return ret; + ret = static_call(kvm_x86_hardware_enable)(); if (ret != 0) return ret; @@ -11990,88 +12137,6 @@ void kvm_arch_hardware_disable(void) drop_user_return_notifiers(); } -static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) -{ - memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); - -#define __KVM_X86_OP(func) \ - static_call_update(kvm_x86_##func, kvm_x86_ops.func); -#define KVM_X86_OP(func) \ - WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) -#define KVM_X86_OP_OPTIONAL __KVM_X86_OP -#define KVM_X86_OP_OPTIONAL_RET0(func) \ - static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \ - (void *)__static_call_return0); -#include <asm/kvm-x86-ops.h> -#undef __KVM_X86_OP - - kvm_pmu_ops_update(ops->pmu_ops); -} - -int kvm_arch_hardware_setup(void *opaque) -{ - struct kvm_x86_init_ops *ops = opaque; - int r; - - rdmsrl_safe(MSR_EFER, &host_efer); - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, host_xss); - - kvm_init_pmu_capability(); - - r = ops->hardware_setup(); - if (r != 0) - return r; - - kvm_ops_update(ops); - - kvm_register_perf_callbacks(ops->handle_intel_pt_intr); - - if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) - kvm_caps.supported_xss = 0; - -#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) - cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); -#undef __kvm_cpu_cap_has - - if (kvm_caps.has_tsc_control) { - /* - * Make sure the user can only configure tsc_khz values that - * fit into a signed integer. - * A min value is not calculated because it will always - * be 1 on all machines. - */ - u64 max = min(0x7fffffffULL, - __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz)); - kvm_caps.max_guest_tsc_khz = max; - } - kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; - kvm_init_msr_list(); - return 0; -} - -void kvm_arch_hardware_unsetup(void) -{ - kvm_unregister_perf_callbacks(); - - static_call(kvm_x86_hardware_unsetup)(); -} - -int kvm_arch_check_processor_compat(void *opaque) -{ - struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); - struct kvm_x86_init_ops *ops = opaque; - - WARN_ON(!irqs_disabled()); - - if (__cr4_reserved_bits(cpu_has, c) != - __cr4_reserved_bits(cpu_has, &boot_cpu_data)) - return -EIO; - - return ops->check_processor_compatibility(); -} - bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; @@ -12244,7 +12309,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, */ hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0); - if (IS_ERR((void *)hva)) + if (IS_ERR_VALUE(hva)) return (void __user *)hva; } else { if (!slot || !slot->npages) @@ -12459,16 +12524,14 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable) { - struct kvm_arch *ka = &kvm->arch; + int nr_slots; if (!kvm_x86_ops.cpu_dirty_log_size) return; - if ((enable && ++ka->cpu_dirty_logging_count == 1) || - (!enable && --ka->cpu_dirty_logging_count == 0)) + nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging); + if ((enable && nr_slots == 1) || !nr_slots) kvm_make_all_cpus_request(kvm, KVM_REQ_UPDATE_CPU_DIRTY_LOGGING); - - WARN_ON_ONCE(ka->cpu_dirty_logging_count < 0); } static void kvm_mmu_slot_apply_flags(struct kvm *kvm, diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 9de72586f406..a8167b47b8c8 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -331,6 +331,18 @@ extern bool report_ignored_msrs; extern bool eager_page_split; +static inline void kvm_pr_unimpl_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "Unhandled WRMSR(0x%x) = 0x%llx\n", msr, data); +} + +static inline void kvm_pr_unimpl_rdmsr(struct kvm_vcpu *vcpu, u32 msr) +{ + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "Unhandled RDMSR(0x%x)\n", msr); +} + static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, @@ -382,13 +394,13 @@ enum kvm_intr_type { KVM_HANDLING_NMI, }; -static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu, - enum kvm_intr_type intr) +static __always_inline void kvm_before_interrupt(struct kvm_vcpu *vcpu, + enum kvm_intr_type intr) { WRITE_ONCE(vcpu->arch.handling_intr_from_guest, (u8)intr); } -static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu) +static __always_inline void kvm_after_interrupt(struct kvm_vcpu *vcpu) { WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 0); } diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 8fd41f5deae3..40edf4d1974c 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -5,6 +5,7 @@ * * KVM Xen emulation */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "x86.h" #include "xen.h" @@ -22,6 +23,9 @@ #include <xen/interface/event_channel.h> #include <xen/interface/sched.h> +#include <asm/xen/cpuid.h> + +#include "cpuid.h" #include "trace.h" static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm); @@ -2076,6 +2080,29 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) del_timer_sync(&vcpu->arch.xen.poll_timer); } +void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *entry; + u32 function; + + if (!vcpu->arch.xen.cpuid.base) + return; + + function = vcpu->arch.xen.cpuid.base | XEN_CPUID_LEAF(3); + if (function > vcpu->arch.xen.cpuid.limit) + return; + + entry = kvm_find_cpuid_entry_index(vcpu, function, 1); + if (entry) { + entry->ecx = vcpu->arch.hv_clock.tsc_to_system_mul; + entry->edx = vcpu->arch.hv_clock.tsc_shift; + } + + entry = kvm_find_cpuid_entry_index(vcpu, function, 2); + if (entry) + entry->eax = vcpu->arch.hw_tsc_khz; +} + void kvm_xen_init_vm(struct kvm *kvm) { mutex_init(&kvm->arch.xen.xen_lock); diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index ea33d80a0c51..f8f1fe22d090 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -9,6 +9,8 @@ #ifndef __ARCH_X86_KVM_XEN_H__ #define __ARCH_X86_KVM_XEN_H__ +#include <asm/xen/hypervisor.h> + #ifdef CONFIG_KVM_XEN #include <linux/jump_label_ratelimit.h> @@ -32,6 +34,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, int kvm_xen_setup_evtchn(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue); +void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu); static inline bool kvm_xen_msr_enabled(struct kvm *kvm) { @@ -135,6 +138,10 @@ static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu) { return false; } + +static inline void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu) +{ +} #endif int kvm_xen_hypercall(struct kvm_vcpu *vcpu); diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index bddfc9a46645..90e820ac9771 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 PHONY += posttest -ifeq ($(KBUILD_VERBOSE),1) +ifneq ($(findstring 1, $(KBUILD_VERBOSE)),) posttest_verbose = -v else posttest_verbose = diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index 3d5cd2e57820..ee89f6bb9242 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -48,4 +48,4 @@ include/generated/user_constants.h: $(obj)/user-offsets.s FORCE UNPROFILE_OBJS := stub_segv.o CFLAGS_stub_segv.o := $(CFLAGS_NO_HARDENING) -include arch/um/scripts/Makefile.rules +include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/arch/x86/um/os-Linux/Makefile b/arch/x86/um/os-Linux/Makefile index 253bfb8cb702..ae169125d03f 100644 --- a/arch/x86/um/os-Linux/Makefile +++ b/arch/x86/um/os-Linux/Makefile @@ -10,4 +10,4 @@ obj-$(CONFIG_64BIT) += prctl.o USER_OBJS := $(obj-y) -include arch/um/scripts/Makefile.rules +include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile index 6fbe97c52c99..6825e146a62f 100644 --- a/arch/x86/um/vdso/Makefile +++ b/arch/x86/um/vdso/Makefile @@ -61,7 +61,7 @@ CFLAGS_REMOVE_um_vdso.o = -pg -fprofile-arcs -ftest-coverage # quiet_cmd_vdso = VDSO $@ cmd_vdso = $(CC) -nostdlib -o $@ \ - $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ + $(CC_FLAGS_LTO) $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \ sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' diff --git a/arch/x86/um/vdso/um_vdso.c b/arch/x86/um/vdso/um_vdso.c index 2112b8d14668..ff0f3b4b6c45 100644 --- a/arch/x86/um/vdso/um_vdso.c +++ b/arch/x86/um/vdso/um_vdso.c @@ -17,8 +17,10 @@ int __vdso_clock_gettime(clockid_t clock, struct __kernel_old_timespec *ts) { long ret; - asm("syscall" : "=a" (ret) : - "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory"); + asm("syscall" + : "=a" (ret) + : "0" (__NR_clock_gettime), "D" (clock), "S" (ts) + : "rcx", "r11", "memory"); return ret; } @@ -29,8 +31,10 @@ int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) { long ret; - asm("syscall" : "=a" (ret) : - "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); + asm("syscall" + : "=a" (ret) + : "0" (__NR_gettimeofday), "D" (tv), "S" (tz) + : "rcx", "r11", "memory"); return ret; } |