diff options
Diffstat (limited to 'arch/x86/mm')
35 files changed, 181 insertions, 82 deletions
| diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 72bf8c01c6e3..7ba7f3d7f477 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,5 +1,13 @@ -# Kernel does not boot with instrumentation of tlb.c. -KCOV_INSTRUMENT_tlb.o	:= n +# SPDX-License-Identifier: GPL-2.0 +# Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c +KCOV_INSTRUMENT_tlb.o		:= n +KCOV_INSTRUMENT_mem_encrypt.o	:= n + +KASAN_SANITIZE_mem_encrypt.o	:= n + +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_mem_encrypt.o	= -pg +endif  obj-y	:=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \  	    pat.o pgtable.o physaddr.o setup_nx.o tlb.o diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index 91f501b2da3b..048c761d97b0 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * AMD NUMA support.   * Discover the memory map and associated nodes. diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index c076f710de4c..c3521e2be396 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -2,6 +2,7 @@  #include <linux/uaccess.h>  #include <linux/sched/debug.h> +#include <asm/fpu/internal.h>  #include <asm/traps.h>  #include <asm/kdebug.h> @@ -78,6 +79,29 @@ bool ex_handler_refcount(const struct exception_table_entry *fixup,  }  EXPORT_SYMBOL_GPL(ex_handler_refcount); +/* + * Handler for when we fail to restore a task's FPU state.  We should never get + * here because the FPU state of a task using the FPU (task->thread.fpu.state) + * should always be valid.  However, past bugs have allowed userspace to set + * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn(). + * These caused XRSTOR to fail when switching to the task, leaking the FPU + * registers of the task previously executing on the CPU.  Mitigate this class + * of vulnerability by restoring from the initial state (essentially, zeroing + * out all the FPU registers) if we can't restore from the task's FPU state. + */ +bool ex_handler_fprestore(const struct exception_table_entry *fixup, +			  struct pt_regs *regs, int trapnr) +{ +	regs->ip = ex_fixup_addr(fixup); + +	WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.", +		  (void *)instruction_pointer(regs)); + +	__copy_kernel_to_fpregs(&init_fpstate, -1); +	return true; +} +EXPORT_SYMBOL_GPL(ex_handler_fprestore); +  bool ex_handler_ext(const struct exception_table_entry *fixup,  		   struct pt_regs *regs, int trapnr)  { diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b836a7274e12..b0ff378650a9 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *  Copyright (C) 1995  Linus Torvalds   *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. @@ -192,8 +193,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)   * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really   *	     faulted on a pte with its pkey=4.   */ -static void fill_sig_info_pkey(int si_code, siginfo_t *info, -		struct vm_area_struct *vma) +static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey)  {  	/* This is effectively an #ifdef */  	if (!boot_cpu_has(X86_FEATURE_OSPKE)) @@ -209,7 +209,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info,  	 * valid VMA, so we should never reach this without a  	 * valid VMA.  	 */ -	if (!vma) { +	if (!pkey) {  		WARN_ONCE(1, "PKU fault with no VMA passed in");  		info->si_pkey = 0;  		return; @@ -219,13 +219,12 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info,  	 * absolutely guranteed to be 100% accurate because of  	 * the race explained above.  	 */ -	info->si_pkey = vma_pkey(vma); +	info->si_pkey = *pkey;  }  static void  force_sig_info_fault(int si_signo, int si_code, unsigned long address, -		     struct task_struct *tsk, struct vm_area_struct *vma, -		     int fault) +		     struct task_struct *tsk, u32 *pkey, int fault)  {  	unsigned lsb = 0;  	siginfo_t info; @@ -240,7 +239,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,  		lsb = PAGE_SHIFT;  	info.si_addr_lsb = lsb; -	fill_sig_info_pkey(si_code, &info, vma); +	fill_sig_info_pkey(si_code, &info, pkey);  	force_sig_info(si_signo, &info, tsk);  } @@ -762,8 +761,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,  	struct task_struct *tsk = current;  	unsigned long flags;  	int sig; -	/* No context means no VMA to pass down */ -	struct vm_area_struct *vma = NULL;  	/* Are we prepared to handle this kernel fault? */  	if (fixup_exception(regs, X86_TRAP_PF)) { @@ -788,7 +785,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,  			/* XXX: hwpoison faults will set the wrong code. */  			force_sig_info_fault(signal, si_code, address, -					     tsk, vma, 0); +					     tsk, NULL, 0);  		}  		/* @@ -806,7 +803,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,  	if (is_vmalloc_addr((void *)address) &&  	    (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||  	     address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { -		register void *__sp asm("rsp");  		unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *);  		/*  		 * We're likely to be running with very little stack space @@ -821,7 +817,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,  		asm volatile ("movq %[stack], %%rsp\n\t"  			      "call handle_stack_overflow\n\t"  			      "1: jmp 1b" -			      : "+r" (__sp) +			      : ASM_CALL_CONSTRAINT  			      : "D" ("kernel stack overflow (page fault)"),  				"S" (regs), "d" (address),  				[stack] "rm" (stack)); @@ -897,8 +893,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,  static void  __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, -		       unsigned long address, struct vm_area_struct *vma, -		       int si_code) +		       unsigned long address, u32 *pkey, int si_code)  {  	struct task_struct *tsk = current; @@ -946,7 +941,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,  		tsk->thread.error_code	= error_code;  		tsk->thread.trap_nr	= X86_TRAP_PF; -		force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0); +		force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0);  		return;  	} @@ -959,9 +954,9 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,  static noinline void  bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, -		     unsigned long address, struct vm_area_struct *vma) +		     unsigned long address, u32 *pkey)  { -	__bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR); +	__bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR);  }  static void @@ -969,6 +964,10 @@ __bad_area(struct pt_regs *regs, unsigned long error_code,  	   unsigned long address,  struct vm_area_struct *vma, int si_code)  {  	struct mm_struct *mm = current->mm; +	u32 pkey; + +	if (vma) +		pkey = vma_pkey(vma);  	/*  	 * Something tried to access memory that isn't in our memory map.. @@ -976,7 +975,8 @@ __bad_area(struct pt_regs *regs, unsigned long error_code,  	 */  	up_read(&mm->mmap_sem); -	__bad_area_nosemaphore(regs, error_code, address, vma, si_code); +	__bad_area_nosemaphore(regs, error_code, address, +			       (vma) ? &pkey : NULL, si_code);  }  static noinline void @@ -1019,7 +1019,7 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,  static void  do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, -	  struct vm_area_struct *vma, unsigned int fault) +	  u32 *pkey, unsigned int fault)  {  	struct task_struct *tsk = current;  	int code = BUS_ADRERR; @@ -1046,13 +1046,12 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,  		code = BUS_MCEERR_AR;  	}  #endif -	force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault); +	force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault);  }  static noinline void  mm_fault_error(struct pt_regs *regs, unsigned long error_code, -	       unsigned long address, struct vm_area_struct *vma, -	       unsigned int fault) +	       unsigned long address, u32 *pkey, unsigned int fault)  {  	if (fatal_signal_pending(current) && !(error_code & PF_USER)) {  		no_context(regs, error_code, address, 0, 0); @@ -1076,9 +1075,9 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,  	} else {  		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|  			     VM_FAULT_HWPOISON_LARGE)) -			do_sigbus(regs, error_code, address, vma, fault); +			do_sigbus(regs, error_code, address, pkey, fault);  		else if (fault & VM_FAULT_SIGSEGV) -			bad_area_nosemaphore(regs, error_code, address, vma); +			bad_area_nosemaphore(regs, error_code, address, pkey);  		else  			BUG();  	} @@ -1268,6 +1267,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,  	struct mm_struct *mm;  	int fault, major = 0;  	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; +	u32 pkey;  	tsk = current;  	mm = tsk->mm; @@ -1441,7 +1441,17 @@ good_area:  	 * make sure we exit gracefully rather than endlessly redo  	 * the fault.  Since we never set FAULT_FLAG_RETRY_NOWAIT, if  	 * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. +	 * +	 * Note that handle_userfault() may also release and reacquire mmap_sem +	 * (and not return with VM_FAULT_RETRY), when returning to userland to +	 * repeat the page fault later with a VM_FAULT_NOPAGE retval +	 * (potentially after handling any pending signal during the return to +	 * userland). The return to userland is identified whenever +	 * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags. +	 * Thus we have to be careful about not touching vma after handling the +	 * fault, so we read the pkey beforehand.  	 */ +	pkey = vma_pkey(vma);  	fault = handle_mm_fault(vma, address, flags);  	major |= fault & VM_FAULT_MAJOR; @@ -1470,7 +1480,7 @@ good_area:  	up_read(&mm->mmap_sem);  	if (unlikely(fault & VM_FAULT_ERROR)) { -		mm_fault_error(regs, error_code, address, vma, fault); +		mm_fault_error(regs, error_code, address, &pkey, fault);  		return;  	} diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 6d06cf33e3de..8ae0000cbdb3 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * IA-32 Huge TLB Page Support for Kernel.   * diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index 31cea988fa36..ab33a32df2a8 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Helper routines for building identity mapping page tables. This is   * included by both the compressed kernel and the regular kernel. diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index bc84b73684b7..8f5be3eb40dd 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #define DISABLE_BRANCH_PROFILING  #define pr_fmt(fmt) "kasan: " fmt  #include <linux/bootmem.h> diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index af599167fe3c..879ef930e2c2 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * This file implements KASLR memory randomization for x86_64. It randomizes   * the virtual address space of kernel memory regions (physical memory diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c index dab41876cdd5..872ec4159a68 100644 --- a/arch/x86/mm/kmemcheck/error.c +++ b/arch/x86/mm/kmemcheck/error.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include <linux/interrupt.h>  #include <linux/kdebug.h>  #include <linux/kmemcheck.h> diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h index 0efc2e8d0a20..39f80d7a874d 100644 --- a/arch/x86/mm/kmemcheck/error.h +++ b/arch/x86/mm/kmemcheck/error.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H  #define ARCH__X86__MM__KMEMCHECK__ERROR_H diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c index 324aa3f07237..df8109ddf7fe 100644 --- a/arch/x86/mm/kmemcheck/opcode.c +++ b/arch/x86/mm/kmemcheck/opcode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include <linux/types.h>  #include "opcode.h" diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h index 6956aad66b5b..51a1ce94c24a 100644 --- a/arch/x86/mm/kmemcheck/opcode.h +++ b/arch/x86/mm/kmemcheck/opcode.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H  #define ARCH__X86__MM__KMEMCHECK__OPCODE_H diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c index 4ead26eeaf96..8a03be90272a 100644 --- a/arch/x86/mm/kmemcheck/pte.c +++ b/arch/x86/mm/kmemcheck/pte.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include <linux/mm.h>  #include <asm/pgtable.h> diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h index 9f5966456492..b595612382c2 100644 --- a/arch/x86/mm/kmemcheck/pte.h +++ b/arch/x86/mm/kmemcheck/pte.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef ARCH__X86__MM__KMEMCHECK__PTE_H  #define ARCH__X86__MM__KMEMCHECK__PTE_H diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c index aef7140c0063..7ce0be1f99eb 100644 --- a/arch/x86/mm/kmemcheck/selftest.c +++ b/arch/x86/mm/kmemcheck/selftest.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include <linux/bug.h>  #include <linux/kernel.h> diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h index 8fed4fe11f95..8d759aae453d 100644 --- a/arch/x86/mm/kmemcheck/selftest.h +++ b/arch/x86/mm/kmemcheck/selftest.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H  #define ARCH_X86_MM_KMEMCHECK_SELFTEST_H diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h index ff0b2f70fbcb..49768dc18664 100644 --- a/arch/x86/mm/kmemcheck/shadow.h +++ b/arch/x86/mm/kmemcheck/shadow.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H  #define ARCH__X86__MM__KMEMCHECK__SHADOW_H diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index afc47f5c9531..c21c2ed04612 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /* Support for MMIO probes.   * Benfit many code from kprobes   * (C) 2002 Louis Zhuang <[email protected]>. diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 3fcc8e01683b..16c5f37933a2 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -10,6 +10,8 @@   * published by the Free Software Foundation.   */ +#define DISABLE_BRANCH_PROFILING +  #include <linux/linkage.h>  #include <linux/init.h>  #include <linux/mm.h> diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index 62474ba66c8e..4e1f6e1b8159 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef __X86_MM_INTERNAL_H  #define __X86_MM_INTERNAL_H diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 9ceaa955d2ba..7eb06701a935 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * mpx.c - Memory Protection eXtensions   * diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 9405ffc91502..066f3511d5f1 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Generic VM initialization for x86-64 NUMA setups.   * Copyright 2002,2003 Andi Kleen, SuSE Labs. diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index d805162e6045..34a2a3bfde9c 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * NUMA emulation   */ diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h index ad86ec91e640..86860f279662 100644 --- a/arch/x86/mm/numa_internal.h +++ b/arch/x86/mm/numa_internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef __X86_MM_NUMA_INTERNAL_H  #define __X86_MM_NUMA_INTERNAL_H diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 5f169d5d76a8..a25588ad75ef 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * self test for change_page_attr.   * diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h index a739bfc40690..eeb5caeb089b 100644 --- a/arch/x86/mm/pat_internal.h +++ b/arch/x86/mm/pat_internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef __PAT_INTERNAL_H_  #define __PAT_INTERNAL_H_ diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index d76485b22824..fa16036fa592 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Handle caching attributes in page tables (PAT)   * diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index b372f3442bbf..17ebc5a978cc 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include <linux/mm.h>  #include <linux/gfp.h>  #include <asm/pgalloc.h> diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index b9bd5b8b14fa..6b9bf023a700 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include <linux/sched.h>  #include <linux/kernel.h>  #include <linux/errno.h> diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c index cfc3b9121ce4..7f9acb68324c 100644 --- a/arch/x86/mm/physaddr.c +++ b/arch/x86/mm/physaddr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include <linux/bootmem.h>  #include <linux/mmdebug.h>  #include <linux/export.h> diff --git a/arch/x86/mm/physaddr.h b/arch/x86/mm/physaddr.h index a3cd5a0c97b3..9f6419cafc32 100644 --- a/arch/x86/mm/physaddr.h +++ b/arch/x86/mm/physaddr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #include <asm/processor.h>  static inline int phys_addr_valid(resource_size_t addr) diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 2dab69a706ec..d7bc0eea20a5 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -18,7 +18,6 @@  #include <asm/cpufeature.h>             /* boot_cpu_has, ...            */  #include <asm/mmu_context.h>            /* vma_pkey()                   */ -#include <asm/fpu/internal.h>           /* fpregs_active()              */  int __execute_only_pkey(struct mm_struct *mm)  { @@ -45,7 +44,7 @@ int __execute_only_pkey(struct mm_struct *mm)  	 */  	preempt_disable();  	if (!need_to_set_mm_pkey && -	    fpregs_active() && +	    current->thread.fpu.initialized &&  	    !__pkru_allows_read(read_pkru(), execute_only_pkey)) {  		preempt_enable();  		return execute_only_pkey; diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index f65a33f505b6..adb3c5784dac 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include <linux/spinlock.h>  #include <linux/errno.h>  #include <linux/init.h> diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 3ea20d61b523..dac07e4f5834 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * ACPI 3.0 based NUMA setup   * Copyright 2004 Andi Kleen, SuSE Labs. diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 1ab3821f9e26..3118392cdf75 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -30,6 +30,7 @@  atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); +  static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,  			    u16 *new_asid, bool *need_flush)  { @@ -80,10 +81,11 @@ void leave_mm(int cpu)  		return;  	/* Warn if we're not lazy. */ -	WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))); +	WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));  	switch_mm(NULL, &init_mm, NULL);  } +EXPORT_SYMBOL_GPL(leave_mm);  void switch_mm(struct mm_struct *prev, struct mm_struct *next,  	       struct task_struct *tsk) @@ -126,8 +128,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,  	 * isn't free.  	 */  #ifdef CONFIG_DEBUG_VM -	if (WARN_ON_ONCE(__read_cr3() != -			 (__sme_pa(real_prev->pgd) | prev_asid))) { +	if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) {  		/*  		 * If we were to BUG here, we'd be very likely to kill  		 * the system so hard that we don't see the call trace. @@ -143,45 +144,24 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,  		__flush_tlb_all();  	}  #endif +	this_cpu_write(cpu_tlbstate.is_lazy, false);  	if (real_prev == next) { -		VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != -			  next->context.ctx_id); - -		if (cpumask_test_cpu(cpu, mm_cpumask(next))) { -			/* -			 * There's nothing to do: we weren't lazy, and we -			 * aren't changing our mm.  We don't need to flush -			 * anything, nor do we need to update CR3, CR4, or -			 * LDTR. -			 */ -			return; -		} - -		/* Resume remote flushes and then read tlb_gen. */ -		cpumask_set_cpu(cpu, mm_cpumask(next)); -		next_tlb_gen = atomic64_read(&next->context.tlb_gen); - -		if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) < -		    next_tlb_gen) { -			/* -			 * Ideally, we'd have a flush_tlb() variant that -			 * takes the known CR3 value as input.  This would -			 * be faster on Xen PV and on hypothetical CPUs -			 * on which INVPCID is fast. -			 */ -			this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, -				       next_tlb_gen); -			write_cr3(__sme_pa(next->pgd) | prev_asid); -			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, -					TLB_FLUSH_ALL); -		} +		VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != +			   next->context.ctx_id);  		/* -		 * We just exited lazy mode, which means that CR4 and/or LDTR -		 * may be stale.  (Changes to the required CR4 and LDTR states -		 * are not reflected in tlb_gen.) +		 * We don't currently support having a real mm loaded without +		 * our cpu set in mm_cpumask().  We have all the bookkeeping +		 * in place to figure out whether we would need to flush +		 * if our cpu were cleared in mm_cpumask(), but we don't +		 * currently use it.  		 */ +		if (WARN_ON_ONCE(real_prev != &init_mm && +				 !cpumask_test_cpu(cpu, mm_cpumask(next)))) +			cpumask_set_cpu(cpu, mm_cpumask(next)); + +		return;  	} else {  		u16 new_asid;  		bool need_flush; @@ -192,7 +172,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,  			 * mapped in the new pgd, we'll double-fault.  Forcibly  			 * map it.  			 */ -			unsigned int index = pgd_index(current_stack_pointer()); +			unsigned int index = pgd_index(current_stack_pointer);  			pgd_t *pgd = next->pgd + index;  			if (unlikely(pgd_none(*pgd))) @@ -200,10 +180,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,  		}  		/* Stop remote flushes for the previous mm */ -		if (cpumask_test_cpu(cpu, mm_cpumask(real_prev))) -			cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); - -		VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); +		VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && +				real_prev != &init_mm); +		cpumask_clear_cpu(cpu, mm_cpumask(real_prev));  		/*  		 * Start remote flushes and then read tlb_gen. @@ -216,13 +195,23 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,  		if (need_flush) {  			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);  			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); -			write_cr3(__sme_pa(next->pgd) | new_asid); -			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, -					TLB_FLUSH_ALL); +			write_cr3(build_cr3(next, new_asid)); + +			/* +			 * NB: This gets called via leave_mm() in the idle path +			 * where RCU functions differently.  Tracing normally +			 * uses RCU, so we need to use the _rcuidle variant. +			 * +			 * (There is no good reason for this.  The idle code should +			 *  be rearranged to call this before rcu_idle_enter().) +			 */ +			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);  		} else {  			/* The new ASID is already up to date. */ -			write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH); -			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); +			write_cr3(build_cr3_noflush(next, new_asid)); + +			/* See above wrt _rcuidle. */ +			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);  		}  		this_cpu_write(cpu_tlbstate.loaded_mm, next); @@ -234,6 +223,40 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,  }  /* + * Please ignore the name of this function.  It should be called + * switch_to_kernel_thread(). + * + * enter_lazy_tlb() is a hint from the scheduler that we are entering a + * kernel thread or other context without an mm.  Acceptable implementations + * include doing nothing whatsoever, switching to init_mm, or various clever + * lazy tricks to try to minimize TLB flushes. + * + * The scheduler reserves the right to call enter_lazy_tlb() several times + * in a row.  It will notify us that we're going back to a real mm by + * calling switch_mm_irqs_off(). + */ +void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +{ +	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) +		return; + +	if (tlb_defer_switch_to_init_mm()) { +		/* +		 * There's a significant optimization that may be possible +		 * here.  We have accurate enough TLB flush tracking that we +		 * don't need to maintain coherence of TLB per se when we're +		 * lazy.  We do, however, need to maintain coherence of +		 * paging-structure caches.  We could, in principle, leave our +		 * old mm loaded and only switch to init_mm when +		 * tlb_remove_page() happens. +		 */ +		this_cpu_write(cpu_tlbstate.is_lazy, true); +	} else { +		switch_mm(NULL, &init_mm, NULL); +	} +} + +/*   * Call this when reinitializing a CPU.  It fixes the following potential   * problems:   * @@ -265,7 +288,7 @@ void initialize_tlbstate_and_flush(void)  		!(cr4_read_shadow() & X86_CR4_PCIDE));  	/* Force ASID 0 and force a TLB flush. */ -	write_cr3(cr3 & ~CR3_PCID_MASK); +	write_cr3(build_cr3(mm, 0));  	/* Reinitialize tlbstate. */  	this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); @@ -304,16 +327,20 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,  	/* This code cannot presently handle being reentered. */  	VM_WARN_ON(!irqs_disabled()); +	if (unlikely(loaded_mm == &init_mm)) +		return; +  	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=  		   loaded_mm->context.ctx_id); -	if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { +	if (this_cpu_read(cpu_tlbstate.is_lazy)) {  		/* -		 * We're in lazy mode -- don't flush.  We can get here on -		 * remote flushes due to races and on local flushes if a -		 * kernel thread coincidentally flushes the mm it's lazily -		 * still using. +		 * We're in lazy mode.  We need to at least flush our +		 * paging-structure cache to avoid speculatively reading +		 * garbage into our TLB.  Since switching to init_mm is barely +		 * slower than a minimal flush, just switch to init_mm.  		 */ +		switch_mm_irqs_off(NULL, &init_mm, NULL);  		return;  	} |