diff options
Diffstat (limited to 'arch/x86/mm')
| -rw-r--r-- | arch/x86/mm/amdtopology.c | 22 | ||||
| -rw-r--r-- | arch/x86/mm/extable.c | 9 | ||||
| -rw-r--r-- | arch/x86/mm/fault.c | 43 | ||||
| -rw-r--r-- | arch/x86/mm/gup.c | 2 | ||||
| -rw-r--r-- | arch/x86/mm/init.c | 4 | ||||
| -rw-r--r-- | arch/x86/mm/kaslr.c | 26 | ||||
| -rw-r--r-- | arch/x86/mm/mpx.c | 5 | ||||
| -rw-r--r-- | arch/x86/mm/numa.c | 27 | ||||
| -rw-r--r-- | arch/x86/mm/pageattr.c | 21 | ||||
| -rw-r--r-- | arch/x86/mm/pat.c | 14 | ||||
| -rw-r--r-- | arch/x86/mm/pat_rbtree.c | 4 | ||||
| -rw-r--r-- | arch/x86/mm/pkeys.c | 142 | ||||
| -rw-r--r-- | arch/x86/mm/tlb.c | 15 | 
13 files changed, 272 insertions, 62 deletions
| diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index ba47524f56e8..d1c7de095808 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -52,21 +52,6 @@ static __init int find_northbridge(void)  	return -ENOENT;  } -static __init void early_get_boot_cpu_id(void) -{ -	/* -	 * need to get the APIC ID of the BSP so can use that to -	 * create apicid_to_node in amd_scan_nodes() -	 */ -#ifdef CONFIG_X86_MPPARSE -	/* -	 * get boot-time SMP configuration: -	 */ -	if (smp_found_config) -		early_get_smp_config(); -#endif -} -  int __init amd_numa_init(void)  {  	u64 start = PFN_PHYS(0); @@ -180,8 +165,11 @@ int __init amd_numa_init(void)  	cores = 1 << bits;  	apicid_base = 0; -	/* get the APIC ID of the BSP early for systems with apicid lifting */ -	early_get_boot_cpu_id(); +	/* +	 * get boot-time SMP configuration: +	 */ +	early_get_smp_config(); +  	if (boot_cpu_physical_apicid > 0) {  		pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);  		apicid_base = boot_cpu_physical_apicid; diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 832b98f822be..fcd06f7526de 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,4 +1,4 @@ -#include <linux/module.h> +#include <linux/extable.h>  #include <asm/uaccess.h>  #include <asm/traps.h>  #include <asm/kdebug.h> @@ -135,7 +135,12 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr)  	if (early_recursion_flag > 2)  		goto halt_loop; -	if (regs->cs != __KERNEL_CS) +	/* +	 * Old CPUs leave the high bits of CS on the stack +	 * undefined.  I'm not sure which CPUs do this, but at least +	 * the 486 DX works this way. +	 */ +	if ((regs->cs & 0xFFFF) != __KERNEL_CS)  		goto fail;  	/* diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index dc8023060456..9f72ca3b2669 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -5,7 +5,7 @@   */  #include <linux/sched.h>		/* test_thread_flag(), ...	*/  #include <linux/kdebug.h>		/* oops_begin/end, ...		*/ -#include <linux/module.h>		/* search_exception_table	*/ +#include <linux/extable.h>		/* search_exception_tables	*/  #include <linux/bootmem.h>		/* max_low_pfn			*/  #include <linux/kprobes.h>		/* NOKPROBE_SYMBOL, ...		*/  #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/ @@ -753,6 +753,38 @@ no_context(struct pt_regs *regs, unsigned long error_code,  		return;  	} +#ifdef CONFIG_VMAP_STACK +	/* +	 * Stack overflow?  During boot, we can fault near the initial +	 * stack in the direct map, but that's not an overflow -- check +	 * that we're in vmalloc space to avoid this. +	 */ +	if (is_vmalloc_addr((void *)address) && +	    (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || +	     address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { +		register void *__sp asm("rsp"); +		unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *); +		/* +		 * We're likely to be running with very little stack space +		 * left.  It's plausible that we'd hit this condition but +		 * double-fault even before we get this far, in which case +		 * we're fine: the double-fault handler will deal with it. +		 * +		 * We don't want to make it all the way into the oops code +		 * and then double-fault, though, because we're likely to +		 * break the console driver and lose most of the stack dump. +		 */ +		asm volatile ("movq %[stack], %%rsp\n\t" +			      "call handle_stack_overflow\n\t" +			      "1: jmp 1b" +			      : "+r" (__sp) +			      : "D" ("kernel stack overflow (page fault)"), +				"S" (regs), "d" (address), +				[stack] "rm" (stack)); +		unreachable(); +	} +#endif +  	/*  	 * 32-bit:  	 * @@ -1112,6 +1144,15 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)  {  	/* This is only called for the current mm, so: */  	bool foreign = false; + +	/* +	 * Read or write was blocked by protection keys.  This is +	 * always an unconditional error and can never result in +	 * a follow-up action to resolve the fault, like a COW. +	 */ +	if (error_code & PF_PK) +		return 1; +  	/*  	 * Make sure to check the VMA so that we do not perform  	 * faults just to hit a PF_PK as soon as we fill in a diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index b8b6a60b32cf..0d4fb3ebbbac 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -435,7 +435,7 @@ slow_irqon:  		ret = get_user_pages_unlocked(start,  					      (end - start) >> PAGE_SHIFT, -					      write, 0, pages); +					      pages, write ? FOLL_WRITE : 0);  		/* Have to be a bit careful with return values */  		if (nr > 0) { diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index d28a2d741f9e..22af912d66d2 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -699,8 +699,10 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)  	}  } -void free_initmem(void) +void __ref free_initmem(void)  { +	e820_reallocate_tables(); +  	free_init_pages("unused kernel",  			(unsigned long)(&__init_begin),  			(unsigned long)(&__init_end)); diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index bda8d5eef04d..887e57182716 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -40,17 +40,26 @@   * You need to add an if/def entry if you introduce a new memory region   * compatible with KASLR. Your entry must be in logical order with memory   * layout. For example, ESPFIX is before EFI because its virtual address is - * before. You also need to add a BUILD_BUG_ON in kernel_randomize_memory to + * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to   * ensure that this order is correct and won't be changed.   */  static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; -static const unsigned long vaddr_end = VMEMMAP_START; + +#if defined(CONFIG_X86_ESPFIX64) +static const unsigned long vaddr_end = ESPFIX_BASE_ADDR; +#elif defined(CONFIG_EFI) +static const unsigned long vaddr_end = EFI_VA_START; +#else +static const unsigned long vaddr_end = __START_KERNEL_map; +#endif  /* Default values */  unsigned long page_offset_base = __PAGE_OFFSET_BASE;  EXPORT_SYMBOL(page_offset_base);  unsigned long vmalloc_base = __VMALLOC_BASE;  EXPORT_SYMBOL(vmalloc_base); +unsigned long vmemmap_base = __VMEMMAP_BASE; +EXPORT_SYMBOL(vmemmap_base);  /*   * Memory regions randomized by KASLR (except modules that use a separate logic @@ -63,6 +72,7 @@ static __initdata struct kaslr_memory_region {  } kaslr_regions[] = {  	{ &page_offset_base, 64/* Maximum */ },  	{ &vmalloc_base, VMALLOC_SIZE_TB }, +	{ &vmemmap_base, 1 },  };  /* Get size in bytes used by the memory region */ @@ -89,6 +99,18 @@ void __init kernel_randomize_memory(void)  	struct rnd_state rand_state;  	unsigned long remain_entropy; +	/* +	 * All these BUILD_BUG_ON checks ensures the memory layout is +	 * consistent with the vaddr_start/vaddr_end variables. +	 */ +	BUILD_BUG_ON(vaddr_start >= vaddr_end); +	BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) && +		     vaddr_end >= EFI_VA_START); +	BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) || +		      IS_ENABLED(CONFIG_EFI)) && +		     vaddr_end >= __START_KERNEL_map); +	BUILD_BUG_ON(vaddr_end > __START_KERNEL_map); +  	if (!kaslr_memory_enabled())  		return; diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 80476878eb4c..e4f800999b32 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -544,10 +544,9 @@ static int mpx_resolve_fault(long __user *addr, int write)  {  	long gup_ret;  	int nr_pages = 1; -	int force = 0; -	gup_ret = get_user_pages((unsigned long)addr, nr_pages, write, -			force, NULL, NULL); +	gup_ret = get_user_pages((unsigned long)addr, nr_pages, +			write ? FOLL_WRITE : 0,	NULL, NULL);  	/*  	 * get_user_pages() returns number of pages gotten.  	 * 0 means we failed to fault in and get anything, diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index fb682108f4dc..3f35b48d1d9d 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -722,22 +722,19 @@ void __init x86_numa_init(void)  	numa_init(dummy_numa_init);  } -static __init int find_near_online_node(int node) +static void __init init_memory_less_node(int nid)  { -	int n, val; -	int min_val = INT_MAX; -	int best_node = -1; +	unsigned long zones_size[MAX_NR_ZONES] = {0}; +	unsigned long zholes_size[MAX_NR_ZONES] = {0}; -	for_each_online_node(n) { -		val = node_distance(node, n); +	/* Allocate and initialize node data. Memory-less node is now online.*/ +	alloc_node_data(nid); +	free_area_init_node(nid, zones_size, 0, zholes_size); -		if (val < min_val) { -			min_val = val; -			best_node = n; -		} -	} - -	return best_node; +	/* +	 * All zonelists will be built later in start_kernel() after per cpu +	 * areas are initialized. +	 */  }  /* @@ -766,8 +763,10 @@ void __init init_cpu_to_node(void)  		if (node == NUMA_NO_NODE)  			continue; +  		if (!node_online(node)) -			node = find_near_online_node(node); +			init_memory_less_node(node); +  		numa_set_node(cpu, node);  	}  } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 849dc09fa4f0..e3353c97d086 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -917,11 +917,11 @@ static void populate_pte(struct cpa_data *cpa,  	}  } -static int populate_pmd(struct cpa_data *cpa, -			unsigned long start, unsigned long end, -			unsigned num_pages, pud_t *pud, pgprot_t pgprot) +static long populate_pmd(struct cpa_data *cpa, +			 unsigned long start, unsigned long end, +			 unsigned num_pages, pud_t *pud, pgprot_t pgprot)  { -	unsigned int cur_pages = 0; +	long cur_pages = 0;  	pmd_t *pmd;  	pgprot_t pmd_pgprot; @@ -991,12 +991,12 @@ static int populate_pmd(struct cpa_data *cpa,  	return num_pages;  } -static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, -			pgprot_t pgprot) +static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, +			 pgprot_t pgprot)  {  	pud_t *pud;  	unsigned long end; -	int cur_pages = 0; +	long cur_pages = 0;  	pgprot_t pud_pgprot;  	end = start + (cpa->numpages << PAGE_SHIFT); @@ -1052,7 +1052,7 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,  	/* Map trailing leftover */  	if (start < end) { -		int tmp; +		long tmp;  		pud = pud_offset(pgd, start);  		if (pud_none(*pud)) @@ -1078,7 +1078,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)  	pgprot_t pgprot = __pgprot(_KERNPG_TABLE);  	pud_t *pud = NULL;	/* shut up gcc */  	pgd_t *pgd_entry; -	int ret; +	long ret;  	pgd_entry = cpa->pgd + pgd_index(addr); @@ -1327,7 +1327,8 @@ static int cpa_process_alias(struct cpa_data *cpa)  static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)  { -	int ret, numpages = cpa->numpages; +	unsigned long numpages = cpa->numpages; +	int ret;  	while (numpages) {  		/* diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 170cc4ff057b..83e701f160a9 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -730,6 +730,20 @@ void io_free_memtype(resource_size_t start, resource_size_t end)  	free_memtype(start, end);  } +int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size) +{ +	enum page_cache_mode type = _PAGE_CACHE_MODE_WC; + +	return io_reserve_memtype(start, start + size, &type); +} +EXPORT_SYMBOL(arch_io_reserve_memtype_wc); + +void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) +{ +	io_free_memtype(start, start + size); +} +EXPORT_SYMBOL(arch_io_free_memtype_wc); +  pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,  				unsigned long size, pgprot_t vma_prot)  { diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index de391b7bc19a..159b52ccd600 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -254,9 +254,7 @@ struct memtype *rbt_memtype_erase(u64 start, u64 end)  struct memtype *rbt_memtype_lookup(u64 addr)  { -	struct memtype *data; -	data = memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE); -	return data; +	return memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE);  }  #if defined(CONFIG_DEBUG_FS) diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index e8c474451928..f88ce0e5efd9 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -11,6 +11,7 @@   * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   * more details.   */ +#include <linux/debugfs.h>		/* debugfs_create_u32()		*/  #include <linux/mm_types.h>             /* mm_struct, vma, etc...       */  #include <linux/pkeys.h>                /* PKEY_*                       */  #include <uapi/asm-generic/mman-common.h> @@ -21,8 +22,19 @@  int __execute_only_pkey(struct mm_struct *mm)  { +	bool need_to_set_mm_pkey = false; +	int execute_only_pkey = mm->context.execute_only_pkey;  	int ret; +	/* Do we need to assign a pkey for mm's execute-only maps? */ +	if (execute_only_pkey == -1) { +		/* Go allocate one to use, which might fail */ +		execute_only_pkey = mm_pkey_alloc(mm); +		if (execute_only_pkey < 0) +			return -1; +		need_to_set_mm_pkey = true; +	} +  	/*  	 * We do not want to go through the relatively costly  	 * dance to set PKRU if we do not need to.  Check it @@ -32,22 +44,33 @@ int __execute_only_pkey(struct mm_struct *mm)  	 * can make fpregs inactive.  	 */  	preempt_disable(); -	if (fpregs_active() && -	    !__pkru_allows_read(read_pkru(), PKEY_DEDICATED_EXECUTE_ONLY)) { +	if (!need_to_set_mm_pkey && +	    fpregs_active() && +	    !__pkru_allows_read(read_pkru(), execute_only_pkey)) {  		preempt_enable(); -		return PKEY_DEDICATED_EXECUTE_ONLY; +		return execute_only_pkey;  	}  	preempt_enable(); -	ret = arch_set_user_pkey_access(current, PKEY_DEDICATED_EXECUTE_ONLY, + +	/* +	 * Set up PKRU so that it denies access for everything +	 * other than execution. +	 */ +	ret = arch_set_user_pkey_access(current, execute_only_pkey,  			PKEY_DISABLE_ACCESS);  	/*  	 * If the PKRU-set operation failed somehow, just return  	 * 0 and effectively disable execute-only support.  	 */ -	if (ret) -		return 0; +	if (ret) { +		mm_set_pkey_free(mm, execute_only_pkey); +		return -1; +	} -	return PKEY_DEDICATED_EXECUTE_ONLY; +	/* We got one, store it and use it from here on out */ +	if (need_to_set_mm_pkey) +		mm->context.execute_only_pkey = execute_only_pkey; +	return execute_only_pkey;  }  static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma) @@ -55,7 +78,7 @@ static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)  	/* Do this check first since the vm_flags should be hot */  	if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)  		return false; -	if (vma_pkey(vma) != PKEY_DEDICATED_EXECUTE_ONLY) +	if (vma_pkey(vma) != vma->vm_mm->context.execute_only_pkey)  		return false;  	return true; @@ -99,3 +122,106 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey  	 */  	return vma_pkey(vma);  } + +#define PKRU_AD_KEY(pkey)	(PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY)) + +/* + * Make the default PKRU value (at execve() time) as restrictive + * as possible.  This ensures that any threads clone()'d early + * in the process's lifetime will not accidentally get access + * to data which is pkey-protected later on. + */ +u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) | +		      PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) | +		      PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) | +		      PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) | +		      PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15); + +/* + * Called from the FPU code when creating a fresh set of FPU + * registers.  This is called from a very specific context where + * we know the FPU regstiers are safe for use and we can use PKRU + * directly.  The fact that PKRU is only available when we are + * using eagerfpu mode makes this possible. + */ +void copy_init_pkru_to_fpregs(void) +{ +	u32 init_pkru_value_snapshot = READ_ONCE(init_pkru_value); +	/* +	 * Any write to PKRU takes it out of the XSAVE 'init +	 * state' which increases context switch cost.  Avoid +	 * writing 0 when PKRU was already 0. +	 */ +	if (!init_pkru_value_snapshot && !read_pkru()) +		return; +	/* +	 * Override the PKRU state that came from 'init_fpstate' +	 * with the baseline from the process. +	 */ +	write_pkru(init_pkru_value_snapshot); +} + +static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf, +			     size_t count, loff_t *ppos) +{ +	char buf[32]; +	unsigned int len; + +	len = sprintf(buf, "0x%x\n", init_pkru_value); +	return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +static ssize_t init_pkru_write_file(struct file *file, +		 const char __user *user_buf, size_t count, loff_t *ppos) +{ +	char buf[32]; +	ssize_t len; +	u32 new_init_pkru; + +	len = min(count, sizeof(buf) - 1); +	if (copy_from_user(buf, user_buf, len)) +		return -EFAULT; + +	/* Make the buffer a valid string that we can not overrun */ +	buf[len] = '\0'; +	if (kstrtouint(buf, 0, &new_init_pkru)) +		return -EINVAL; + +	/* +	 * Don't allow insane settings that will blow the system +	 * up immediately if someone attempts to disable access +	 * or writes to pkey 0. +	 */ +	if (new_init_pkru & (PKRU_AD_BIT|PKRU_WD_BIT)) +		return -EINVAL; + +	WRITE_ONCE(init_pkru_value, new_init_pkru); +	return count; +} + +static const struct file_operations fops_init_pkru = { +	.read = init_pkru_read_file, +	.write = init_pkru_write_file, +	.llseek = default_llseek, +}; + +static int __init create_init_pkru_value(void) +{ +	debugfs_create_file("init_pkru", S_IRUSR | S_IWUSR, +			arch_debugfs_dir, NULL, &fops_init_pkru); +	return 0; +} +late_initcall(create_init_pkru_value); + +static __init int setup_init_pkru(char *opt) +{ +	u32 new_init_pkru; + +	if (kstrtouint(opt, 0, &new_init_pkru)) +		return 1; + +	WRITE_ONCE(init_pkru_value, new_init_pkru); + +	return 1; +} +__setup("init_pkru=", setup_init_pkru); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 4dbe65622810..a7655f6caf7d 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -77,10 +77,25 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,  	unsigned cpu = smp_processor_id();  	if (likely(prev != next)) { +		if (IS_ENABLED(CONFIG_VMAP_STACK)) { +			/* +			 * If our current stack is in vmalloc space and isn't +			 * mapped in the new pgd, we'll double-fault.  Forcibly +			 * map it. +			 */ +			unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); + +			pgd_t *pgd = next->pgd + stack_pgd_index; + +			if (unlikely(pgd_none(*pgd))) +				set_pgd(pgd, init_mm.pgd[stack_pgd_index]); +		} +  #ifdef CONFIG_SMP  		this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);  		this_cpu_write(cpu_tlbstate.active_mm, next);  #endif +  		cpumask_set_cpu(cpu, mm_cpumask(next));  		/* |