diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/amdtopology.c | 22 | ||||
-rw-r--r-- | arch/x86/mm/extable.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 43 | ||||
-rw-r--r-- | arch/x86/mm/init.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/kaslr.c | 26 | ||||
-rw-r--r-- | arch/x86/mm/numa.c | 27 | ||||
-rw-r--r-- | arch/x86/mm/pat_rbtree.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/pkeys.c | 142 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 15 |
9 files changed, 238 insertions, 47 deletions
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index ba47524f56e8..d1c7de095808 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -52,21 +52,6 @@ static __init int find_northbridge(void) return -ENOENT; } -static __init void early_get_boot_cpu_id(void) -{ - /* - * need to get the APIC ID of the BSP so can use that to - * create apicid_to_node in amd_scan_nodes() - */ -#ifdef CONFIG_X86_MPPARSE - /* - * get boot-time SMP configuration: - */ - if (smp_found_config) - early_get_smp_config(); -#endif -} - int __init amd_numa_init(void) { u64 start = PFN_PHYS(0); @@ -180,8 +165,11 @@ int __init amd_numa_init(void) cores = 1 << bits; apicid_base = 0; - /* get the APIC ID of the BSP early for systems with apicid lifting */ - early_get_boot_cpu_id(); + /* + * get boot-time SMP configuration: + */ + early_get_smp_config(); + if (boot_cpu_physical_apicid > 0) { pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); apicid_base = boot_cpu_physical_apicid; diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 832b98f822be..79ae939970d3 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,4 +1,4 @@ -#include <linux/module.h> +#include <linux/extable.h> #include <asm/uaccess.h> #include <asm/traps.h> #include <asm/kdebug.h> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index dc8023060456..4dc13340653e 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -5,7 +5,7 @@ */ #include <linux/sched.h> /* test_thread_flag(), ... */ #include <linux/kdebug.h> /* oops_begin/end, ... */ -#include <linux/module.h> /* search_exception_table */ +#include <linux/extable.h> /* search_exception_table */ #include <linux/bootmem.h> /* max_low_pfn */ #include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ @@ -753,6 +753,38 @@ no_context(struct pt_regs *regs, unsigned long error_code, return; } +#ifdef CONFIG_VMAP_STACK + /* + * Stack overflow? During boot, we can fault near the initial + * stack in the direct map, but that's not an overflow -- check + * that we're in vmalloc space to avoid this. + */ + if (is_vmalloc_addr((void *)address) && + (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || + address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { + register void *__sp asm("rsp"); + unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *); + /* + * We're likely to be running with very little stack space + * left. It's plausible that we'd hit this condition but + * double-fault even before we get this far, in which case + * we're fine: the double-fault handler will deal with it. + * + * We don't want to make it all the way into the oops code + * and then double-fault, though, because we're likely to + * break the console driver and lose most of the stack dump. + */ + asm volatile ("movq %[stack], %%rsp\n\t" + "call handle_stack_overflow\n\t" + "1: jmp 1b" + : "+r" (__sp) + : "D" ("kernel stack overflow (page fault)"), + "S" (regs), "d" (address), + [stack] "rm" (stack)); + unreachable(); + } +#endif + /* * 32-bit: * @@ -1112,6 +1144,15 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) { /* This is only called for the current mm, so: */ bool foreign = false; + + /* + * Read or write was blocked by protection keys. This is + * always an unconditional error and can never result in + * a follow-up action to resolve the fault, like a COW. + */ + if (error_code & PF_PK) + return 1; + /* * Make sure to check the VMA so that we do not perform * faults just to hit a PF_PK as soon as we fill in a diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index d28a2d741f9e..22af912d66d2 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -699,8 +699,10 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) } } -void free_initmem(void) +void __ref free_initmem(void) { + e820_reallocate_tables(); + free_init_pages("unused kernel", (unsigned long)(&__init_begin), (unsigned long)(&__init_end)); diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index bda8d5eef04d..ddd2661c4502 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -40,17 +40,26 @@ * You need to add an if/def entry if you introduce a new memory region * compatible with KASLR. Your entry must be in logical order with memory * layout. For example, ESPFIX is before EFI because its virtual address is - * before. You also need to add a BUILD_BUG_ON in kernel_randomize_memory to + * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to * ensure that this order is correct and won't be changed. */ static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; -static const unsigned long vaddr_end = VMEMMAP_START; + +#if defined(CONFIG_X86_ESPFIX64) +static const unsigned long vaddr_end = ESPFIX_BASE_ADDR; +#elif defined(CONFIG_EFI) +static const unsigned long vaddr_end = EFI_VA_START; +#else +static const unsigned long vaddr_end = __START_KERNEL_map; +#endif /* Default values */ unsigned long page_offset_base = __PAGE_OFFSET_BASE; EXPORT_SYMBOL(page_offset_base); unsigned long vmalloc_base = __VMALLOC_BASE; EXPORT_SYMBOL(vmalloc_base); +unsigned long vmemmap_base = __VMEMMAP_BASE; +EXPORT_SYMBOL(vmemmap_base); /* * Memory regions randomized by KASLR (except modules that use a separate logic @@ -63,6 +72,7 @@ static __initdata struct kaslr_memory_region { } kaslr_regions[] = { { &page_offset_base, 64/* Maximum */ }, { &vmalloc_base, VMALLOC_SIZE_TB }, + { &vmemmap_base, 1 }, }; /* Get size in bytes used by the memory region */ @@ -89,6 +99,18 @@ void __init kernel_randomize_memory(void) struct rnd_state rand_state; unsigned long remain_entropy; + /* + * All these BUILD_BUG_ON checks ensures the memory layout is + * consistent with the vaddr_start/vaddr_end variables. + */ + BUILD_BUG_ON(vaddr_start >= vaddr_end); + BUILD_BUG_ON(config_enabled(CONFIG_X86_ESPFIX64) && + vaddr_end >= EFI_VA_START); + BUILD_BUG_ON((config_enabled(CONFIG_X86_ESPFIX64) || + config_enabled(CONFIG_EFI)) && + vaddr_end >= __START_KERNEL_map); + BUILD_BUG_ON(vaddr_end > __START_KERNEL_map); + if (!kaslr_memory_enabled()) return; diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index fb682108f4dc..3f35b48d1d9d 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -722,22 +722,19 @@ void __init x86_numa_init(void) numa_init(dummy_numa_init); } -static __init int find_near_online_node(int node) +static void __init init_memory_less_node(int nid) { - int n, val; - int min_val = INT_MAX; - int best_node = -1; + unsigned long zones_size[MAX_NR_ZONES] = {0}; + unsigned long zholes_size[MAX_NR_ZONES] = {0}; - for_each_online_node(n) { - val = node_distance(node, n); + /* Allocate and initialize node data. Memory-less node is now online.*/ + alloc_node_data(nid); + free_area_init_node(nid, zones_size, 0, zholes_size); - if (val < min_val) { - min_val = val; - best_node = n; - } - } - - return best_node; + /* + * All zonelists will be built later in start_kernel() after per cpu + * areas are initialized. + */ } /* @@ -766,8 +763,10 @@ void __init init_cpu_to_node(void) if (node == NUMA_NO_NODE) continue; + if (!node_online(node)) - node = find_near_online_node(node); + init_memory_less_node(node); + numa_set_node(cpu, node); } } diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index de391b7bc19a..159b52ccd600 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -254,9 +254,7 @@ struct memtype *rbt_memtype_erase(u64 start, u64 end) struct memtype *rbt_memtype_lookup(u64 addr) { - struct memtype *data; - data = memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE); - return data; + return memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE); } #if defined(CONFIG_DEBUG_FS) diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index e8c474451928..f88ce0e5efd9 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -11,6 +11,7 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. */ +#include <linux/debugfs.h> /* debugfs_create_u32() */ #include <linux/mm_types.h> /* mm_struct, vma, etc... */ #include <linux/pkeys.h> /* PKEY_* */ #include <uapi/asm-generic/mman-common.h> @@ -21,8 +22,19 @@ int __execute_only_pkey(struct mm_struct *mm) { + bool need_to_set_mm_pkey = false; + int execute_only_pkey = mm->context.execute_only_pkey; int ret; + /* Do we need to assign a pkey for mm's execute-only maps? */ + if (execute_only_pkey == -1) { + /* Go allocate one to use, which might fail */ + execute_only_pkey = mm_pkey_alloc(mm); + if (execute_only_pkey < 0) + return -1; + need_to_set_mm_pkey = true; + } + /* * We do not want to go through the relatively costly * dance to set PKRU if we do not need to. Check it @@ -32,22 +44,33 @@ int __execute_only_pkey(struct mm_struct *mm) * can make fpregs inactive. */ preempt_disable(); - if (fpregs_active() && - !__pkru_allows_read(read_pkru(), PKEY_DEDICATED_EXECUTE_ONLY)) { + if (!need_to_set_mm_pkey && + fpregs_active() && + !__pkru_allows_read(read_pkru(), execute_only_pkey)) { preempt_enable(); - return PKEY_DEDICATED_EXECUTE_ONLY; + return execute_only_pkey; } preempt_enable(); - ret = arch_set_user_pkey_access(current, PKEY_DEDICATED_EXECUTE_ONLY, + + /* + * Set up PKRU so that it denies access for everything + * other than execution. + */ + ret = arch_set_user_pkey_access(current, execute_only_pkey, PKEY_DISABLE_ACCESS); /* * If the PKRU-set operation failed somehow, just return * 0 and effectively disable execute-only support. */ - if (ret) - return 0; + if (ret) { + mm_set_pkey_free(mm, execute_only_pkey); + return -1; + } - return PKEY_DEDICATED_EXECUTE_ONLY; + /* We got one, store it and use it from here on out */ + if (need_to_set_mm_pkey) + mm->context.execute_only_pkey = execute_only_pkey; + return execute_only_pkey; } static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma) @@ -55,7 +78,7 @@ static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma) /* Do this check first since the vm_flags should be hot */ if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC) return false; - if (vma_pkey(vma) != PKEY_DEDICATED_EXECUTE_ONLY) + if (vma_pkey(vma) != vma->vm_mm->context.execute_only_pkey) return false; return true; @@ -99,3 +122,106 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey */ return vma_pkey(vma); } + +#define PKRU_AD_KEY(pkey) (PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY)) + +/* + * Make the default PKRU value (at execve() time) as restrictive + * as possible. This ensures that any threads clone()'d early + * in the process's lifetime will not accidentally get access + * to data which is pkey-protected later on. + */ +u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) | + PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) | + PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) | + PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) | + PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15); + +/* + * Called from the FPU code when creating a fresh set of FPU + * registers. This is called from a very specific context where + * we know the FPU regstiers are safe for use and we can use PKRU + * directly. The fact that PKRU is only available when we are + * using eagerfpu mode makes this possible. + */ +void copy_init_pkru_to_fpregs(void) +{ + u32 init_pkru_value_snapshot = READ_ONCE(init_pkru_value); + /* + * Any write to PKRU takes it out of the XSAVE 'init + * state' which increases context switch cost. Avoid + * writing 0 when PKRU was already 0. + */ + if (!init_pkru_value_snapshot && !read_pkru()) + return; + /* + * Override the PKRU state that came from 'init_fpstate' + * with the baseline from the process. + */ + write_pkru(init_pkru_value_snapshot); +} + +static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[32]; + unsigned int len; + + len = sprintf(buf, "0x%x\n", init_pkru_value); + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +static ssize_t init_pkru_write_file(struct file *file, + const char __user *user_buf, size_t count, loff_t *ppos) +{ + char buf[32]; + ssize_t len; + u32 new_init_pkru; + + len = min(count, sizeof(buf) - 1); + if (copy_from_user(buf, user_buf, len)) + return -EFAULT; + + /* Make the buffer a valid string that we can not overrun */ + buf[len] = '\0'; + if (kstrtouint(buf, 0, &new_init_pkru)) + return -EINVAL; + + /* + * Don't allow insane settings that will blow the system + * up immediately if someone attempts to disable access + * or writes to pkey 0. + */ + if (new_init_pkru & (PKRU_AD_BIT|PKRU_WD_BIT)) + return -EINVAL; + + WRITE_ONCE(init_pkru_value, new_init_pkru); + return count; +} + +static const struct file_operations fops_init_pkru = { + .read = init_pkru_read_file, + .write = init_pkru_write_file, + .llseek = default_llseek, +}; + +static int __init create_init_pkru_value(void) +{ + debugfs_create_file("init_pkru", S_IRUSR | S_IWUSR, + arch_debugfs_dir, NULL, &fops_init_pkru); + return 0; +} +late_initcall(create_init_pkru_value); + +static __init int setup_init_pkru(char *opt) +{ + u32 new_init_pkru; + + if (kstrtouint(opt, 0, &new_init_pkru)) + return 1; + + WRITE_ONCE(init_pkru_value, new_init_pkru); + + return 1; +} +__setup("init_pkru=", setup_init_pkru); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 4dbe65622810..a7655f6caf7d 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -77,10 +77,25 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, unsigned cpu = smp_processor_id(); if (likely(prev != next)) { + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + /* + * If our current stack is in vmalloc space and isn't + * mapped in the new pgd, we'll double-fault. Forcibly + * map it. + */ + unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); + + pgd_t *pgd = next->pgd + stack_pgd_index; + + if (unlikely(pgd_none(*pgd))) + set_pgd(pgd, init_mm.pgd[stack_pgd_index]); + } + #ifdef CONFIG_SMP this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); this_cpu_write(cpu_tlbstate.active_mm, next); #endif + cpumask_set_cpu(cpu, mm_cpumask(next)); /* |