diff options
-rw-r--r-- | arch/x86/include/asm/fpu/signal.h | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/mmu_context.h | 8 | ||||
-rw-r--r-- | arch/x86/include/asm/pgtable_types.h | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/tlbflush.h | 9 | ||||
-rw-r--r-- | arch/x86/kernel/fpu/signal.c | 27 | ||||
-rw-r--r-- | arch/x86/kernel/fpu/xstate.c | 13 | ||||
-rw-r--r-- | arch/x86/kernel/fpu/xstate.h | 2 | ||||
-rw-r--r-- | arch/x86/kernel/machine_kexec_64.c | 27 | ||||
-rw-r--r-- | arch/x86/kernel/process_64.c | 42 | ||||
-rw-r--r-- | arch/x86/kernel/signal.c | 29 | ||||
-rw-r--r-- | arch/x86/kernel/signal_64.c | 6 | ||||
-rw-r--r-- | arch/x86/mm/ident_map.c | 23 | ||||
-rw-r--r-- | arch/x86/mm/ioremap.c | 3 | ||||
-rw-r--r-- | arch/x86/mm/srat.c | 6 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 19 | ||||
-rw-r--r-- | include/linux/ioremap.h | 1 | ||||
-rw-r--r-- | tools/testing/selftests/mm/Makefile | 1 | ||||
-rw-r--r-- | tools/testing/selftests/mm/pkey-helpers.h | 13 | ||||
-rw-r--r-- | tools/testing/selftests/mm/pkey_sighandler_tests.c | 481 | ||||
-rw-r--r-- | tools/testing/selftests/mm/protection_keys.c | 10 |
20 files changed, 664 insertions, 60 deletions
diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h index 611fa41711af..eccc75bc9c4f 100644 --- a/arch/x86/include/asm/fpu/signal.h +++ b/arch/x86/include/asm/fpu/signal.h @@ -29,7 +29,7 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long fpu__get_fpstate_size(void); -extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); +extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size, u32 pkru); extern void fpu__clear_user_states(struct fpu *fpu); extern bool fpu__restore_sig(void __user *buf, int ia32_frame); diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 8dac45a2c7fc..19091ebb8633 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -88,7 +88,13 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) #ifdef CONFIG_ADDRESS_MASKING static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm) { - return mm->context.lam_cr3_mask; + /* + * When switch_mm_irqs_off() is called for a kthread, it may race with + * LAM enablement. switch_mm_irqs_off() uses the LAM mask to do two + * things: populate CR3 and populate 'cpu_tlbstate.lam'. Make sure it + * reads a single value for both. + */ + return READ_ONCE(mm->context.lam_cr3_mask); } static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 2f321137736c..6f82e75b6149 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -517,8 +517,6 @@ typedef struct page *pgtable_t; extern pteval_t __supported_pte_mask; extern pteval_t __default_kernel_pte_mask; -extern void set_nx(void); -extern int nx_enabled; #define pgprot_writecombine pgprot_writecombine extern pgprot_t pgprot_writecombine(pgprot_t prot); diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 25726893c6f4..69e79fff41b8 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -399,11 +399,10 @@ static inline u64 tlbstate_lam_cr3_mask(void) return lam << X86_CR3_LAM_U57_BIT; } -static inline void set_tlbstate_lam_mode(struct mm_struct *mm) +static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask) { - this_cpu_write(cpu_tlbstate.lam, - mm->context.lam_cr3_mask >> X86_CR3_LAM_U57_BIT); - this_cpu_write(tlbstate_untag_mask, mm->context.untag_mask); + this_cpu_write(cpu_tlbstate.lam, lam >> X86_CR3_LAM_U57_BIT); + this_cpu_write(tlbstate_untag_mask, untag_mask); } #else @@ -413,7 +412,7 @@ static inline u64 tlbstate_lam_cr3_mask(void) return 0; } -static inline void set_tlbstate_lam_mode(struct mm_struct *mm) +static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask) { } #endif diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 247f2225aa9f..1065ab995305 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -64,6 +64,16 @@ setfx: } /* + * Update the value of PKRU register that was already pushed onto the signal frame. + */ +static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u32 pkru) +{ + if (unlikely(!cpu_feature_enabled(X86_FEATURE_OSPKE))) + return 0; + return __put_user(pkru, (unsigned int __user *)get_xsave_addr_user(buf, XFEATURE_PKRU)); +} + +/* * Signal frame handlers. */ static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf) @@ -156,10 +166,17 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, return !err; } -static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) +static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf, u32 pkru) { - if (use_xsave()) - return xsave_to_user_sigframe(buf); + int err = 0; + + if (use_xsave()) { + err = xsave_to_user_sigframe(buf); + if (!err) + err = update_pkru_in_sigframe(buf, pkru); + return err; + } + if (use_fxsr()) return fxsave_to_user_sigframe((struct fxregs_state __user *) buf); else @@ -185,7 +202,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) * For [f]xsave state, update the SW reserved fields in the [f]xsave frame * indicating the absence/presence of the extended state to the user. */ -bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) +bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size, u32 pkru) { struct task_struct *tsk = current; struct fpstate *fpstate = tsk->thread.fpu.fpstate; @@ -228,7 +245,7 @@ retry: fpregs_restore_userregs(); pagefault_disable(); - ret = copy_fpregs_to_sigframe(buf_fx); + ret = copy_fpregs_to_sigframe(buf_fx, pkru); pagefault_enable(); fpregs_unlock(); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 53d9f01d5988..22abb5ee0cf2 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -999,6 +999,19 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) } EXPORT_SYMBOL_GPL(get_xsave_addr); +/* + * Given an xstate feature nr, calculate where in the xsave buffer the state is. + * The xsave buffer should be in standard format, not compacted (e.g. user mode + * signal frames). + */ +void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr) +{ + if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) + return NULL; + + return (void __user *)xsave + xstate_offsets[xfeature_nr]; +} + #ifdef CONFIG_ARCH_HAS_PKEYS /* diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index afb404cd2059..0b86a5002c84 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -54,6 +54,8 @@ extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void extern void fpu__init_cpu_xstate(void); extern void fpu__init_system_xstate(unsigned int legacy_size); +extern void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr); + static inline u64 xfeatures_mask_supervisor(void) { return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED; diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index cc0f7f70b17b..9c9ac606893e 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -28,6 +28,7 @@ #include <asm/setup.h> #include <asm/set_memory.h> #include <asm/cpu.h> +#include <asm/efi.h> #ifdef CONFIG_ACPI /* @@ -87,6 +88,8 @@ map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p) { #ifdef CONFIG_EFI unsigned long mstart, mend; + void *kaddr; + int ret; if (!efi_enabled(EFI_BOOT)) return 0; @@ -102,6 +105,30 @@ map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p) if (!mstart) return 0; + ret = kernel_ident_mapping_init(info, level4p, mstart, mend); + if (ret) + return ret; + + kaddr = memremap(mstart, mend - mstart, MEMREMAP_WB); + if (!kaddr) { + pr_err("Could not map UEFI system table\n"); + return -ENOMEM; + } + + mstart = efi_config_table; + + if (efi_enabled(EFI_64BIT)) { + efi_system_table_64_t *stbl = (efi_system_table_64_t *)kaddr; + + mend = mstart + sizeof(efi_config_table_64_t) * stbl->nr_tables; + } else { + efi_system_table_32_t *stbl = (efi_system_table_32_t *)kaddr; + + mend = mstart + sizeof(efi_config_table_32_t) * stbl->nr_tables; + } + + memunmap(kaddr); + return kernel_ident_mapping_init(info, level4p, mstart, mend); #endif return 0; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6d3d20e3e43a..226472332a70 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -798,6 +798,32 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) #define LAM_U57_BITS 6 +static void enable_lam_func(void *__mm) +{ + struct mm_struct *mm = __mm; + unsigned long lam; + + if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) { + lam = mm_lam_cr3_mask(mm); + write_cr3(__read_cr3() | lam); + cpu_tlbstate_update_lam(lam, mm_untag_mask(mm)); + } +} + +static void mm_enable_lam(struct mm_struct *mm) +{ + mm->context.lam_cr3_mask = X86_CR3_LAM_U57; + mm->context.untag_mask = ~GENMASK(62, 57); + + /* + * Even though the process must still be single-threaded at this + * point, kernel threads may be using the mm. IPI those kernel + * threads if they exist. + */ + on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true); + set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags); +} + static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) { if (!cpu_feature_enabled(X86_FEATURE_LAM)) @@ -814,25 +840,21 @@ static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) if (mmap_write_lock_killable(mm)) return -EINTR; + /* + * MM_CONTEXT_LOCK_LAM is set on clone. Prevent LAM from + * being enabled unless the process is single threaded: + */ if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) { mmap_write_unlock(mm); return -EBUSY; } - if (!nr_bits) { - mmap_write_unlock(mm); - return -EINVAL; - } else if (nr_bits <= LAM_U57_BITS) { - mm->context.lam_cr3_mask = X86_CR3_LAM_U57; - mm->context.untag_mask = ~GENMASK(62, 57); - } else { + if (!nr_bits || nr_bits > LAM_U57_BITS) { mmap_write_unlock(mm); return -EINVAL; } - write_cr3(__read_cr3() | mm->context.lam_cr3_mask); - set_tlbstate_lam_mode(mm); - set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags); + mm_enable_lam(mm); mmap_write_unlock(mm); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 31b6f5dddfc2..5f441039b572 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -61,6 +61,24 @@ static inline int is_x32_frame(struct ksignal *ksig) } /* + * Enable all pkeys temporarily, so as to ensure that both the current + * execution stack as well as the alternate signal stack are writeable. + * The application can use any of the available pkeys to protect the + * alternate signal stack, and we don't know which one it is, so enable + * all. The PKRU register will be reset to init_pkru later in the flow, + * in fpu__clear_user_states(), and it is the application's responsibility + * to enable the appropriate pkey as the first step in the signal handler + * so that the handler does not segfault. + */ +static inline u32 sig_prepare_pkru(void) +{ + u32 orig_pkru = read_pkru(); + + write_pkru(0); + return orig_pkru; +} + +/* * Set up a signal frame. */ @@ -84,6 +102,7 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, unsigned long math_size = 0; unsigned long sp = regs->sp; unsigned long buf_fx = 0; + u32 pkru; /* redzone */ if (!ia32_frame) @@ -138,9 +157,17 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, return (void __user *)-1L; } + /* Update PKRU to enable access to the alternate signal stack. */ + pkru = sig_prepare_pkru(); /* save i387 and extended state */ - if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size)) + if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size, pkru)) { + /* + * Restore PKRU to the original, user-defined value; disable + * extra pkeys enabled for the alternate signal stack, if any. + */ + write_pkru(pkru); return (void __user *)-1L; + } return (void __user *)sp; } diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 8a94053c5444..ee9453891901 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -260,13 +260,13 @@ SYSCALL_DEFINE0(rt_sigreturn) set_current_blocked(&set); - if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) + if (restore_altstack(&frame->uc.uc_stack)) goto badframe; - if (restore_signal_shadow_stack()) + if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; - if (restore_altstack(&frame->uc.uc_stack)) + if (restore_signal_shadow_stack()) goto badframe; return regs->ax; diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index c45127265f2f..437e96fb4977 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -99,18 +99,31 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, for (; addr < end; addr = next) { pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; + bool use_gbpage; next = (addr & PUD_MASK) + PUD_SIZE; if (next > end) next = end; - if (info->direct_gbpages) { - pud_t pudval; + /* if this is already a gbpage, this portion is already mapped */ + if (pud_leaf(*pud)) + continue; + + /* Is using a gbpage allowed? */ + use_gbpage = info->direct_gbpages; - if (pud_present(*pud)) - continue; + /* Don't use gbpage if it maps more than the requested region. */ + /* at the begining: */ + use_gbpage &= ((addr & ~PUD_MASK) == 0); + /* ... or at the end: */ + use_gbpage &= ((next & ~PUD_MASK) == 0); + + /* Never overwrite existing mappings */ + use_gbpage &= !pud_present(*pud); + + if (use_gbpage) { + pud_t pudval; - addr &= PUD_MASK; pudval = __pud((addr - info->offset) | info->page_flag); set_pud(pud, pudval); continue; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index aa7d279321ea..70b02fc61d93 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -11,6 +11,7 @@ #include <linux/init.h> #include <linux/io.h> #include <linux/ioport.h> +#include <linux/ioremap.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mmiotrace.h> @@ -457,7 +458,7 @@ void iounmap(volatile void __iomem *addr) { struct vm_struct *p, *o; - if ((void __force *)addr <= high_memory) + if (WARN_ON_ONCE(!is_ioremap_addr((void __force *)addr))) return; /* diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 9c52a95937ad..6f8e0f21c710 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -57,8 +57,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) } set_apicid_to_node(apic_id, node); node_set(node, numa_nodes_parsed); - printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", - pxm, apic_id, node); + pr_debug("SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", pxm, apic_id, node); } /* Callback for Proximity Domain -> LAPIC mapping */ @@ -98,8 +97,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) set_apicid_to_node(apic_id, node); node_set(node, numa_nodes_parsed); - printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", - pxm, apic_id, node); + pr_debug("SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", pxm, apic_id, node); } int __init x86_acpi_numa_init(void) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 44ac64f3a047..86593d1b787d 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -11,6 +11,7 @@ #include <linux/sched/smt.h> #include <linux/task_work.h> #include <linux/mmu_notifier.h> +#include <linux/mmu_context.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -85,9 +86,6 @@ * */ -/* There are 12 bits of space for ASIDS in CR3 */ -#define CR3_HW_ASID_BITS 12 - /* * When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for * user/kernel switches @@ -160,7 +158,6 @@ static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam) unsigned long cr3 = __sme_pa(pgd) | lam; if (static_cpu_has(X86_FEATURE_PCID)) { - VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); cr3 |= kern_pcid(asid); } else { VM_WARN_ON_ONCE(asid != 0); @@ -503,9 +500,9 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, { struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); - unsigned long new_lam = mm_lam_cr3_mask(next); bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy); unsigned cpu = smp_processor_id(); + unsigned long new_lam; u64 next_tlb_gen; bool need_flush; u16 new_asid; @@ -619,9 +616,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, cpumask_clear_cpu(cpu, mm_cpumask(prev)); } - /* - * Start remote flushes and then read tlb_gen. - */ + /* Start receiving IPIs and then read tlb_gen (and LAM below) */ if (next != &init_mm) cpumask_set_cpu(cpu, mm_cpumask(next)); next_tlb_gen = atomic64_read(&next->context.tlb_gen); @@ -633,7 +628,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, barrier(); } - set_tlbstate_lam_mode(next); + new_lam = mm_lam_cr3_mask(next); if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); @@ -652,6 +647,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, this_cpu_write(cpu_tlbstate.loaded_mm, next); this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + cpu_tlbstate_update_lam(new_lam, mm_untag_mask(next)); if (next != prev) { cr4_update_pce_mm(next); @@ -698,6 +694,7 @@ void initialize_tlbstate_and_flush(void) int i; struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm); u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen); + unsigned long lam = mm_lam_cr3_mask(mm); unsigned long cr3 = __read_cr3(); /* Assert that CR3 already references the right mm. */ @@ -705,7 +702,7 @@ void initialize_tlbstate_and_flush(void) /* LAM expected to be disabled */ WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57)); - WARN_ON(mm_lam_cr3_mask(mm)); + WARN_ON(lam); /* * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization @@ -724,7 +721,7 @@ void initialize_tlbstate_and_flush(void) this_cpu_write(cpu_tlbstate.next_asid, 1); this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); - set_tlbstate_lam_mode(mm); + cpu_tlbstate_update_lam(lam, mm_untag_mask(mm)); for (i = 1; i < TLB_NR_DYN_ASIDS; i++) this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); diff --git a/include/linux/ioremap.h b/include/linux/ioremap.h index f0e99fc7dd8b..2bd1661fe9ad 100644 --- a/include/linux/ioremap.h +++ b/include/linux/ioremap.h @@ -4,6 +4,7 @@ #include <linux/kasan.h> #include <asm/pgtable.h> +#include <asm/vmalloc.h> #if defined(CONFIG_HAS_IOMEM) || defined(CONFIG_GENERIC_IOREMAP) /* diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 5f2ca591c956..e10b87376fde 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -90,6 +90,7 @@ CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_pr CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie) VMTARGETS := protection_keys +VMTARGETS += pkey_sighandler_tests BINARIES_32 := $(VMTARGETS:%=%_32) BINARIES_64 := $(VMTARGETS:%=%_64) diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h index 15608350fc01..9ab6a3ee153b 100644 --- a/tools/testing/selftests/mm/pkey-helpers.h +++ b/tools/testing/selftests/mm/pkey-helpers.h @@ -79,7 +79,18 @@ extern void abort_hooks(void); } \ } while (0) -__attribute__((noinline)) int read_ptr(int *ptr); +#define barrier() __asm__ __volatile__("": : :"memory") +#ifndef noinline +# define noinline __attribute__((noinline)) +#endif + +noinline int read_ptr(int *ptr) +{ + /* Keep GCC from optimizing this away somehow */ + barrier(); + return *ptr; +} + void expected_pkey_fault(int pkey); int sys_pkey_alloc(unsigned long flags, unsigned long init_val); int sys_pkey_free(unsigned long pkey); diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c new file mode 100644 index 000000000000..a8088b645ad6 --- /dev/null +++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c @@ -0,0 +1,481 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst) + * + * The testcases in this file exercise various flows related to signal handling, + * using an alternate signal stack, with the default pkey (pkey 0) disabled. + * + * Compile with: + * gcc -mxsave -o pkey_sighandler_tests -O2 -g -std=gnu99 -pthread -Wall pkey_sighandler_tests.c -I../../../../tools/include -lrt -ldl -lm + * gcc -mxsave -m32 -o pkey_sighandler_tests -O2 -g -std=gnu99 -pthread -Wall pkey_sighandler_tests.c -I../../../../tools/include -lrt -ldl -lm + */ +#define _GNU_SOURCE +#define __SANE_USERSPACE_TYPES__ +#include <errno.h> +#include <sys/syscall.h> +#include <string.h> +#include <stdio.h> +#include <stdint.h> +#include <stdbool.h> +#include <signal.h> +#include <assert.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> +#include <pthread.h> +#include <limits.h> + +#include "pkey-helpers.h" + +#define STACK_SIZE PTHREAD_STACK_MIN + +void expected_pkey_fault(int pkey) {} + +pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; +pthread_cond_t cond = PTHREAD_COND_INITIALIZER; +siginfo_t siginfo = {0}; + +/* + * We need to use inline assembly instead of glibc's syscall because glibc's + * syscall will attempt to access the PLT in order to call a library function + * which is protected by MPK 0 which we don't have access to. + */ +static inline __always_inline +long syscall_raw(long n, long a1, long a2, long a3, long a4, long a5, long a6) +{ + unsigned long ret; +#ifdef __x86_64__ + register long r10 asm("r10") = a4; + register long r8 asm("r8") = a5; + register long r9 asm("r9") = a6; + asm volatile ("syscall" + : "=a"(ret) + : "a"(n), "D"(a1), "S"(a2), "d"(a3), "r"(r10), "r"(r8), "r"(r9) + : "rcx", "r11", "memory"); +#elif defined __i386__ + asm volatile ("int $0x80" + : "=a"(ret) + : "a"(n), "b"(a1), "c"(a2), "d"(a3), "S"(a4), "D"(a5) + : "memory"); +#else +# error syscall_raw() not implemented +#endif + return ret; +} + +static void sigsegv_handler(int signo, siginfo_t *info, void *ucontext) +{ + pthread_mutex_lock(&mutex); + + memcpy(&siginfo, info, sizeof(siginfo_t)); + + pthread_cond_signal(&cond); + pthread_mutex_unlock(&mutex); + + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); +} + +static void sigusr1_handler(int signo, siginfo_t *info, void *ucontext) +{ + pthread_mutex_lock(&mutex); + + memcpy(&siginfo, info, sizeof(siginfo_t)); + + pthread_cond_signal(&cond); + pthread_mutex_unlock(&mutex); +} + +static void sigusr2_handler(int signo, siginfo_t *info, void *ucontext) +{ + /* + * pkru should be the init_pkru value which enabled MPK 0 so + * we can use library functions. + */ + printf("%s invoked.\n", __func__); +} + +static void raise_sigusr2(void) +{ + pid_t tid = 0; + + tid = syscall_raw(SYS_gettid, 0, 0, 0, 0, 0, 0); + + syscall_raw(SYS_tkill, tid, SIGUSR2, 0, 0, 0, 0); + + /* + * We should return from the signal handler here and be able to + * return to the interrupted thread. + */ +} + +static void *thread_segv_with_pkey0_disabled(void *ptr) +{ + /* Disable MPK 0 (and all others too) */ + __write_pkey_reg(0x55555555); + + /* Segfault (with SEGV_MAPERR) */ + *(int *) (0x1) = 1; + return NULL; +} + +static void *thread_segv_pkuerr_stack(void *ptr) +{ + /* Disable MPK 0 (and all others too) */ + __write_pkey_reg(0x55555555); + + /* After we disable MPK 0, we can't access the stack to return */ + return NULL; +} + +static void *thread_segv_maperr_ptr(void *ptr) +{ + stack_t *stack = ptr; + int *bad = (int *)1; + + /* + * Setup alternate signal stack, which should be pkey_mprotect()ed by + * MPK 0. The thread's stack cannot be used for signals because it is + * not accessible by the default init_pkru value of 0x55555554. + */ + syscall_raw(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0); + + /* Disable MPK 0. Only MPK 1 is enabled. */ + __write_pkey_reg(0x55555551); + + /* Segfault */ + *bad = 1; + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); + return NULL; +} + +/* + * Verify that the sigsegv handler is invoked when pkey 0 is disabled. + * Note that the new thread stack and the alternate signal stack is + * protected by MPK 0. + */ +static void test_sigsegv_handler_with_pkey0_disabled(void) +{ + struct sigaction sa; + pthread_attr_t attr; + pthread_t thr; + + sa.sa_flags = SA_SIGINFO; + + sa.sa_sigaction = sigsegv_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGSEGV, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + memset(&siginfo, 0, sizeof(siginfo)); + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + + pthread_create(&thr, &attr, thread_segv_with_pkey0_disabled, NULL); + + pthread_mutex_lock(&mutex); + while (siginfo.si_signo == 0) + pthread_cond_wait(&cond, &mutex); + pthread_mutex_unlock(&mutex); + + ksft_test_result(siginfo.si_signo == SIGSEGV && + siginfo.si_code == SEGV_MAPERR && + siginfo.si_addr == (void *)1, + "%s\n", __func__); +} + +/* + * Verify that the sigsegv handler is invoked when pkey 0 is disabled. + * Note that the new thread stack and the alternate signal stack is + * protected by MPK 0, which renders them inaccessible when MPK 0 + * is disabled. So just the return from the thread should cause a + * segfault with SEGV_PKUERR. + */ +static void test_sigsegv_handler_cannot_access_stack(void) +{ + struct sigaction sa; + pthread_attr_t attr; + pthread_t thr; + + sa.sa_flags = SA_SIGINFO; + + sa.sa_sigaction = sigsegv_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGSEGV, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + memset(&siginfo, 0, sizeof(siginfo)); + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + + pthread_create(&thr, &attr, thread_segv_pkuerr_stack, NULL); + + pthread_mutex_lock(&mutex); + while (siginfo.si_signo == 0) + pthread_cond_wait(&cond, &mutex); + pthread_mutex_unlock(&mutex); + + ksft_test_result(siginfo.si_signo == SIGSEGV && + siginfo.si_code == SEGV_PKUERR, + "%s\n", __func__); +} + +/* + * Verify that the sigsegv handler that uses an alternate signal stack + * is correctly invoked for a thread which uses a non-zero MPK to protect + * its own stack, and disables all other MPKs (including 0). + */ +static void test_sigsegv_handler_with_different_pkey_for_stack(void) +{ + struct sigaction sa; + static stack_t sigstack; + void *stack; + int pkey; + int parent_pid = 0; + int child_pid = 0; + + sa.sa_flags = SA_SIGINFO | SA_ONSTACK; + + sa.sa_sigaction = sigsegv_handler; + + sigemptyset(&sa.sa_mask); + if (sigaction(SIGSEGV, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + stack = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + assert(stack != MAP_FAILED); + + /* Allow access to MPK 0 and MPK 1 */ + __write_pkey_reg(0x55555550); + + /* Protect the new stack with MPK 1 */ + pkey = pkey_alloc(0, 0); + pkey_mprotect(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey); + + /* Set up alternate signal stack that will use the default MPK */ + sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + sigstack.ss_flags = 0; + sigstack.ss_size = STACK_SIZE; + + memset(&siginfo, 0, sizeof(siginfo)); + + /* Use clone to avoid newer glibcs using rseq on new threads */ + long ret = syscall_raw(SYS_clone, + CLONE_VM | CLONE_FS | CLONE_FILES | + CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM | + CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID | + CLONE_DETACHED, + (long) ((char *)(stack) + STACK_SIZE), + (long) &parent_pid, + (long) &child_pid, 0, 0); + + if (ret < 0) { + errno = -ret; + perror("clone"); + } else if (ret == 0) { + thread_segv_maperr_ptr(&sigstack); + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); + } + + pthread_mutex_lock(&mutex); + while (siginfo.si_signo == 0) + pthread_cond_wait(&cond, &mutex); + pthread_mutex_unlock(&mutex); + + ksft_test_result(siginfo.si_signo == SIGSEGV && + siginfo.si_code == SEGV_MAPERR && + siginfo.si_addr == (void *)1, + "%s\n", __func__); +} + +/* + * Verify that the PKRU value set by the application is correctly + * restored upon return from signal handling. + */ +static void test_pkru_preserved_after_sigusr1(void) +{ + struct sigaction sa; + unsigned long pkru = 0x45454544; + + sa.sa_flags = SA_SIGINFO; + + sa.sa_sigaction = sigusr1_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGUSR1, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + memset(&siginfo, 0, sizeof(siginfo)); + + __write_pkey_reg(pkru); + + raise(SIGUSR1); + + pthread_mutex_lock(&mutex); + while (siginfo.si_signo == 0) + pthread_cond_wait(&cond, &mutex); + pthread_mutex_unlock(&mutex); + + /* Ensure the pkru value is the same after returning from signal. */ + ksft_test_result(pkru == __read_pkey_reg() && + siginfo.si_signo == SIGUSR1, + "%s\n", __func__); +} + +static noinline void *thread_sigusr2_self(void *ptr) +{ + /* + * A const char array like "Resuming after SIGUSR2" won't be stored on + * the stack and the code could access it via an offset from the program + * counter. This makes sure it's on the function's stack frame. + */ + char str[] = {'R', 'e', 's', 'u', 'm', 'i', 'n', 'g', ' ', + 'a', 'f', 't', 'e', 'r', ' ', + 'S', 'I', 'G', 'U', 'S', 'R', '2', + '.', '.', '.', '\n', '\0'}; + stack_t *stack = ptr; + + /* + * Setup alternate signal stack, which should be pkey_mprotect()ed by + * MPK 0. The thread's stack cannot be used for signals because it is + * not accessible by the default init_pkru value of 0x55555554. + */ + syscall(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0); + + /* Disable MPK 0. Only MPK 2 is enabled. */ + __write_pkey_reg(0x55555545); + + raise_sigusr2(); + + /* Do something, to show the thread resumed execution after the signal */ + syscall_raw(SYS_write, 1, (long) str, sizeof(str) - 1, 0, 0, 0); + + /* + * We can't return to test_pkru_sigreturn because it + * will attempt to use a %rbp value which is on the stack + * of the main thread. + */ + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); + return NULL; +} + +/* + * Verify that sigreturn is able to restore altstack even if the thread had + * disabled pkey 0. + */ +static void test_pkru_sigreturn(void) +{ + struct sigaction sa = {0}; + static stack_t sigstack; + void *stack; + int pkey; + int parent_pid = 0; + int child_pid = 0; + + sa.sa_handler = SIG_DFL; + sa.sa_flags = 0; + sigemptyset(&sa.sa_mask); + + /* + * For this testcase, we do not want to handle SIGSEGV. Reset handler + * to default so that the application can crash if it receives SIGSEGV. + */ + if (sigaction(SIGSEGV, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + sa.sa_flags = SA_SIGINFO | SA_ONSTACK; + sa.sa_sigaction = sigusr2_handler; + sigemptyset(&sa.sa_mask); + + if (sigaction(SIGUSR2, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + stack = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + assert(stack != MAP_FAILED); + + /* + * Allow access to MPK 0 and MPK 2. The child thread (to be created + * later in this flow) will have its stack protected by MPK 2, whereas + * the current thread's stack is protected by the default MPK 0. Hence + * both need to be enabled. + */ + __write_pkey_reg(0x55555544); + + /* Protect the stack with MPK 2 */ + pkey = pkey_alloc(0, 0); + pkey_mprotect(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey); + + /* Set up alternate signal stack that will use the default MPK */ + sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + sigstack.ss_flags = 0; + sigstack.ss_size = STACK_SIZE; + + /* Use clone to avoid newer glibcs using rseq on new threads */ + long ret = syscall_raw(SYS_clone, + CLONE_VM | CLONE_FS | CLONE_FILES | + CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM | + CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID | + CLONE_DETACHED, + (long) ((char *)(stack) + STACK_SIZE), + (long) &parent_pid, + (long) &child_pid, 0, 0); + + if (ret < 0) { + errno = -ret; + perror("clone"); + } else if (ret == 0) { + thread_sigusr2_self(&sigstack); + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); + } + + child_pid = ret; + /* Check that thread exited */ + do { + sched_yield(); + ret = syscall_raw(SYS_tkill, child_pid, 0, 0, 0, 0, 0); + } while (ret != -ESRCH && ret != -EINVAL); + + ksft_test_result_pass("%s\n", __func__); +} + +static void (*pkey_tests[])(void) = { + test_sigsegv_handler_with_pkey0_disabled, + test_sigsegv_handler_cannot_access_stack, + test_sigsegv_handler_with_different_pkey_for_stack, + test_pkru_preserved_after_sigusr1, + test_pkru_sigreturn +}; + +int main(int argc, char *argv[]) +{ + int i; + + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(pkey_tests)); + + for (i = 0; i < ARRAY_SIZE(pkey_tests); i++) + (*pkey_tests[i])(); + + ksft_finished(); + return 0; +} diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c index 0789981b72b9..4990f7ab4cb7 100644 --- a/tools/testing/selftests/mm/protection_keys.c +++ b/tools/testing/selftests/mm/protection_keys.c @@ -954,16 +954,6 @@ void close_test_fds(void) nr_test_fds = 0; } -#define barrier() __asm__ __volatile__("": : :"memory") -__attribute__((noinline)) int read_ptr(int *ptr) -{ - /* - * Keep GCC from optimizing this away somehow - */ - barrier(); - return *ptr; -} - void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) { int i, err; |