aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/x86/include/asm/fpu/signal.h2
-rw-r--r--arch/x86/include/asm/mmu_context.h8
-rw-r--r--arch/x86/include/asm/pgtable_types.h2
-rw-r--r--arch/x86/include/asm/tlbflush.h9
-rw-r--r--arch/x86/kernel/fpu/signal.c27
-rw-r--r--arch/x86/kernel/fpu/xstate.c13
-rw-r--r--arch/x86/kernel/fpu/xstate.h2
-rw-r--r--arch/x86/kernel/machine_kexec_64.c27
-rw-r--r--arch/x86/kernel/process_64.c42
-rw-r--r--arch/x86/kernel/signal.c29
-rw-r--r--arch/x86/kernel/signal_64.c6
-rw-r--r--arch/x86/mm/ident_map.c23
-rw-r--r--arch/x86/mm/ioremap.c3
-rw-r--r--arch/x86/mm/srat.c6
-rw-r--r--arch/x86/mm/tlb.c19
-rw-r--r--include/linux/ioremap.h1
-rw-r--r--tools/testing/selftests/mm/Makefile1
-rw-r--r--tools/testing/selftests/mm/pkey-helpers.h13
-rw-r--r--tools/testing/selftests/mm/pkey_sighandler_tests.c481
-rw-r--r--tools/testing/selftests/mm/protection_keys.c10
20 files changed, 664 insertions, 60 deletions
diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
index 611fa41711af..eccc75bc9c4f 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -29,7 +29,7 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
unsigned long fpu__get_fpstate_size(void);
-extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
+extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size, u32 pkru);
extern void fpu__clear_user_states(struct fpu *fpu);
extern bool fpu__restore_sig(void __user *buf, int ia32_frame);
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 8dac45a2c7fc..19091ebb8633 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -88,7 +88,13 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
#ifdef CONFIG_ADDRESS_MASKING
static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
{
- return mm->context.lam_cr3_mask;
+ /*
+ * When switch_mm_irqs_off() is called for a kthread, it may race with
+ * LAM enablement. switch_mm_irqs_off() uses the LAM mask to do two
+ * things: populate CR3 and populate 'cpu_tlbstate.lam'. Make sure it
+ * reads a single value for both.
+ */
+ return READ_ONCE(mm->context.lam_cr3_mask);
}
static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 2f321137736c..6f82e75b6149 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -517,8 +517,6 @@ typedef struct page *pgtable_t;
extern pteval_t __supported_pte_mask;
extern pteval_t __default_kernel_pte_mask;
-extern void set_nx(void);
-extern int nx_enabled;
#define pgprot_writecombine pgprot_writecombine
extern pgprot_t pgprot_writecombine(pgprot_t prot);
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 25726893c6f4..69e79fff41b8 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -399,11 +399,10 @@ static inline u64 tlbstate_lam_cr3_mask(void)
return lam << X86_CR3_LAM_U57_BIT;
}
-static inline void set_tlbstate_lam_mode(struct mm_struct *mm)
+static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask)
{
- this_cpu_write(cpu_tlbstate.lam,
- mm->context.lam_cr3_mask >> X86_CR3_LAM_U57_BIT);
- this_cpu_write(tlbstate_untag_mask, mm->context.untag_mask);
+ this_cpu_write(cpu_tlbstate.lam, lam >> X86_CR3_LAM_U57_BIT);
+ this_cpu_write(tlbstate_untag_mask, untag_mask);
}
#else
@@ -413,7 +412,7 @@ static inline u64 tlbstate_lam_cr3_mask(void)
return 0;
}
-static inline void set_tlbstate_lam_mode(struct mm_struct *mm)
+static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask)
{
}
#endif
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 247f2225aa9f..1065ab995305 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -64,6 +64,16 @@ setfx:
}
/*
+ * Update the value of PKRU register that was already pushed onto the signal frame.
+ */
+static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u32 pkru)
+{
+ if (unlikely(!cpu_feature_enabled(X86_FEATURE_OSPKE)))
+ return 0;
+ return __put_user(pkru, (unsigned int __user *)get_xsave_addr_user(buf, XFEATURE_PKRU));
+}
+
+/*
* Signal frame handlers.
*/
static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf)
@@ -156,10 +166,17 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
return !err;
}
-static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
+static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf, u32 pkru)
{
- if (use_xsave())
- return xsave_to_user_sigframe(buf);
+ int err = 0;
+
+ if (use_xsave()) {
+ err = xsave_to_user_sigframe(buf);
+ if (!err)
+ err = update_pkru_in_sigframe(buf, pkru);
+ return err;
+ }
+
if (use_fxsr())
return fxsave_to_user_sigframe((struct fxregs_state __user *) buf);
else
@@ -185,7 +202,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
* For [f]xsave state, update the SW reserved fields in the [f]xsave frame
* indicating the absence/presence of the extended state to the user.
*/
-bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
+bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size, u32 pkru)
{
struct task_struct *tsk = current;
struct fpstate *fpstate = tsk->thread.fpu.fpstate;
@@ -228,7 +245,7 @@ retry:
fpregs_restore_userregs();
pagefault_disable();
- ret = copy_fpregs_to_sigframe(buf_fx);
+ ret = copy_fpregs_to_sigframe(buf_fx, pkru);
pagefault_enable();
fpregs_unlock();
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 53d9f01d5988..22abb5ee0cf2 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -999,6 +999,19 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
}
EXPORT_SYMBOL_GPL(get_xsave_addr);
+/*
+ * Given an xstate feature nr, calculate where in the xsave buffer the state is.
+ * The xsave buffer should be in standard format, not compacted (e.g. user mode
+ * signal frames).
+ */
+void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr)
+{
+ if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
+ return NULL;
+
+ return (void __user *)xsave + xstate_offsets[xfeature_nr];
+}
+
#ifdef CONFIG_ARCH_HAS_PKEYS
/*
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index afb404cd2059..0b86a5002c84 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -54,6 +54,8 @@ extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void
extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system_xstate(unsigned int legacy_size);
+extern void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr);
+
static inline u64 xfeatures_mask_supervisor(void)
{
return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index cc0f7f70b17b..9c9ac606893e 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -28,6 +28,7 @@
#include <asm/setup.h>
#include <asm/set_memory.h>
#include <asm/cpu.h>
+#include <asm/efi.h>
#ifdef CONFIG_ACPI
/*
@@ -87,6 +88,8 @@ map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
{
#ifdef CONFIG_EFI
unsigned long mstart, mend;
+ void *kaddr;
+ int ret;
if (!efi_enabled(EFI_BOOT))
return 0;
@@ -102,6 +105,30 @@ map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
if (!mstart)
return 0;
+ ret = kernel_ident_mapping_init(info, level4p, mstart, mend);
+ if (ret)
+ return ret;
+
+ kaddr = memremap(mstart, mend - mstart, MEMREMAP_WB);
+ if (!kaddr) {
+ pr_err("Could not map UEFI system table\n");
+ return -ENOMEM;
+ }
+
+ mstart = efi_config_table;
+
+ if (efi_enabled(EFI_64BIT)) {
+ efi_system_table_64_t *stbl = (efi_system_table_64_t *)kaddr;
+
+ mend = mstart + sizeof(efi_config_table_64_t) * stbl->nr_tables;
+ } else {
+ efi_system_table_32_t *stbl = (efi_system_table_32_t *)kaddr;
+
+ mend = mstart + sizeof(efi_config_table_32_t) * stbl->nr_tables;
+ }
+
+ memunmap(kaddr);
+
return kernel_ident_mapping_init(info, level4p, mstart, mend);
#endif
return 0;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 6d3d20e3e43a..226472332a70 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -798,6 +798,32 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
#define LAM_U57_BITS 6
+static void enable_lam_func(void *__mm)
+{
+ struct mm_struct *mm = __mm;
+ unsigned long lam;
+
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) {
+ lam = mm_lam_cr3_mask(mm);
+ write_cr3(__read_cr3() | lam);
+ cpu_tlbstate_update_lam(lam, mm_untag_mask(mm));
+ }
+}
+
+static void mm_enable_lam(struct mm_struct *mm)
+{
+ mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
+ mm->context.untag_mask = ~GENMASK(62, 57);
+
+ /*
+ * Even though the process must still be single-threaded at this
+ * point, kernel threads may be using the mm. IPI those kernel
+ * threads if they exist.
+ */
+ on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true);
+ set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
+}
+
static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
{
if (!cpu_feature_enabled(X86_FEATURE_LAM))
@@ -814,25 +840,21 @@ static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
if (mmap_write_lock_killable(mm))
return -EINTR;
+ /*
+ * MM_CONTEXT_LOCK_LAM is set on clone. Prevent LAM from
+ * being enabled unless the process is single threaded:
+ */
if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) {
mmap_write_unlock(mm);
return -EBUSY;
}
- if (!nr_bits) {
- mmap_write_unlock(mm);
- return -EINVAL;
- } else if (nr_bits <= LAM_U57_BITS) {
- mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
- mm->context.untag_mask = ~GENMASK(62, 57);
- } else {
+ if (!nr_bits || nr_bits > LAM_U57_BITS) {
mmap_write_unlock(mm);
return -EINVAL;
}
- write_cr3(__read_cr3() | mm->context.lam_cr3_mask);
- set_tlbstate_lam_mode(mm);
- set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
+ mm_enable_lam(mm);
mmap_write_unlock(mm);
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 31b6f5dddfc2..5f441039b572 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -61,6 +61,24 @@ static inline int is_x32_frame(struct ksignal *ksig)
}
/*
+ * Enable all pkeys temporarily, so as to ensure that both the current
+ * execution stack as well as the alternate signal stack are writeable.
+ * The application can use any of the available pkeys to protect the
+ * alternate signal stack, and we don't know which one it is, so enable
+ * all. The PKRU register will be reset to init_pkru later in the flow,
+ * in fpu__clear_user_states(), and it is the application's responsibility
+ * to enable the appropriate pkey as the first step in the signal handler
+ * so that the handler does not segfault.
+ */
+static inline u32 sig_prepare_pkru(void)
+{
+ u32 orig_pkru = read_pkru();
+
+ write_pkru(0);
+ return orig_pkru;
+}
+
+/*
* Set up a signal frame.
*/
@@ -84,6 +102,7 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size,
unsigned long math_size = 0;
unsigned long sp = regs->sp;
unsigned long buf_fx = 0;
+ u32 pkru;
/* redzone */
if (!ia32_frame)
@@ -138,9 +157,17 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size,
return (void __user *)-1L;
}
+ /* Update PKRU to enable access to the alternate signal stack. */
+ pkru = sig_prepare_pkru();
/* save i387 and extended state */
- if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size))
+ if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size, pkru)) {
+ /*
+ * Restore PKRU to the original, user-defined value; disable
+ * extra pkeys enabled for the alternate signal stack, if any.
+ */
+ write_pkru(pkru);
return (void __user *)-1L;
+ }
return (void __user *)sp;
}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 8a94053c5444..ee9453891901 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -260,13 +260,13 @@ SYSCALL_DEFINE0(rt_sigreturn)
set_current_blocked(&set);
- if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+ if (restore_altstack(&frame->uc.uc_stack))
goto badframe;
- if (restore_signal_shadow_stack())
+ if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
goto badframe;
- if (restore_altstack(&frame->uc.uc_stack))
+ if (restore_signal_shadow_stack())
goto badframe;
return regs->ax;
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index c45127265f2f..437e96fb4977 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -99,18 +99,31 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
for (; addr < end; addr = next) {
pud_t *pud = pud_page + pud_index(addr);
pmd_t *pmd;
+ bool use_gbpage;
next = (addr & PUD_MASK) + PUD_SIZE;
if (next > end)
next = end;
- if (info->direct_gbpages) {
- pud_t pudval;
+ /* if this is already a gbpage, this portion is already mapped */
+ if (pud_leaf(*pud))
+ continue;
+
+ /* Is using a gbpage allowed? */
+ use_gbpage = info->direct_gbpages;
- if (pud_present(*pud))
- continue;
+ /* Don't use gbpage if it maps more than the requested region. */
+ /* at the begining: */
+ use_gbpage &= ((addr & ~PUD_MASK) == 0);
+ /* ... or at the end: */
+ use_gbpage &= ((next & ~PUD_MASK) == 0);
+
+ /* Never overwrite existing mappings */
+ use_gbpage &= !pud_present(*pud);
+
+ if (use_gbpage) {
+ pud_t pudval;
- addr &= PUD_MASK;
pudval = __pud((addr - info->offset) | info->page_flag);
set_pud(pud, pudval);
continue;
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index aa7d279321ea..70b02fc61d93 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -11,6 +11,7 @@
#include <linux/init.h>
#include <linux/io.h>
#include <linux/ioport.h>
+#include <linux/ioremap.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mmiotrace.h>
@@ -457,7 +458,7 @@ void iounmap(volatile void __iomem *addr)
{
struct vm_struct *p, *o;
- if ((void __force *)addr <= high_memory)
+ if (WARN_ON_ONCE(!is_ioremap_addr((void __force *)addr)))
return;
/*
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 9c52a95937ad..6f8e0f21c710 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -57,8 +57,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
}
set_apicid_to_node(apic_id, node);
node_set(node, numa_nodes_parsed);
- printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
- pxm, apic_id, node);
+ pr_debug("SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", pxm, apic_id, node);
}
/* Callback for Proximity Domain -> LAPIC mapping */
@@ -98,8 +97,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
set_apicid_to_node(apic_id, node);
node_set(node, numa_nodes_parsed);
- printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
- pxm, apic_id, node);
+ pr_debug("SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", pxm, apic_id, node);
}
int __init x86_acpi_numa_init(void)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 44ac64f3a047..86593d1b787d 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -11,6 +11,7 @@
#include <linux/sched/smt.h>
#include <linux/task_work.h>
#include <linux/mmu_notifier.h>
+#include <linux/mmu_context.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -85,9 +86,6 @@
*
*/
-/* There are 12 bits of space for ASIDS in CR3 */
-#define CR3_HW_ASID_BITS 12
-
/*
* When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for
* user/kernel switches
@@ -160,7 +158,6 @@ static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam)
unsigned long cr3 = __sme_pa(pgd) | lam;
if (static_cpu_has(X86_FEATURE_PCID)) {
- VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
cr3 |= kern_pcid(asid);
} else {
VM_WARN_ON_ONCE(asid != 0);
@@ -503,9 +500,9 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
{
struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
- unsigned long new_lam = mm_lam_cr3_mask(next);
bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
unsigned cpu = smp_processor_id();
+ unsigned long new_lam;
u64 next_tlb_gen;
bool need_flush;
u16 new_asid;
@@ -619,9 +616,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
cpumask_clear_cpu(cpu, mm_cpumask(prev));
}
- /*
- * Start remote flushes and then read tlb_gen.
- */
+ /* Start receiving IPIs and then read tlb_gen (and LAM below) */
if (next != &init_mm)
cpumask_set_cpu(cpu, mm_cpumask(next));
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
@@ -633,7 +628,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
barrier();
}
- set_tlbstate_lam_mode(next);
+ new_lam = mm_lam_cr3_mask(next);
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
@@ -652,6 +647,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+ cpu_tlbstate_update_lam(new_lam, mm_untag_mask(next));
if (next != prev) {
cr4_update_pce_mm(next);
@@ -698,6 +694,7 @@ void initialize_tlbstate_and_flush(void)
int i;
struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
+ unsigned long lam = mm_lam_cr3_mask(mm);
unsigned long cr3 = __read_cr3();
/* Assert that CR3 already references the right mm. */
@@ -705,7 +702,7 @@ void initialize_tlbstate_and_flush(void)
/* LAM expected to be disabled */
WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57));
- WARN_ON(mm_lam_cr3_mask(mm));
+ WARN_ON(lam);
/*
* Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
@@ -724,7 +721,7 @@ void initialize_tlbstate_and_flush(void)
this_cpu_write(cpu_tlbstate.next_asid, 1);
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
- set_tlbstate_lam_mode(mm);
+ cpu_tlbstate_update_lam(lam, mm_untag_mask(mm));
for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
diff --git a/include/linux/ioremap.h b/include/linux/ioremap.h
index f0e99fc7dd8b..2bd1661fe9ad 100644
--- a/include/linux/ioremap.h
+++ b/include/linux/ioremap.h
@@ -4,6 +4,7 @@
#include <linux/kasan.h>
#include <asm/pgtable.h>
+#include <asm/vmalloc.h>
#if defined(CONFIG_HAS_IOMEM) || defined(CONFIG_GENERIC_IOREMAP)
/*
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 5f2ca591c956..e10b87376fde 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -90,6 +90,7 @@ CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_pr
CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie)
VMTARGETS := protection_keys
+VMTARGETS += pkey_sighandler_tests
BINARIES_32 := $(VMTARGETS:%=%_32)
BINARIES_64 := $(VMTARGETS:%=%_64)
diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h
index 15608350fc01..9ab6a3ee153b 100644
--- a/tools/testing/selftests/mm/pkey-helpers.h
+++ b/tools/testing/selftests/mm/pkey-helpers.h
@@ -79,7 +79,18 @@ extern void abort_hooks(void);
} \
} while (0)
-__attribute__((noinline)) int read_ptr(int *ptr);
+#define barrier() __asm__ __volatile__("": : :"memory")
+#ifndef noinline
+# define noinline __attribute__((noinline))
+#endif
+
+noinline int read_ptr(int *ptr)
+{
+ /* Keep GCC from optimizing this away somehow */
+ barrier();
+ return *ptr;
+}
+
void expected_pkey_fault(int pkey);
int sys_pkey_alloc(unsigned long flags, unsigned long init_val);
int sys_pkey_free(unsigned long pkey);
diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c
new file mode 100644
index 000000000000..a8088b645ad6
--- /dev/null
+++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c
@@ -0,0 +1,481 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst)
+ *
+ * The testcases in this file exercise various flows related to signal handling,
+ * using an alternate signal stack, with the default pkey (pkey 0) disabled.
+ *
+ * Compile with:
+ * gcc -mxsave -o pkey_sighandler_tests -O2 -g -std=gnu99 -pthread -Wall pkey_sighandler_tests.c -I../../../../tools/include -lrt -ldl -lm
+ * gcc -mxsave -m32 -o pkey_sighandler_tests -O2 -g -std=gnu99 -pthread -Wall pkey_sighandler_tests.c -I../../../../tools/include -lrt -ldl -lm
+ */
+#define _GNU_SOURCE
+#define __SANE_USERSPACE_TYPES__
+#include <errno.h>
+#include <sys/syscall.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <limits.h>
+
+#include "pkey-helpers.h"
+
+#define STACK_SIZE PTHREAD_STACK_MIN
+
+void expected_pkey_fault(int pkey) {}
+
+pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
+siginfo_t siginfo = {0};
+
+/*
+ * We need to use inline assembly instead of glibc's syscall because glibc's
+ * syscall will attempt to access the PLT in order to call a library function
+ * which is protected by MPK 0 which we don't have access to.
+ */
+static inline __always_inline
+long syscall_raw(long n, long a1, long a2, long a3, long a4, long a5, long a6)
+{
+ unsigned long ret;
+#ifdef __x86_64__
+ register long r10 asm("r10") = a4;
+ register long r8 asm("r8") = a5;
+ register long r9 asm("r9") = a6;
+ asm volatile ("syscall"
+ : "=a"(ret)
+ : "a"(n), "D"(a1), "S"(a2), "d"(a3), "r"(r10), "r"(r8), "r"(r9)
+ : "rcx", "r11", "memory");
+#elif defined __i386__
+ asm volatile ("int $0x80"
+ : "=a"(ret)
+ : "a"(n), "b"(a1), "c"(a2), "d"(a3), "S"(a4), "D"(a5)
+ : "memory");
+#else
+# error syscall_raw() not implemented
+#endif
+ return ret;
+}
+
+static void sigsegv_handler(int signo, siginfo_t *info, void *ucontext)
+{
+ pthread_mutex_lock(&mutex);
+
+ memcpy(&siginfo, info, sizeof(siginfo_t));
+
+ pthread_cond_signal(&cond);
+ pthread_mutex_unlock(&mutex);
+
+ syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0);
+}
+
+static void sigusr1_handler(int signo, siginfo_t *info, void *ucontext)
+{
+ pthread_mutex_lock(&mutex);
+
+ memcpy(&siginfo, info, sizeof(siginfo_t));
+
+ pthread_cond_signal(&cond);
+ pthread_mutex_unlock(&mutex);
+}
+
+static void sigusr2_handler(int signo, siginfo_t *info, void *ucontext)
+{
+ /*
+ * pkru should be the init_pkru value which enabled MPK 0 so
+ * we can use library functions.
+ */
+ printf("%s invoked.\n", __func__);
+}
+
+static void raise_sigusr2(void)
+{
+ pid_t tid = 0;
+
+ tid = syscall_raw(SYS_gettid, 0, 0, 0, 0, 0, 0);
+
+ syscall_raw(SYS_tkill, tid, SIGUSR2, 0, 0, 0, 0);
+
+ /*
+ * We should return from the signal handler here and be able to
+ * return to the interrupted thread.
+ */
+}
+
+static void *thread_segv_with_pkey0_disabled(void *ptr)
+{
+ /* Disable MPK 0 (and all others too) */
+ __write_pkey_reg(0x55555555);
+
+ /* Segfault (with SEGV_MAPERR) */
+ *(int *) (0x1) = 1;
+ return NULL;
+}
+
+static void *thread_segv_pkuerr_stack(void *ptr)
+{
+ /* Disable MPK 0 (and all others too) */
+ __write_pkey_reg(0x55555555);
+
+ /* After we disable MPK 0, we can't access the stack to return */
+ return NULL;
+}
+
+static void *thread_segv_maperr_ptr(void *ptr)
+{
+ stack_t *stack = ptr;
+ int *bad = (int *)1;
+
+ /*
+ * Setup alternate signal stack, which should be pkey_mprotect()ed by
+ * MPK 0. The thread's stack cannot be used for signals because it is
+ * not accessible by the default init_pkru value of 0x55555554.
+ */
+ syscall_raw(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0);
+
+ /* Disable MPK 0. Only MPK 1 is enabled. */
+ __write_pkey_reg(0x55555551);
+
+ /* Segfault */
+ *bad = 1;
+ syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0);
+ return NULL;
+}
+
+/*
+ * Verify that the sigsegv handler is invoked when pkey 0 is disabled.
+ * Note that the new thread stack and the alternate signal stack is
+ * protected by MPK 0.
+ */
+static void test_sigsegv_handler_with_pkey0_disabled(void)
+{
+ struct sigaction sa;
+ pthread_attr_t attr;
+ pthread_t thr;
+
+ sa.sa_flags = SA_SIGINFO;
+
+ sa.sa_sigaction = sigsegv_handler;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(SIGSEGV, &sa, NULL) == -1) {
+ perror("sigaction");
+ exit(EXIT_FAILURE);
+ }
+
+ memset(&siginfo, 0, sizeof(siginfo));
+
+ pthread_attr_init(&attr);
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+
+ pthread_create(&thr, &attr, thread_segv_with_pkey0_disabled, NULL);
+
+ pthread_mutex_lock(&mutex);
+ while (siginfo.si_signo == 0)
+ pthread_cond_wait(&cond, &mutex);
+ pthread_mutex_unlock(&mutex);
+
+ ksft_test_result(siginfo.si_signo == SIGSEGV &&
+ siginfo.si_code == SEGV_MAPERR &&
+ siginfo.si_addr == (void *)1,
+ "%s\n", __func__);
+}
+
+/*
+ * Verify that the sigsegv handler is invoked when pkey 0 is disabled.
+ * Note that the new thread stack and the alternate signal stack is
+ * protected by MPK 0, which renders them inaccessible when MPK 0
+ * is disabled. So just the return from the thread should cause a
+ * segfault with SEGV_PKUERR.
+ */
+static void test_sigsegv_handler_cannot_access_stack(void)
+{
+ struct sigaction sa;
+ pthread_attr_t attr;
+ pthread_t thr;
+
+ sa.sa_flags = SA_SIGINFO;
+
+ sa.sa_sigaction = sigsegv_handler;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(SIGSEGV, &sa, NULL) == -1) {
+ perror("sigaction");
+ exit(EXIT_FAILURE);
+ }
+
+ memset(&siginfo, 0, sizeof(siginfo));
+
+ pthread_attr_init(&attr);
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+
+ pthread_create(&thr, &attr, thread_segv_pkuerr_stack, NULL);
+
+ pthread_mutex_lock(&mutex);
+ while (siginfo.si_signo == 0)
+ pthread_cond_wait(&cond, &mutex);
+ pthread_mutex_unlock(&mutex);
+
+ ksft_test_result(siginfo.si_signo == SIGSEGV &&
+ siginfo.si_code == SEGV_PKUERR,
+ "%s\n", __func__);
+}
+
+/*
+ * Verify that the sigsegv handler that uses an alternate signal stack
+ * is correctly invoked for a thread which uses a non-zero MPK to protect
+ * its own stack, and disables all other MPKs (including 0).
+ */
+static void test_sigsegv_handler_with_different_pkey_for_stack(void)
+{
+ struct sigaction sa;
+ static stack_t sigstack;
+ void *stack;
+ int pkey;
+ int parent_pid = 0;
+ int child_pid = 0;
+
+ sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
+
+ sa.sa_sigaction = sigsegv_handler;
+
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(SIGSEGV, &sa, NULL) == -1) {
+ perror("sigaction");
+ exit(EXIT_FAILURE);
+ }
+
+ stack = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ assert(stack != MAP_FAILED);
+
+ /* Allow access to MPK 0 and MPK 1 */
+ __write_pkey_reg(0x55555550);
+
+ /* Protect the new stack with MPK 1 */
+ pkey = pkey_alloc(0, 0);
+ pkey_mprotect(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey);
+
+ /* Set up alternate signal stack that will use the default MPK */
+ sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ sigstack.ss_flags = 0;
+ sigstack.ss_size = STACK_SIZE;
+
+ memset(&siginfo, 0, sizeof(siginfo));
+
+ /* Use clone to avoid newer glibcs using rseq on new threads */
+ long ret = syscall_raw(SYS_clone,
+ CLONE_VM | CLONE_FS | CLONE_FILES |
+ CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
+ CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
+ CLONE_DETACHED,
+ (long) ((char *)(stack) + STACK_SIZE),
+ (long) &parent_pid,
+ (long) &child_pid, 0, 0);
+
+ if (ret < 0) {
+ errno = -ret;
+ perror("clone");
+ } else if (ret == 0) {
+ thread_segv_maperr_ptr(&sigstack);
+ syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0);
+ }
+
+ pthread_mutex_lock(&mutex);
+ while (siginfo.si_signo == 0)
+ pthread_cond_wait(&cond, &mutex);
+ pthread_mutex_unlock(&mutex);
+
+ ksft_test_result(siginfo.si_signo == SIGSEGV &&
+ siginfo.si_code == SEGV_MAPERR &&
+ siginfo.si_addr == (void *)1,
+ "%s\n", __func__);
+}
+
+/*
+ * Verify that the PKRU value set by the application is correctly
+ * restored upon return from signal handling.
+ */
+static void test_pkru_preserved_after_sigusr1(void)
+{
+ struct sigaction sa;
+ unsigned long pkru = 0x45454544;
+
+ sa.sa_flags = SA_SIGINFO;
+
+ sa.sa_sigaction = sigusr1_handler;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(SIGUSR1, &sa, NULL) == -1) {
+ perror("sigaction");
+ exit(EXIT_FAILURE);
+ }
+
+ memset(&siginfo, 0, sizeof(siginfo));
+
+ __write_pkey_reg(pkru);
+
+ raise(SIGUSR1);
+
+ pthread_mutex_lock(&mutex);
+ while (siginfo.si_signo == 0)
+ pthread_cond_wait(&cond, &mutex);
+ pthread_mutex_unlock(&mutex);
+
+ /* Ensure the pkru value is the same after returning from signal. */
+ ksft_test_result(pkru == __read_pkey_reg() &&
+ siginfo.si_signo == SIGUSR1,
+ "%s\n", __func__);
+}
+
+static noinline void *thread_sigusr2_self(void *ptr)
+{
+ /*
+ * A const char array like "Resuming after SIGUSR2" won't be stored on
+ * the stack and the code could access it via an offset from the program
+ * counter. This makes sure it's on the function's stack frame.
+ */
+ char str[] = {'R', 'e', 's', 'u', 'm', 'i', 'n', 'g', ' ',
+ 'a', 'f', 't', 'e', 'r', ' ',
+ 'S', 'I', 'G', 'U', 'S', 'R', '2',
+ '.', '.', '.', '\n', '\0'};
+ stack_t *stack = ptr;
+
+ /*
+ * Setup alternate signal stack, which should be pkey_mprotect()ed by
+ * MPK 0. The thread's stack cannot be used for signals because it is
+ * not accessible by the default init_pkru value of 0x55555554.
+ */
+ syscall(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0);
+
+ /* Disable MPK 0. Only MPK 2 is enabled. */
+ __write_pkey_reg(0x55555545);
+
+ raise_sigusr2();
+
+ /* Do something, to show the thread resumed execution after the signal */
+ syscall_raw(SYS_write, 1, (long) str, sizeof(str) - 1, 0, 0, 0);
+
+ /*
+ * We can't return to test_pkru_sigreturn because it
+ * will attempt to use a %rbp value which is on the stack
+ * of the main thread.
+ */
+ syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0);
+ return NULL;
+}
+
+/*
+ * Verify that sigreturn is able to restore altstack even if the thread had
+ * disabled pkey 0.
+ */
+static void test_pkru_sigreturn(void)
+{
+ struct sigaction sa = {0};
+ static stack_t sigstack;
+ void *stack;
+ int pkey;
+ int parent_pid = 0;
+ int child_pid = 0;
+
+ sa.sa_handler = SIG_DFL;
+ sa.sa_flags = 0;
+ sigemptyset(&sa.sa_mask);
+
+ /*
+ * For this testcase, we do not want to handle SIGSEGV. Reset handler
+ * to default so that the application can crash if it receives SIGSEGV.
+ */
+ if (sigaction(SIGSEGV, &sa, NULL) == -1) {
+ perror("sigaction");
+ exit(EXIT_FAILURE);
+ }
+
+ sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
+ sa.sa_sigaction = sigusr2_handler;
+ sigemptyset(&sa.sa_mask);
+
+ if (sigaction(SIGUSR2, &sa, NULL) == -1) {
+ perror("sigaction");
+ exit(EXIT_FAILURE);
+ }
+
+ stack = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ assert(stack != MAP_FAILED);
+
+ /*
+ * Allow access to MPK 0 and MPK 2. The child thread (to be created
+ * later in this flow) will have its stack protected by MPK 2, whereas
+ * the current thread's stack is protected by the default MPK 0. Hence
+ * both need to be enabled.
+ */
+ __write_pkey_reg(0x55555544);
+
+ /* Protect the stack with MPK 2 */
+ pkey = pkey_alloc(0, 0);
+ pkey_mprotect(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey);
+
+ /* Set up alternate signal stack that will use the default MPK */
+ sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ sigstack.ss_flags = 0;
+ sigstack.ss_size = STACK_SIZE;
+
+ /* Use clone to avoid newer glibcs using rseq on new threads */
+ long ret = syscall_raw(SYS_clone,
+ CLONE_VM | CLONE_FS | CLONE_FILES |
+ CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
+ CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
+ CLONE_DETACHED,
+ (long) ((char *)(stack) + STACK_SIZE),
+ (long) &parent_pid,
+ (long) &child_pid, 0, 0);
+
+ if (ret < 0) {
+ errno = -ret;
+ perror("clone");
+ } else if (ret == 0) {
+ thread_sigusr2_self(&sigstack);
+ syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0);
+ }
+
+ child_pid = ret;
+ /* Check that thread exited */
+ do {
+ sched_yield();
+ ret = syscall_raw(SYS_tkill, child_pid, 0, 0, 0, 0, 0);
+ } while (ret != -ESRCH && ret != -EINVAL);
+
+ ksft_test_result_pass("%s\n", __func__);
+}
+
+static void (*pkey_tests[])(void) = {
+ test_sigsegv_handler_with_pkey0_disabled,
+ test_sigsegv_handler_cannot_access_stack,
+ test_sigsegv_handler_with_different_pkey_for_stack,
+ test_pkru_preserved_after_sigusr1,
+ test_pkru_sigreturn
+};
+
+int main(int argc, char *argv[])
+{
+ int i;
+
+ ksft_print_header();
+ ksft_set_plan(ARRAY_SIZE(pkey_tests));
+
+ for (i = 0; i < ARRAY_SIZE(pkey_tests); i++)
+ (*pkey_tests[i])();
+
+ ksft_finished();
+ return 0;
+}
diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c
index 0789981b72b9..4990f7ab4cb7 100644
--- a/tools/testing/selftests/mm/protection_keys.c
+++ b/tools/testing/selftests/mm/protection_keys.c
@@ -954,16 +954,6 @@ void close_test_fds(void)
nr_test_fds = 0;
}
-#define barrier() __asm__ __volatile__("": : :"memory")
-__attribute__((noinline)) int read_ptr(int *ptr)
-{
- /*
- * Keep GCC from optimizing this away somehow
- */
- barrier();
- return *ptr;
-}
-
void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey)
{
int i, err;