diff options
Diffstat (limited to 'arch/arm64/mm')
-rw-r--r-- | arch/arm64/mm/Makefile | 2 | ||||
-rw-r--r-- | arch/arm64/mm/contpte.c | 6 | ||||
-rw-r--r-- | arch/arm64/mm/fault.c | 55 | ||||
-rw-r--r-- | arch/arm64/mm/init.c | 12 | ||||
-rw-r--r-- | arch/arm64/mm/ioremap.c | 23 | ||||
-rw-r--r-- | arch/arm64/mm/mem_encrypt.c | 50 | ||||
-rw-r--r-- | arch/arm64/mm/mmap.c | 11 | ||||
-rw-r--r-- | arch/arm64/mm/mmu.c | 45 | ||||
-rw-r--r-- | arch/arm64/mm/proc.S | 4 | ||||
-rw-r--r-- | arch/arm64/mm/ptdump.c | 70 | ||||
-rw-r--r-- | arch/arm64/mm/trans_pgd.c | 6 |
11 files changed, 224 insertions, 60 deletions
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index 60454256945b..2fc8c6dd0407 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := dma-mapping.o extable.o fault.o init.o \ cache.o copypage.o flush.o \ - ioremap.o mmap.o pgd.o mmu.o \ + ioremap.o mmap.o pgd.o mem_encrypt.o mmu.o \ context.o proc.o pageattr.o fixmap.o obj-$(CONFIG_ARM64_CONTPTE) += contpte.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index a3edced29ac1..55107d27d3f8 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -421,6 +421,12 @@ int contpte_ptep_set_access_flags(struct vm_area_struct *vma, ptep = contpte_align_down(ptep); start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); + /* + * We are not advancing entry because __ptep_set_access_flags() + * only consumes access flags from entry. And since we have checked + * for the whole contpte block and returned early, pte_same() + * within __ptep_set_access_flags() is likely false. + */ for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) __ptep_set_access_flags(vma, addr, ptep, entry, 0); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 451ba7cbd5ad..8b281cf308b3 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -23,6 +23,7 @@ #include <linux/sched/debug.h> #include <linux/highmem.h> #include <linux/perf_event.h> +#include <linux/pkeys.h> #include <linux/preempt.h> #include <linux/hugetlb.h> @@ -486,6 +487,23 @@ static void do_bad_area(unsigned long far, unsigned long esr, } } +static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma, + unsigned int mm_flags) +{ + unsigned long iss2 = ESR_ELx_ISS2(esr); + + if (!system_supports_poe()) + return false; + + if (esr_fsc_is_permission_fault(esr) && (iss2 & ESR_ELx_Overlay)) + return true; + + return !arch_vma_access_permitted(vma, + mm_flags & FAULT_FLAG_WRITE, + mm_flags & FAULT_FLAG_INSTRUCTION, + false); +} + static bool is_el0_instruction_abort(unsigned long esr) { return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW; @@ -511,6 +529,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, unsigned long addr = untagged_addr(far); struct vm_area_struct *vma; int si_code; + int pkey = -1; if (kprobe_page_fault(regs, esr)) return 0; @@ -575,6 +594,16 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, count_vm_vma_lock_event(VMA_LOCK_SUCCESS); goto bad_area; } + + if (fault_from_pkey(esr, vma, mm_flags)) { + pkey = vma_pkey(vma); + vma_end_read(vma); + fault = 0; + si_code = SEGV_PKUERR; + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + goto bad_area; + } + fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs); if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) vma_end_read(vma); @@ -610,7 +639,16 @@ retry: goto bad_area; } + if (fault_from_pkey(esr, vma, mm_flags)) { + pkey = vma_pkey(vma); + mmap_read_unlock(mm); + fault = 0; + si_code = SEGV_PKUERR; + goto bad_area; + } + fault = handle_mm_fault(vma, addr, mm_flags, regs); + /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) @@ -669,8 +707,23 @@ bad_area: arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name); } else { + /* + * The pkey value that we return to userspace can be different + * from the pkey that caused the fault. + * + * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); + * 2. T1 : set POR_EL0 to deny access to pkey=4, touches, page + * 3. T1 : faults... + * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); + * 5. T1 : enters fault handler, takes mmap_lock, etc... + * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really + * faulted on a pte with its pkey=4. + */ /* Something tried to access memory that out of memory map */ - arm64_force_sig_fault(SIGSEGV, si_code, far, inf->name); + if (si_code == SEGV_PKUERR) + arm64_force_sig_fault_pkey(far, inf->name, pkey); + else + arm64_force_sig_fault(SIGSEGV, si_code, far, inf->name); } return 0; diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 9b5ab6818f7f..a0400b9aa814 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -414,8 +414,16 @@ void __init mem_init(void) void free_initmem(void) { - free_reserved_area(lm_alias(__init_begin), - lm_alias(__init_end), + void *lm_init_begin = lm_alias(__init_begin); + void *lm_init_end = lm_alias(__init_end); + + WARN_ON(!IS_ALIGNED((unsigned long)lm_init_begin, PAGE_SIZE)); + WARN_ON(!IS_ALIGNED((unsigned long)lm_init_end, PAGE_SIZE)); + + /* Delete __init region from memblock.reserved. */ + memblock_free(lm_init_begin, lm_init_end - lm_init_begin); + + free_reserved_area(lm_init_begin, lm_init_end, POISON_FREE_INITMEM, "unused kernel"); /* * Unmap the __init region but leave the VM area in place. This diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c index 269f2f63ab7d..6cc0b7e7eb03 100644 --- a/arch/arm64/mm/ioremap.c +++ b/arch/arm64/mm/ioremap.c @@ -3,10 +3,22 @@ #include <linux/mm.h> #include <linux/io.h> +static ioremap_prot_hook_t ioremap_prot_hook; + +int arm64_ioremap_prot_hook_register(ioremap_prot_hook_t hook) +{ + if (WARN_ON(ioremap_prot_hook)) + return -EBUSY; + + ioremap_prot_hook = hook; + return 0; +} + void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, unsigned long prot) { unsigned long last_addr = phys_addr + size - 1; + pgprot_t pgprot = __pgprot(prot); /* Don't allow outside PHYS_MASK */ if (last_addr & ~PHYS_MASK) @@ -16,7 +28,16 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, if (WARN_ON(pfn_is_map_memory(__phys_to_pfn(phys_addr)))) return NULL; - return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); + /* + * If a hook is registered (e.g. for confidential computing + * purposes), call that now and barf if it fails. + */ + if (unlikely(ioremap_prot_hook) && + WARN_ON(ioremap_prot_hook(phys_addr, size, &pgprot))) { + return NULL; + } + + return generic_ioremap_prot(phys_addr, size, pgprot); } EXPORT_SYMBOL(ioremap_prot); diff --git a/arch/arm64/mm/mem_encrypt.c b/arch/arm64/mm/mem_encrypt.c new file mode 100644 index 000000000000..ee3c0ab04384 --- /dev/null +++ b/arch/arm64/mm/mem_encrypt.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Implementation of the memory encryption/decryption API. + * + * Since the low-level details of the operation depend on the + * Confidential Computing environment (e.g. pKVM, CCA, ...), this just + * acts as a top-level dispatcher to whatever hooks may have been + * registered. + * + * Author: Will Deacon <will@kernel.org> + * Copyright (C) 2024 Google LLC + * + * "Hello, boils and ghouls!" + */ + +#include <linux/bug.h> +#include <linux/compiler.h> +#include <linux/err.h> +#include <linux/mm.h> + +#include <asm/mem_encrypt.h> + +static const struct arm64_mem_crypt_ops *crypt_ops; + +int arm64_mem_crypt_ops_register(const struct arm64_mem_crypt_ops *ops) +{ + if (WARN_ON(crypt_ops)) + return -EBUSY; + + crypt_ops = ops; + return 0; +} + +int set_memory_encrypted(unsigned long addr, int numpages) +{ + if (likely(!crypt_ops) || WARN_ON(!PAGE_ALIGNED(addr))) + return 0; + + return crypt_ops->encrypt(addr, numpages); +} +EXPORT_SYMBOL_GPL(set_memory_encrypted); + +int set_memory_decrypted(unsigned long addr, int numpages) +{ + if (likely(!crypt_ops) || WARN_ON(!PAGE_ALIGNED(addr))) + return 0; + + return crypt_ops->decrypt(addr, numpages); +} +EXPORT_SYMBOL_GPL(set_memory_decrypted); diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 642bdf908b22..7e3ad97e27d8 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -102,6 +102,17 @@ pgprot_t vm_get_page_prot(unsigned long vm_flags) if (vm_flags & VM_MTE) prot |= PTE_ATTRINDX(MT_NORMAL_TAGGED); +#ifdef CONFIG_ARCH_HAS_PKEYS + if (system_supports_poe()) { + if (vm_flags & VM_PKEY_BIT0) + prot |= PTE_PO_IDX_0; + if (vm_flags & VM_PKEY_BIT1) + prot |= PTE_PO_IDX_1; + if (vm_flags & VM_PKEY_BIT2) + prot |= PTE_PO_IDX_2; + } +#endif + return __pgprot(prot); } EXPORT_SYMBOL(vm_get_page_prot); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 353ea5dc32b8..e55b02fbddc8 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -25,6 +25,7 @@ #include <linux/vmalloc.h> #include <linux/set_memory.h> #include <linux/kfence.h> +#include <linux/pkeys.h> #include <asm/barrier.h> #include <asm/cputype.h> @@ -1549,3 +1550,47 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp) cpu_uninstall_idmap(); } + +#ifdef CONFIG_ARCH_HAS_PKEYS +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) +{ + u64 new_por = POE_RXW; + u64 old_por; + u64 pkey_shift; + + if (!system_supports_poe()) + return -ENOSPC; + + /* + * This code should only be called with valid 'pkey' + * values originating from in-kernel users. Complain + * if a bad value is observed. + */ + if (WARN_ON_ONCE(pkey >= arch_max_pkey())) + return -EINVAL; + + /* Set the bits we need in POR: */ + new_por = POE_RXW; + if (init_val & PKEY_DISABLE_WRITE) + new_por &= ~POE_W; + if (init_val & PKEY_DISABLE_ACCESS) + new_por &= ~POE_RW; + if (init_val & PKEY_DISABLE_READ) + new_por &= ~POE_R; + if (init_val & PKEY_DISABLE_EXECUTE) + new_por &= ~POE_X; + + /* Shift the bits in to the correct place in POR for pkey: */ + pkey_shift = pkey * POR_BITS_PER_PKEY; + new_por <<= pkey_shift; + + /* Get old POR and mask off any old bits in place: */ + old_por = read_sysreg_s(SYS_POR_EL0); + old_por &= ~(POE_MASK << pkey_shift); + + /* Write old part along with new part: */ + write_sysreg_s(old_por | new_por, SYS_POR_EL0); + + return 0; +} +#endif diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index f4bc6c5bac06..8abdc7fed321 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -36,8 +36,6 @@ #define TCR_KASLR_FLAGS 0 #endif -#define TCR_SMP_FLAGS TCR_SHARED - /* PTWs cacheable, inner/outer WBWA */ #define TCR_CACHE_FLAGS TCR_IRGN_WBWA | TCR_ORGN_WBWA @@ -469,7 +467,7 @@ SYM_FUNC_START(__cpu_setup) tcr .req x16 mov_q mair, MAIR_EL1_SET mov_q tcr, TCR_T0SZ(IDMAP_VA_BITS) | TCR_T1SZ(VA_BITS_MIN) | TCR_CACHE_FLAGS | \ - TCR_SMP_FLAGS | TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \ + TCR_SHARED | TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \ TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS tcr_clear_errata_bits tcr, x9, x5 diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index 6986827e0d64..264c5f9b97d8 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -38,33 +38,7 @@ seq_printf(m, fmt); \ }) -/* - * The page dumper groups page table entries of the same type into a single - * description. It uses pg_state to track the range information while - * iterating over the pte entries. When the continuity is broken it then - * dumps out a description of the range. - */ -struct pg_state { - struct ptdump_state ptdump; - struct seq_file *seq; - const struct addr_marker *marker; - const struct mm_struct *mm; - unsigned long start_address; - int level; - u64 current_prot; - bool check_wx; - unsigned long wx_pages; - unsigned long uxn_pages; -}; - -struct prot_bits { - u64 mask; - u64 val; - const char *set; - const char *clear; -}; - -static const struct prot_bits pte_bits[] = { +static const struct ptdump_prot_bits pte_bits[] = { { .mask = PTE_VALID, .val = PTE_VALID, @@ -143,14 +117,7 @@ static const struct prot_bits pte_bits[] = { } }; -struct pg_level { - const struct prot_bits *bits; - char name[4]; - int num; - u64 mask; -}; - -static struct pg_level pg_level[] __ro_after_init = { +static struct ptdump_pg_level kernel_pg_levels[] __ro_after_init = { { /* pgd */ .name = "PGD", .bits = pte_bits, @@ -174,7 +141,7 @@ static struct pg_level pg_level[] __ro_after_init = { }, }; -static void dump_prot(struct pg_state *st, const struct prot_bits *bits, +static void dump_prot(struct ptdump_pg_state *st, const struct ptdump_prot_bits *bits, size_t num) { unsigned i; @@ -192,7 +159,7 @@ static void dump_prot(struct pg_state *st, const struct prot_bits *bits, } } -static void note_prot_uxn(struct pg_state *st, unsigned long addr) +static void note_prot_uxn(struct ptdump_pg_state *st, unsigned long addr) { if (!st->check_wx) return; @@ -206,7 +173,7 @@ static void note_prot_uxn(struct pg_state *st, unsigned long addr) st->uxn_pages += (addr - st->start_address) / PAGE_SIZE; } -static void note_prot_wx(struct pg_state *st, unsigned long addr) +static void note_prot_wx(struct ptdump_pg_state *st, unsigned long addr) { if (!st->check_wx) return; @@ -221,16 +188,17 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) st->wx_pages += (addr - st->start_address) / PAGE_SIZE; } -static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, - u64 val) +void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, + u64 val) { - struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); + struct ptdump_pg_state *st = container_of(pt_st, struct ptdump_pg_state, ptdump); + struct ptdump_pg_level *pg_level = st->pg_level; static const char units[] = "KMGTPE"; u64 prot = 0; /* check if the current level has been folded dynamically */ - if ((level == 1 && mm_p4d_folded(st->mm)) || - (level == 2 && mm_pud_folded(st->mm))) + if (st->mm && ((level == 1 && mm_p4d_folded(st->mm)) || + (level == 2 && mm_pud_folded(st->mm)))) level = 0; if (level >= 0) @@ -286,15 +254,16 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, void ptdump_walk(struct seq_file *s, struct ptdump_info *info) { unsigned long end = ~0UL; - struct pg_state st; + struct ptdump_pg_state st; if (info->base_addr < TASK_SIZE_64) end = TASK_SIZE_64; - st = (struct pg_state){ + st = (struct ptdump_pg_state){ .seq = s, .marker = info->markers, .mm = info->mm, + .pg_level = &kernel_pg_levels[0], .level = -1, .ptdump = { .note_page = note_page, @@ -312,10 +281,10 @@ static void __init ptdump_initialize(void) { unsigned i, j; - for (i = 0; i < ARRAY_SIZE(pg_level); i++) - if (pg_level[i].bits) - for (j = 0; j < pg_level[i].num; j++) - pg_level[i].mask |= pg_level[i].bits[j].mask; + for (i = 0; i < ARRAY_SIZE(kernel_pg_levels); i++) + if (kernel_pg_levels[i].bits) + for (j = 0; j < kernel_pg_levels[i].num; j++) + kernel_pg_levels[i].mask |= kernel_pg_levels[i].bits[j].mask; } static struct ptdump_info kernel_ptdump_info __ro_after_init = { @@ -324,12 +293,13 @@ static struct ptdump_info kernel_ptdump_info __ro_after_init = { bool ptdump_check_wx(void) { - struct pg_state st = { + struct ptdump_pg_state st = { .seq = NULL, .marker = (struct addr_marker[]) { { 0, NULL}, { -1, NULL}, }, + .pg_level = &kernel_pg_levels[0], .level = -1, .check_wx = true, .ptdump = { diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index 5139a28130c0..0f7b484cb2ff 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -42,14 +42,16 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) * the temporary mappings we use during restore. */ __set_pte(dst_ptep, pte_mkwrite_novma(pte)); - } else if ((debug_pagealloc_enabled() || - is_kfence_address((void *)addr)) && !pte_none(pte)) { + } else if (!pte_none(pte)) { /* * debug_pagealloc will removed the PTE_VALID bit if * the page isn't in use by the resume kernel. It may have * been in use by the original kernel, in which case we need * to put it back in our copy to do the restore. * + * Other cases include kfence / vmalloc / memfd_secret which + * may call `set_direct_map_invalid_noflush()`. + * * Before marking this entry valid, check the pfn should * be mapped. */ |