diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index d52c3e2e16bc..47a397da0498 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -35,7 +35,7 @@ endif CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char -obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o +obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o vmem.o obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o obj-y += version.o pgm_check_info.o ctype.o ipl_data.o machine_kexec_reloc.o obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h index 286441cf3bf0..547614496e7e 100644 --- a/arch/s390/boot/boot.h +++ b/arch/s390/boot/boot.h @@ -16,7 +16,7 @@ struct machine_info { struct vmlinux_info { unsigned long default_lma; - void (*entry)(void); + unsigned long entry; unsigned long image_size; /* does not include .bss */ unsigned long bss_size; /* uncompressed image .bss size */ unsigned long bootdata_off; @@ -27,6 +27,9 @@ struct vmlinux_info { unsigned long rela_dyn_start; unsigned long rela_dyn_end; unsigned long amode31_size; + unsigned long init_mm_off; + unsigned long swapper_pg_dir_off; + unsigned long invalid_pg_dir_off; }; void startup_kernel(void); @@ -41,6 +44,7 @@ void print_missing_facilities(void); void sclp_early_setup_buffer(void); void print_pgm_check_info(void); unsigned long get_random_base(unsigned long safe_addr); +void setup_vmem(unsigned long online_end, unsigned long asce_limit); void __printf(1, 2) decompressor_printk(const char *fmt, ...); void error(char *m); diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index da6ee587fe9a..c5d59df9fa62 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "decompressor.h" #include "boot.h" #include "uv.h" @@ -166,9 +167,10 @@ static void setup_ident_map_size(unsigned long max_physmem_end) #endif } -static void setup_kernel_memory_layout(void) +static unsigned long setup_kernel_memory_layout(void) { unsigned long vmemmap_start; + unsigned long asce_limit; unsigned long rte_size; unsigned long pages; unsigned long vmax; @@ -183,10 +185,10 @@ static void setup_kernel_memory_layout(void) vmalloc_size > _REGION2_SIZE || vmemmap_start + vmemmap_size + vmalloc_size + MODULES_LEN > _REGION2_SIZE) { - vmax = _REGION1_SIZE; + asce_limit = _REGION1_SIZE; rte_size = _REGION2_SIZE; } else { - vmax = _REGION2_SIZE; + asce_limit = _REGION2_SIZE; rte_size = _REGION3_SIZE; } /* @@ -194,7 +196,7 @@ static void setup_kernel_memory_layout(void) * secure storage limit, so that any vmalloc allocation * we do could be used to back secure guest storage. */ - vmax = adjust_to_uv_max(vmax); + vmax = adjust_to_uv_max(asce_limit); #ifdef CONFIG_KASAN /* force vmalloc and modules below kasan shadow */ vmax = min(vmax, KASAN_SHADOW_START); @@ -223,6 +225,8 @@ static void setup_kernel_memory_layout(void) /* make sure vmemmap doesn't overlay with vmalloc area */ VMALLOC_START = max(vmemmap_start + vmemmap_size, VMALLOC_START); vmemmap = (struct page *)vmemmap_start; + + return asce_limit; } /* @@ -256,6 +260,9 @@ static void offset_vmlinux_info(unsigned long offset) vmlinux.rela_dyn_start += offset; vmlinux.rela_dyn_end += offset; vmlinux.dynsym_start += offset; + vmlinux.init_mm_off += offset; + vmlinux.swapper_pg_dir_off += offset; + vmlinux.invalid_pg_dir_off += offset; } static unsigned long reserve_amode31(unsigned long safe_addr) @@ -268,7 +275,10 @@ void startup_kernel(void) { unsigned long random_lma; unsigned long safe_addr; + unsigned long asce_limit; + unsigned long online_end; void *img; + psw_t psw; detect_facilities(); @@ -290,7 +300,8 @@ void startup_kernel(void) sanitize_prot_virt_host(); setup_ident_map_size(detect_memory()); setup_vmalloc_size(); - setup_kernel_memory_layout(); + asce_limit = setup_kernel_memory_layout(); + online_end = min(get_mem_detect_end(), ident_map_size); if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_enabled) { random_lma = get_random_base(safe_addr); @@ -307,9 +318,23 @@ void startup_kernel(void) } else if (__kaslr_offset) memcpy((void *)vmlinux.default_lma, img, vmlinux.image_size); + /* + * The order of the following operations is important: + * + * - handle_relocs() must follow clear_bss_section() to establish static + * memory references to data in .bss to be used by setup_vmem() + * (i.e init_mm.pgd) + * + * - setup_vmem() must follow handle_relocs() to be able using + * static memory references to data in .bss (i.e init_mm.pgd) + * + * - copy_bootdata() must follow setup_vmem() to propagate changes to + * bootdata made by setup_vmem() + */ clear_bss_section(); - copy_bootdata(); handle_relocs(__kaslr_offset); + setup_vmem(online_end, asce_limit); + copy_bootdata(); if (__kaslr_offset) { /* @@ -321,5 +346,11 @@ void startup_kernel(void) if (IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED)) memset(img, 0, vmlinux.image_size); } - vmlinux.entry(); + + /* + * Jump to the decompressed kernel entry point and switch DAT mode on. + */ + psw.addr = vmlinux.entry; + psw.mask = PSW_KERNEL_BITS; + __load_psw(psw); } diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c new file mode 100644 index 000000000000..db1469c17289 --- /dev/null +++ b/arch/s390/boot/vmem.c @@ -0,0 +1,254 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include "decompressor.h" +#include "boot.h" + +#define init_mm (*(struct mm_struct *)vmlinux.init_mm_off) +#define swapper_pg_dir vmlinux.swapper_pg_dir_off +#define invalid_pg_dir vmlinux.invalid_pg_dir_off + +unsigned long __bootdata_preserved(s390_invalid_asce); +unsigned long __bootdata(pgalloc_pos); +unsigned long __bootdata(pgalloc_end); +unsigned long __bootdata(pgalloc_low); + +static void boot_check_oom(void) +{ + if (pgalloc_pos < pgalloc_low) + error("out of memory on boot\n"); +} + +static void pgtable_populate_begin(unsigned long online_end) +{ + unsigned long initrd_end; + unsigned long kernel_end; + + kernel_end = vmlinux.default_lma + vmlinux.image_size + vmlinux.bss_size; + pgalloc_low = round_up(kernel_end, PAGE_SIZE); + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD)) { + initrd_end = round_up(initrd_data.start + initrd_data.size, _SEGMENT_SIZE); + pgalloc_low = max(pgalloc_low, initrd_end); + } + + pgalloc_end = round_down(online_end, PAGE_SIZE); + pgalloc_pos = pgalloc_end; + + boot_check_oom(); +} + +static void *boot_alloc_pages(unsigned int order) +{ + unsigned long size = PAGE_SIZE << order; + + pgalloc_pos -= size; + pgalloc_pos = round_down(pgalloc_pos, size); + + boot_check_oom(); + + return (void *)pgalloc_pos; +} + +static void *boot_crst_alloc(unsigned long val) +{ + unsigned long *table; + + table = boot_alloc_pages(CRST_ALLOC_ORDER); + if (table) + crst_table_init(table, val); + return table; +} + +static pte_t *boot_pte_alloc(void) +{ + static void *pte_leftover; + pte_t *pte; + + BUILD_BUG_ON(_PAGE_TABLE_SIZE * 2 != PAGE_SIZE); + + if (!pte_leftover) { + pte_leftover = boot_alloc_pages(0); + pte = pte_leftover + _PAGE_TABLE_SIZE; + } else { + pte = pte_leftover; + pte_leftover = NULL; + } + memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); + return pte; +} + +static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end) +{ + return machine.has_edat2 && + IS_ALIGNED(addr, PUD_SIZE) && (end - addr) >= PUD_SIZE; +} + +static bool can_large_pmd(pmd_t *pm_dir, unsigned long addr, unsigned long end) +{ + return machine.has_edat1 && + IS_ALIGNED(addr, PMD_SIZE) && (end - addr) >= PMD_SIZE; +} + +static void pgtable_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long end) +{ + unsigned long next; + pte_t *pte, entry; + + pte = pte_offset_kernel(pmd, addr); + for (; addr < end; addr += PAGE_SIZE, pte++) { + if (pte_none(*pte)) { + entry = __pte(__pa(addr)); + entry = set_pte_bit(entry, PAGE_KERNEL_EXEC); + set_pte(pte, entry); + } + } +} + +static void pgtable_pmd_populate(pud_t *pud, unsigned long addr, unsigned long end) +{ + unsigned long next; + pmd_t *pmd, entry; + pte_t *pte; + + pmd = pmd_offset(pud, addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + if (pmd_none(*pmd)) { + if (can_large_pmd(pmd, addr, next)) { + entry = __pmd(__pa(addr)); + entry = set_pmd_bit(entry, SEGMENT_KERNEL_EXEC); + set_pmd(pmd, entry); + continue; + } + pte = boot_pte_alloc(); + pmd_populate(&init_mm, pmd, pte); + } else if (pmd_large(*pmd)) { + continue; + } + pgtable_pte_populate(pmd, addr, next); + } +} + +static void pgtable_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long end) +{ + unsigned long next; + pud_t *pud, entry; + pmd_t *pmd; + + pud = pud_offset(p4d, addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + if (pud_none(*pud)) { + if (can_large_pud(pud, addr, next)) { + entry = __pud(__pa(addr)); + entry = set_pud_bit(entry, REGION3_KERNEL_EXEC); + set_pud(pud, entry); + continue; + } + pmd = boot_crst_alloc(_SEGMENT_ENTRY_EMPTY); + pud_populate(&init_mm, pud, pmd); + } else if (pud_large(*pud)) { + continue; + } + pgtable_pmd_populate(pud, addr, next); + } +} + +static void pgtable_p4d_populate(pgd_t *pgd, unsigned long addr, unsigned long end) +{ + unsigned long next; + p4d_t *p4d; + pud_t *pud; + + p4d = p4d_offset(pgd, addr); + for (; addr < end; addr = next, p4d++) { + next = p4d_addr_end(addr, end); + if (p4d_none(*p4d)) { + pud = boot_crst_alloc(_REGION3_ENTRY_EMPTY); + p4d_populate(&init_mm, p4d, pud); + } + pgtable_pud_populate(p4d, addr, next); + } +} + +static void pgtable_populate(unsigned long addr, unsigned long end) +{ + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + + pgd = pgd_offset(&init_mm, addr); + for (; addr < end; addr = next, pgd++) { + next = pgd_addr_end(addr, end); + if (pgd_none(*pgd)) { + p4d = boot_crst_alloc(_REGION2_ENTRY_EMPTY); + pgd_populate(&init_mm, pgd, p4d); + } + pgtable_p4d_populate(pgd, addr, next); + } +} + +/* + * The pgtables are located in the range [pgalloc_pos, pgalloc_end). + * That range must stay intact and is later reserved in the memblock. + * Therefore pgtable_populate(pgalloc_pos, pgalloc_end) is needed to + * finalize pgalloc_pos pointer. However that call can decrease the + * value of pgalloc_pos pointer itself. Therefore, pgtable_populate() + * needs to be called repeatedly until pgtables are complete and + * pgalloc_pos does not grow left anymore. + */ +static void pgtable_populate_end(void) +{ + unsigned long pgalloc_end_curr = pgalloc_end; + unsigned long pgalloc_pos_prev; + + do { + pgalloc_pos_prev = pgalloc_pos; + pgtable_populate(pgalloc_pos, pgalloc_end_curr); + pgalloc_end_curr = pgalloc_pos_prev; + } while (pgalloc_pos < pgalloc_pos_prev); +} + +void setup_vmem(unsigned long online_end, unsigned long asce_limit) +{ + unsigned long asce_type; + unsigned long asce_bits; + + if (asce_limit == _REGION1_SIZE) { + asce_type = _REGION2_ENTRY_EMPTY; + asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; + } else { + asce_type = _REGION3_ENTRY_EMPTY; + asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + } + s390_invalid_asce = invalid_pg_dir | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + + crst_table_init((unsigned long *)swapper_pg_dir, asce_type); + crst_table_init((unsigned long *)invalid_pg_dir, _REGION3_ENTRY_EMPTY); + + /* + * To allow prefixing the lowcore must be mapped with 4KB pages. + * To prevent creation of a large page at address 0 first map + * the lowcore and create the identity mapping only afterwards. + * + * No further pgtable_populate() calls are allowed after the value + * of pgalloc_pos finalized with a call to pgtable_populate_end(). + */ + pgtable_populate_begin(online_end); + pgtable_populate(0, sizeof(struct lowcore)); + pgtable_populate(0, online_end); + pgtable_populate_end(); + + S390_lowcore.kernel_asce = swapper_pg_dir | asce_bits; + S390_lowcore.user_asce = s390_invalid_asce; + + __ctl_load(S390_lowcore.kernel_asce, 1, 1); + __ctl_load(S390_lowcore.user_asce, 7, 7); + __ctl_load(S390_lowcore.kernel_asce, 13, 13); + + init_mm.context.asce = S390_lowcore.kernel_asce; +} diff --git a/arch/s390/include/asm/kasan.h b/arch/s390/include/asm/kasan.h index 2768d5db181f..f7244cc16240 100644 --- a/arch/s390/include/asm/kasan.h +++ b/arch/s390/include/asm/kasan.h @@ -14,8 +14,6 @@ #define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE) extern void kasan_early_init(void); -extern void kasan_copy_shadow_mapping(void); -extern void kasan_free_early_identity(void); /* * Estimate kasan memory requirements, which it will reserve @@ -43,8 +41,6 @@ static inline unsigned long kasan_estimate_memory_needs(unsigned long physmem) } #else static inline void kasan_early_init(void) { } -static inline void kasan_copy_shadow_mapping(void) { } -static inline void kasan_free_early_identity(void) { } static inline unsigned long kasan_estimate_memory_needs(unsigned long physmem) { return 0; } #endif diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 0170f95f3b91..0f1eba005f6d 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -23,6 +23,7 @@ #include extern pgd_t swapper_pg_dir[]; +extern pgd_t invalid_pg_dir[]; extern void paging_init(void); extern unsigned long s390_invalid_asce; diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index 8bae33ab320a..bfb8c3cb8aee 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -26,7 +26,7 @@ #ifndef __ASSEMBLY__ #define PSW_KERNEL_BITS (PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_ASC_HOME | \ - PSW_MASK_EA | PSW_MASK_BA) + PSW_MASK_EA | PSW_MASK_BA | PSW_MASK_DAT) #define PSW_USER_BITS (PSW_MASK_DAT | PSW_MASK_IO | PSW_MASK_EXT | \ PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_MASK_MCHECK | \ PSW_MASK_PSTATE | PSW_ASC_PRIMARY) diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index 77e6506898f5..6792ce28d37a 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -73,6 +73,9 @@ extern unsigned int zlib_dfltcc_support; extern int noexec_disabled; extern unsigned long ident_map_size; +extern unsigned long pgalloc_pos; +extern unsigned long pgalloc_end; +extern unsigned long pgalloc_low; /* The Write Back bit position in the physaddr is given by the SLPC PCI */ extern unsigned long mio_wb_bit_mask; diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 9693c8630e73..9cfd9f4fc927 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -160,9 +161,7 @@ static noinline __init void setup_lowcore_early(void) psw_t psw; psw.addr = (unsigned long)early_pgm_check_handler; - psw.mask = PSW_MASK_BASE | PSW_DEFAULT_KEY | PSW_MASK_EA | PSW_MASK_BA; - if (IS_ENABLED(CONFIG_KASAN)) - psw.mask |= PSW_MASK_DAT; + psw.mask = PSW_KERNEL_BITS; S390_lowcore.program_new_psw = psw; S390_lowcore.preempt_count = INIT_PREEMPT_COUNT; } diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c index 4bf1ee293f2b..a8aebb5c95cf 100644 --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c @@ -51,8 +51,8 @@ void arch_cpu_idle(void) unsigned long psw_mask; /* Wait for external, I/O or machine check interrupt. */ - psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT | PSW_MASK_DAT | - PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; + psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT | + PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; clear_cpu_flag(CIF_NOHZ_DELAY); /* psw_idle() returns with interrupts disabled. */ diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 3f5d2db0b854..67df64ef4839 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -147,8 +147,8 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (unlikely(args->fn)) { /* kernel thread */ memset(&frame->childregs, 0, sizeof(struct pt_regs)); - frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT | - PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; + frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_IO | + PSW_MASK_EXT | PSW_MASK_MCHECK; frame->childregs.psw.addr = (unsigned long)__ret_from_fork; frame->childregs.gprs[9] = (unsigned long)args->fn; diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 2b6091349daa..1ffaa85cd518 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -149,6 +149,9 @@ int __bootdata(noexec_disabled); unsigned long __bootdata(ident_map_size); struct mem_detect_info __bootdata(mem_detect); struct initrd_data __bootdata(initrd_data); +unsigned long __bootdata(pgalloc_pos); +unsigned long __bootdata(pgalloc_end); +unsigned long __bootdata(pgalloc_low); unsigned long __bootdata_preserved(__kaslr_offset); unsigned long __bootdata(__amode31_base); @@ -411,16 +414,12 @@ void __init arch_call_rest_init(void) call_on_stack_noreturn(rest_init, stack); } -static void __init setup_lowcore_dat_off(void) +static void __init setup_lowcore(void) { - unsigned long int_psw_mask = PSW_KERNEL_BITS; - struct lowcore *abs_lc, *lc; + struct lowcore *lc, *abs_lc; unsigned long mcck_stack; unsigned long flags; - if (IS_ENABLED(CONFIG_KASAN)) - int_psw_mask |= PSW_MASK_DAT; - /* * Setup lowcore for boot cpu */ @@ -430,17 +429,17 @@ static void __init setup_lowcore_dat_off(void) panic("%s: Failed to allocate %zu bytes align=%zx\n", __func__, sizeof(*lc), sizeof(*lc)); - lc->restart_psw.mask = PSW_KERNEL_BITS; - lc->restart_psw.addr = (unsigned long) restart_int_handler; - lc->external_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; + lc->restart_psw.mask = PSW_KERNEL_BITS & ~PSW_MASK_DAT; + lc->restart_psw.addr = __pa(restart_int_handler); + lc->external_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; lc->external_new_psw.addr = (unsigned long) ext_int_handler; - lc->svc_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; + lc->svc_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; lc->svc_new_psw.addr = (unsigned long) system_call; - lc->program_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; + lc->program_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; lc->program_new_psw.addr = (unsigned long) pgm_check_handler; - lc->mcck_new_psw.mask = int_psw_mask; + lc->mcck_new_psw.mask = PSW_KERNEL_BITS; lc->mcck_new_psw.addr = (unsigned long) mcck_int_handler; - lc->io_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; + lc->io_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; lc->io_new_psw.addr = (unsigned long) io_int_handler; lc->clock_comparator = clock_comparator_max; lc->nodat_stack = ((unsigned long) &init_thread_union) @@ -477,15 +476,7 @@ static void __init setup_lowcore_dat_off(void) lc->restart_fn = (unsigned long) do_restart; lc->restart_data = 0; lc->restart_source = -1U; - - abs_lc = get_abs_lowcore(&flags); - abs_lc->restart_stack = lc->restart_stack; - abs_lc->restart_fn = lc->restart_fn; - abs_lc->restart_data = lc->restart_data; - abs_lc->restart_source = lc->restart_source; - abs_lc->restart_psw = lc->restart_psw; - abs_lc->mcesad = lc->mcesad; - put_abs_lowcore(abs_lc, flags); + __ctl_store(lc->cregs_save_area, 0, 15); mcck_stack = (unsigned long)memblock_alloc(THREAD_SIZE, THREAD_SIZE); if (!mcck_stack) @@ -499,33 +490,26 @@ static void __init setup_lowcore_dat_off(void) lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); lc->preempt_count = PREEMPT_DISABLED; + lc->kernel_asce = S390_lowcore.kernel_asce; + lc->user_asce = S390_lowcore.user_asce; + + abs_lc = get_abs_lowcore(&flags); + abs_lc->restart_stack = lc->restart_stack; + abs_lc->restart_fn = lc->restart_fn; + abs_lc->restart_data = lc->restart_data; + abs_lc->restart_source = lc->restart_source; + abs_lc->restart_psw = lc->restart_psw; + abs_lc->restart_flags = RESTART_FLAG_CTLREGS; + memcpy(abs_lc->cregs_save_area, lc->cregs_save_area, sizeof(abs_lc->cregs_save_area)); + abs_lc->program_new_psw = lc->program_new_psw; + abs_lc->mcesad = lc->mcesad; + put_abs_lowcore(abs_lc, flags); set_prefix(__pa(lc)); lowcore_ptr[0] = lc; -} - -static void __init setup_lowcore_dat_on(void) -{ - struct lowcore *abs_lc; - unsigned long flags; - - __ctl_clear_bit(0, 28); - S390_lowcore.external_new_psw.mask |= PSW_MASK_DAT; - S390_lowcore.svc_new_psw.mask |= PSW_MASK_DAT; - S390_lowcore.program_new_psw.mask |= PSW_MASK_DAT; - S390_lowcore.mcck_new_psw.mask |= PSW_MASK_DAT; - S390_lowcore.io_new_psw.mask |= PSW_MASK_DAT; - __ctl_set_bit(0, 28); - __ctl_store(S390_lowcore.cregs_save_area, 0, 15); if (abs_lowcore_map(0, lowcore_ptr[0], true)) panic("Couldn't setup absolute lowcore"); abs_lowcore_mapped = true; - abs_lc = get_abs_lowcore(&flags); - abs_lc->restart_flags = RESTART_FLAG_CTLREGS; - abs_lc->program_new_psw = S390_lowcore.program_new_psw; - memcpy(abs_lc->cregs_save_area, S390_lowcore.cregs_save_area, - sizeof(abs_lc->cregs_save_area)); - put_abs_lowcore(abs_lc, flags); } static struct resource code_resource = { @@ -649,6 +633,14 @@ static struct notifier_block kdump_mem_nb = { #endif +/* + * Reserve page tables created by decompressor + */ +static void __init reserve_pgtables(void) +{ + memblock_reserve(pgalloc_pos, pgalloc_end - pgalloc_pos); +} + /* * Reserve memory for kdump kernel to be loaded with kexec */ @@ -1004,6 +996,7 @@ void __init setup_arch(char **cmdline_p) setup_control_program_code(); /* Do some memory reservations *before* memory is added to memblock */ + reserve_pgtables(); reserve_kernel(); reserve_initrd(); reserve_certificate_list(); @@ -1038,7 +1031,7 @@ void __init setup_arch(char **cmdline_p) #endif setup_resources(); - setup_lowcore_dat_off(); + setup_lowcore(); smp_fill_possible_mask(); cpu_detect_mhz_feature(); cpu_init(); @@ -1050,7 +1043,7 @@ void __init setup_arch(char **cmdline_p) static_branch_enable(&cpu_has_bear); /* - * Create kernel page tables and switch to virtual addressing. + * Create kernel page tables. */ paging_init(); memcpy_real_init(); @@ -1058,7 +1051,6 @@ void __init setup_arch(char **cmdline_p) * After paging_init created the kernel page table, the new PSWs * in lowcore can now run with DAT enabled. */ - setup_lowcore_dat_on(); #ifdef CONFIG_CRASH_DUMP smp_save_dump_ipl_cpu(); #endif diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 0031325ce4bc..24f19f10b237 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -327,7 +327,7 @@ static void pcpu_delegate(struct pcpu *pcpu, lc = lowcore_ptr[pcpu - pcpu_devices]; source_cpu = stap(); - __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); + if (pcpu->address == source_cpu) { call_on_stack(2, stack, void, __pcpu_delegate, pcpu_delegate_fn *, func, void *, data); @@ -488,7 +488,7 @@ void smp_send_stop(void) int cpu; /* Disable all interrupts/machine checks */ - __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); + __load_psw_mask(PSW_KERNEL_BITS); trace_hardirqs_off(); debug_set_critical(); diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 5ea3830af0cc..a965ddc34f43 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -213,6 +213,9 @@ SECTIONS QUAD(__rela_dyn_start) /* rela_dyn_start */ QUAD(__rela_dyn_end) /* rela_dyn_end */ QUAD(_eamode31 - _samode31) /* amode31_size */ + QUAD(init_mm) + QUAD(swapper_pg_dir) + QUAD(invalid_pg_dir) } :NONE /* Debugging sections. */ diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 30ab55f868f6..144447d5cb4c 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -52,9 +52,9 @@ #include pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir"); -static pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir"); +pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir"); -unsigned long s390_invalid_asce; +unsigned long __bootdata_preserved(s390_invalid_asce); unsigned long empty_zero_page, zero_page_mask; EXPORT_SYMBOL(empty_zero_page); @@ -93,37 +93,8 @@ static void __init setup_zero_pages(void) void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; - unsigned long pgd_type, asce_bits; - psw_t psw; - s390_invalid_asce = (unsigned long)invalid_pg_dir; - s390_invalid_asce |= _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; - crst_table_init((unsigned long *)invalid_pg_dir, _REGION3_ENTRY_EMPTY); - init_mm.pgd = swapper_pg_dir; - if (VMALLOC_END > _REGION2_SIZE) { - asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; - pgd_type = _REGION2_ENTRY_EMPTY; - } else { - asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; - pgd_type = _REGION3_ENTRY_EMPTY; - } - init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits; - S390_lowcore.kernel_asce = init_mm.context.asce; - S390_lowcore.user_asce = s390_invalid_asce; - crst_table_init((unsigned long *) init_mm.pgd, pgd_type); vmem_map_init(); - kasan_copy_shadow_mapping(); - - /* enable virtual mapping in kernel mode */ - __ctl_load(S390_lowcore.kernel_asce, 1, 1); - __ctl_load(S390_lowcore.user_asce, 7, 7); - __ctl_load(S390_lowcore.kernel_asce, 13, 13); - psw.mask = __extract_psw(); - psw_bits(psw).dat = 1; - psw_bits(psw).as = PSW_BITS_AS_HOME; - __load_psw_mask(psw.mask); - kasan_free_early_identity(); - sparse_init(); zone_dma_bits = 31; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c index a97b7981358e..801d81c189a7 100644 --- a/arch/s390/mm/kasan_init.c +++ b/arch/s390/mm/kasan_init.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include #include -#include #include #include #include @@ -15,16 +14,11 @@ static unsigned long segment_pos __initdata; static unsigned long segment_low __initdata; -static unsigned long pgalloc_pos __initdata; -static unsigned long pgalloc_low __initdata; -static unsigned long pgalloc_freeable __initdata; static bool has_edat __initdata; static bool has_nx __initdata; #define __sha(x) ((unsigned long)kasan_mem_to_shadow((void *)x)) -static pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); - static void __init kasan_early_panic(const char *reason) { sclp_early_printk("The Linux kernel failed to boot with the KernelAddressSanitizer:\n"); @@ -229,29 +223,6 @@ static void __init kasan_early_pgtable_populate(unsigned long address, } } -static void __init kasan_set_pgd(pgd_t *pgd, unsigned long asce_type) -{ - unsigned long asce_bits; - - asce_bits = asce_type | _ASCE_TABLE_LENGTH; - S390_lowcore.kernel_asce = (__pa(pgd) & PAGE_MASK) | asce_bits; - S390_lowcore.user_asce = S390_lowcore.kernel_asce; - - __ctl_load(S390_lowcore.kernel_asce, 1, 1); - __ctl_load(S390_lowcore.kernel_asce, 7, 7); - __ctl_load(S390_lowcore.kernel_asce, 13, 13); -} - -static void __init kasan_enable_dat(void) -{ - psw_t psw; - - psw.mask = __extract_psw(); - psw_bits(psw).dat = 1; - psw_bits(psw).as = PSW_BITS_AS_HOME; - __load_psw_mask(psw.mask); -} - static void __init kasan_early_detect_facilities(void) { if (test_facility(8)) { @@ -272,7 +243,6 @@ void __init kasan_early_init(void) p4d_t p4d_z = __p4d(__pa(kasan_early_shadow_pud) | _REGION2_ENTRY); unsigned long untracked_end = MODULES_VADDR; unsigned long shadow_alloc_size; - unsigned long initrd_end; unsigned long memsize; kasan_early_detect_facilities(); @@ -298,36 +268,24 @@ void __init kasan_early_init(void) BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, P4D_SIZE)); BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, P4D_SIZE)); - crst_table_init((unsigned long *)early_pg_dir, _REGION2_ENTRY_EMPTY); /* init kasan zero shadow */ - crst_table_init((unsigned long *)kasan_early_shadow_p4d, - p4d_val(p4d_z)); - crst_table_init((unsigned long *)kasan_early_shadow_pud, - pud_val(pud_z)); - crst_table_init((unsigned long *)kasan_early_shadow_pmd, - pmd_val(pmd_z)); + crst_table_init((unsigned long *)kasan_early_shadow_p4d, p4d_val(p4d_z)); + crst_table_init((unsigned long *)kasan_early_shadow_pud, pud_val(pud_z)); + crst_table_init((unsigned long *)kasan_early_shadow_pmd, pmd_val(pmd_z)); memset64((u64 *)kasan_early_shadow_pte, pte_val(pte_z), PTRS_PER_PTE); shadow_alloc_size = memsize >> KASAN_SHADOW_SCALE_SHIFT; - pgalloc_low = round_up((unsigned long)_end, _SEGMENT_SIZE); - if (IS_ENABLED(CONFIG_BLK_DEV_INITRD)) { - initrd_end = - round_up(initrd_data.start + initrd_data.size, _SEGMENT_SIZE); - pgalloc_low = max(pgalloc_low, initrd_end); - } if (pgalloc_low + shadow_alloc_size > memsize) kasan_early_panic("out of memory during initialisation\n"); if (has_edat) { - segment_pos = round_down(memsize, _SEGMENT_SIZE); + segment_pos = round_down(pgalloc_pos, _SEGMENT_SIZE); segment_low = segment_pos - shadow_alloc_size; + segment_low = round_down(segment_low, _SEGMENT_SIZE); pgalloc_pos = segment_low; - } else { - pgalloc_pos = memsize; } - init_mm.pgd = early_pg_dir; /* * Current memory layout: * +- 0 -------------+ +- shadow start -+ @@ -376,40 +334,7 @@ void __init kasan_early_init(void) POPULATE_ZERO_SHADOW); kasan_early_pgtable_populate(__sha(MODULES_END), __sha(_REGION1_SIZE), POPULATE_ZERO_SHADOW); - /* memory allocated for identity mapping structs will be freed later */ - pgalloc_freeable = pgalloc_pos; - /* populate identity mapping */ - kasan_early_pgtable_populate(0, memsize, POPULATE_ONE2ONE); - kasan_set_pgd(early_pg_dir, _ASCE_TYPE_REGION2); - kasan_enable_dat(); /* enable kasan */ init_task.kasan_depth = 0; - memblock_reserve(pgalloc_pos, memsize - pgalloc_pos); sclp_early_printk("KernelAddressSanitizer initialized\n"); } - -void __init kasan_copy_shadow_mapping(void) -{ - /* - * At this point we are still running on early pages setup early_pg_dir, - * while swapper_pg_dir has just been initialized with identity mapping. - * Carry over shadow memory region from early_pg_dir to swapper_pg_dir. - */ - - pgd_t *pg_dir_src; - pgd_t *pg_dir_dst; - p4d_t *p4_dir_src; - p4d_t *p4_dir_dst; - - pg_dir_src = pgd_offset_raw(early_pg_dir, KASAN_SHADOW_START); - pg_dir_dst = pgd_offset_raw(init_mm.pgd, KASAN_SHADOW_START); - p4_dir_src = p4d_offset(pg_dir_src, KASAN_SHADOW_START); - p4_dir_dst = p4d_offset(pg_dir_dst, KASAN_SHADOW_START); - memcpy(p4_dir_dst, p4_dir_src, - (KASAN_SHADOW_SIZE >> P4D_SHIFT) * sizeof(p4d_t)); -} - -void __init kasan_free_early_identity(void) -{ - memblock_phys_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos); -} diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index ee1a97078527..78d7768f93d7 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -657,6 +658,29 @@ void vmem_unmap_4k_page(unsigned long addr) mutex_unlock(&vmem_mutex); } +static int __init memblock_region_cmp(const void *a, const void *b) +{ + const struct memblock_region *r1 = a; + const struct memblock_region *r2 = b; + + if (r1->base < r2->base) + return -1; + if (r1->base > r2->base) + return 1; + return 0; +} + +static void __init memblock_region_swap(void *a, void *b, int size) +{ + struct memblock_region *r1 = a; + struct memblock_region *r2 = b; + struct memblock_region swap; + + swap = *r1; + *r1 = *r2; + *r2 = swap; +} + /* * map whole physical memory to virtual memory (identity mapping) * we reserve enough space in the vmalloc area for vmemmap to hotplug @@ -664,11 +688,68 @@ void vmem_unmap_4k_page(unsigned long addr) */ void __init vmem_map_init(void) { + struct memblock_region memory_rwx_regions[] = { + { + .base = 0, + .size = sizeof(struct lowcore), + .flags = MEMBLOCK_NONE, +#ifdef CONFIG_NUMA + .nid = NUMA_NO_NODE, +#endif + }, + { + .base = __pa(_stext), + .size = _etext - _stext, + .flags = MEMBLOCK_NONE, +#ifdef CONFIG_NUMA + .nid = NUMA_NO_NODE, +#endif + }, + { + .base = __pa(_sinittext), + .size = _einittext - _sinittext, + .flags = MEMBLOCK_NONE, +#ifdef CONFIG_NUMA + .nid = NUMA_NO_NODE, +#endif + }, + { + .base = __stext_amode31, + .size = __etext_amode31 - __stext_amode31, + .flags = MEMBLOCK_NONE, +#ifdef CONFIG_NUMA + .nid = NUMA_NO_NODE, +#endif + }, + }; + struct memblock_type memory_rwx = { + .regions = memory_rwx_regions, + .cnt = ARRAY_SIZE(memory_rwx_regions), + .max = ARRAY_SIZE(memory_rwx_regions), + }; phys_addr_t base, end; u64 i; - for_each_mem_range(i, &base, &end) - vmem_add_range(base, end - base); + /* + * Set RW+NX attribute on all memory, except regions enumerated with + * memory_rwx exclude type. These regions need different attributes, + * which are enforced afterwards. + * + * __for_each_mem_range() iterate and exclude types should be sorted. + * The relative location of _stext and _sinittext is hardcoded in the + * linker script. However a location of __stext_amode31 and the kernel + * image itself are chosen dynamically. Thus, sort the exclude type. + */ + sort(&memory_rwx_regions, + ARRAY_SIZE(memory_rwx_regions), sizeof(memory_rwx_regions[0]), + memblock_region_cmp, memblock_region_swap); + __for_each_mem_range(i, &memblock.memory, &memory_rwx, + NUMA_NO_NODE, MEMBLOCK_NONE, &base, &end, NULL) { + __set_memory((unsigned long)__va(base), + (end - base) >> PAGE_SHIFT, + SET_MEMORY_RW | SET_MEMORY_NX); + } + __set_memory((unsigned long)_stext, (unsigned long)(_etext - _stext) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); @@ -678,15 +759,14 @@ void __init vmem_map_init(void) __set_memory((unsigned long)_sinittext, (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); - __set_memory(__stext_amode31, (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT, + __set_memory(__stext_amode31, + (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); - /* lowcore requires 4k mapping for real addresses / prefixing */ - set_memory_4k(0, LC_PAGES); - /* lowcore must be executable for LPSWE */ - if (!static_key_enabled(&cpu_has_bear)) - set_memory_x(0, 1); + if (static_key_enabled(&cpu_has_bear)) + set_memory_nx(0, 1); + set_memory_nx(PAGE_SIZE, 1); pr_info("Write protected kernel read-only data: %luk\n", (unsigned long)(__end_rodata - _stext) >> 10);