diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/Makefile | 3 | ||||
-rw-r--r-- | arch/x86/mm/amdtopology.c | 34 | ||||
-rw-r--r-- | arch/x86/mm/debug_pagetables.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/dump_pagetables.c | 28 | ||||
-rw-r--r-- | arch/x86/mm/extable.c | 78 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 62 | ||||
-rw-r--r-- | arch/x86/mm/ident_map.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 16 | ||||
-rw-r--r-- | arch/x86/mm/kasan_init_64.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/maccess.c | 10 | ||||
-rw-r--r-- | arch/x86/mm/mem_encrypt.c | 55 | ||||
-rw-r--r-- | arch/x86/mm/mem_encrypt_identity.c | 114 | ||||
-rw-r--r-- | arch/x86/mm/numa.c | 21 | ||||
-rw-r--r-- | arch/x86/mm/pat/memtype.c | 9 | ||||
-rw-r--r-- | arch/x86/mm/pat/set_memory.c | 57 | ||||
-rw-r--r-- | arch/x86/mm/pgtable.c | 8 | ||||
-rw-r--r-- | arch/x86/mm/pti.c | 10 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 49 |
19 files changed, 305 insertions, 263 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index c80febc44cd2..428048e73bd2 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -16,6 +16,7 @@ KASAN_SANITIZE_pgprot.o := n KCSAN_SANITIZE := n # Avoid recursion by not calling KMSAN hooks for CEA code. KMSAN_SANITIZE_cpu_entry_area.o := n +KMSAN_SANITIZE_mem_encrypt_identity.o := n ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_mem_encrypt.o = -pg @@ -60,7 +61,7 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o -obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o +obj-$(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION) += pti.o obj-$(CONFIG_X86_MEM_ENCRYPT) += mem_encrypt.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_amd.o diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index b3ca7d23e4b0..9332b36a1091 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -54,13 +54,11 @@ static __init int find_northbridge(void) int __init amd_numa_init(void) { - u64 start = PFN_PHYS(0); + unsigned int numnodes, cores, apicid; + u64 prevbase, start = PFN_PHYS(0); u64 end = PFN_PHYS(max_pfn); - unsigned numnodes; - u64 prevbase; - int i, j, nb; u32 nodeid, reg; - unsigned int bits, cores, apicid_base; + int i, j, nb; if (!early_pci_allowed()) return -EINVAL; @@ -158,26 +156,18 @@ int __init amd_numa_init(void) return -ENOENT; /* - * We seem to have valid NUMA configuration. Map apicids to nodes - * using the coreid bits from early_identify_cpu. + * We seem to have valid NUMA configuration. Map apicids to nodes + * using the size of the core domain in the APIC space. */ - bits = boot_cpu_data.x86_coreid_bits; - cores = 1 << bits; - apicid_base = 0; + cores = topology_get_domain_size(TOPO_CORE_DOMAIN); - /* - * get boot-time SMP configuration: - */ - early_get_smp_config(); + apicid = boot_cpu_physical_apicid; + if (apicid > 0) + pr_info("BSP APIC ID: %02x\n", apicid); - if (boot_cpu_physical_apicid > 0) { - pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); - apicid_base = boot_cpu_physical_apicid; + for_each_node_mask(i, numa_nodes_parsed) { + for (j = 0; j < cores; j++, apicid++) + set_apicid_to_node(apicid, i); } - - for_each_node_mask(i, numa_nodes_parsed) - for (j = apicid_base; j < cores + apicid_base; j++) - set_apicid_to_node((i << bits) + j, i); - return 0; } diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index b43301cb2a80..ae5c213a1cb0 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -22,7 +22,7 @@ static int ptdump_curknl_show(struct seq_file *m, void *v) DEFINE_SHOW_ATTRIBUTE(ptdump_curknl); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION static int ptdump_curusr_show(struct seq_file *m, void *v) { if (current->mm->pgd) @@ -54,7 +54,7 @@ static int __init pt_dump_debug_init(void) debugfs_create_file("current_kernel", 0400, dir, NULL, &ptdump_curknl_fops); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION debugfs_create_file("current_user", 0400, dir, NULL, &ptdump_curusr_fops); #endif diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index e1b599ecbbc2..89079ea73e65 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -362,9 +362,9 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, } } -static void ptdump_walk_pgd_level_core(struct seq_file *m, - struct mm_struct *mm, pgd_t *pgd, - bool checkwx, bool dmesg) +bool ptdump_walk_pgd_level_core(struct seq_file *m, + struct mm_struct *mm, pgd_t *pgd, + bool checkwx, bool dmesg) { const struct ptdump_range ptdump_ranges[] = { #ifdef CONFIG_X86_64 @@ -391,12 +391,17 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, ptdump_walk_pgd(&st.ptdump, mm, pgd); if (!checkwx) - return; - if (st.wx_pages) + return true; + if (st.wx_pages) { pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n", st.wx_pages); - else + + return false; + } else { pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n"); + + return true; + } } void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm) @@ -408,7 +413,7 @@ void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, bool user) { pgd_t *pgd = mm->pgd; -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION if (user && boot_cpu_has(X86_FEATURE_PTI)) pgd = kernel_to_user_pgdp(pgd); #endif @@ -418,7 +423,7 @@ EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); void ptdump_walk_user_pgd_level_checkwx(void) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION pgd_t *pgd = INIT_PGD; if (!(__supported_pte_mask & _PAGE_NX) || @@ -431,9 +436,12 @@ void ptdump_walk_user_pgd_level_checkwx(void) #endif } -void ptdump_walk_pgd_level_checkwx(void) +bool ptdump_walk_pgd_level_checkwx(void) { - ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false); + if (!(__supported_pte_mask & _PAGE_NX)) + return true; + + return ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false); } static int __init pt_dump_init(void) diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 271dcb2deabc..b522933bfa56 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -6,6 +6,7 @@ #include <xen/xen.h> #include <asm/fpu/api.h> +#include <asm/fred.h> #include <asm/sev.h> #include <asm/traps.h> #include <asm/kdebug.h> @@ -223,6 +224,79 @@ static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup, return ex_handler_uaccess(fixup, regs, trapnr, fault_address); } +#ifdef CONFIG_X86_FRED +static bool ex_handler_eretu(const struct exception_table_entry *fixup, + struct pt_regs *regs, unsigned long error_code) +{ + struct pt_regs *uregs = (struct pt_regs *)(regs->sp - offsetof(struct pt_regs, orig_ax)); + unsigned short ss = uregs->ss; + unsigned short cs = uregs->cs; + + /* + * Move the NMI bit from the invalid stack frame, which caused ERETU + * to fault, to the fault handler's stack frame, thus to unblock NMI + * with the fault handler's ERETS instruction ASAP if NMI is blocked. + */ + regs->fred_ss.nmi = uregs->fred_ss.nmi; + + /* + * Sync event information to uregs, i.e., the ERETU return frame, but + * is it safe to write to the ERETU return frame which is just above + * current event stack frame? + * + * The RSP used by FRED to push a stack frame is not the value in %rsp, + * it is calculated from %rsp with the following 2 steps: + * 1) RSP = %rsp - (IA32_FRED_CONFIG & 0x1c0) // Reserve N*64 bytes + * 2) RSP = RSP & ~0x3f // Align to a 64-byte cache line + * when an event delivery doesn't trigger a stack level change. + * + * Here is an example with N*64 (N=1) bytes reserved: + * + * 64-byte cache line ==> ______________ + * |___Reserved___| + * |__Event_data__| + * |_____SS_______| + * |_____RSP______| + * |_____FLAGS____| + * |_____CS_______| + * |_____IP_______| + * 64-byte cache line ==> |__Error_code__| <== ERETU return frame + * |______________| + * |______________| + * |______________| + * |______________| + * |______________| + * |______________| + * |______________| + * 64-byte cache line ==> |______________| <== RSP after step 1) and 2) + * |___Reserved___| + * |__Event_data__| + * |_____SS_______| + * |_____RSP______| + * |_____FLAGS____| + * |_____CS_______| + * |_____IP_______| + * 64-byte cache line ==> |__Error_code__| <== ERETS return frame + * + * Thus a new FRED stack frame will always be pushed below a previous + * FRED stack frame ((N*64) bytes may be reserved between), and it is + * safe to write to a previous FRED stack frame as they never overlap. + */ + fred_info(uregs)->edata = fred_event_data(regs); + uregs->ssx = regs->ssx; + uregs->fred_ss.ss = ss; + /* The NMI bit was moved away above */ + uregs->fred_ss.nmi = 0; + uregs->csx = regs->csx; + uregs->fred_cs.sl = 0; + uregs->fred_cs.wfe = 0; + uregs->cs = cs; + uregs->orig_ax = error_code; + + return ex_handler_default(fixup, regs); +} +#endif + int ex_get_fixup_type(unsigned long ip) { const struct exception_table_entry *e = search_exception_tables(ip); @@ -300,6 +374,10 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, return ex_handler_ucopy_len(e, regs, trapnr, fault_addr, reg, imm); case EX_TYPE_ZEROPAD: return ex_handler_zeropad(e, regs, fault_addr); +#ifdef CONFIG_X86_FRED + case EX_TYPE_ERETU: + return ex_handler_eretu(e, regs, error_code); +#endif } BUG(); } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 679b09cfe241..622d12ec7f08 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -34,6 +34,8 @@ #include <asm/kvm_para.h> /* kvm_handle_async_pf */ #include <asm/vdso.h> /* fixup_vdso_exception() */ #include <asm/irq_stack.h> +#include <asm/fred.h> +#include <asm/sev.h> /* snp_dump_hva_rmpentry() */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -250,7 +252,7 @@ static noinline int vmalloc_fault(unsigned long address) if (!pmd_k) return -1; - if (pmd_large(*pmd_k)) + if (pmd_leaf(*pmd_k)) return 0; pte_k = pte_offset_kernel(pmd_k, address); @@ -319,7 +321,7 @@ static void dump_pagetable(unsigned long address) * And let's rather not kmap-atomic the pte, just in case * it's allocated already: */ - if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd)) + if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_leaf(*pmd)) goto out; pte = pte_offset_kernel(pmd, address); @@ -368,7 +370,7 @@ static void dump_pagetable(unsigned long address) goto bad; pr_cont("P4D %lx ", p4d_val(*p4d)); - if (!p4d_present(*p4d) || p4d_large(*p4d)) + if (!p4d_present(*p4d) || p4d_leaf(*p4d)) goto out; pud = pud_offset(p4d, address); @@ -376,7 +378,7 @@ static void dump_pagetable(unsigned long address) goto bad; pr_cont("PUD %lx ", pud_val(*pud)); - if (!pud_present(*pud) || pud_large(*pud)) + if (!pud_present(*pud) || pud_leaf(*pud)) goto out; pmd = pmd_offset(pud, address); @@ -384,7 +386,7 @@ static void dump_pagetable(unsigned long address) goto bad; pr_cont("PMD %lx ", pmd_val(*pmd)); - if (!pmd_present(*pmd) || pmd_large(*pmd)) + if (!pmd_present(*pmd) || pmd_leaf(*pmd)) goto out; pte = pte_offset_kernel(pmd, address); @@ -547,6 +549,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad !(error_code & X86_PF_PROT) ? "not-present page" : (error_code & X86_PF_RSVD) ? "reserved bit violation" : (error_code & X86_PF_PK) ? "protection keys violation" : + (error_code & X86_PF_RMP) ? "RMP violation" : "permissions violation"); if (!(error_code & X86_PF_USER) && user_mode(regs)) { @@ -579,6 +582,9 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad } dump_pagetable(address); + + if (error_code & X86_PF_RMP) + snp_dump_hva_rmpentry(address); } static noinline void @@ -798,15 +804,6 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, show_opcodes(regs, loglvl); } -/* - * The (legacy) vsyscall page is the long page in the kernel portion - * of the address space that has user-accessible permissions. - */ -static bool is_vsyscall_vaddr(unsigned long vaddr) -{ - return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR); -} - static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 pkey, int si_code) @@ -1039,21 +1036,21 @@ spurious_kernel_fault(unsigned long error_code, unsigned long address) if (!p4d_present(*p4d)) return 0; - if (p4d_large(*p4d)) + if (p4d_leaf(*p4d)) return spurious_kernel_fault_check(error_code, (pte_t *) p4d); pud = pud_offset(p4d, address); if (!pud_present(*pud)) return 0; - if (pud_large(*pud)) + if (pud_leaf(*pud)) return spurious_kernel_fault_check(error_code, (pte_t *) pud); pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return 0; - if (pmd_large(*pmd)) + if (pmd_leaf(*pmd)) return spurious_kernel_fault_check(error_code, (pte_t *) pmd); pte = pte_offset_kernel(pmd, address); @@ -1302,21 +1299,14 @@ void do_user_addr_fault(struct pt_regs *regs, return; } - /* - * It's safe to allow irq's after cr2 has been saved and the - * vmalloc fault has been handled. - * - * User-mode registers count as a user access even for any - * potential system fault or CPU buglet: - */ - if (user_mode(regs)) { - local_irq_enable(); - flags |= FAULT_FLAG_USER; - } else { - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); + /* Legacy check - remove this after verifying that it doesn't trigger */ + if (WARN_ON_ONCE(!(regs->flags & X86_EFLAGS_IF))) { + bad_area_nosemaphore(regs, error_code, address); + return; } + local_irq_enable(); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* @@ -1332,6 +1322,14 @@ void do_user_addr_fault(struct pt_regs *regs, if (error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; + /* + * We set FAULT_FLAG_USER based on the register state, not + * based on X86_PF_USER. User space accesses that cause + * system page faults are still user accesses. + */ + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + #ifdef CONFIG_X86_64 /* * Faults in the vsyscall page might need emulation. The @@ -1518,8 +1516,10 @@ handle_page_fault(struct pt_regs *regs, unsigned long error_code, DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) { - unsigned long address = read_cr2(); irqentry_state_t state; + unsigned long address; + + address = cpu_feature_enabled(X86_FEATURE_FRED) ? fred_event_data(regs) : read_cr2(); prefetchw(¤t->mm->mmap_lock); diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index f50cc210a981..a204a332c71f 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -33,7 +33,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, next = end; /* if this is already a gbpage, this portion is already mapped */ - if (pud_large(*pud)) + if (pud_leaf(*pud)) continue; /* Is using a gbpage allowed? */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index b63403d7179d..ac41b1e0940d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -463,7 +463,7 @@ void __init native_pagetable_init(void) break; /* should not be large page here */ - if (pmd_large(*pmd)) { + if (pmd_leaf(*pmd)) { pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n", pfn, pmd, __pa(pmd)); BUG_ON(1); @@ -800,6 +800,4 @@ void mark_rodata_ro(void) set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); #endif mark_nxdata_nx(); - if (__supported_pte_mask & _PAGE_NX) - debug_checkwx(); } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a0dffaca6d2b..7e177856ee4f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -530,7 +530,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, } if (!pmd_none(*pmd)) { - if (!pmd_large(*pmd)) { + if (!pmd_leaf(*pmd)) { spin_lock(&init_mm.page_table_lock); pte = (pte_t *)pmd_page_vaddr(*pmd); paddr_last = phys_pte_init(pte, paddr, @@ -617,7 +617,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, } if (!pud_none(*pud)) { - if (!pud_large(*pud)) { + if (!pud_leaf(*pud)) { pmd = pmd_offset(pud, 0); paddr_last = phys_pmd_init(pmd, paddr, paddr_end, @@ -1114,7 +1114,7 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, if (!pmd_present(*pmd)) continue; - if (pmd_large(*pmd)) { + if (pmd_leaf(*pmd)) { if (IS_ALIGNED(addr, PMD_SIZE) && IS_ALIGNED(next, PMD_SIZE)) { if (!direct) @@ -1163,7 +1163,7 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, if (!pud_present(*pud)) continue; - if (pud_large(*pud) && + if (pud_leaf(*pud) && IS_ALIGNED(addr, PUD_SIZE) && IS_ALIGNED(next, PUD_SIZE)) { spin_lock(&init_mm.page_table_lock); @@ -1197,7 +1197,7 @@ remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end, if (!p4d_present(*p4d)) continue; - BUILD_BUG_ON(p4d_large(*p4d)); + BUILD_BUG_ON(p4d_leaf(*p4d)); pud_base = pud_offset(p4d, 0); remove_pud_table(pud_base, addr, next, altmap, direct); @@ -1412,8 +1412,6 @@ void mark_rodata_ro(void) (void *)text_end, (void *)rodata_start); free_kernel_image_pages("unused kernel image (rodata/data gap)", (void *)rodata_end, (void *)_sdata); - - debug_checkwx(); } /* @@ -1522,9 +1520,9 @@ void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, int __meminit vmemmap_check_pmd(pmd_t *pmd, int node, unsigned long addr, unsigned long next) { - int large = pmd_large(*pmd); + int large = pmd_leaf(*pmd); - if (pmd_large(*pmd)) { + if (pmd_leaf(*pmd)) { vmemmap_verify((pte_t *)pmd, node, addr, next); vmemmap_use_sub_pmd(addr, next); } diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 0302491d799d..9dddf19a5571 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -95,7 +95,7 @@ static void __init kasan_populate_pud(pud_t *pud, unsigned long addr, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (!pmd_large(*pmd)) + if (!pmd_leaf(*pmd)) kasan_populate_pmd(pmd, addr, next, nid); } while (pmd++, addr = next, addr != end); } @@ -115,7 +115,7 @@ static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr, pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); - if (!pud_large(*pud)) + if (!pud_leaf(*pud)) kasan_populate_pud(pud, addr, next, nid); } while (pud++, addr = next, addr != end); } diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c index 6993f026adec..42115ac079cf 100644 --- a/arch/x86/mm/maccess.c +++ b/arch/x86/mm/maccess.c @@ -3,6 +3,8 @@ #include <linux/uaccess.h> #include <linux/kernel.h> +#include <asm/vsyscall.h> + #ifdef CONFIG_X86_64 bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { @@ -16,6 +18,14 @@ bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) return false; /* + * Reading from the vsyscall page may cause an unhandled fault in + * certain cases. Though it is at an address above TASK_SIZE_MAX, it is + * usually considered as a user space address. + */ + if (is_vsyscall_vaddr(vaddr)) + return false; + + /* * Allow everything during early boot before 'x86_virt_bits' * is initialized. Needed for instruction decoding in early * exception handlers. diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index c290c55b632b..6f3b3e028718 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -14,6 +14,8 @@ #include <linux/mem_encrypt.h> #include <linux/virtio_anchor.h> +#include <asm/sev.h> + /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */ bool force_dma_unencrypted(struct device *dev) { @@ -42,38 +44,45 @@ bool force_dma_unencrypted(struct device *dev) static void print_mem_encrypt_feature_info(void) { - pr_info("Memory Encryption Features active:"); - - if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { - pr_cont(" Intel TDX\n"); - return; - } + pr_info("Memory Encryption Features active: "); - pr_cont(" AMD"); + switch (cc_vendor) { + case CC_VENDOR_INTEL: + pr_cont("Intel TDX\n"); + break; + case CC_VENDOR_AMD: + pr_cont("AMD"); - /* Secure Memory Encryption */ - if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { + /* Secure Memory Encryption */ + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { /* * SME is mutually exclusive with any of the SEV * features below. - */ - pr_cont(" SME\n"); - return; - } + */ + pr_cont(" SME\n"); + return; + } - /* Secure Encrypted Virtualization */ - if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) - pr_cont(" SEV"); + /* Secure Encrypted Virtualization */ + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + pr_cont(" SEV"); + + /* Encrypted Register State */ + if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) + pr_cont(" SEV-ES"); - /* Encrypted Register State */ - if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) - pr_cont(" SEV-ES"); + /* Secure Nested Paging */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + pr_cont(" SEV-SNP"); - /* Secure Nested Paging */ - if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) - pr_cont(" SEV-SNP"); + pr_cont("\n"); - pr_cont("\n"); + sev_show_status(); + + break; + default: + pr_cont("Unknown\n"); + } } /* Architecture __weak replacement functions */ diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index d73aeb16417f..ac33b2263a43 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -41,9 +41,9 @@ #include <linux/mem_encrypt.h> #include <linux/cc_platform.h> +#include <asm/init.h> #include <asm/setup.h> #include <asm/sections.h> -#include <asm/cmdline.h> #include <asm/coco.h> #include <asm/sev.h> @@ -95,11 +95,7 @@ struct sme_populate_pgd_data { */ static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch"); -static char sme_cmdline_arg[] __initdata = "mem_encrypt"; -static char sme_cmdline_on[] __initdata = "on"; -static char sme_cmdline_off[] __initdata = "off"; - -static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) +static void __head sme_clear_pgd(struct sme_populate_pgd_data *ppd) { unsigned long pgd_start, pgd_end, pgd_size; pgd_t *pgd_p; @@ -114,7 +110,7 @@ static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) memset(pgd_p, 0, pgd_size); } -static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) +static pud_t __head *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) { pgd_t *pgd; p4d_t *p4d; @@ -145,13 +141,13 @@ static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) set_pud(pud, __pud(PUD_FLAGS | __pa(pmd))); } - if (pud_large(*pud)) + if (pud_leaf(*pud)) return NULL; return pud; } -static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) +static void __head sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) { pud_t *pud; pmd_t *pmd; @@ -161,13 +157,13 @@ static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) return; pmd = pmd_offset(pud, ppd->vaddr); - if (pmd_large(*pmd)) + if (pmd_leaf(*pmd)) return; set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags)); } -static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) +static void __head sme_populate_pgd(struct sme_populate_pgd_data *ppd) { pud_t *pud; pmd_t *pmd; @@ -185,7 +181,7 @@ static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) set_pmd(pmd, __pmd(PMD_FLAGS | __pa(pte))); } - if (pmd_large(*pmd)) + if (pmd_leaf(*pmd)) return; pte = pte_offset_kernel(pmd, ppd->vaddr); @@ -193,7 +189,7 @@ static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) set_pte(pte, __pte(ppd->paddr | ppd->pte_flags)); } -static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) +static void __head __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) { while (ppd->vaddr < ppd->vaddr_end) { sme_populate_pgd_large(ppd); @@ -203,7 +199,7 @@ static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) } } -static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) +static void __head __sme_map_range_pte(struct sme_populate_pgd_data *ppd) { while (ppd->vaddr < ppd->vaddr_end) { sme_populate_pgd(ppd); @@ -213,7 +209,7 @@ static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) } } -static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, +static void __head __sme_map_range(struct sme_populate_pgd_data *ppd, pmdval_t pmd_flags, pteval_t pte_flags) { unsigned long vaddr_end; @@ -237,22 +233,22 @@ static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, __sme_map_range_pte(ppd); } -static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) +static void __head sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); } -static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) +static void __head sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); } -static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) +static void __head sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP); } -static unsigned long __init sme_pgtable_calc(unsigned long len) +static unsigned long __head sme_pgtable_calc(unsigned long len) { unsigned long entries = 0, tables = 0; @@ -289,7 +285,7 @@ static unsigned long __init sme_pgtable_calc(unsigned long len) return entries + tables; } -void __init sme_encrypt_kernel(struct boot_params *bp) +void __head sme_encrypt_kernel(struct boot_params *bp) { unsigned long workarea_start, workarea_end, workarea_len; unsigned long execute_start, execute_end, execute_len; @@ -305,7 +301,8 @@ void __init sme_encrypt_kernel(struct boot_params *bp) * instrumentation or checking boot_cpu_data in the cc_platform_has() * function. */ - if (!sme_get_me_mask() || sev_status & MSR_AMD64_SEV_ENABLED) + if (!sme_get_me_mask() || + RIP_REL_REF(sev_status) & MSR_AMD64_SEV_ENABLED) return; /* @@ -323,9 +320,8 @@ void __init sme_encrypt_kernel(struct boot_params *bp) * memory from being cached. */ - /* Physical addresses gives us the identity mapped virtual addresses */ - kernel_start = __pa_symbol(_text); - kernel_end = ALIGN(__pa_symbol(_end), PMD_SIZE); + kernel_start = (unsigned long)RIP_REL_REF(_text); + kernel_end = ALIGN((unsigned long)RIP_REL_REF(_end), PMD_SIZE); kernel_len = kernel_end - kernel_start; initrd_start = 0; @@ -343,14 +339,6 @@ void __init sme_encrypt_kernel(struct boot_params *bp) #endif /* - * We're running identity mapped, so we must obtain the address to the - * SME encryption workarea using rip-relative addressing. - */ - asm ("lea sme_workarea(%%rip), %0" - : "=r" (workarea_start) - : "p" (sme_workarea)); - - /* * Calculate required number of workarea bytes needed: * executable encryption area size: * stack page (PAGE_SIZE) @@ -359,7 +347,7 @@ void __init sme_encrypt_kernel(struct boot_params *bp) * pagetable structures for the encryption of the kernel * pagetable structures for workarea (in case not currently mapped) */ - execute_start = workarea_start; + execute_start = workarea_start = (unsigned long)RIP_REL_REF(sme_workarea); execute_end = execute_start + (PAGE_SIZE * 2) + PMD_SIZE; execute_len = execute_end - execute_start; @@ -502,14 +490,11 @@ void __init sme_encrypt_kernel(struct boot_params *bp) native_write_cr3(__native_read_cr3()); } -void __init sme_enable(struct boot_params *bp) +void __head sme_enable(struct boot_params *bp) { - const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; unsigned int eax, ebx, ecx, edx; unsigned long feature_mask; - bool active_by_default; unsigned long me_mask; - char buffer[16]; bool snp; u64 msr; @@ -543,15 +528,18 @@ void __init sme_enable(struct boot_params *bp) me_mask = 1UL << (ebx & 0x3f); /* Check the SEV MSR whether SEV or SME is enabled */ - sev_status = __rdmsr(MSR_AMD64_SEV); - feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; + RIP_REL_REF(sev_status) = msr = __rdmsr(MSR_AMD64_SEV); + feature_mask = (msr & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; /* The SEV-SNP CC blob should never be present unless SEV-SNP is enabled. */ - if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + if (snp && !(msr & MSR_AMD64_SEV_SNP_ENABLED)) snp_abort(); /* Check if memory encryption is enabled */ if (feature_mask == AMD_SME_BIT) { + if (!(bp->hdr.xloadflags & XLF_MEM_ENCRYPTION)) + return; + /* * No SME if Hypervisor bit is set. This check is here to * prevent a guest from trying to enable SME. For running as a @@ -571,48 +559,10 @@ void __init sme_enable(struct boot_params *bp) msr = __rdmsr(MSR_AMD64_SYSCFG); if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) return; - } else { - /* SEV state cannot be controlled by a command line option */ - sme_me_mask = me_mask; - goto out; } - /* - * Fixups have not been applied to phys_base yet and we're running - * identity mapped, so we must obtain the address to the SME command - * line argument data using rip-relative addressing. - */ - asm ("lea sme_cmdline_arg(%%rip), %0" - : "=r" (cmdline_arg) - : "p" (sme_cmdline_arg)); - asm ("lea sme_cmdline_on(%%rip), %0" - : "=r" (cmdline_on) - : "p" (sme_cmdline_on)); - asm ("lea sme_cmdline_off(%%rip), %0" - : "=r" (cmdline_off) - : "p" (sme_cmdline_off)); - - if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT)) - active_by_default = true; - else - active_by_default = false; - - cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | - ((u64)bp->ext_cmd_line_ptr << 32)); - - if (cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)) < 0) - return; - - if (!strncmp(buffer, cmdline_on, sizeof(buffer))) - sme_me_mask = me_mask; - else if (!strncmp(buffer, cmdline_off, sizeof(buffer))) - sme_me_mask = 0; - else - sme_me_mask = active_by_default ? me_mask : 0; -out: - if (sme_me_mask) { - physical_mask &= ~sme_me_mask; - cc_vendor = CC_VENDOR_AMD; - cc_set_mask(sme_me_mask); - } + RIP_REL_REF(sme_me_mask) = me_mask; + physical_mask &= ~me_mask; + cc_vendor = CC_VENDOR_AMD; + cc_set_mask(me_mask); } diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index adc497b93f03..65e9a6e391c0 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -934,7 +934,7 @@ static int __init cmp_memblk(const void *a, const void *b) const struct numa_memblk *ma = *(const struct numa_memblk **)a; const struct numa_memblk *mb = *(const struct numa_memblk **)b; - return ma->start - mb->start; + return (ma->start > mb->start) - (ma->start < mb->start); } static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; @@ -944,14 +944,12 @@ static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; * @start: address to begin fill * @end: address to end fill * - * Find and extend numa_meminfo memblks to cover the @start-@end - * physical address range, such that the first memblk includes - * @start, the last memblk includes @end, and any gaps in between - * are filled. + * Find and extend numa_meminfo memblks to cover the physical + * address range @start-@end * * RETURNS: * 0 : Success - * NUMA_NO_MEMBLK : No memblk exists in @start-@end range + * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end */ int __init numa_fill_memblks(u64 start, u64 end) @@ -963,17 +961,14 @@ int __init numa_fill_memblks(u64 start, u64 end) /* * Create a list of pointers to numa_meminfo memblks that - * overlap start, end. Exclude (start == bi->end) since - * end addresses in both a CFMWS range and a memblk range - * are exclusive. - * - * This list of pointers is used to make in-place changes - * that fill out the numa_meminfo memblks. + * overlap start, end. The list is used to make in-place + * changes that fill out the numa_meminfo memblks. */ for (int i = 0; i < mi->nr_blks; i++) { struct numa_memblk *bi = &mi->blk[i]; - if (start < bi->end && end >= bi->start) { + if (memblock_addrs_overlap(start, end - start, bi->start, + bi->end - bi->start)) { blk[count] = &mi->blk[i]; count++; } diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 0904d7e8e126..0d72183b5dd0 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -240,6 +240,8 @@ void pat_cpu_init(void) } wrmsrl(MSR_IA32_CR_PAT, pat_msr_val); + + __flush_tlb_all(); } /** @@ -296,13 +298,8 @@ void __init pat_bp_init(void) /* * Xen PV doesn't allow to set PAT MSR, but all cache modes are * supported. - * When running as TDX guest setting the PAT MSR won't work either - * due to the requirement to set CR0.CD when doing so. Rely on - * firmware to have set the PAT MSR correctly. */ - if (pat_disabled || - cpu_feature_enabled(X86_FEATURE_XENPV) || - cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { + if (pat_disabled || cpu_feature_enabled(X86_FEATURE_XENPV)) { init_cache_modes(pat_msr_val); return; } diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index e9b448d1b1b7..80c9037ffadf 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -676,7 +676,7 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, return NULL; *level = PG_LEVEL_512G; - if (p4d_large(*p4d) || !p4d_present(*p4d)) + if (p4d_leaf(*p4d) || !p4d_present(*p4d)) return (pte_t *)p4d; pud = pud_offset(p4d, address); @@ -684,7 +684,7 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, return NULL; *level = PG_LEVEL_1G; - if (pud_large(*pud) || !pud_present(*pud)) + if (pud_leaf(*pud) || !pud_present(*pud)) return (pte_t *)pud; pmd = pmd_offset(pud, address); @@ -692,7 +692,7 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, return NULL; *level = PG_LEVEL_2M; - if (pmd_large(*pmd) || !pmd_present(*pmd)) + if (pmd_leaf(*pmd) || !pmd_present(*pmd)) return (pte_t *)pmd; *level = PG_LEVEL_4K; @@ -739,11 +739,11 @@ pmd_t *lookup_pmd_address(unsigned long address) return NULL; p4d = p4d_offset(pgd, address); - if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d)) + if (p4d_none(*p4d) || p4d_leaf(*p4d) || !p4d_present(*p4d)) return NULL; pud = pud_offset(p4d, address); - if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud)) + if (pud_none(*pud) || pud_leaf(*pud) || !pud_present(*pud)) return NULL; return pmd_offset(pud, address); @@ -755,10 +755,14 @@ pmd_t *lookup_pmd_address(unsigned long address) * areas on 32-bit NUMA systems. The percpu areas can * end up in this kind of memory, for instance. * - * This could be optimized, but it is only intended to be - * used at initialization time, and keeping it - * unoptimized should increase the testing coverage for - * the more obscure platforms. + * Note that as long as the PTEs are well-formed with correct PFNs, this + * works without checking the PRESENT bit in the leaf PTE. This is unlike + * the similar vmalloc_to_page() and derivatives. Callers may depend on + * this behavior. + * + * This could be optimized, but it is only used in paths that are not perf + * sensitive, and keeping it unoptimized should increase the testing coverage + * for the more obscure platforms. */ phys_addr_t slow_virt_to_phys(void *__virt_addr) { @@ -1229,7 +1233,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) * Try to unmap in 2M chunks. */ while (end - start >= PMD_SIZE) { - if (pmd_large(*pmd)) + if (pmd_leaf(*pmd)) pmd_clear(pmd); else __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); @@ -1274,7 +1278,7 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end) */ while (end - start >= PUD_SIZE) { - if (pud_large(*pud)) + if (pud_leaf(*pud)) pud_clear(pud); else unmap_pmd_range(pud, start, start + PUD_SIZE); @@ -2041,17 +2045,12 @@ int set_mce_nospec(unsigned long pfn) return rc; } -static int set_memory_p(unsigned long *addr, int numpages) -{ - return change_page_attr_set(addr, numpages, __pgprot(_PAGE_PRESENT), 0); -} - /* Restore full speculative operation to the pfn. */ int clear_mce_nospec(unsigned long pfn) { unsigned long addr = (unsigned long) pfn_to_kaddr(pfn); - return set_memory_p(&addr, 1); + return set_memory_p(addr, 1); } EXPORT_SYMBOL_GPL(clear_mce_nospec); #endif /* CONFIG_X86_64 */ @@ -2104,6 +2103,11 @@ int set_memory_np_noalias(unsigned long addr, int numpages) CPA_NO_CHECK_ALIAS, NULL); } +int set_memory_p(unsigned long addr, int numpages) +{ + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); +} + int set_memory_4k(unsigned long addr, int numpages) { return change_page_attr_set_clr(&addr, numpages, __pgprot(0), @@ -2153,7 +2157,7 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) /* Notify hypervisor that we are about to set/clr encryption attribute. */ if (!x86_platform.guest.enc_status_change_prepare(addr, numpages, enc)) - return -EIO; + goto vmm_fail; ret = __change_page_attr_set_clr(&cpa, 1); @@ -2166,13 +2170,20 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) */ cpa_flush(&cpa, 0); + if (ret) + return ret; + /* Notify hypervisor that we have successfully set/clr encryption attribute. */ - if (!ret) { - if (!x86_platform.guest.enc_status_change_finish(addr, numpages, enc)) - ret = -EIO; - } + if (!x86_platform.guest.enc_status_change_finish(addr, numpages, enc)) + goto vmm_fail; - return ret; + return 0; + +vmm_fail: + WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s.\n", + (void *)addr, numpages, enc ? "private" : "shared"); + + return -EIO; } static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 0cbc1b8e8e3d..d007591b8059 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -293,7 +293,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) for (i = 0; i < PREALLOCATED_PMDS; i++) mop_up_one_pmd(mm, &pgdp[i]); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION if (!boot_cpu_has(X86_FEATURE_PTI)) return; @@ -325,7 +325,7 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) } } -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION static void pgd_prepopulate_user_pmd(struct mm_struct *mm, pgd_t *k_pgd, pmd_t *pmds[]) { @@ -777,7 +777,7 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) */ int pud_clear_huge(pud_t *pud) { - if (pud_large(*pud)) { + if (pud_leaf(*pud)) { pud_clear(pud); return 1; } @@ -792,7 +792,7 @@ int pud_clear_huge(pud_t *pud) */ int pmd_clear_huge(pmd_t *pmd) { - if (pmd_large(*pmd)) { + if (pmd_leaf(*pmd)) { pmd_clear(pmd); return 1; } diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 669ba1c345b3..2e69abf4f852 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -185,7 +185,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); } - BUILD_BUG_ON(pgd_large(*pgd) != 0); + BUILD_BUG_ON(pgd_leaf(*pgd) != 0); return p4d_offset(pgd, address); } @@ -206,7 +206,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) if (!p4d) return NULL; - BUILD_BUG_ON(p4d_large(*p4d) != 0); + BUILD_BUG_ON(p4d_leaf(*p4d) != 0); if (p4d_none(*p4d)) { unsigned long new_pud_page = __get_free_page(gfp); if (WARN_ON_ONCE(!new_pud_page)) @@ -217,7 +217,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) pud = pud_offset(p4d, address); /* The user page tables do not use large mappings: */ - if (pud_large(*pud)) { + if (pud_leaf(*pud)) { WARN_ON(1); return NULL; } @@ -252,7 +252,7 @@ static pte_t *pti_user_pagetable_walk_pte(unsigned long address) return NULL; /* We can't do anything sensible if we hit a large mapping. */ - if (pmd_large(*pmd)) { + if (pmd_leaf(*pmd)) { WARN_ON(1); return NULL; } @@ -341,7 +341,7 @@ pti_clone_pgtable(unsigned long start, unsigned long end, continue; } - if (pmd_large(*pmd) || level == PTI_CLONE_PMD) { + if (pmd_leaf(*pmd) || level == PTI_CLONE_PMD) { target_pmd = pti_user_pagetable_walk_pmd(addr); if (WARN_ON(!target_pmd)) return; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 5768d386efab..44ac64f3a047 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -89,10 +89,10 @@ #define CR3_HW_ASID_BITS 12 /* - * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for + * When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for * user/kernel switches */ -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION # define PTI_CONSUMED_PCID_BITS 1 #else # define PTI_CONSUMED_PCID_BITS 0 @@ -114,7 +114,7 @@ static inline u16 kern_pcid(u16 asid) { VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * Make sure that the dynamic ASID space does not conflict with the * bit we are using to switch between user and kernel ASIDs. @@ -149,7 +149,7 @@ static inline u16 kern_pcid(u16 asid) static inline u16 user_pcid(u16 asid) { u16 ret = kern_pcid(asid); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION ret |= 1 << X86_CR3_PTI_PCID_USER_BIT; #endif return ret; @@ -262,7 +262,7 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, static inline void invalidate_user_asid(u16 asid) { /* There is no user ASID if address space separation is off */ - if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + if (!IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) return; /* @@ -299,7 +299,7 @@ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam, write_cr3(new_mm_cr3); } -void leave_mm(int cpu) +void leave_mm(void) { struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); @@ -327,7 +327,7 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, unsigned long flags; local_irq_save(flags); - switch_mm_irqs_off(prev, next, tsk); + switch_mm_irqs_off(NULL, next, tsk); local_irq_restore(flags); } @@ -492,10 +492,16 @@ void cr4_update_pce(void *ignored) static inline void cr4_update_pce_mm(struct mm_struct *mm) { } #endif -void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, +/* + * This optimizes when not actually switching mm's. Some architectures use the + * 'unused' argument for this optimization, but x86 must use + * 'cpu_tlbstate.loaded_mm' instead because it does not always keep + * 'current->active_mm' up to date. + */ +void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, struct task_struct *tsk) { - struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); + struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); unsigned long new_lam = mm_lam_cr3_mask(next); bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy); @@ -504,15 +510,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, bool need_flush; u16 new_asid; - /* - * NB: The scheduler will call us with prev == next when switching - * from lazy TLB mode to normal mode if active_mm isn't changing. - * When this happens, we don't assume that CR3 (and hence - * cpu_tlbstate.loaded_mm) matches next. - * - * NB: leave_mm() calls us with prev == NULL and tsk == NULL. - */ - /* We don't want flush_tlb_func() to run concurrently with us. */ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) WARN_ON_ONCE(!irqs_disabled()); @@ -527,7 +524,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * isn't free. */ #ifdef CONFIG_DEBUG_VM - if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid, + if (WARN_ON_ONCE(__read_cr3() != build_cr3(prev->pgd, prev_asid, tlbstate_lam_cr3_mask()))) { /* * If we were to BUG here, we'd be very likely to kill @@ -559,7 +556,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * provides that full memory barrier and core serializing * instruction. */ - if (real_prev == next) { + if (prev == next) { /* Not actually switching mm's */ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != next->context.ctx_id); @@ -574,7 +571,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * mm_cpumask. The TLB shootdown code can figure out from * cpu_tlbstate_shared.is_lazy whether or not to send an IPI. */ - if (WARN_ON_ONCE(real_prev != &init_mm && + if (WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); @@ -616,10 +613,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * Skip kernel threads; we never send init_mm TLB flushing IPIs, * but the bitmap manipulation can cause cache line contention. */ - if (real_prev != &init_mm) { + if (prev != &init_mm) { VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, - mm_cpumask(real_prev))); - cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); + mm_cpumask(prev))); + cpumask_clear_cpu(cpu, mm_cpumask(prev)); } /* @@ -656,9 +653,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.loaded_mm, next); this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); - if (next != real_prev) { + if (next != prev) { cr4_update_pce_mm(next); - switch_ldt(real_prev, next); + switch_ldt(prev, next); } } |