diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-09-17 15:03:01 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-09-17 15:03:01 +0200 |
commit | 70f43ea3a360c5a7d3474b0cfbabb80be6424596 (patch) | |
tree | 0c4b3ac2078294f9912e7cb0f44ff92f0cc9d596 /arch/x86/mm | |
parent | b136021126b99072da705f693a8be07c6285e47c (diff) | |
parent | 50c6dbdfd16e312382842198a7919341ad480e05 (diff) |
Merge tag 'x86-mm-2024-09-17' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 memory management updates from Thomas Gleixner:
- Make LAM enablement safe vs. kernel threads using a process mm
temporarily as switching back to the process would not update CR3 and
therefore not enable LAM causing faults in user space when using
tagged pointers. Cure it by synchronizing LAM enablement via IPIs to
all CPUs which use the related mm.
- Cure a LAM harmless inconsistency between CR3 and the state during
context switch. It's both confusing and prone to lead to real bugs
- Handle alt stack handling for threads which run with a non-zero
protection key. The non-zero key prevents the kernel to access the
alternate stack. Cure it by temporarily enabling all protection keys
for the alternate stack setup/restore operations.
- Provide a EFI config table identity mapping for kexec kernel to
prevent kexec fails because the new kernel cannot access the config
table array
- Use GB pages only when a full GB is mapped in the identity map as
otherwise the CPU can speculate into reserved areas after the end of
memory which causes malfunction on UV systems.
- Remove the noisy and pointless SRAT table dump during boot
- Use is_ioremap_addr() for iounmap() address range checks instead of
high_memory. is_ioremap_addr() is more precise.
* tag 'x86-mm-2024-09-17' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/ioremap: Improve iounmap() address range checks
x86/mm: Remove duplicate check from build_cr3()
x86/mm: Remove unused NX related declarations
x86/mm: Remove unused CR3_HW_ASID_BITS
x86/mm: Don't print out SRAT table information
x86/mm/ident_map: Use gbpages only where full GB page should be mapped.
x86/kexec: Add EFI config table identity mapping for kexec kernel
selftests/mm: Add new testcases for pkeys
x86/pkeys: Restore altstack access in sigreturn()
x86/pkeys: Update PKRU to enable all pkeys before XSAVE
x86/pkeys: Add helper functions to update PKRU on the sigframe
x86/pkeys: Add PKRU as a parameter in signal handling functions
x86/mm: Cleanup prctl_enable_tagged_addr() nr_bits error checking
x86/mm: Fix LAM inconsistency during context switch
x86/mm: Use IPIs to synchronize LAM enablement
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/ident_map.c | 23 | ||||
-rw-r--r-- | arch/x86/mm/ioremap.c | 3 | ||||
-rw-r--r-- | arch/x86/mm/srat.c | 6 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 19 |
4 files changed, 30 insertions, 21 deletions
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index c45127265f2f..437e96fb4977 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -99,18 +99,31 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, for (; addr < end; addr = next) { pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; + bool use_gbpage; next = (addr & PUD_MASK) + PUD_SIZE; if (next > end) next = end; - if (info->direct_gbpages) { - pud_t pudval; + /* if this is already a gbpage, this portion is already mapped */ + if (pud_leaf(*pud)) + continue; + + /* Is using a gbpage allowed? */ + use_gbpage = info->direct_gbpages; - if (pud_present(*pud)) - continue; + /* Don't use gbpage if it maps more than the requested region. */ + /* at the begining: */ + use_gbpage &= ((addr & ~PUD_MASK) == 0); + /* ... or at the end: */ + use_gbpage &= ((next & ~PUD_MASK) == 0); + + /* Never overwrite existing mappings */ + use_gbpage &= !pud_present(*pud); + + if (use_gbpage) { + pud_t pudval; - addr &= PUD_MASK; pudval = __pud((addr - info->offset) | info->page_flag); set_pud(pud, pudval); continue; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index aa7d279321ea..70b02fc61d93 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -11,6 +11,7 @@ #include <linux/init.h> #include <linux/io.h> #include <linux/ioport.h> +#include <linux/ioremap.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mmiotrace.h> @@ -457,7 +458,7 @@ void iounmap(volatile void __iomem *addr) { struct vm_struct *p, *o; - if ((void __force *)addr <= high_memory) + if (WARN_ON_ONCE(!is_ioremap_addr((void __force *)addr))) return; /* diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 9c52a95937ad..6f8e0f21c710 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -57,8 +57,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) } set_apicid_to_node(apic_id, node); node_set(node, numa_nodes_parsed); - printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", - pxm, apic_id, node); + pr_debug("SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", pxm, apic_id, node); } /* Callback for Proximity Domain -> LAPIC mapping */ @@ -98,8 +97,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) set_apicid_to_node(apic_id, node); node_set(node, numa_nodes_parsed); - printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", - pxm, apic_id, node); + pr_debug("SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", pxm, apic_id, node); } int __init x86_acpi_numa_init(void) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 44ac64f3a047..86593d1b787d 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -11,6 +11,7 @@ #include <linux/sched/smt.h> #include <linux/task_work.h> #include <linux/mmu_notifier.h> +#include <linux/mmu_context.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -85,9 +86,6 @@ * */ -/* There are 12 bits of space for ASIDS in CR3 */ -#define CR3_HW_ASID_BITS 12 - /* * When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for * user/kernel switches @@ -160,7 +158,6 @@ static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam) unsigned long cr3 = __sme_pa(pgd) | lam; if (static_cpu_has(X86_FEATURE_PCID)) { - VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); cr3 |= kern_pcid(asid); } else { VM_WARN_ON_ONCE(asid != 0); @@ -503,9 +500,9 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, { struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); - unsigned long new_lam = mm_lam_cr3_mask(next); bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy); unsigned cpu = smp_processor_id(); + unsigned long new_lam; u64 next_tlb_gen; bool need_flush; u16 new_asid; @@ -619,9 +616,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, cpumask_clear_cpu(cpu, mm_cpumask(prev)); } - /* - * Start remote flushes and then read tlb_gen. - */ + /* Start receiving IPIs and then read tlb_gen (and LAM below) */ if (next != &init_mm) cpumask_set_cpu(cpu, mm_cpumask(next)); next_tlb_gen = atomic64_read(&next->context.tlb_gen); @@ -633,7 +628,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, barrier(); } - set_tlbstate_lam_mode(next); + new_lam = mm_lam_cr3_mask(next); if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); @@ -652,6 +647,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, this_cpu_write(cpu_tlbstate.loaded_mm, next); this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + cpu_tlbstate_update_lam(new_lam, mm_untag_mask(next)); if (next != prev) { cr4_update_pce_mm(next); @@ -698,6 +694,7 @@ void initialize_tlbstate_and_flush(void) int i; struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm); u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen); + unsigned long lam = mm_lam_cr3_mask(mm); unsigned long cr3 = __read_cr3(); /* Assert that CR3 already references the right mm. */ @@ -705,7 +702,7 @@ void initialize_tlbstate_and_flush(void) /* LAM expected to be disabled */ WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57)); - WARN_ON(mm_lam_cr3_mask(mm)); + WARN_ON(lam); /* * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization @@ -724,7 +721,7 @@ void initialize_tlbstate_and_flush(void) this_cpu_write(cpu_tlbstate.next_asid, 1); this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); - set_tlbstate_lam_mode(mm); + cpu_tlbstate_update_lam(lam, mm_untag_mask(mm)); for (i = 1; i < TLB_NR_DYN_ASIDS; i++) this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); |