aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/cpu_entry_area.c3
-rw-r--r--arch/x86/mm/extable.c25
-rw-r--r--arch/x86/mm/fault.c93
-rw-r--r--arch/x86/mm/init_64.c20
-rw-r--r--arch/x86/mm/ioremap.c2
-rw-r--r--arch/x86/mm/mem_encrypt.c38
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c3
-rw-r--r--arch/x86/mm/numa.c34
-rw-r--r--arch/x86/mm/numa_emulation.c5
-rw-r--r--arch/x86/mm/pat/set_memory.c2
-rw-r--r--arch/x86/mm/tlb.c37
11 files changed, 190 insertions, 72 deletions
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index 770b613790b3..f5e1e60c9095 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -21,7 +21,8 @@ DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks);
DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack);
#endif
-struct cpu_entry_area *get_cpu_entry_area(int cpu)
+/* Is called from entry code, so must be noinstr */
+noinstr struct cpu_entry_area *get_cpu_entry_area(int cpu)
{
unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 1d6cb07f4f86..b93d6cd08a7f 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -5,6 +5,7 @@
#include <xen/xen.h>
#include <asm/fpu/internal.h>
+#include <asm/sev-es.h>
#include <asm/traps.h>
#include <asm/kdebug.h>
@@ -80,6 +81,18 @@ __visible bool ex_handler_uaccess(const struct exception_table_entry *fixup,
}
EXPORT_SYMBOL(ex_handler_uaccess);
+__visible bool ex_handler_copy(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr)
+{
+ WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
+ regs->ip = ex_fixup_addr(fixup);
+ regs->ax = trapnr;
+ return true;
+}
+EXPORT_SYMBOL(ex_handler_copy);
+
__visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr,
unsigned long error_code,
@@ -125,17 +138,21 @@ __visible bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
}
EXPORT_SYMBOL(ex_handler_clear_fs);
-__visible bool ex_has_fault_handler(unsigned long ip)
+enum handler_type ex_get_fault_handler_type(unsigned long ip)
{
const struct exception_table_entry *e;
ex_handler_t handler;
e = search_exception_tables(ip);
if (!e)
- return false;
+ return EX_HANDLER_NONE;
handler = ex_fixup_handler(e);
-
- return handler == ex_handler_fault;
+ if (handler == ex_handler_fault)
+ return EX_HANDLER_FAULT;
+ else if (handler == ex_handler_uaccess || handler == ex_handler_copy)
+ return EX_HANDLER_UACCESS;
+ else
+ return EX_HANDLER_OTHER;
}
int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 35f1498e9832..82bf37a5c9ec 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -190,6 +190,53 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
return pmd_k;
}
+/*
+ * Handle a fault on the vmalloc or module mapping area
+ *
+ * This is needed because there is a race condition between the time
+ * when the vmalloc mapping code updates the PMD to the point in time
+ * where it synchronizes this update with the other page-tables in the
+ * system.
+ *
+ * In this race window another thread/CPU can map an area on the same
+ * PMD, finds it already present and does not synchronize it with the
+ * rest of the system yet. As a result v[mz]alloc might return areas
+ * which are not mapped in every page-table in the system, causing an
+ * unhandled page-fault when they are accessed.
+ */
+static noinline int vmalloc_fault(unsigned long address)
+{
+ unsigned long pgd_paddr;
+ pmd_t *pmd_k;
+ pte_t *pte_k;
+
+ /* Make sure we are in vmalloc area: */
+ if (!(address >= VMALLOC_START && address < VMALLOC_END))
+ return -1;
+
+ /*
+ * Synchronize this task's top level page-table
+ * with the 'reference' page table.
+ *
+ * Do _not_ use "current" here. We might be inside
+ * an interrupt in the middle of a task switch..
+ */
+ pgd_paddr = read_cr3_pa();
+ pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
+ if (!pmd_k)
+ return -1;
+
+ if (pmd_large(*pmd_k))
+ return 0;
+
+ pte_k = pte_offset_kernel(pmd_k, address);
+ if (!pte_present(*pte_k))
+ return -1;
+
+ return 0;
+}
+NOKPROBE_SYMBOL(vmalloc_fault);
+
void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
{
unsigned long addr;
@@ -1081,7 +1128,7 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
return 0;
}
-static int fault_in_kernel_space(unsigned long address)
+bool fault_in_kernel_space(unsigned long address)
{
/*
* On 64-bit systems, the vsyscall page is at an address above
@@ -1110,6 +1157,37 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
*/
WARN_ON_ONCE(hw_error_code & X86_PF_PK);
+#ifdef CONFIG_X86_32
+ /*
+ * We can fault-in kernel-space virtual memory on-demand. The
+ * 'reference' page table is init_mm.pgd.
+ *
+ * NOTE! We MUST NOT take any locks for this case. We may
+ * be in an interrupt or a critical region, and should
+ * only copy the information from the master page table,
+ * nothing more.
+ *
+ * Before doing this on-demand faulting, ensure that the
+ * fault is not any of the following:
+ * 1. A fault on a PTE with a reserved bit set.
+ * 2. A fault caused by a user-mode access. (Do not demand-
+ * fault kernel memory due to user-mode accesses).
+ * 3. A fault caused by a page-level protection violation.
+ * (A demand fault would be on a non-present page which
+ * would have X86_PF_PROT==0).
+ *
+ * This is only needed to close a race condition on x86-32 in
+ * the vmalloc mapping/unmapping code. See the comment above
+ * vmalloc_fault() for details. On x86-64 the race does not
+ * exist as the vmalloc mappings don't need to be synchronized
+ * there.
+ */
+ if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
+ if (vmalloc_fault(address) >= 0)
+ return;
+ }
+#endif
+
/* Was the fault spurious, caused by lazy TLB invalidation? */
if (spurious_kernel_fault(hw_error_code, address))
return;
@@ -1368,11 +1446,14 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
prefetchw(&current->mm->mmap_lock);
/*
- * KVM has two types of events that are, logically, interrupts, but
- * are unfortunately delivered using the #PF vector. These events are
- * "you just accessed valid memory, but the host doesn't have it right
- * now, so I'll put you to sleep if you continue" and "that memory
- * you tried to access earlier is available now."
+ * KVM uses #PF vector to deliver 'page not present' events to guests
+ * (asynchronous page fault mechanism). The event happens when a
+ * userspace task is trying to access some valid (from guest's point of
+ * view) memory which is not currently mapped by the host (e.g. the
+ * memory is swapped out). Note, the corresponding "page ready" event
+ * which is injected when the memory becomes available, is delived via
+ * an interrupt mechanism and not a #PF exception
+ * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()).
*
* We are relying on the interrupted context being sane (valid RSP,
* relevant locks not held, etc.), which is fine as long as the
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a4ac13cc3fdc..b5a3fa4033d3 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -217,11 +217,6 @@ static void sync_global_pgds(unsigned long start, unsigned long end)
sync_global_pgds_l4(start, end);
}
-void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
-{
- sync_global_pgds(start, end);
-}
-
/*
* NOTE: This function is marked __ref because it calls __init function
* (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
@@ -1257,14 +1252,19 @@ static void __init preallocate_vmalloc_pages(void)
if (!p4d)
goto failed;
- /*
- * With 5-level paging the P4D level is not folded. So the PGDs
- * are now populated and there is no need to walk down to the
- * PUD level.
- */
if (pgtable_l5_enabled())
continue;
+ /*
+ * The goal here is to allocate all possibly required
+ * hardware page tables pointed to by the top hardware
+ * level.
+ *
+ * On 4-level systems, the P4D layer is folded away and
+ * the above code does no preallocation. Below, go down
+ * to the pud _software_ level to ensure the second
+ * hardware level is allocated on 4-level systems too.
+ */
lvl = "pud";
pud = pud_alloc(&init_mm, p4d, addr);
if (!pud)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 84d85dbd1dad..9e5ccc56f8e0 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -574,7 +574,7 @@ static bool memremap_should_map_decrypted(resource_size_t phys_addr,
/* For SEV, these areas are encrypted */
if (sev_active())
break;
- /* Fallthrough */
+ fallthrough;
case E820_TYPE_PRAM:
return true;
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 9f1177edc2e7..ebb7edc8bc0a 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -38,6 +38,7 @@
* section is later cleared.
*/
u64 sme_me_mask __section(.data) = 0;
+u64 sev_status __section(.data) = 0;
EXPORT_SYMBOL(sme_me_mask);
DEFINE_STATIC_KEY_FALSE(sev_enable_key);
EXPORT_SYMBOL_GPL(sev_enable_key);
@@ -347,7 +348,13 @@ bool sme_active(void)
bool sev_active(void)
{
- return sme_me_mask && sev_enabled;
+ return sev_status & MSR_AMD64_SEV_ENABLED;
+}
+
+/* Needs to be called from non-instrumentable code */
+bool noinstr sev_es_active(void)
+{
+ return sev_status & MSR_AMD64_SEV_ES_ENABLED;
}
/* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
@@ -400,6 +407,31 @@ void __init mem_encrypt_free_decrypted_mem(void)
free_init_pages("unused decrypted", vaddr, vaddr_end);
}
+static void print_mem_encrypt_feature_info(void)
+{
+ pr_info("AMD Memory Encryption Features active:");
+
+ /* Secure Memory Encryption */
+ if (sme_active()) {
+ /*
+ * SME is mutually exclusive with any of the SEV
+ * features below.
+ */
+ pr_cont(" SME\n");
+ return;
+ }
+
+ /* Secure Encrypted Virtualization */
+ if (sev_active())
+ pr_cont(" SEV");
+
+ /* Encrypted Register State */
+ if (sev_es_active())
+ pr_cont(" SEV-ES");
+
+ pr_cont("\n");
+}
+
/* Architecture __weak replacement functions */
void __init mem_encrypt_init(void)
{
@@ -415,8 +447,6 @@ void __init mem_encrypt_init(void)
if (sev_active())
static_branch_enable(&sev_enable_key);
- pr_info("AMD %s active\n",
- sev_active() ? "Secure Encrypted Virtualization (SEV)"
- : "Secure Memory Encryption (SME)");
+ print_mem_encrypt_feature_info();
}
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index e2b0e2ac07bb..68d75379e06a 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -540,6 +540,9 @@ void __init sme_enable(struct boot_params *bp)
if (!(msr & MSR_AMD64_SEV_ENABLED))
return;
+ /* Save SEV_STATUS to avoid reading MSR again */
+ sev_status = msr;
+
/* SEV state cannot be controlled by a command line option */
sme_me_mask = me_mask;
sev_enabled = true;
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index aa76ec2d359b..44148691d78b 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -37,14 +37,12 @@ static __init int numa_setup(char *opt)
return -EINVAL;
if (!strncmp(opt, "off", 3))
numa_off = 1;
-#ifdef CONFIG_NUMA_EMU
if (!strncmp(opt, "fake=", 5))
- numa_emu_cmdline(opt + 5);
-#endif
-#ifdef CONFIG_ACPI_NUMA
+ return numa_emu_cmdline(opt + 5);
if (!strncmp(opt, "noacpi", 6))
- acpi_numa = -1;
-#endif
+ disable_srat();
+ if (!strncmp(opt, "nohmat", 6))
+ disable_hmat();
return 0;
}
early_param("numa", numa_setup);
@@ -516,7 +514,7 @@ static void __init numa_clear_kernel_node_hotplug(void)
* memory ranges, because quirks such as trim_snb_memory()
* reserve specific pages for Sandy Bridge graphics. ]
*/
- for_each_memblock(reserved, mb_region) {
+ for_each_reserved_mem_region(mb_region) {
int nid = memblock_get_region_node(mb_region);
if (nid != MAX_NUMNODES)
@@ -748,6 +746,27 @@ static void __init init_memory_less_node(int nid)
}
/*
+ * A node may exist which has one or more Generic Initiators but no CPUs and no
+ * memory.
+ *
+ * This function must be called after init_cpu_to_node(), to ensure that any
+ * memoryless CPU nodes have already been brought online, and before the
+ * node_data[nid] is needed for zone list setup in build_all_zonelists().
+ *
+ * When this function is called, any nodes containing either memory and/or CPUs
+ * will already be online and there is no need to do anything extra, even if
+ * they also contain one or more Generic Initiators.
+ */
+void __init init_gi_nodes(void)
+{
+ int nid;
+
+ for_each_node_state(nid, N_GENERIC_INITIATOR)
+ if (!node_online(nid))
+ init_memory_less_node(nid);
+}
+
+/*
* Setup early cpu_to_node.
*
* Populate cpu_to_node[] only if x86_cpu_to_apicid[],
@@ -919,7 +938,6 @@ int phys_to_target_node(phys_addr_t start)
return meminfo_to_nid(&numa_reserved_meminfo, start);
}
-EXPORT_SYMBOL_GPL(phys_to_target_node);
int memory_add_physaddr_to_nid(u64 start)
{
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index c5174b4e318b..87d77cc52f86 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -13,9 +13,10 @@
static int emu_nid_to_phys[MAX_NUMNODES];
static char *emu_cmdline __initdata;
-void __init numa_emu_cmdline(char *str)
+int __init numa_emu_cmdline(char *str)
{
emu_cmdline = str;
+ return 0;
}
static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
@@ -321,7 +322,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
u64 addr, u64 max_addr, u64 size)
{
return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size,
- 0, NULL, NUMA_NO_NODE);
+ 0, NULL, 0);
}
static int __init setup_emu2phys_nid(int *dfl_phys_nid)
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index d1b2a889f035..40baa90e74f4 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -1999,7 +1999,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
/*
* Before changing the encryption attribute, we need to flush caches.
*/
- cpa_flush(&cpa, 1);
+ cpa_flush(&cpa, !this_cpu_has(X86_FEATURE_SME_COHERENT));
ret = __change_page_attr_set_clr(&cpa, 1);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 1a3569b43aa5..11666ba19b62 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -14,7 +14,6 @@
#include <asm/nospec-branch.h>
#include <asm/cache.h>
#include <asm/apic.h>
-#include <asm/uv/uv.h>
#include "mm_internal.h"
@@ -555,21 +554,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
load_new_mm_cr3(next->pgd, new_asid, true);
- /*
- * NB: This gets called via leave_mm() in the idle path
- * where RCU functions differently. Tracing normally
- * uses RCU, so we need to use the _rcuidle variant.
- *
- * (There is no good reason for this. The idle code should
- * be rearranged to call this before rcu_idle_enter().)
- */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
load_new_mm_cr3(next->pgd, new_asid, false);
- /* See above wrt _rcuidle. */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
/* Make sure we write CR3 before loaded_mm. */
@@ -809,29 +799,6 @@ STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask,
trace_tlb_flush(TLB_REMOTE_SEND_IPI,
(info->end - info->start) >> PAGE_SHIFT);
- if (is_uv_system()) {
- /*
- * This whole special case is confused. UV has a "Broadcast
- * Assist Unit", which seems to be a fancy way to send IPIs.
- * Back when x86 used an explicit TLB flush IPI, UV was
- * optimized to use its own mechanism. These days, x86 uses
- * smp_call_function_many(), but UV still uses a manual IPI,
- * and that IPI's action is out of date -- it does a manual
- * flush instead of calling flush_tlb_func_remote(). This
- * means that the percpu tlb_gen variables won't be updated
- * and we'll do pointless flushes on future context switches.
- *
- * Rather than hooking native_flush_tlb_others() here, I think
- * that UV should be updated so that smp_call_function_many(),
- * etc, are optimal on UV.
- */
- cpumask = uv_flush_tlb_others(cpumask, info);
- if (cpumask)
- smp_call_function_many(cpumask, flush_tlb_func_remote,
- (void *)info, 1);
- return;
- }
-
/*
* If no page tables were freed, we can skip sending IPIs to
* CPUs in lazy TLB mode. They will flush the CPU themselves