47 files changed, 1681 insertions, 1722 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index e1f095884386..27e9e90a8d35 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
 # Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c
 KCOV_INSTRUMENT_tlb.o		:= n
 KCOV_INSTRUMENT_mem_encrypt.o	:= n
@@ -9,7 +10,7 @@ CFLAGS_REMOVE_mem_encrypt.o	= -pg
 endif
 
 obj-y	:=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
-	    pat.o pgtable.o physaddr.o setup_nx.o tlb.o
+	    pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o
 
 # Make sure __phys_addr has no stackprotector
 nostackp := $(call cc-option, -fno-stack-protector)
@@ -28,8 +29,6 @@ obj-$(CONFIG_X86_PTDUMP)	+= debug_pagetables.o
 
 obj-$(CONFIG_HIGHMEM)		+= highmem_32.o
 
-obj-$(CONFIG_KMEMCHECK)		+= kmemcheck/
-
 KASAN_SANITIZE_kasan_init_$(BITS).o := n
 obj-$(CONFIG_KASAN)		+= kasan_init_$(BITS).o
 
@@ -42,9 +41,10 @@ obj-$(CONFIG_AMD_NUMA)		+= amdtopology.o
 obj-$(CONFIG_ACPI_NUMA)		+= srat.o
 obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
 
-obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o
-obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
-obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
+obj-$(CONFIG_X86_INTEL_MPX)			+= mpx.o
+obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS)	+= pkeys.o
+obj-$(CONFIG_RANDOMIZE_MEMORY)			+= kaslr.o
+obj-$(CONFIG_PAGE_TABLE_ISOLATION)		+= pti.o
 
 obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt.o
 obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_boot.o
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
index 91f501b2da3b..048c761d97b0 100644
--- a/arch/x86/mm/amdtopology.c
+++ b/arch/x86/mm/amdtopology.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * AMD NUMA support.
  * Discover the memory map and associated nodes.
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
new file mode 100644
index 000000000000..b9283cc27622
--- /dev/null
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+
+#include <asm/cpu_entry_area.h>
+#include <asm/pgtable.h>
+#include <asm/fixmap.h>
+#include <asm/desc.h>
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage);
+
+#ifdef CONFIG_X86_64
+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+#endif
+
+struct cpu_entry_area *get_cpu_entry_area(int cpu)
+{
+	unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
+	BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
+
+	return (struct cpu_entry_area *) va;
+}
+EXPORT_SYMBOL(get_cpu_entry_area);
+
+void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags)
+{
+	unsigned long va = (unsigned long) cea_vaddr;
+
+	set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags));
+}
+
+static void __init
+cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
+{
+	for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE)
+		cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
+}
+
+static void percpu_setup_debug_store(int cpu)
+{
+#ifdef CONFIG_CPU_SUP_INTEL
+	int npages;
+	void *cea;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		return;
+
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_store;
+	npages = sizeof(struct debug_store) / PAGE_SIZE;
+	BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
+	cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages,
+			     PAGE_KERNEL);
+
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers;
+	/*
+	 * Force the population of PMDs for not yet allocated per cpu
+	 * memory like debug store buffers.
+	 */
+	npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
+	for (; npages; npages--, cea += PAGE_SIZE)
+		cea_set_pte(cea, 0, PAGE_NONE);
+#endif
+}
+
+/* Setup the fixmap mappings only once per-processor */
+static void __init setup_cpu_entry_area(int cpu)
+{
+#ifdef CONFIG_X86_64
+	extern char _entry_trampoline[];
+
+	/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
+	pgprot_t gdt_prot = PAGE_KERNEL_RO;
+	pgprot_t tss_prot = PAGE_KERNEL_RO;
+#else
+	/*
+	 * On native 32-bit systems, the GDT cannot be read-only because
+	 * our double fault handler uses a task gate, and entering through
+	 * a task gate needs to change an available TSS to busy.  If the
+	 * GDT is read-only, that will triple fault.  The TSS cannot be
+	 * read-only because the CPU writes to it on task switches.
+	 *
+	 * On Xen PV, the GDT must be read-only because the hypervisor
+	 * requires it.
+	 */
+	pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
+		PAGE_KERNEL_RO : PAGE_KERNEL;
+	pgprot_t tss_prot = PAGE_KERNEL;
+#endif
+
+	cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu),
+		    gdt_prot);
+
+	cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page,
+			     per_cpu_ptr(&entry_stack_storage, cpu), 1,
+			     PAGE_KERNEL);
+
+	/*
+	 * The Intel SDM says (Volume 3, 7.2.1):
+	 *
+	 *  Avoid placing a page boundary in the part of the TSS that the
+	 *  processor reads during a task switch (the first 104 bytes). The
+	 *  processor may not correctly perform address translations if a
+	 *  boundary occurs in this area. During a task switch, the processor
+	 *  reads and writes into the first 104 bytes of each TSS (using
+	 *  contiguous physical addresses beginning with the physical address
+	 *  of the first byte of the TSS). So, after TSS access begins, if
+	 *  part of the 104 bytes is not physically contiguous, the processor
+	 *  will access incorrect information without generating a page-fault
+	 *  exception.
+	 *
+	 * There are also a lot of errata involving the TSS spanning a page
+	 * boundary.  Assert that we're not doing that.
+	 */
+	BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
+		      offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
+	BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
+	cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss,
+			     &per_cpu(cpu_tss_rw, cpu),
+			     sizeof(struct tss_struct) / PAGE_SIZE, tss_prot);
+
+#ifdef CONFIG_X86_32
+	per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
+#endif
+
+#ifdef CONFIG_X86_64
+	BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
+	BUILD_BUG_ON(sizeof(exception_stacks) !=
+		     sizeof(((struct cpu_entry_area *)0)->exception_stacks));
+	cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks,
+			     &per_cpu(exception_stacks, cpu),
+			     sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL);
+
+	cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
+		     __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
+#endif
+	percpu_setup_debug_store(cpu);
+}
+
+static __init void setup_cpu_entry_area_ptes(void)
+{
+#ifdef CONFIG_X86_32
+	unsigned long start, end;
+
+	BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE);
+	BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK);
+
+	start = CPU_ENTRY_AREA_BASE;
+	end = start + CPU_ENTRY_AREA_MAP_SIZE;
+
+	/* Careful here: start + PMD_SIZE might wrap around */
+	for (; start < end && start >= CPU_ENTRY_AREA_BASE; start += PMD_SIZE)
+		populate_extra_pte(start);
+#endif
+}
+
+void __init setup_cpu_entry_areas(void)
+{
+	unsigned int cpu;
+
+	setup_cpu_entry_area_ptes();
+
+	for_each_possible_cpu(cpu)
+		setup_cpu_entry_area(cpu);
+}
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
index bfcffdf6c577..421f2664ffa0 100644
--- a/arch/x86/mm/debug_pagetables.c
+++ b/arch/x86/mm/debug_pagetables.c
@@ -5,7 +5,7 @@
 
 static int ptdump_show(struct seq_file *m, void *v)
 {
-	ptdump_walk_pgd_level(m, NULL);
+	ptdump_walk_pgd_level_debugfs(m, NULL, false);
 	return 0;
 }
 
@@ -22,21 +22,89 @@ static const struct file_operations ptdump_fops = {
 	.release	= single_release,
 };
 
-static struct dentry *pe;
+static int ptdump_show_curknl(struct seq_file *m, void *v)
+{
+	if (current->mm->pgd) {
+		down_read(&current->mm->mmap_sem);
+		ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false);
+		up_read(&current->mm->mmap_sem);
+	}
+	return 0;
+}
+
+static int ptdump_open_curknl(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, ptdump_show_curknl, NULL);
+}
+
+static const struct file_operations ptdump_curknl_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ptdump_open_curknl,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+static struct dentry *pe_curusr;
+
+static int ptdump_show_curusr(struct seq_file *m, void *v)
+{
+	if (current->mm->pgd) {
+		down_read(&current->mm->mmap_sem);
+		ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true);
+		up_read(&current->mm->mmap_sem);
+	}
+	return 0;
+}
+
+static int ptdump_open_curusr(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, ptdump_show_curusr, NULL);
+}
+
+static const struct file_operations ptdump_curusr_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ptdump_open_curusr,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
+
+static struct dentry *dir, *pe_knl, *pe_curknl;
 
 static int __init pt_dump_debug_init(void)
 {
-	pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL,
-				 &ptdump_fops);
-	if (!pe)
+	dir = debugfs_create_dir("page_tables", NULL);
+	if (!dir)
 		return -ENOMEM;
 
+	pe_knl = debugfs_create_file("kernel", 0400, dir, NULL,
+				     &ptdump_fops);
+	if (!pe_knl)
+		goto err;
+
+	pe_curknl = debugfs_create_file("current_kernel", 0400,
+					dir, NULL, &ptdump_curknl_fops);
+	if (!pe_curknl)
+		goto err;
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	pe_curusr = debugfs_create_file("current_user", 0400,
+					dir, NULL, &ptdump_curusr_fops);
+	if (!pe_curusr)
+		goto err;
+#endif
 	return 0;
+err:
+	debugfs_remove_recursive(dir);
+	return -ENOMEM;
 }
 
 static void __exit pt_dump_debug_exit(void)
 {
-	debugfs_remove_recursive(pe);
+	debugfs_remove_recursive(dir);
 }
 
 module_init(pt_dump_debug_init);
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 5e3ac6fe6c9e..2a4849e92831 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -44,68 +44,97 @@ struct addr_marker {
 	unsigned long max_lines;
 };
 
-/* indices for address_markers; keep sync'd w/ address_markers below */
+/* Address space markers hints */
+
+#ifdef CONFIG_X86_64
+
 enum address_markers_idx {
 	USER_SPACE_NR = 0,
-#ifdef CONFIG_X86_64
 	KERNEL_SPACE_NR,
 	LOW_KERNEL_NR,
+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
+	LDT_NR,
+#endif
 	VMALLOC_START_NR,
 	VMEMMAP_START_NR,
 #ifdef CONFIG_KASAN
 	KASAN_SHADOW_START_NR,
 	KASAN_SHADOW_END_NR,
 #endif
-# ifdef CONFIG_X86_ESPFIX64
+	CPU_ENTRY_AREA_NR,
+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
+	LDT_NR,
+#endif
+#ifdef CONFIG_X86_ESPFIX64
 	ESPFIX_START_NR,
-# endif
+#endif
+#ifdef CONFIG_EFI
+	EFI_END_NR,
+#endif
 	HIGH_KERNEL_NR,
 	MODULES_VADDR_NR,
 	MODULES_END_NR,
-#else
+	FIXADDR_START_NR,
+	END_OF_SPACE_NR,
+};
+
+static struct addr_marker address_markers[] = {
+	[USER_SPACE_NR]		= { 0,			"User Space" },
+	[KERNEL_SPACE_NR]	= { (1UL << 63),	"Kernel Space" },
+	[LOW_KERNEL_NR]		= { 0UL,		"Low Kernel Mapping" },
+	[VMALLOC_START_NR]	= { 0UL,		"vmalloc() Area" },
+	[VMEMMAP_START_NR]	= { 0UL,		"Vmemmap" },
+#ifdef CONFIG_KASAN
+	[KASAN_SHADOW_START_NR]	= { KASAN_SHADOW_START,	"KASAN shadow" },
+	[KASAN_SHADOW_END_NR]	= { KASAN_SHADOW_END,	"KASAN shadow end" },
+#endif
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+	[LDT_NR]		= { LDT_BASE_ADDR,	"LDT remap" },
+#endif
+	[CPU_ENTRY_AREA_NR]	= { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
+#ifdef CONFIG_X86_ESPFIX64
+	[ESPFIX_START_NR]	= { ESPFIX_BASE_ADDR,	"ESPfix Area", 16 },
+#endif
+#ifdef CONFIG_EFI
+	[EFI_END_NR]		= { EFI_VA_END,		"EFI Runtime Services" },
+#endif
+	[HIGH_KERNEL_NR]	= { __START_KERNEL_map,	"High Kernel Mapping" },
+	[MODULES_VADDR_NR]	= { MODULES_VADDR,	"Modules" },
+	[MODULES_END_NR]	= { MODULES_END,	"End Modules" },
+	[FIXADDR_START_NR]	= { FIXADDR_START,	"Fixmap Area" },
+	[END_OF_SPACE_NR]	= { -1,			NULL }
+};
+
+#else /* CONFIG_X86_64 */
+
+enum address_markers_idx {
+	USER_SPACE_NR = 0,
 	KERNEL_SPACE_NR,
 	VMALLOC_START_NR,
 	VMALLOC_END_NR,
-# ifdef CONFIG_HIGHMEM
+#ifdef CONFIG_HIGHMEM
 	PKMAP_BASE_NR,
-# endif
-	FIXADDR_START_NR,
 #endif
+	CPU_ENTRY_AREA_NR,
+	FIXADDR_START_NR,
+	END_OF_SPACE_NR,
 };
 
-/* Address space markers hints */
 static struct addr_marker address_markers[] = {
-	{ 0, "User Space" },
-#ifdef CONFIG_X86_64
-	{ 0x8000000000000000UL, "Kernel Space" },
-	{ 0/* PAGE_OFFSET */,   "Low Kernel Mapping" },
-	{ 0/* VMALLOC_START */, "vmalloc() Area" },
-	{ 0/* VMEMMAP_START */, "Vmemmap" },
-#ifdef CONFIG_KASAN
-	{ KASAN_SHADOW_START,	"KASAN shadow" },
-	{ KASAN_SHADOW_END,	"KASAN shadow end" },
+	[USER_SPACE_NR]		= { 0,			"User Space" },
+	[KERNEL_SPACE_NR]	= { PAGE_OFFSET,	"Kernel Mapping" },
+	[VMALLOC_START_NR]	= { 0UL,		"vmalloc() Area" },
+	[VMALLOC_END_NR]	= { 0UL,		"vmalloc() End" },
+#ifdef CONFIG_HIGHMEM
+	[PKMAP_BASE_NR]		= { 0UL,		"Persistent kmap() Area" },
 #endif
-# ifdef CONFIG_X86_ESPFIX64
-	{ ESPFIX_BASE_ADDR,	"ESPfix Area", 16 },
-# endif
-# ifdef CONFIG_EFI
-	{ EFI_VA_END,		"EFI Runtime Services" },
-# endif
-	{ __START_KERNEL_map,   "High Kernel Mapping" },
-	{ MODULES_VADDR,        "Modules" },
-	{ MODULES_END,          "End Modules" },
-#else
-	{ PAGE_OFFSET,          "Kernel Mapping" },
-	{ 0/* VMALLOC_START */, "vmalloc() Area" },
-	{ 0/*VMALLOC_END*/,     "vmalloc() End" },
-# ifdef CONFIG_HIGHMEM
-	{ 0/*PKMAP_BASE*/,      "Persistent kmap() Area" },
-# endif
-	{ 0/*FIXADDR_START*/,   "Fixmap Area" },
-#endif
-	{ -1, NULL }		/* End of list */
+	[CPU_ENTRY_AREA_NR]	= { 0UL,		"CPU entry area" },
+	[FIXADDR_START_NR]	= { 0UL,		"Fixmap area" },
+	[END_OF_SPACE_NR]	= { -1,			NULL }
 };
 
+#endif /* !CONFIG_X86_64 */
+
 /* Multipliers for offsets within the PTEs */
 #define PTE_LEVEL_MULT (PAGE_SIZE)
 #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
@@ -140,7 +169,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
 	static const char * const level_name[] =
 		{ "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
 
-	if (!pgprot_val(prot)) {
+	if (!(pr & _PAGE_PRESENT)) {
 		/* Not present */
 		pt_dump_cont_printf(m, dmsg, "                              ");
 	} else {
@@ -447,7 +476,7 @@ static inline bool is_hypervisor_range(int idx)
 }
 
 static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
-				       bool checkwx)
+				       bool checkwx, bool dmesg)
 {
 #ifdef CONFIG_X86_64
 	pgd_t *start = (pgd_t *) &init_top_pgt;
@@ -460,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
 
 	if (pgd) {
 		start = pgd;
-		st.to_dmesg = true;
+		st.to_dmesg = dmesg;
 	}
 
 	st.check_wx = checkwx;
@@ -498,13 +527,37 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
 
 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
 {
-	ptdump_walk_pgd_level_core(m, pgd, false);
+	ptdump_walk_pgd_level_core(m, pgd, false, true);
+}
+
+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	if (user && static_cpu_has(X86_FEATURE_PTI))
+		pgd = kernel_to_user_pgdp(pgd);
+#endif
+	ptdump_walk_pgd_level_core(m, pgd, false, false);
+}
+EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
+
+static void ptdump_walk_user_pgd_level_checkwx(void)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	pgd_t *pgd = (pgd_t *) &init_top_pgt;
+
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+
+	pr_info("x86/mm: Checking user space page tables\n");
+	pgd = kernel_to_user_pgdp(pgd);
+	ptdump_walk_pgd_level_core(NULL, pgd, true, false);
+#endif
 }
-EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
 
 void ptdump_walk_pgd_level_checkwx(void)
 {
-	ptdump_walk_pgd_level_core(NULL, NULL, true);
+	ptdump_walk_pgd_level_core(NULL, NULL, true, false);
+	ptdump_walk_user_pgd_level_checkwx();
 }
 
 static int __init pt_dump_init(void)
@@ -525,8 +578,8 @@ static int __init pt_dump_init(void)
 	address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
 # endif
 	address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
+	address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;
 #endif
-
 	return 0;
 }
 __initcall(pt_dump_init);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index c3521e2be396..9fe656c42aa5 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -1,6 +1,7 @@
 #include <linux/extable.h>
 #include <linux/uaccess.h>
 #include <linux/sched/debug.h>
+#include <xen/xen.h>
 
 #include <asm/fpu/internal.h>
 #include <asm/traps.h>
@@ -67,17 +68,22 @@ bool ex_handler_refcount(const struct exception_table_entry *fixup,
 	 * wrapped around) will be set. Additionally, seeing the refcount
 	 * reach 0 will set ZF (Zero Flag: result was zero). In each of
 	 * these cases we want a report, since it's a boundary condition.
-	 *
+	 * The SF case is not reported since it indicates post-boundary
+	 * manipulations below zero or above INT_MAX. And if none of the
+	 * flags are set, something has gone very wrong, so report it.
 	 */
 	if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) {
 		bool zero = regs->flags & X86_EFLAGS_ZF;
 
 		refcount_error_report(regs, zero ? "hit zero" : "overflow");
+	} else if ((regs->flags & X86_EFLAGS_SF) == 0) {
+		/* Report if none of OF, ZF, nor SF are set. */
+		refcount_error_report(regs, "unexpected saturation");
 	}
 
 	return true;
 }
-EXPORT_SYMBOL_GPL(ex_handler_refcount);
+EXPORT_SYMBOL(ex_handler_refcount);
 
 /*
  * Handler for when we fail to restore a task's FPU state.  We should never get
@@ -207,8 +213,9 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
 	 * Old CPUs leave the high bits of CS on the stack
 	 * undefined.  I'm not sure which CPUs do this, but at least
 	 * the 486 DX works this way.
+	 * Xen pv domains are not using the default __KERNEL_CS.
 	 */
-	if (regs->cs != __KERNEL_CS)
+	if (!xen_pv_domain() && regs->cs != __KERNEL_CS)
 		goto fail;
 
 	/*
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index e2baeaa053a5..06fe3d51d385 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  Copyright (C) 1995  Linus Torvalds
  *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
@@ -19,7 +20,6 @@
 #include <asm/cpufeature.h>		/* boot_cpu_has, ...		*/
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
-#include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/
 #include <asm/fixmap.h>			/* VSYSCALL_ADDR		*/
 #include <asm/vsyscall.h>		/* emulate_vsyscall		*/
 #include <asm/vm86.h>			/* struct vm86			*/
@@ -29,26 +29,6 @@
 #include <asm/trace/exceptions.h>
 
 /*
- * Page fault error code bits:
- *
- *   bit 0 ==	 0: no page found	1: protection fault
- *   bit 1 ==	 0: read access		1: write access
- *   bit 2 ==	 0: kernel-mode access	1: user-mode access
- *   bit 3 ==				1: use of reserved bit detected
- *   bit 4 ==				1: fault was an instruction fetch
- *   bit 5 ==				1: protection keys block access
- */
-enum x86_pf_error_code {
-
-	PF_PROT		=		1 << 0,
-	PF_WRITE	=		1 << 1,
-	PF_USER		=		1 << 2,
-	PF_RSVD		=		1 << 3,
-	PF_INSTR	=		1 << 4,
-	PF_PK		=		1 << 5,
-};
-
-/*
  * Returns 0 if mmiotrace is disabled, or if the fault is not
  * handled by mmiotrace:
  */
@@ -149,7 +129,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
 	 * If it was a exec (instruction fetch) fault on NX page, then
 	 * do not ignore the fault:
 	 */
-	if (error_code & PF_INSTR)
+	if (error_code & X86_PF_INSTR)
 		return 0;
 
 	instr = (void *)convert_ip_to_linear(current, regs);
@@ -179,7 +159,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
  * siginfo so userspace can discover which protection key was set
  * on the PTE.
  *
- * If we get here, we know that the hardware signaled a PF_PK
+ * If we get here, we know that the hardware signaled a X86_PF_PK
  * fault and that there was a VMA once we got in the fault
  * handler.  It does *not* guarantee that the VMA we find here
  * was the one that we faulted on.
@@ -204,7 +184,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey)
 	/*
 	 * force_sig_info_fault() is called from a number of
 	 * contexts, some of which have a VMA and some of which
-	 * do not.  The PF_PK handing happens after we have a
+	 * do not.  The X86_PF_PK handing happens after we have a
 	 * valid VMA, so we should never reach this without a
 	 * valid VMA.
 	 */
@@ -697,7 +677,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 	if (!oops_may_print())
 		return;
 
-	if (error_code & PF_INSTR) {
+	if (error_code & X86_PF_INSTR) {
 		unsigned int level;
 		pgd_t *pgd;
 		pte_t *pte;
@@ -721,7 +701,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 	else
 		printk(KERN_CONT "paging request");
 
-	printk(KERN_CONT " at %p\n", (void *) address);
+	printk(KERN_CONT " at %px\n", (void *) address);
 	printk(KERN_ALERT "IP: %pS\n", (void *)regs->ip);
 
 	dump_pagetable(address);
@@ -779,7 +759,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 		 */
 		if (current->thread.sig_on_uaccess_err && signal) {
 			tsk->thread.trap_nr = X86_TRAP_PF;
-			tsk->thread.error_code = error_code | PF_USER;
+			tsk->thread.error_code = error_code | X86_PF_USER;
 			tsk->thread.cr2 = address;
 
 			/* XXX: hwpoison faults will set the wrong code. */
@@ -880,7 +860,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
 	if (!printk_ratelimit())
 		return;
 
-	printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
 		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
 		tsk->comm, task_pid_nr(tsk), address,
 		(void *)regs->ip, (void *)regs->sp, error_code);
@@ -897,7 +877,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 	struct task_struct *tsk = current;
 
 	/* User mode accesses just cause a SIGSEGV */
-	if (error_code & PF_USER) {
+	if (error_code & X86_PF_USER) {
 		/*
 		 * It's possible to have interrupts off here:
 		 */
@@ -918,7 +898,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 		 * Instruction fetch faults in the vsyscall page might need
 		 * emulation.
 		 */
-		if (unlikely((error_code & PF_INSTR) &&
+		if (unlikely((error_code & X86_PF_INSTR) &&
 			     ((address & ~0xfff) == VSYSCALL_ADDR))) {
 			if (emulate_vsyscall(regs, address))
 				return;
@@ -931,7 +911,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 		 * are always protection faults.
 		 */
 		if (address >= TASK_SIZE_MAX)
-			error_code |= PF_PROT;
+			error_code |= X86_PF_PROT;
 
 		if (likely(show_unhandled_signals))
 			show_signal_msg(regs, error_code, address, tsk);
@@ -992,11 +972,11 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
 
 	if (!boot_cpu_has(X86_FEATURE_OSPKE))
 		return false;
-	if (error_code & PF_PK)
+	if (error_code & X86_PF_PK)
 		return true;
 	/* this checks permission keys on the VMA: */
-	if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
-				(error_code & PF_INSTR), foreign))
+	if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
+				       (error_code & X86_PF_INSTR), foreign))
 		return true;
 	return false;
 }
@@ -1024,7 +1004,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 	int code = BUS_ADRERR;
 
 	/* Kernel mode? Handle exceptions or die: */
-	if (!(error_code & PF_USER)) {
+	if (!(error_code & X86_PF_USER)) {
 		no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
 		return;
 	}
@@ -1052,14 +1032,14 @@ static noinline void
 mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 	       unsigned long address, u32 *pkey, unsigned int fault)
 {
-	if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
+	if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
 		no_context(regs, error_code, address, 0, 0);
 		return;
 	}
 
 	if (fault & VM_FAULT_OOM) {
 		/* Kernel mode? Handle exceptions or die: */
-		if (!(error_code & PF_USER)) {
+		if (!(error_code & X86_PF_USER)) {
 			no_context(regs, error_code, address,
 				   SIGSEGV, SEGV_MAPERR);
 			return;
@@ -1084,16 +1064,16 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 
 static int spurious_fault_check(unsigned long error_code, pte_t *pte)
 {
-	if ((error_code & PF_WRITE) && !pte_write(*pte))
+	if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
 		return 0;
 
-	if ((error_code & PF_INSTR) && !pte_exec(*pte))
+	if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
 		return 0;
 	/*
 	 * Note: We do not do lazy flushing on protection key
-	 * changes, so no spurious fault will ever set PF_PK.
+	 * changes, so no spurious fault will ever set X86_PF_PK.
 	 */
-	if ((error_code & PF_PK))
+	if ((error_code & X86_PF_PK))
 		return 1;
 
 	return 1;
@@ -1139,8 +1119,8 @@ spurious_fault(unsigned long error_code, unsigned long address)
 	 * change, so user accesses are not expected to cause spurious
 	 * faults.
 	 */
-	if (error_code != (PF_WRITE | PF_PROT)
-	    && error_code != (PF_INSTR | PF_PROT))
+	if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
+	    error_code != (X86_PF_INSTR | X86_PF_PROT))
 		return 0;
 
 	pgd = init_mm.pgd + pgd_index(address);
@@ -1200,19 +1180,19 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
 	 * always an unconditional error and can never result in
 	 * a follow-up action to resolve the fault, like a COW.
 	 */
-	if (error_code & PF_PK)
+	if (error_code & X86_PF_PK)
 		return 1;
 
 	/*
 	 * Make sure to check the VMA so that we do not perform
-	 * faults just to hit a PF_PK as soon as we fill in a
+	 * faults just to hit a X86_PF_PK as soon as we fill in a
 	 * page.
 	 */
-	if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
-				(error_code & PF_INSTR), foreign))
+	if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
+				       (error_code & X86_PF_INSTR), foreign))
 		return 1;
 
-	if (error_code & PF_WRITE) {
+	if (error_code & X86_PF_WRITE) {
 		/* write, present and write, not present: */
 		if (unlikely(!(vma->vm_flags & VM_WRITE)))
 			return 1;
@@ -1220,7 +1200,7 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
 	}
 
 	/* read, present: */
-	if (unlikely(error_code & PF_PROT))
+	if (unlikely(error_code & X86_PF_PROT))
 		return 1;
 
 	/* read, not present: */
@@ -1243,7 +1223,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
 	if (!static_cpu_has(X86_FEATURE_SMAP))
 		return false;
 
-	if (error_code & PF_USER)
+	if (error_code & X86_PF_USER)
 		return false;
 
 	if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
@@ -1275,8 +1255,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 * Detect and handle instructions that would cause a page fault for
 	 * both a tracked kernel page and a userspace page.
 	 */
-	if (kmemcheck_active(regs))
-		kmemcheck_hide(regs);
 	prefetchw(&mm->mmap_sem);
 
 	if (unlikely(kmmio_fault(regs, address)))
@@ -1296,12 +1274,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 * protection error (error_code & 9) == 0.
 	 */
 	if (unlikely(fault_in_kernel_space(address))) {
-		if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
+		if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
 			if (vmalloc_fault(address) >= 0)
 				return;
-
-			if (kmemcheck_fault(regs, address, error_code))
-				return;
 		}
 
 		/* Can handle a stale RO->RW TLB: */
@@ -1324,7 +1299,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	if (unlikely(kprobes_fault(regs)))
 		return;
 
-	if (unlikely(error_code & PF_RSVD))
+	if (unlikely(error_code & X86_PF_RSVD))
 		pgtable_bad(regs, error_code, address);
 
 	if (unlikely(smap_violation(error_code, regs))) {
@@ -1350,7 +1325,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 */
 	if (user_mode(regs)) {
 		local_irq_enable();
-		error_code |= PF_USER;
+		error_code |= X86_PF_USER;
 		flags |= FAULT_FLAG_USER;
 	} else {
 		if (regs->flags & X86_EFLAGS_IF)
@@ -1359,9 +1334,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 
-	if (error_code & PF_WRITE)
+	if (error_code & X86_PF_WRITE)
 		flags |= FAULT_FLAG_WRITE;
-	if (error_code & PF_INSTR)
+	if (error_code & X86_PF_INSTR)
 		flags |= FAULT_FLAG_INSTRUCTION;
 
 	/*
@@ -1381,7 +1356,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 * space check, thus avoiding the deadlock:
 	 */
 	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
-		if ((error_code & PF_USER) == 0 &&
+		if (!(error_code & X86_PF_USER) &&
 		    !search_exception_tables(regs->ip)) {
 			bad_area_nosemaphore(regs, error_code, address, NULL);
 			return;
@@ -1408,7 +1383,7 @@ retry:
 		bad_area(regs, error_code, address);
 		return;
 	}
-	if (error_code & PF_USER) {
+	if (error_code & X86_PF_USER) {
 		/*
 		 * Accessing the stack below %sp is always a bug.
 		 * The large cushion allows instructions like enter
@@ -1440,7 +1415,17 @@ good_area:
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.  Since we never set FAULT_FLAG_RETRY_NOWAIT, if
 	 * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
+	 *
+	 * Note that handle_userfault() may also release and reacquire mmap_sem
+	 * (and not return with VM_FAULT_RETRY), when returning to userland to
+	 * repeat the page fault later with a VM_FAULT_NOPAGE retval
+	 * (potentially after handling any pending signal during the return to
+	 * userland). The return to userland is identified whenever
+	 * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
+	 * Thus we have to be careful about not touching vma after handling the
+	 * fault, so we read the pkey beforehand.
 	 */
+	pkey = vma_pkey(vma);
 	fault = handle_mm_fault(vma, address, flags);
 	major |= fault & VM_FAULT_MAJOR;
 
@@ -1467,7 +1452,6 @@ good_area:
 		return;
 	}
 
-	pkey = vma_pkey(vma);
 	up_read(&mm->mmap_sem);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		mm_fault_error(regs, error_code, address, &pkey, fault);
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 6d06cf33e3de..00b296617ca4 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * IA-32 Huge TLB Page Support for Kernel.
  *
@@ -157,6 +158,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	if (len > TASK_SIZE)
 		return -ENOMEM;
 
+	/* No address checking. See comment at mmap_address_hint_valid() */
 	if (flags & MAP_FIXED) {
 		if (prepare_hugepage_range(file, addr, len))
 			return -EINVAL;
@@ -164,12 +166,16 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	}
 
 	if (addr) {
-		addr = ALIGN(addr, huge_page_size(h));
+		addr &= huge_page_mask(h);
+		if (!mmap_address_hint_valid(addr, len))
+			goto get_unmapped_area;
+
 		vma = find_vma(mm, addr);
-		if (TASK_SIZE - len >= addr &&
-		    (!vma || addr + len <= vm_start_gap(vma)))
+		if (!vma || addr + len <= vm_start_gap(vma))
 			return addr;
 	}
+
+get_unmapped_area:
 	if (mm->get_unmapped_area == arch_get_unmapped_area)
 		return hugetlb_get_unmapped_area_bottomup(file, addr, len,
 				pgoff, flags);
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index 31cea988fa36..ab33a32df2a8 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Helper routines for building identity mapping page tables. This is
  * included by both the compressed kernel and the regular kernel.
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index af5c1ed21d43..82f5252c723a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -20,6 +20,7 @@
 #include <asm/kaslr.h>
 #include <asm/hypervisor.h>
 #include <asm/cpufeature.h>
+#include <asm/pti.h>
 
 /*
  * We need to define the tracepoints somewhere, and tlb.c
@@ -92,8 +93,7 @@ __ref void *alloc_low_pages(unsigned int num)
 		unsigned int order;
 
 		order = get_order((unsigned long)num << PAGE_SHIFT);
-		return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
-						__GFP_ZERO, order);
+		return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
 	}
 
 	if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
@@ -161,15 +161,20 @@ struct map_range {
 
 static int page_size_mask;
 
+static void enable_global_pages(void)
+{
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		__supported_pte_mask |= _PAGE_GLOBAL;
+}
+
 static void __init probe_page_size_mask(void)
 {
 	/*
-	 * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
-	 * use small pages.
+	 * For pagealloc debugging, identity mapping will use small pages.
 	 * This will simplify cpa(), which otherwise needs to support splitting
 	 * large pages into small in interrupt context, etc.
 	 */
-	if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled() && !IS_ENABLED(CONFIG_KMEMCHECK))
+	if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled())
 		page_size_mask |= 1 << PG_LEVEL_2M;
 	else
 		direct_gbpages = 0;
@@ -179,11 +184,11 @@ static void __init probe_page_size_mask(void)
 		cr4_set_bits_and_update_boot(X86_CR4_PSE);
 
 	/* Enable PGE if available */
+	__supported_pte_mask &= ~_PAGE_GLOBAL;
 	if (boot_cpu_has(X86_FEATURE_PGE)) {
 		cr4_set_bits_and_update_boot(X86_CR4_PGE);
-		__supported_pte_mask |= _PAGE_GLOBAL;
-	} else
-		__supported_pte_mask &= ~_PAGE_GLOBAL;
+		enable_global_pages();
+	}
 
 	/* Enable 1 GB linear kernel mappings if available: */
 	if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
@@ -196,34 +201,44 @@ static void __init probe_page_size_mask(void)
 
 static void setup_pcid(void)
 {
-#ifdef CONFIG_X86_64
-	if (boot_cpu_has(X86_FEATURE_PCID)) {
-		if (boot_cpu_has(X86_FEATURE_PGE)) {
-			/*
-			 * This can't be cr4_set_bits_and_update_boot() --
-			 * the trampoline code can't handle CR4.PCIDE and
-			 * it wouldn't do any good anyway.  Despite the name,
-			 * cr4_set_bits_and_update_boot() doesn't actually
-			 * cause the bits in question to remain set all the
-			 * way through the secondary boot asm.
-			 *
-			 * Instead, we brute-force it and set CR4.PCIDE
-			 * manually in start_secondary().
-			 */
-			cr4_set_bits(X86_CR4_PCIDE);
-		} else {
-			/*
-			 * flush_tlb_all(), as currently implemented, won't
-			 * work if PCID is on but PGE is not.  Since that
-			 * combination doesn't exist on real hardware, there's
-			 * no reason to try to fully support it, but it's
-			 * polite to avoid corrupting data if we're on
-			 * an improperly configured VM.
-			 */
-			setup_clear_cpu_cap(X86_FEATURE_PCID);
-		}
+	if (!IS_ENABLED(CONFIG_X86_64))
+		return;
+
+	if (!boot_cpu_has(X86_FEATURE_PCID))
+		return;
+
+	if (boot_cpu_has(X86_FEATURE_PGE)) {
+		/*
+		 * This can't be cr4_set_bits_and_update_boot() -- the
+		 * trampoline code can't handle CR4.PCIDE and it wouldn't
+		 * do any good anyway.  Despite the name,
+		 * cr4_set_bits_and_update_boot() doesn't actually cause
+		 * the bits in question to remain set all the way through
+		 * the secondary boot asm.
+		 *
+		 * Instead, we brute-force it and set CR4.PCIDE manually in
+		 * start_secondary().
+		 */
+		cr4_set_bits(X86_CR4_PCIDE);
+
+		/*
+		 * INVPCID's single-context modes (2/3) only work if we set
+		 * X86_CR4_PCIDE, *and* we INVPCID support.  It's unusable
+		 * on systems that have X86_CR4_PCIDE clear, or that have
+		 * no INVPCID support at all.
+		 */
+		if (boot_cpu_has(X86_FEATURE_INVPCID))
+			setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
+	} else {
+		/*
+		 * flush_tlb_all(), as currently implemented, won't work if
+		 * PCID is on but PGE is not.  Since that combination
+		 * doesn't exist on real hardware, there's no reason to try
+		 * to fully support it, but it's polite to avoid corrupting
+		 * data if we're on an improperly configured VM.
+		 */
+		setup_clear_cpu_cap(X86_FEATURE_PCID);
 	}
-#endif
 }
 
 #ifdef CONFIG_X86_32
@@ -624,6 +639,7 @@ void __init init_mem_mapping(void)
 {
 	unsigned long end;
 
+	pti_check_boottime_disable();
 	probe_page_size_mask();
 	setup_pcid();
 
@@ -671,7 +687,7 @@ void __init init_mem_mapping(void)
 	load_cr3(swapper_pg_dir);
 	__flush_tlb_all();
 
-	hypervisor_init_mem_mapping();
+	x86_init.hyper.init_mem_mapping();
 
 	early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
 }
@@ -847,12 +863,12 @@ void __init zone_sizes_init(void)
 	free_area_init_nodes(max_zone_pfns);
 }
 
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
 	.loaded_mm = &init_mm,
 	.next_asid = 1,
 	.cr4 = ~0UL,	/* fail hard if we screw up cr4 shadow initialization */
 };
-EXPORT_SYMBOL_GPL(cpu_tlbstate);
+EXPORT_PER_CPU_SYMBOL(cpu_tlbstate);
 
 void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
 {
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 8a64a6f2848d..135c9a7898c7 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -50,6 +50,7 @@
 #include <asm/setup.h>
 #include <asm/set_memory.h>
 #include <asm/page_types.h>
+#include <asm/cpu_entry_area.h>
 #include <asm/init.h>
 
 #include "mm_internal.h"
@@ -766,6 +767,7 @@ void __init mem_init(void)
 	mem_init_print_info(NULL);
 	printk(KERN_INFO "virtual kernel memory layout:\n"
 		"    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+		"  cpu_entry : 0x%08lx - 0x%08lx   (%4ld kB)\n"
 #ifdef CONFIG_HIGHMEM
 		"    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
 #endif
@@ -777,6 +779,10 @@ void __init mem_init(void)
 		FIXADDR_START, FIXADDR_TOP,
 		(FIXADDR_TOP - FIXADDR_START) >> 10,
 
+		CPU_ENTRY_AREA_BASE,
+		CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE,
+		CPU_ENTRY_AREA_MAP_SIZE >> 10,
+
 #ifdef CONFIG_HIGHMEM
 		PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
 		(LAST_PKMAP*PAGE_SIZE) >> 10,
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 048fbe8fc274..4a837289f2ad 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -184,7 +184,7 @@ static __ref void *spp_getpage(void)
 	void *ptr;
 
 	if (after_bootmem)
-		ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
+		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
 	else
 		ptr = alloc_bootmem_pages(PAGE_SIZE);
 
@@ -1173,12 +1173,18 @@ void __init mem_init(void)
 
 	/* clear_bss() already clear the empty_zero_page */
 
-	register_page_bootmem_info();
-
 	/* this will put all memory onto the freelists */
 	free_all_bootmem();
 	after_bootmem = 1;
 
+	/*
+	 * Must be done after boot memory is put on freelist, because here we
+	 * might set fields in deferred struct pages that have not yet been
+	 * initialized, and free_all_bootmem() initializes all the reserved
+	 * deferred pages for us.
+	 */
+	register_page_bootmem_info();
+
 	/* Register memory areas for /proc/kcore */
 	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR,
 			 PAGE_SIZE, KCORE_OTHER);
@@ -1399,7 +1405,6 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
 			vmemmap_verify((pte_t *)pmd, node, addr, next);
 			continue;
 		}
-		pr_warn_once("vmemmap: falling back to regular page backing\n");
 		if (vmemmap_populate_basepages(addr, next, node))
 			return -ENOMEM;
 	}
@@ -1426,16 +1431,16 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
 
 #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
 void register_page_bootmem_memmap(unsigned long section_nr,
-				  struct page *start_page, unsigned long size)
+				  struct page *start_page, unsigned long nr_pages)
 {
 	unsigned long addr = (unsigned long)start_page;
-	unsigned long end = (unsigned long)(start_page + size);
+	unsigned long end = (unsigned long)(start_page + nr_pages);
 	unsigned long next;
 	pgd_t *pgd;
 	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
-	unsigned int nr_pages;
+	unsigned int nr_pmd_pages;
 	struct page *page;
 
 	for (; addr < end; addr = next) {
@@ -1482,9 +1487,9 @@ void register_page_bootmem_memmap(unsigned long section_nr,
 			if (pmd_none(*pmd))
 				continue;
 
-			nr_pages = 1 << (get_order(PMD_SIZE));
+			nr_pmd_pages = 1 << get_order(PMD_SIZE);
 			page = pmd_page(*pmd);
-			while (nr_pages--)
+			while (nr_pmd_pages--)
 				get_page_bootmem(section_nr, page++,
 						 SECTION_INFO);
 		}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 34f0e1847dd6..c45b6ec5357b 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -27,6 +27,11 @@
 
 #include "physaddr.h"
 
+struct ioremap_mem_flags {
+	bool system_ram;
+	bool desc_other;
+};
+
 /*
  * Fix up the linear direct mapping of the kernel to avoid cache attribute
  * conflicts.
@@ -56,17 +61,59 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
 	return err;
 }
 
-static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
-			       void *arg)
+static bool __ioremap_check_ram(struct resource *res)
 {
+	unsigned long start_pfn, stop_pfn;
 	unsigned long i;
 
-	for (i = 0; i < nr_pages; ++i)
-		if (pfn_valid(start_pfn + i) &&
-		    !PageReserved(pfn_to_page(start_pfn + i)))
-			return 1;
+	if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM)
+		return false;
 
-	return 0;
+	start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	stop_pfn = (res->end + 1) >> PAGE_SHIFT;
+	if (stop_pfn > start_pfn) {
+		for (i = 0; i < (stop_pfn - start_pfn); ++i)
+			if (pfn_valid(start_pfn + i) &&
+			    !PageReserved(pfn_to_page(start_pfn + i)))
+				return true;
+	}
+
+	return false;
+}
+
+static int __ioremap_check_desc_other(struct resource *res)
+{
+	return (res->desc != IORES_DESC_NONE);
+}
+
+static int __ioremap_res_check(struct resource *res, void *arg)
+{
+	struct ioremap_mem_flags *flags = arg;
+
+	if (!flags->system_ram)
+		flags->system_ram = __ioremap_check_ram(res);
+
+	if (!flags->desc_other)
+		flags->desc_other = __ioremap_check_desc_other(res);
+
+	return flags->system_ram && flags->desc_other;
+}
+
+/*
+ * To avoid multiple resource walks, this function walks resources marked as
+ * IORESOURCE_MEM and IORESOURCE_BUSY and looking for system RAM and/or a
+ * resource described not as IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES).
+ */
+static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
+				struct ioremap_mem_flags *flags)
+{
+	u64 start, end;
+
+	start = (u64)addr;
+	end = start + size - 1;
+	memset(flags, 0, sizeof(*flags));
+
+	walk_mem_res(start, end, flags, __ioremap_res_check);
 }
 
 /*
@@ -87,9 +134,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		unsigned long size, enum page_cache_mode pcm, void *caller)
 {
 	unsigned long offset, vaddr;
-	resource_size_t pfn, last_pfn, last_addr;
+	resource_size_t last_addr;
 	const resource_size_t unaligned_phys_addr = phys_addr;
 	const unsigned long unaligned_size = size;
+	struct ioremap_mem_flags mem_flags;
 	struct vm_struct *area;
 	enum page_cache_mode new_pcm;
 	pgprot_t prot;
@@ -108,13 +156,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		return NULL;
 	}
 
+	__ioremap_check_mem(phys_addr, size, &mem_flags);
+
 	/*
 	 * Don't allow anybody to remap normal RAM that we're using..
 	 */
-	pfn      = phys_addr >> PAGE_SHIFT;
-	last_pfn = last_addr >> PAGE_SHIFT;
-	if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
-					  __ioremap_check_ram) == 1) {
+	if (mem_flags.system_ram) {
 		WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n",
 			  &phys_addr, &last_addr);
 		return NULL;
@@ -146,7 +193,15 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		pcm = new_pcm;
 	}
 
+	/*
+	 * If the page being mapped is in memory and SEV is active then
+	 * make sure the memory encryption attribute is enabled in the
+	 * resulting mapping.
+	 */
 	prot = PAGE_KERNEL_IO;
+	if (sev_active() && mem_flags.desc_other)
+		prot = pgprot_encrypted(prot);
+
 	switch (pcm) {
 	case _PAGE_CACHE_MODE_UC:
 	default:
@@ -349,11 +404,11 @@ void iounmap(volatile void __iomem *addr)
 		return;
 	}
 
+	mmiotrace_iounmap(addr);
+
 	addr = (volatile void __iomem *)
 		(PAGE_MASK & (unsigned long __force)addr);
 
-	mmiotrace_iounmap(addr);
-
 	/* Use the vm area unlocked, assuming the caller
 	   ensures there isn't another iounmap for the same address
 	   in parallel. Reuse of the virtual address is prevented by
@@ -422,6 +477,9 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
  * areas should be mapped decrypted. And since the encryption key can
  * change across reboots, persistent memory should also be mapped
  * decrypted.
+ *
+ * If SEV is active, that implies that BIOS/UEFI also ran encrypted so
+ * only persistent memory should be mapped decrypted.
  */
 static bool memremap_should_map_decrypted(resource_size_t phys_addr,
 					  unsigned long size)
@@ -458,6 +516,11 @@ static bool memremap_should_map_decrypted(resource_size_t phys_addr,
 	case E820_TYPE_ACPI:
 	case E820_TYPE_NVS:
 	case E820_TYPE_UNUSABLE:
+		/* For SEV, these areas are encrypted */
+		if (sev_active())
+			break;
+		/* Fallthrough */
+
 	case E820_TYPE_PRAM:
 		return true;
 	default:
@@ -581,7 +644,7 @@ static bool __init early_memremap_is_setup_data(resource_size_t phys_addr,
 bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size,
 				 unsigned long flags)
 {
-	if (!sme_active())
+	if (!mem_encrypt_active())
 		return true;
 
 	if (flags & MEMREMAP_ENC)
@@ -590,12 +653,13 @@ bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size,
 	if (flags & MEMREMAP_DEC)
 		return false;
 
-	if (memremap_is_setup_data(phys_addr, size) ||
-	    memremap_is_efi_data(phys_addr, size) ||
-	    memremap_should_map_decrypted(phys_addr, size))
-		return false;
+	if (sme_active()) {
+		if (memremap_is_setup_data(phys_addr, size) ||
+		    memremap_is_efi_data(phys_addr, size))
+			return false;
+	}
 
-	return true;
+	return !memremap_should_map_decrypted(phys_addr, size);
 }
 
 /*
@@ -608,17 +672,24 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
 					     unsigned long size,
 					     pgprot_t prot)
 {
-	if (!sme_active())
+	bool encrypted_prot;
+
+	if (!mem_encrypt_active())
 		return prot;
 
-	if (early_memremap_is_setup_data(phys_addr, size) ||
-	    memremap_is_efi_data(phys_addr, size) ||
-	    memremap_should_map_decrypted(phys_addr, size))
-		prot = pgprot_decrypted(prot);
-	else
-		prot = pgprot_encrypted(prot);
+	encrypted_prot = true;
+
+	if (sme_active()) {
+		if (early_memremap_is_setup_data(phys_addr, size) ||
+		    memremap_is_efi_data(phys_addr, size))
+			encrypted_prot = false;
+	}
+
+	if (encrypted_prot && memremap_should_map_decrypted(phys_addr, size))
+		encrypted_prot = false;
 
-	return prot;
+	return encrypted_prot ? pgprot_encrypted(prot)
+			      : pgprot_decrypted(prot);
 }
 
 bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size)
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index bc84b73684b7..47388f0c0e59 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -1,21 +1,154 @@
+// SPDX-License-Identifier: GPL-2.0
 #define DISABLE_BRANCH_PROFILING
 #define pr_fmt(fmt) "kasan: " fmt
 #include <linux/bootmem.h>
 #include <linux/kasan.h>
 #include <linux/kdebug.h>
+#include <linux/memblock.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
 #include <linux/vmalloc.h>
 
 #include <asm/e820/types.h>
+#include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
 #include <asm/pgtable.h>
+#include <asm/cpu_entry_area.h>
 
 extern struct range pfn_mapped[E820_MAX_ENTRIES];
 
-static int __init map_range(struct range *range)
+static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
+
+static __init void *early_alloc(size_t size, int nid)
+{
+	return memblock_virt_alloc_try_nid_nopanic(size, size,
+		__pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid);
+}
+
+static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
+				      unsigned long end, int nid)
+{
+	pte_t *pte;
+
+	if (pmd_none(*pmd)) {
+		void *p;
+
+		if (boot_cpu_has(X86_FEATURE_PSE) &&
+		    ((end - addr) == PMD_SIZE) &&
+		    IS_ALIGNED(addr, PMD_SIZE)) {
+			p = early_alloc(PMD_SIZE, nid);
+			if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL))
+				return;
+			else if (p)
+				memblock_free(__pa(p), PMD_SIZE);
+		}
+
+		p = early_alloc(PAGE_SIZE, nid);
+		pmd_populate_kernel(&init_mm, pmd, p);
+	}
+
+	pte = pte_offset_kernel(pmd, addr);
+	do {
+		pte_t entry;
+		void *p;
+
+		if (!pte_none(*pte))
+			continue;
+
+		p = early_alloc(PAGE_SIZE, nid);
+		entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL);
+		set_pte_at(&init_mm, addr, pte, entry);
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+}
+
+static void __init kasan_populate_pud(pud_t *pud, unsigned long addr,
+				      unsigned long end, int nid)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	if (pud_none(*pud)) {
+		void *p;
+
+		if (boot_cpu_has(X86_FEATURE_GBPAGES) &&
+		    ((end - addr) == PUD_SIZE) &&
+		    IS_ALIGNED(addr, PUD_SIZE)) {
+			p = early_alloc(PUD_SIZE, nid);
+			if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL))
+				return;
+			else if (p)
+				memblock_free(__pa(p), PUD_SIZE);
+		}
+
+		p = early_alloc(PAGE_SIZE, nid);
+		pud_populate(&init_mm, pud, p);
+	}
+
+	pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (!pmd_large(*pmd))
+			kasan_populate_pmd(pmd, addr, next, nid);
+	} while (pmd++, addr = next, addr != end);
+}
+
+static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr,
+				      unsigned long end, int nid)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	if (p4d_none(*p4d)) {
+		void *p = early_alloc(PAGE_SIZE, nid);
+
+		p4d_populate(&init_mm, p4d, p);
+	}
+
+	pud = pud_offset(p4d, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (!pud_large(*pud))
+			kasan_populate_pud(pud, addr, next, nid);
+	} while (pud++, addr = next, addr != end);
+}
+
+static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr,
+				      unsigned long end, int nid)
+{
+	void *p;
+	p4d_t *p4d;
+	unsigned long next;
+
+	if (pgd_none(*pgd)) {
+		p = early_alloc(PAGE_SIZE, nid);
+		pgd_populate(&init_mm, pgd, p);
+	}
+
+	p4d = p4d_offset(pgd, addr);
+	do {
+		next = p4d_addr_end(addr, end);
+		kasan_populate_p4d(p4d, addr, next, nid);
+	} while (p4d++, addr = next, addr != end);
+}
+
+static void __init kasan_populate_shadow(unsigned long addr, unsigned long end,
+					 int nid)
+{
+	pgd_t *pgd;
+	unsigned long next;
+
+	addr = addr & PAGE_MASK;
+	end = round_up(end, PAGE_SIZE);
+	pgd = pgd_offset_k(addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		kasan_populate_pgd(pgd, addr, next, nid);
+	} while (pgd++, addr = next, addr != end);
+}
+
+static void __init map_range(struct range *range)
 {
 	unsigned long start;
 	unsigned long end;
@@ -23,15 +156,17 @@ static int __init map_range(struct range *range)
 	start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start));
 	end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end));
 
-	return vmemmap_populate(start, end, NUMA_NO_NODE);
+	kasan_populate_shadow(start, end, early_pfn_to_nid(range->start));
 }
 
 static void __init clear_pgds(unsigned long start,
 			unsigned long end)
 {
 	pgd_t *pgd;
+	/* See comment in kasan_init() */
+	unsigned long pgd_end = end & PGDIR_MASK;
 
-	for (; start < end; start += PGDIR_SIZE) {
+	for (; start < pgd_end; start += PGDIR_SIZE) {
 		pgd = pgd_offset_k(start);
 		/*
 		 * With folded p4d, pgd_clear() is nop, use p4d_clear()
@@ -42,29 +177,61 @@ static void __init clear_pgds(unsigned long start,
 		else
 			pgd_clear(pgd);
 	}
+
+	pgd = pgd_offset_k(start);
+	for (; start < end; start += P4D_SIZE)
+		p4d_clear(p4d_offset(pgd, start));
+}
+
+static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr)
+{
+	unsigned long p4d;
+
+	if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+		return (p4d_t *)pgd;
+
+	p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK;
+	p4d += __START_KERNEL_map - phys_base;
+	return (p4d_t *)p4d + p4d_index(addr);
+}
+
+static void __init kasan_early_p4d_populate(pgd_t *pgd,
+		unsigned long addr,
+		unsigned long end)
+{
+	pgd_t pgd_entry;
+	p4d_t *p4d, p4d_entry;
+	unsigned long next;
+
+	if (pgd_none(*pgd)) {
+		pgd_entry = __pgd(_KERNPG_TABLE | __pa_nodebug(kasan_zero_p4d));
+		set_pgd(pgd, pgd_entry);
+	}
+
+	p4d = early_p4d_offset(pgd, addr);
+	do {
+		next = p4d_addr_end(addr, end);
+
+		if (!p4d_none(*p4d))
+			continue;
+
+		p4d_entry = __p4d(_KERNPG_TABLE | __pa_nodebug(kasan_zero_pud));
+		set_p4d(p4d, p4d_entry);
+	} while (p4d++, addr = next, addr != end && p4d_none(*p4d));
 }
 
 static void __init kasan_map_early_shadow(pgd_t *pgd)
 {
-	int i;
-	unsigned long start = KASAN_SHADOW_START;
+	/* See comment in kasan_init() */
+	unsigned long addr = KASAN_SHADOW_START & PGDIR_MASK;
 	unsigned long end = KASAN_SHADOW_END;
+	unsigned long next;
 
-	for (i = pgd_index(start); start < end; i++) {
-		switch (CONFIG_PGTABLE_LEVELS) {
-		case 4:
-			pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) |
-					_KERNPG_TABLE);
-			break;
-		case 5:
-			pgd[i] = __pgd(__pa_nodebug(kasan_zero_p4d) |
-					_KERNPG_TABLE);
-			break;
-		default:
-			BUILD_BUG();
-		}
-		start += PGDIR_SIZE;
-	}
+	pgd += pgd_index(addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		kasan_early_p4d_populate(pgd, addr, next);
+	} while (pgd++, addr = next, addr != end);
 }
 
 #ifdef CONFIG_KASAN_INLINE
@@ -101,7 +268,7 @@ void __init kasan_early_init(void)
 	for (i = 0; i < PTRS_PER_PUD; i++)
 		kasan_zero_pud[i] = __pud(pud_val);
 
-	for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++)
+	for (i = 0; IS_ENABLED(CONFIG_X86_5LEVEL) && i < PTRS_PER_P4D; i++)
 		kasan_zero_p4d[i] = __p4d(p4d_val);
 
 	kasan_map_early_shadow(early_top_pgt);
@@ -111,37 +278,78 @@ void __init kasan_early_init(void)
 void __init kasan_init(void)
 {
 	int i;
+	void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;
 
 #ifdef CONFIG_KASAN_INLINE
 	register_die_notifier(&kasan_die_notifier);
 #endif
 
 	memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
+
+	/*
+	 * We use the same shadow offset for 4- and 5-level paging to
+	 * facilitate boot-time switching between paging modes.
+	 * As result in 5-level paging mode KASAN_SHADOW_START and
+	 * KASAN_SHADOW_END are not aligned to PGD boundary.
+	 *
+	 * KASAN_SHADOW_START doesn't share PGD with anything else.
+	 * We claim whole PGD entry to make things easier.
+	 *
+	 * KASAN_SHADOW_END lands in the last PGD entry and it collides with
+	 * bunch of things like kernel code, modules, EFI mapping, etc.
+	 * We need to take extra steps to not overwrite them.
+	 */
+	if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+		void *ptr;
+
+		ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END));
+		memcpy(tmp_p4d_table, (void *)ptr, sizeof(tmp_p4d_table));
+		set_pgd(&early_top_pgt[pgd_index(KASAN_SHADOW_END)],
+				__pgd(__pa(tmp_p4d_table) | _KERNPG_TABLE));
+	}
+
 	load_cr3(early_top_pgt);
 	__flush_tlb_all();
 
-	clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
+	clear_pgds(KASAN_SHADOW_START & PGDIR_MASK, KASAN_SHADOW_END);
 
-	kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
+	kasan_populate_zero_shadow((void *)(KASAN_SHADOW_START & PGDIR_MASK),
 			kasan_mem_to_shadow((void *)PAGE_OFFSET));
 
 	for (i = 0; i < E820_MAX_ENTRIES; i++) {
 		if (pfn_mapped[i].end == 0)
 			break;
 
-		if (map_range(&pfn_mapped[i]))
-			panic("kasan: unable to allocate shadow!");
+		map_range(&pfn_mapped[i]);
 	}
+
+	shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE;
+	shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
+	shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin,
+						PAGE_SIZE);
+
+	shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE +
+					CPU_ENTRY_AREA_MAP_SIZE);
+	shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
+	shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end,
+					PAGE_SIZE);
+
 	kasan_populate_zero_shadow(
 		kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
-		kasan_mem_to_shadow((void *)__START_KERNEL_map));
+		shadow_cpu_entry_begin);
+
+	kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
+			      (unsigned long)shadow_cpu_entry_end, 0);
+
+	kasan_populate_zero_shadow(shadow_cpu_entry_end,
+				kasan_mem_to_shadow((void *)__START_KERNEL_map));
 
-	vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext),
-			(unsigned long)kasan_mem_to_shadow(_end),
-			NUMA_NO_NODE);
+	kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),
+			      (unsigned long)kasan_mem_to_shadow(_end),
+			      early_pfn_to_nid(__pa(_stext)));
 
 	kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
-			(void *)KASAN_SHADOW_END);
+				(void *)KASAN_SHADOW_END);
 
 	load_cr3(init_top_pgt);
 	__flush_tlb_all();
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index af599167fe3c..aedebd2ebf1e 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * This file implements KASLR memory randomization for x86_64. It randomizes
  * the virtual address space of kernel memory regions (physical memory
@@ -33,25 +34,14 @@
 #define TB_SHIFT 40
 
 /*
- * Virtual address start and end range for randomization. The end changes base
- * on configuration to have the highest amount of space for randomization.
- * It increases the possible random position for each randomized region.
+ * Virtual address start and end range for randomization.
  *
- * You need to add an if/def entry if you introduce a new memory region
- * compatible with KASLR. Your entry must be in logical order with memory
- * layout. For example, ESPFIX is before EFI because its virtual address is
- * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to
- * ensure that this order is correct and won't be changed.
+ * The end address could depend on more configuration options to make the
+ * highest amount of space for randomization available, but that's too hard
+ * to keep straight and caused issues already.
  */
 static const unsigned long vaddr_start = __PAGE_OFFSET_BASE;
-
-#if defined(CONFIG_X86_ESPFIX64)
-static const unsigned long vaddr_end = ESPFIX_BASE_ADDR;
-#elif defined(CONFIG_EFI)
-static const unsigned long vaddr_end = EFI_VA_END;
-#else
-static const unsigned long vaddr_end = __START_KERNEL_map;
-#endif
+static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
 
 /* Default values */
 unsigned long page_offset_base = __PAGE_OFFSET_BASE;
@@ -100,15 +90,12 @@ void __init kernel_randomize_memory(void)
 	unsigned long remain_entropy;
 
 	/*
-	 * All these BUILD_BUG_ON checks ensures the memory layout is
-	 * consistent with the vaddr_start/vaddr_end variables.
+	 * These BUILD_BUG_ON checks ensure the memory layout is consistent
+	 * with the vaddr_start/vaddr_end variables. These checks are very
+	 * limited....
 	 */
 	BUILD_BUG_ON(vaddr_start >= vaddr_end);
-	BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) &&
-		     vaddr_end >= EFI_VA_END);
-	BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) ||
-		      IS_ENABLED(CONFIG_EFI)) &&
-		     vaddr_end >= __START_KERNEL_map);
+	BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE);
 	BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
 
 	if (!kaslr_memory_enabled())
diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile
deleted file mode 100644
index 520b3bce4095..000000000000
--- a/arch/x86/mm/kmemcheck/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
deleted file mode 100644
index dab41876cdd5..000000000000
--- a/arch/x86/mm/kmemcheck/error.c
+++ /dev/null
@@ -1,227 +0,0 @@
-#include <linux/interrupt.h>
-#include <linux/kdebug.h>
-#include <linux/kmemcheck.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/stacktrace.h>
-#include <linux/string.h>
-
-#include "error.h"
-#include "shadow.h"
-
-enum kmemcheck_error_type {
-	KMEMCHECK_ERROR_INVALID_ACCESS,
-	KMEMCHECK_ERROR_BUG,
-};
-
-#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT)
-
-struct kmemcheck_error {
-	enum kmemcheck_error_type type;
-
-	union {
-		/* KMEMCHECK_ERROR_INVALID_ACCESS */
-		struct {
-			/* Kind of access that caused the error */
-			enum kmemcheck_shadow state;
-			/* Address and size of the erroneous read */
-			unsigned long	address;
-			unsigned int	size;
-		};
-	};
-
-	struct pt_regs		regs;
-	struct stack_trace	trace;
-	unsigned long		trace_entries[32];
-
-	/* We compress it to a char. */
-	unsigned char		shadow_copy[SHADOW_COPY_SIZE];
-	unsigned char		memory_copy[SHADOW_COPY_SIZE];
-};
-
-/*
- * Create a ring queue of errors to output. We can't call printk() directly
- * from the kmemcheck traps, since this may call the console drivers and
- * result in a recursive fault.
- */
-static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
-static unsigned int error_count;
-static unsigned int error_rd;
-static unsigned int error_wr;
-static unsigned int error_missed_count;
-
-static struct kmemcheck_error *error_next_wr(void)
-{
-	struct kmemcheck_error *e;
-
-	if (error_count == ARRAY_SIZE(error_fifo)) {
-		++error_missed_count;
-		return NULL;
-	}
-
-	e = &error_fifo[error_wr];
-	if (++error_wr == ARRAY_SIZE(error_fifo))
-		error_wr = 0;
-	++error_count;
-	return e;
-}
-
-static struct kmemcheck_error *error_next_rd(void)
-{
-	struct kmemcheck_error *e;
-
-	if (error_count == 0)
-		return NULL;
-
-	e = &error_fifo[error_rd];
-	if (++error_rd == ARRAY_SIZE(error_fifo))
-		error_rd = 0;
-	--error_count;
-	return e;
-}
-
-void kmemcheck_error_recall(void)
-{
-	static const char *desc[] = {
-		[KMEMCHECK_SHADOW_UNALLOCATED]		= "unallocated",
-		[KMEMCHECK_SHADOW_UNINITIALIZED]	= "uninitialized",
-		[KMEMCHECK_SHADOW_INITIALIZED]		= "initialized",
-		[KMEMCHECK_SHADOW_FREED]		= "freed",
-	};
-
-	static const char short_desc[] = {
-		[KMEMCHECK_SHADOW_UNALLOCATED]		= 'a',
-		[KMEMCHECK_SHADOW_UNINITIALIZED]	= 'u',
-		[KMEMCHECK_SHADOW_INITIALIZED]		= 'i',
-		[KMEMCHECK_SHADOW_FREED]		= 'f',
-	};
-
-	struct kmemcheck_error *e;
-	unsigned int i;
-
-	e = error_next_rd();
-	if (!e)
-		return;
-
-	switch (e->type) {
-	case KMEMCHECK_ERROR_INVALID_ACCESS:
-		printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n",
-			8 * e->size, e->state < ARRAY_SIZE(desc) ?
-				desc[e->state] : "(invalid shadow state)",
-			(void *) e->address);
-
-		printk(KERN_WARNING);
-		for (i = 0; i < SHADOW_COPY_SIZE; ++i)
-			printk(KERN_CONT "%02x", e->memory_copy[i]);
-		printk(KERN_CONT "\n");
-
-		printk(KERN_WARNING);
-		for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
-			if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
-				printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]);
-			else
-				printk(KERN_CONT " ?");
-		}
-		printk(KERN_CONT "\n");
-		printk(KERN_WARNING "%*c\n", 2 + 2
-			* (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
-		break;
-	case KMEMCHECK_ERROR_BUG:
-		printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n");
-		break;
-	}
-
-	__show_regs(&e->regs, 1);
-	print_stack_trace(&e->trace, 0);
-}
-
-static void do_wakeup(unsigned long data)
-{
-	while (error_count > 0)
-		kmemcheck_error_recall();
-
-	if (error_missed_count > 0) {
-		printk(KERN_WARNING "kmemcheck: Lost %d error reports because "
-			"the queue was too small\n", error_missed_count);
-		error_missed_count = 0;
-	}
-}
-
-static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
-
-/*
- * Save the context of an error report.
- */
-void kmemcheck_error_save(enum kmemcheck_shadow state,
-	unsigned long address, unsigned int size, struct pt_regs *regs)
-{
-	static unsigned long prev_ip;
-
-	struct kmemcheck_error *e;
-	void *shadow_copy;
-	void *memory_copy;
-
-	/* Don't report several adjacent errors from the same EIP. */
-	if (regs->ip == prev_ip)
-		return;
-	prev_ip = regs->ip;
-
-	e = error_next_wr();
-	if (!e)
-		return;
-
-	e->type = KMEMCHECK_ERROR_INVALID_ACCESS;
-
-	e->state = state;
-	e->address = address;
-	e->size = size;
-
-	/* Save regs */
-	memcpy(&e->regs, regs, sizeof(*regs));
-
-	/* Save stack trace */
-	e->trace.nr_entries = 0;
-	e->trace.entries = e->trace_entries;
-	e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
-	e->trace.skip = 0;
-	save_stack_trace_regs(regs, &e->trace);
-
-	/* Round address down to nearest 16 bytes */
-	shadow_copy = kmemcheck_shadow_lookup(address
-		& ~(SHADOW_COPY_SIZE - 1));
-	BUG_ON(!shadow_copy);
-
-	memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE);
-
-	kmemcheck_show_addr(address);
-	memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1));
-	memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE);
-	kmemcheck_hide_addr(address);
-
-	tasklet_hi_schedule_first(&kmemcheck_tasklet);
-}
-
-/*
- * Save the context of a kmemcheck bug.
- */
-void kmemcheck_error_save_bug(struct pt_regs *regs)
-{
-	struct kmemcheck_error *e;
-
-	e = error_next_wr();
-	if (!e)
-		return;
-
-	e->type = KMEMCHECK_ERROR_BUG;
-
-	memcpy(&e->regs, regs, sizeof(*regs));
-
-	e->trace.nr_entries = 0;
-	e->trace.entries = e->trace_entries;
-	e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
-	e->trace.skip = 1;
-	save_stack_trace(&e->trace);
-
-	tasklet_hi_schedule_first(&kmemcheck_tasklet);
-}
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
deleted file mode 100644
index 0efc2e8d0a20..000000000000
--- a/arch/x86/mm/kmemcheck/error.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
-#define ARCH__X86__MM__KMEMCHECK__ERROR_H
-
-#include <linux/ptrace.h>
-
-#include "shadow.h"
-
-void kmemcheck_error_save(enum kmemcheck_shadow state,
-	unsigned long address, unsigned int size, struct pt_regs *regs);
-
-void kmemcheck_error_save_bug(struct pt_regs *regs);
-
-void kmemcheck_error_recall(void);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
deleted file mode 100644
index 4515bae36bbe..000000000000
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ /dev/null
@@ -1,658 +0,0 @@
-/**
- * kmemcheck - a heavyweight memory checker for the linux kernel
- * Copyright (C) 2007, 2008  Vegard Nossum <[email protected]>
- * (With a lot of help from Ingo Molnar and Pekka Enberg.)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License (version 2) as
- * published by the Free Software Foundation.
- */
-
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/kallsyms.h>
-#include <linux/kernel.h>
-#include <linux/kmemcheck.h>
-#include <linux/mm.h>
-#include <linux/page-flags.h>
-#include <linux/percpu.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/types.h>
-
-#include <asm/cacheflush.h>
-#include <asm/kmemcheck.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-
-#include "error.h"
-#include "opcode.h"
-#include "pte.h"
-#include "selftest.h"
-#include "shadow.h"
-
-
-#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
-#  define KMEMCHECK_ENABLED 0
-#endif
-
-#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
-#  define KMEMCHECK_ENABLED 1
-#endif
-
-#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
-#  define KMEMCHECK_ENABLED 2
-#endif
-
-int kmemcheck_enabled = KMEMCHECK_ENABLED;
-
-int __init kmemcheck_init(void)
-{
-#ifdef CONFIG_SMP
-	/*
-	 * Limit SMP to use a single CPU. We rely on the fact that this code
-	 * runs before SMP is set up.
-	 */
-	if (setup_max_cpus > 1) {
-		printk(KERN_INFO
-			"kmemcheck: Limiting number of CPUs to 1.\n");
-		setup_max_cpus = 1;
-	}
-#endif
-
-	if (!kmemcheck_selftest()) {
-		printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n");
-		kmemcheck_enabled = 0;
-		return -EINVAL;
-	}
-
-	printk(KERN_INFO "kmemcheck: Initialized\n");
-	return 0;
-}
-
-early_initcall(kmemcheck_init);
-
-/*
- * We need to parse the kmemcheck= option before any memory is allocated.
- */
-static int __init param_kmemcheck(char *str)
-{
-	int val;
-	int ret;
-
-	if (!str)
-		return -EINVAL;
-
-	ret = kstrtoint(str, 0, &val);
-	if (ret)
-		return ret;
-	kmemcheck_enabled = val;
-	return 0;
-}
-
-early_param("kmemcheck", param_kmemcheck);
-
-int kmemcheck_show_addr(unsigned long address)
-{
-	pte_t *pte;
-
-	pte = kmemcheck_pte_lookup(address);
-	if (!pte)
-		return 0;
-
-	set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
-	__flush_tlb_one(address);
-	return 1;
-}
-
-int kmemcheck_hide_addr(unsigned long address)
-{
-	pte_t *pte;
-
-	pte = kmemcheck_pte_lookup(address);
-	if (!pte)
-		return 0;
-
-	set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
-	__flush_tlb_one(address);
-	return 1;
-}
-
-struct kmemcheck_context {
-	bool busy;
-	int balance;
-
-	/*
-	 * There can be at most two memory operands to an instruction, but
-	 * each address can cross a page boundary -- so we may need up to
-	 * four addresses that must be hidden/revealed for each fault.
-	 */
-	unsigned long addr[4];
-	unsigned long n_addrs;
-	unsigned long flags;
-
-	/* Data size of the instruction that caused a fault. */
-	unsigned int size;
-};
-
-static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
-
-bool kmemcheck_active(struct pt_regs *regs)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
-	return data->balance > 0;
-}
-
-/* Save an address that needs to be shown/hidden */
-static void kmemcheck_save_addr(unsigned long addr)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
-	BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
-	data->addr[data->n_addrs++] = addr;
-}
-
-static unsigned int kmemcheck_show_all(void)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-	unsigned int i;
-	unsigned int n;
-
-	n = 0;
-	for (i = 0; i < data->n_addrs; ++i)
-		n += kmemcheck_show_addr(data->addr[i]);
-
-	return n;
-}
-
-static unsigned int kmemcheck_hide_all(void)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-	unsigned int i;
-	unsigned int n;
-
-	n = 0;
-	for (i = 0; i < data->n_addrs; ++i)
-		n += kmemcheck_hide_addr(data->addr[i]);
-
-	return n;
-}
-
-/*
- * Called from the #PF handler.
- */
-void kmemcheck_show(struct pt_regs *regs)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
-	BUG_ON(!irqs_disabled());
-
-	if (unlikely(data->balance != 0)) {
-		kmemcheck_show_all();
-		kmemcheck_error_save_bug(regs);
-		data->balance = 0;
-		return;
-	}
-
-	/*
-	 * None of the addresses actually belonged to kmemcheck. Note that
-	 * this is not an error.
-	 */
-	if (kmemcheck_show_all() == 0)
-		return;
-
-	++data->balance;
-
-	/*
-	 * The IF needs to be cleared as well, so that the faulting
-	 * instruction can run "uninterrupted". Otherwise, we might take
-	 * an interrupt and start executing that before we've had a chance
-	 * to hide the page again.
-	 *
-	 * NOTE: In the rare case of multiple faults, we must not override
-	 * the original flags:
-	 */
-	if (!(regs->flags & X86_EFLAGS_TF))
-		data->flags = regs->flags;
-
-	regs->flags |= X86_EFLAGS_TF;
-	regs->flags &= ~X86_EFLAGS_IF;
-}
-
-/*
- * Called from the #DB handler.
- */
-void kmemcheck_hide(struct pt_regs *regs)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-	int n;
-
-	BUG_ON(!irqs_disabled());
-
-	if (unlikely(data->balance != 1)) {
-		kmemcheck_show_all();
-		kmemcheck_error_save_bug(regs);
-		data->n_addrs = 0;
-		data->balance = 0;
-
-		if (!(data->flags & X86_EFLAGS_TF))
-			regs->flags &= ~X86_EFLAGS_TF;
-		if (data->flags & X86_EFLAGS_IF)
-			regs->flags |= X86_EFLAGS_IF;
-		return;
-	}
-
-	if (kmemcheck_enabled)
-		n = kmemcheck_hide_all();
-	else
-		n = kmemcheck_show_all();
-
-	if (n == 0)
-		return;
-
-	--data->balance;
-
-	data->n_addrs = 0;
-
-	if (!(data->flags & X86_EFLAGS_TF))
-		regs->flags &= ~X86_EFLAGS_TF;
-	if (data->flags & X86_EFLAGS_IF)
-		regs->flags |= X86_EFLAGS_IF;
-}
-
-void kmemcheck_show_pages(struct page *p, unsigned int n)
-{
-	unsigned int i;
-
-	for (i = 0; i < n; ++i) {
-		unsigned long address;
-		pte_t *pte;
-		unsigned int level;
-
-		address = (unsigned long) page_address(&p[i]);
-		pte = lookup_address(address, &level);
-		BUG_ON(!pte);
-		BUG_ON(level != PG_LEVEL_4K);
-
-		set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
-		set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN));
-		__flush_tlb_one(address);
-	}
-}
-
-bool kmemcheck_page_is_tracked(struct page *p)
-{
-	/* This will also check the "hidden" flag of the PTE. */
-	return kmemcheck_pte_lookup((unsigned long) page_address(p));
-}
-
-void kmemcheck_hide_pages(struct page *p, unsigned int n)
-{
-	unsigned int i;
-
-	for (i = 0; i < n; ++i) {
-		unsigned long address;
-		pte_t *pte;
-		unsigned int level;
-
-		address = (unsigned long) page_address(&p[i]);
-		pte = lookup_address(address, &level);
-		BUG_ON(!pte);
-		BUG_ON(level != PG_LEVEL_4K);
-
-		set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
-		set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));
-		__flush_tlb_one(address);
-	}
-}
-
-/* Access may NOT cross page boundary */
-static void kmemcheck_read_strict(struct pt_regs *regs,
-	unsigned long addr, unsigned int size)
-{
-	void *shadow;
-	enum kmemcheck_shadow status;
-
-	shadow = kmemcheck_shadow_lookup(addr);
-	if (!shadow)
-		return;
-
-	kmemcheck_save_addr(addr);
-	status = kmemcheck_shadow_test(shadow, size);
-	if (status == KMEMCHECK_SHADOW_INITIALIZED)
-		return;
-
-	if (kmemcheck_enabled)
-		kmemcheck_error_save(status, addr, size, regs);
-
-	if (kmemcheck_enabled == 2)
-		kmemcheck_enabled = 0;
-
-	/* Don't warn about it again. */
-	kmemcheck_shadow_set(shadow, size);
-}
-
-bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
-{
-	enum kmemcheck_shadow status;
-	void *shadow;
-
-	shadow = kmemcheck_shadow_lookup(addr);
-	if (!shadow)
-		return true;
-
-	status = kmemcheck_shadow_test_all(shadow, size);
-
-	return status == KMEMCHECK_SHADOW_INITIALIZED;
-}
-
-/* Access may cross page boundary */
-static void kmemcheck_read(struct pt_regs *regs,
-	unsigned long addr, unsigned int size)
-{
-	unsigned long page = addr & PAGE_MASK;
-	unsigned long next_addr = addr + size - 1;
-	unsigned long next_page = next_addr & PAGE_MASK;
-
-	if (likely(page == next_page)) {
-		kmemcheck_read_strict(regs, addr, size);
-		return;
-	}
-
-	/*
-	 * What we do is basically to split the access across the
-	 * two pages and handle each part separately. Yes, this means
-	 * that we may now see reads that are 3 + 5 bytes, for
-	 * example (and if both are uninitialized, there will be two
-	 * reports), but it makes the code a lot simpler.
-	 */
-	kmemcheck_read_strict(regs, addr, next_page - addr);
-	kmemcheck_read_strict(regs, next_page, next_addr - next_page);
-}
-
-static void kmemcheck_write_strict(struct pt_regs *regs,
-	unsigned long addr, unsigned int size)
-{
-	void *shadow;
-
-	shadow = kmemcheck_shadow_lookup(addr);
-	if (!shadow)
-		return;
-
-	kmemcheck_save_addr(addr);
-	kmemcheck_shadow_set(shadow, size);
-}
-
-static void kmemcheck_write(struct pt_regs *regs,
-	unsigned long addr, unsigned int size)
-{
-	unsigned long page = addr & PAGE_MASK;
-	unsigned long next_addr = addr + size - 1;
-	unsigned long next_page = next_addr & PAGE_MASK;
-
-	if (likely(page == next_page)) {
-		kmemcheck_write_strict(regs, addr, size);
-		return;
-	}
-
-	/* See comment in kmemcheck_read(). */
-	kmemcheck_write_strict(regs, addr, next_page - addr);
-	kmemcheck_write_strict(regs, next_page, next_addr - next_page);
-}
-
-/*
- * Copying is hard. We have two addresses, each of which may be split across
- * a page (and each page will have different shadow addresses).
- */
-static void kmemcheck_copy(struct pt_regs *regs,
-	unsigned long src_addr, unsigned long dst_addr, unsigned int size)
-{
-	uint8_t shadow[8];
-	enum kmemcheck_shadow status;
-
-	unsigned long page;
-	unsigned long next_addr;
-	unsigned long next_page;
-
-	uint8_t *x;
-	unsigned int i;
-	unsigned int n;
-
-	BUG_ON(size > sizeof(shadow));
-
-	page = src_addr & PAGE_MASK;
-	next_addr = src_addr + size - 1;
-	next_page = next_addr & PAGE_MASK;
-
-	if (likely(page == next_page)) {
-		/* Same page */
-		x = kmemcheck_shadow_lookup(src_addr);
-		if (x) {
-			kmemcheck_save_addr(src_addr);
-			for (i = 0; i < size; ++i)
-				shadow[i] = x[i];
-		} else {
-			for (i = 0; i < size; ++i)
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-		}
-	} else {
-		n = next_page - src_addr;
-		BUG_ON(n > sizeof(shadow));
-
-		/* First page */
-		x = kmemcheck_shadow_lookup(src_addr);
-		if (x) {
-			kmemcheck_save_addr(src_addr);
-			for (i = 0; i < n; ++i)
-				shadow[i] = x[i];
-		} else {
-			/* Not tracked */
-			for (i = 0; i < n; ++i)
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-		}
-
-		/* Second page */
-		x = kmemcheck_shadow_lookup(next_page);
-		if (x) {
-			kmemcheck_save_addr(next_page);
-			for (i = n; i < size; ++i)
-				shadow[i] = x[i - n];
-		} else {
-			/* Not tracked */
-			for (i = n; i < size; ++i)
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-		}
-	}
-
-	page = dst_addr & PAGE_MASK;
-	next_addr = dst_addr + size - 1;
-	next_page = next_addr & PAGE_MASK;
-
-	if (likely(page == next_page)) {
-		/* Same page */
-		x = kmemcheck_shadow_lookup(dst_addr);
-		if (x) {
-			kmemcheck_save_addr(dst_addr);
-			for (i = 0; i < size; ++i) {
-				x[i] = shadow[i];
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-			}
-		}
-	} else {
-		n = next_page - dst_addr;
-		BUG_ON(n > sizeof(shadow));
-
-		/* First page */
-		x = kmemcheck_shadow_lookup(dst_addr);
-		if (x) {
-			kmemcheck_save_addr(dst_addr);
-			for (i = 0; i < n; ++i) {
-				x[i] = shadow[i];
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-			}
-		}
-
-		/* Second page */
-		x = kmemcheck_shadow_lookup(next_page);
-		if (x) {
-			kmemcheck_save_addr(next_page);
-			for (i = n; i < size; ++i) {
-				x[i - n] = shadow[i];
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-			}
-		}
-	}
-
-	status = kmemcheck_shadow_test(shadow, size);
-	if (status == KMEMCHECK_SHADOW_INITIALIZED)
-		return;
-
-	if (kmemcheck_enabled)
-		kmemcheck_error_save(status, src_addr, size, regs);
-
-	if (kmemcheck_enabled == 2)
-		kmemcheck_enabled = 0;
-}
-
-enum kmemcheck_method {
-	KMEMCHECK_READ,
-	KMEMCHECK_WRITE,
-};
-
-static void kmemcheck_access(struct pt_regs *regs,
-	unsigned long fallback_address, enum kmemcheck_method fallback_method)
-{
-	const uint8_t *insn;
-	const uint8_t *insn_primary;
-	unsigned int size;
-
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
-	/* Recursive fault -- ouch. */
-	if (data->busy) {
-		kmemcheck_show_addr(fallback_address);
-		kmemcheck_error_save_bug(regs);
-		return;
-	}
-
-	data->busy = true;
-
-	insn = (const uint8_t *) regs->ip;
-	insn_primary = kmemcheck_opcode_get_primary(insn);
-
-	kmemcheck_opcode_decode(insn, &size);
-
-	switch (insn_primary[0]) {
-#ifdef CONFIG_KMEMCHECK_BITOPS_OK
-		/* AND, OR, XOR */
-		/*
-		 * Unfortunately, these instructions have to be excluded from
-		 * our regular checking since they access only some (and not
-		 * all) bits. This clears out "bogus" bitfield-access warnings.
-		 */
-	case 0x80:
-	case 0x81:
-	case 0x82:
-	case 0x83:
-		switch ((insn_primary[1] >> 3) & 7) {
-			/* OR */
-		case 1:
-			/* AND */
-		case 4:
-			/* XOR */
-		case 6:
-			kmemcheck_write(regs, fallback_address, size);
-			goto out;
-
-			/* ADD */
-		case 0:
-			/* ADC */
-		case 2:
-			/* SBB */
-		case 3:
-			/* SUB */
-		case 5:
-			/* CMP */
-		case 7:
-			break;
-		}
-		break;
-#endif
-
-		/* MOVS, MOVSB, MOVSW, MOVSD */
-	case 0xa4:
-	case 0xa5:
-		/*
-		 * These instructions are special because they take two
-		 * addresses, but we only get one page fault.
-		 */
-		kmemcheck_copy(regs, regs->si, regs->di, size);
-		goto out;
-
-		/* CMPS, CMPSB, CMPSW, CMPSD */
-	case 0xa6:
-	case 0xa7:
-		kmemcheck_read(regs, regs->si, size);
-		kmemcheck_read(regs, regs->di, size);
-		goto out;
-	}
-
-	/*
-	 * If the opcode isn't special in any way, we use the data from the
-	 * page fault handler to determine the address and type of memory
-	 * access.
-	 */
-	switch (fallback_method) {
-	case KMEMCHECK_READ:
-		kmemcheck_read(regs, fallback_address, size);
-		goto out;
-	case KMEMCHECK_WRITE:
-		kmemcheck_write(regs, fallback_address, size);
-		goto out;
-	}
-
-out:
-	data->busy = false;
-}
-
-bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
-	unsigned long error_code)
-{
-	pte_t *pte;
-
-	/*
-	 * XXX: Is it safe to assume that memory accesses from virtual 86
-	 * mode or non-kernel code segments will _never_ access kernel
-	 * memory (e.g. tracked pages)? For now, we need this to avoid
-	 * invoking kmemcheck for PnP BIOS calls.
-	 */
-	if (regs->flags & X86_VM_MASK)
-		return false;
-	if (regs->cs != __KERNEL_CS)
-		return false;
-
-	pte = kmemcheck_pte_lookup(address);
-	if (!pte)
-		return false;
-
-	WARN_ON_ONCE(in_nmi());
-
-	if (error_code & 2)
-		kmemcheck_access(regs, address, KMEMCHECK_WRITE);
-	else
-		kmemcheck_access(regs, address, KMEMCHECK_READ);
-
-	kmemcheck_show(regs);
-	return true;
-}
-
-bool kmemcheck_trap(struct pt_regs *regs)
-{
-	if (!kmemcheck_active(regs))
-		return false;
-
-	/* We're done. */
-	kmemcheck_hide(regs);
-	return true;
-}
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
deleted file mode 100644
index 324aa3f07237..000000000000
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ /dev/null
@@ -1,106 +0,0 @@
-#include <linux/types.h>
-
-#include "opcode.h"
-
-static bool opcode_is_prefix(uint8_t b)
-{
-	return
-		/* Group 1 */
-		b == 0xf0 || b == 0xf2 || b == 0xf3
-		/* Group 2 */
-		|| b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
-		|| b == 0x64 || b == 0x65
-		/* Group 3 */
-		|| b == 0x66
-		/* Group 4 */
-		|| b == 0x67;
-}
-
-#ifdef CONFIG_X86_64
-static bool opcode_is_rex_prefix(uint8_t b)
-{
-	return (b & 0xf0) == 0x40;
-}
-#else
-static bool opcode_is_rex_prefix(uint8_t b)
-{
-	return false;
-}
-#endif
-
-#define REX_W (1 << 3)
-
-/*
- * This is a VERY crude opcode decoder. We only need to find the size of the
- * load/store that caused our #PF and this should work for all the opcodes
- * that we care about. Moreover, the ones who invented this instruction set
- * should be shot.
- */
-void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size)
-{
-	/* Default operand size */
-	int operand_size_override = 4;
-
-	/* prefixes */
-	for (; opcode_is_prefix(*op); ++op) {
-		if (*op == 0x66)
-			operand_size_override = 2;
-	}
-
-	/* REX prefix */
-	if (opcode_is_rex_prefix(*op)) {
-		uint8_t rex = *op;
-
-		++op;
-		if (rex & REX_W) {
-			switch (*op) {
-			case 0x63:
-				*size = 4;
-				return;
-			case 0x0f:
-				++op;
-
-				switch (*op) {
-				case 0xb6:
-				case 0xbe:
-					*size = 1;
-					return;
-				case 0xb7:
-				case 0xbf:
-					*size = 2;
-					return;
-				}
-
-				break;
-			}
-
-			*size = 8;
-			return;
-		}
-	}
-
-	/* escape opcode */
-	if (*op == 0x0f) {
-		++op;
-
-		/*
-		 * This is move with zero-extend and sign-extend, respectively;
-		 * we don't have to think about 0xb6/0xbe, because this is
-		 * already handled in the conditional below.
-		 */
-		if (*op == 0xb7 || *op == 0xbf)
-			operand_size_override = 2;
-	}
-
-	*size = (*op & 1) ? operand_size_override : 1;
-}
-
-const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op)
-{
-	/* skip prefixes */
-	while (opcode_is_prefix(*op))
-		++op;
-	if (opcode_is_rex_prefix(*op))
-		++op;
-	return op;
-}
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
deleted file mode 100644
index 6956aad66b5b..000000000000
--- a/arch/x86/mm/kmemcheck/opcode.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
-#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
-
-#include <linux/types.h>
-
-void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size);
-const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
deleted file mode 100644
index 4ead26eeaf96..000000000000
--- a/arch/x86/mm/kmemcheck/pte.c
+++ /dev/null
@@ -1,22 +0,0 @@
-#include <linux/mm.h>
-
-#include <asm/pgtable.h>
-
-#include "pte.h"
-
-pte_t *kmemcheck_pte_lookup(unsigned long address)
-{
-	pte_t *pte;
-	unsigned int level;
-
-	pte = lookup_address(address, &level);
-	if (!pte)
-		return NULL;
-	if (level != PG_LEVEL_4K)
-		return NULL;
-	if (!pte_hidden(*pte))
-		return NULL;
-
-	return pte;
-}
-
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
deleted file mode 100644
index 9f5966456492..000000000000
--- a/arch/x86/mm/kmemcheck/pte.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
-#define ARCH__X86__MM__KMEMCHECK__PTE_H
-
-#include <linux/mm.h>
-
-#include <asm/pgtable.h>
-
-pte_t *kmemcheck_pte_lookup(unsigned long address);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
deleted file mode 100644
index aef7140c0063..000000000000
--- a/arch/x86/mm/kmemcheck/selftest.c
+++ /dev/null
@@ -1,70 +0,0 @@
-#include <linux/bug.h>
-#include <linux/kernel.h>
-
-#include "opcode.h"
-#include "selftest.h"
-
-struct selftest_opcode {
-	unsigned int expected_size;
-	const uint8_t *insn;
-	const char *desc;
-};
-
-static const struct selftest_opcode selftest_opcodes[] = {
-	/* REP MOVS */
-	{1, "\xf3\xa4", 		"rep movsb <mem8>, <mem8>"},
-	{4, "\xf3\xa5",			"rep movsl <mem32>, <mem32>"},
-
-	/* MOVZX / MOVZXD */
-	{1, "\x66\x0f\xb6\x51\xf8",	"movzwq <mem8>, <reg16>"},
-	{1, "\x0f\xb6\x51\xf8",		"movzwq <mem8>, <reg32>"},
-
-	/* MOVSX / MOVSXD */
-	{1, "\x66\x0f\xbe\x51\xf8",	"movswq <mem8>, <reg16>"},
-	{1, "\x0f\xbe\x51\xf8",		"movswq <mem8>, <reg32>"},
-
-#ifdef CONFIG_X86_64
-	/* MOVZX / MOVZXD */
-	{1, "\x49\x0f\xb6\x51\xf8",	"movzbq <mem8>, <reg64>"},
-	{2, "\x49\x0f\xb7\x51\xf8",	"movzbq <mem16>, <reg64>"},
-
-	/* MOVSX / MOVSXD */
-	{1, "\x49\x0f\xbe\x51\xf8",	"movsbq <mem8>, <reg64>"},
-	{2, "\x49\x0f\xbf\x51\xf8",	"movsbq <mem16>, <reg64>"},
-	{4, "\x49\x63\x51\xf8",		"movslq <mem32>, <reg64>"},
-#endif
-};
-
-static bool selftest_opcode_one(const struct selftest_opcode *op)
-{
-	unsigned size;
-
-	kmemcheck_opcode_decode(op->insn, &size);
-
-	if (size == op->expected_size)
-		return true;
-
-	printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n",
-		op->desc, op->expected_size, size);
-	return false;
-}
-
-static bool selftest_opcodes_all(void)
-{
-	bool pass = true;
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i)
-		pass = pass && selftest_opcode_one(&selftest_opcodes[i]);
-
-	return pass;
-}
-
-bool kmemcheck_selftest(void)
-{
-	bool pass = true;
-
-	pass = pass && selftest_opcodes_all();
-
-	return pass;
-}
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h
deleted file mode 100644
index 8fed4fe11f95..000000000000
--- a/arch/x86/mm/kmemcheck/selftest.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H
-#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H
-
-bool kmemcheck_selftest(void);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
deleted file mode 100644
index c2638a7d2c10..000000000000
--- a/arch/x86/mm/kmemcheck/shadow.c
+++ /dev/null
@@ -1,173 +0,0 @@
-#include <linux/kmemcheck.h>
-#include <linux/export.h>
-#include <linux/mm.h>
-
-#include <asm/page.h>
-#include <asm/pgtable.h>
-
-#include "pte.h"
-#include "shadow.h"
-
-/*
- * Return the shadow address for the given address. Returns NULL if the
- * address is not tracked.
- *
- * We need to be extremely careful not to follow any invalid pointers,
- * because this function can be called for *any* possible address.
- */
-void *kmemcheck_shadow_lookup(unsigned long address)
-{
-	pte_t *pte;
-	struct page *page;
-
-	if (!virt_addr_valid(address))
-		return NULL;
-
-	pte = kmemcheck_pte_lookup(address);
-	if (!pte)
-		return NULL;
-
-	page = virt_to_page(address);
-	if (!page->shadow)
-		return NULL;
-	return page->shadow + (address & (PAGE_SIZE - 1));
-}
-
-static void mark_shadow(void *address, unsigned int n,
-	enum kmemcheck_shadow status)
-{
-	unsigned long addr = (unsigned long) address;
-	unsigned long last_addr = addr + n - 1;
-	unsigned long page = addr & PAGE_MASK;
-	unsigned long last_page = last_addr & PAGE_MASK;
-	unsigned int first_n;
-	void *shadow;
-
-	/* If the memory range crosses a page boundary, stop there. */
-	if (page == last_page)
-		first_n = n;
-	else
-		first_n = page + PAGE_SIZE - addr;
-
-	shadow = kmemcheck_shadow_lookup(addr);
-	if (shadow)
-		memset(shadow, status, first_n);
-
-	addr += first_n;
-	n -= first_n;
-
-	/* Do full-page memset()s. */
-	while (n >= PAGE_SIZE) {
-		shadow = kmemcheck_shadow_lookup(addr);
-		if (shadow)
-			memset(shadow, status, PAGE_SIZE);
-
-		addr += PAGE_SIZE;
-		n -= PAGE_SIZE;
-	}
-
-	/* Do the remaining page, if any. */
-	if (n > 0) {
-		shadow = kmemcheck_shadow_lookup(addr);
-		if (shadow)
-			memset(shadow, status, n);
-	}
-}
-
-void kmemcheck_mark_unallocated(void *address, unsigned int n)
-{
-	mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED);
-}
-
-void kmemcheck_mark_uninitialized(void *address, unsigned int n)
-{
-	mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED);
-}
-
-/*
- * Fill the shadow memory of the given address such that the memory at that
- * address is marked as being initialized.
- */
-void kmemcheck_mark_initialized(void *address, unsigned int n)
-{
-	mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED);
-}
-EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized);
-
-void kmemcheck_mark_freed(void *address, unsigned int n)
-{
-	mark_shadow(address, n, KMEMCHECK_SHADOW_FREED);
-}
-
-void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n)
-{
-	unsigned int i;
-
-	for (i = 0; i < n; ++i)
-		kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE);
-}
-
-void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
-{
-	unsigned int i;
-
-	for (i = 0; i < n; ++i)
-		kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
-}
-
-void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
-{
-	unsigned int i;
-
-	for (i = 0; i < n; ++i)
-		kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE);
-}
-
-enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
-{
-#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
-	uint8_t *x;
-	unsigned int i;
-
-	x = shadow;
-
-	/*
-	 * Make sure _some_ bytes are initialized. Gcc frequently generates
-	 * code to access neighboring bytes.
-	 */
-	for (i = 0; i < size; ++i) {
-		if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
-			return x[i];
-	}
-
-	return x[0];
-#else
-	return kmemcheck_shadow_test_all(shadow, size);
-#endif
-}
-
-enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size)
-{
-	uint8_t *x;
-	unsigned int i;
-
-	x = shadow;
-
-	/* All bytes must be initialized. */
-	for (i = 0; i < size; ++i) {
-		if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
-			return x[i];
-	}
-
-	return x[0];
-}
-
-void kmemcheck_shadow_set(void *shadow, unsigned int size)
-{
-	uint8_t *x;
-	unsigned int i;
-
-	x = shadow;
-	for (i = 0; i < size; ++i)
-		x[i] = KMEMCHECK_SHADOW_INITIALIZED;
-}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
deleted file mode 100644
index ff0b2f70fbcb..000000000000
--- a/arch/x86/mm/kmemcheck/shadow.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
-#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
-
-enum kmemcheck_shadow {
-	KMEMCHECK_SHADOW_UNALLOCATED,
-	KMEMCHECK_SHADOW_UNINITIALIZED,
-	KMEMCHECK_SHADOW_INITIALIZED,
-	KMEMCHECK_SHADOW_FREED,
-};
-
-void *kmemcheck_shadow_lookup(unsigned long address);
-
-enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
-enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow,
-						unsigned int size);
-void kmemcheck_shadow_set(void *shadow, unsigned int size);
-
-#endif
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index afc47f5c9531..58477ec3d66d 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* Support for MMIO probes.
  * Benfit many code from kprobes
  * (C) 2002 Louis Zhuang <[email protected]>.
@@ -434,17 +435,18 @@ int register_kmmio_probe(struct kmmio_probe *p)
 	unsigned long flags;
 	int ret = 0;
 	unsigned long size = 0;
+	unsigned long addr = p->addr & PAGE_MASK;
 	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
 	unsigned int l;
 	pte_t *pte;
 
 	spin_lock_irqsave(&kmmio_lock, flags);
-	if (get_kmmio_probe(p->addr)) {
+	if (get_kmmio_probe(addr)) {
 		ret = -EEXIST;
 		goto out;
 	}
 
-	pte = lookup_address(p->addr, &l);
+	pte = lookup_address(addr, &l);
 	if (!pte) {
 		ret = -EINVAL;
 		goto out;
@@ -453,7 +455,7 @@ int register_kmmio_probe(struct kmmio_probe *p)
 	kmmio_count++;
 	list_add_rcu(&p->list, &kmmio_probes);
 	while (size < size_lim) {
-		if (add_kmmio_fault_page(p->addr + size))
+		if (add_kmmio_fault_page(addr + size))
 			pr_err("Unable to set page fault.\n");
 		size += page_level_size(l);
 	}
@@ -527,19 +529,20 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
 {
 	unsigned long flags;
 	unsigned long size = 0;
+	unsigned long addr = p->addr & PAGE_MASK;
 	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
 	struct kmmio_fault_page *release_list = NULL;
 	struct kmmio_delayed_release *drelease;
 	unsigned int l;
 	pte_t *pte;
 
-	pte = lookup_address(p->addr, &l);
+	pte = lookup_address(addr, &l);
 	if (!pte)
 		return;
 
 	spin_lock_irqsave(&kmmio_lock, flags);
 	while (size < size_lim) {
-		release_kmmio_fault_page(p->addr + size, &release_list);
+		release_kmmio_fault_page(addr + size, &release_list);
 		size += page_level_size(l);
 	}
 	list_del_rcu(&p->list);
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 16c5f37933a2..391b13402e40 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -30,6 +30,8 @@
 #include <asm/msr.h>
 #include <asm/cmdline.h>
 
+#include "mm_internal.h"
+
 static char sme_cmdline_arg[] __initdata = "mem_encrypt";
 static char sme_cmdline_on[]  __initdata = "on";
 static char sme_cmdline_off[] __initdata = "off";
@@ -40,7 +42,11 @@ static char sme_cmdline_off[] __initdata = "off";
  * section is later cleared.
  */
 u64 sme_me_mask __section(.data) = 0;
-EXPORT_SYMBOL_GPL(sme_me_mask);
+EXPORT_SYMBOL(sme_me_mask);
+DEFINE_STATIC_KEY_FALSE(sev_enable_key);
+EXPORT_SYMBOL_GPL(sev_enable_key);
+
+static bool sev_enabled __section(.data);
 
 /* Buffer used for early in-place encryption by BSP, no locking needed */
 static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE);
@@ -63,7 +69,6 @@ static void __init __sme_early_enc_dec(resource_size_t paddr,
 	if (!sme_me_mask)
 		return;
 
-	local_flush_tlb();
 	wbinvd();
 
 	/*
@@ -190,8 +195,238 @@ void __init sme_early_init(void)
 	/* Update the protection map with memory encryption mask */
 	for (i = 0; i < ARRAY_SIZE(protection_map); i++)
 		protection_map[i] = pgprot_encrypted(protection_map[i]);
+
+	if (sev_active())
+		swiotlb_force = SWIOTLB_FORCE;
+}
+
+static void *sev_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
+		       gfp_t gfp, unsigned long attrs)
+{
+	unsigned long dma_mask;
+	unsigned int order;
+	struct page *page;
+	void *vaddr = NULL;
+
+	dma_mask = dma_alloc_coherent_mask(dev, gfp);
+	order = get_order(size);
+
+	/*
+	 * Memory will be memset to zero after marking decrypted, so don't
+	 * bother clearing it before.
+	 */
+	gfp &= ~__GFP_ZERO;
+
+	page = alloc_pages_node(dev_to_node(dev), gfp, order);
+	if (page) {
+		dma_addr_t addr;
+
+		/*
+		 * Since we will be clearing the encryption bit, check the
+		 * mask with it already cleared.
+		 */
+		addr = __sme_clr(phys_to_dma(dev, page_to_phys(page)));
+		if ((addr + size) > dma_mask) {
+			__free_pages(page, get_order(size));
+		} else {
+			vaddr = page_address(page);
+			*dma_handle = addr;
+		}
+	}
+
+	if (!vaddr)
+		vaddr = swiotlb_alloc_coherent(dev, size, dma_handle, gfp);
+
+	if (!vaddr)
+		return NULL;
+
+	/* Clear the SME encryption bit for DMA use if not swiotlb area */
+	if (!is_swiotlb_buffer(dma_to_phys(dev, *dma_handle))) {
+		set_memory_decrypted((unsigned long)vaddr, 1 << order);
+		memset(vaddr, 0, PAGE_SIZE << order);
+		*dma_handle = __sme_clr(*dma_handle);
+	}
+
+	return vaddr;
 }
 
+static void sev_free(struct device *dev, size_t size, void *vaddr,
+		     dma_addr_t dma_handle, unsigned long attrs)
+{
+	/* Set the SME encryption bit for re-use if not swiotlb area */
+	if (!is_swiotlb_buffer(dma_to_phys(dev, dma_handle)))
+		set_memory_encrypted((unsigned long)vaddr,
+				     1 << get_order(size));
+
+	swiotlb_free_coherent(dev, size, vaddr, dma_handle);
+}
+
+static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
+{
+	pgprot_t old_prot, new_prot;
+	unsigned long pfn, pa, size;
+	pte_t new_pte;
+
+	switch (level) {
+	case PG_LEVEL_4K:
+		pfn = pte_pfn(*kpte);
+		old_prot = pte_pgprot(*kpte);
+		break;
+	case PG_LEVEL_2M:
+		pfn = pmd_pfn(*(pmd_t *)kpte);
+		old_prot = pmd_pgprot(*(pmd_t *)kpte);
+		break;
+	case PG_LEVEL_1G:
+		pfn = pud_pfn(*(pud_t *)kpte);
+		old_prot = pud_pgprot(*(pud_t *)kpte);
+		break;
+	default:
+		return;
+	}
+
+	new_prot = old_prot;
+	if (enc)
+		pgprot_val(new_prot) |= _PAGE_ENC;
+	else
+		pgprot_val(new_prot) &= ~_PAGE_ENC;
+
+	/* If prot is same then do nothing. */
+	if (pgprot_val(old_prot) == pgprot_val(new_prot))
+		return;
+
+	pa = pfn << page_level_shift(level);
+	size = page_level_size(level);
+
+	/*
+	 * We are going to perform in-place en-/decryption and change the
+	 * physical page attribute from C=1 to C=0 or vice versa. Flush the
+	 * caches to ensure that data gets accessed with the correct C-bit.
+	 */
+	clflush_cache_range(__va(pa), size);
+
+	/* Encrypt/decrypt the contents in-place */
+	if (enc)
+		sme_early_encrypt(pa, size);
+	else
+		sme_early_decrypt(pa, size);
+
+	/* Change the page encryption mask. */
+	new_pte = pfn_pte(pfn, new_prot);
+	set_pte_atomic(kpte, new_pte);
+}
+
+static int __init early_set_memory_enc_dec(unsigned long vaddr,
+					   unsigned long size, bool enc)
+{
+	unsigned long vaddr_end, vaddr_next;
+	unsigned long psize, pmask;
+	int split_page_size_mask;
+	int level, ret;
+	pte_t *kpte;
+
+	vaddr_next = vaddr;
+	vaddr_end = vaddr + size;
+
+	for (; vaddr < vaddr_end; vaddr = vaddr_next) {
+		kpte = lookup_address(vaddr, &level);
+		if (!kpte || pte_none(*kpte)) {
+			ret = 1;
+			goto out;
+		}
+
+		if (level == PG_LEVEL_4K) {
+			__set_clr_pte_enc(kpte, level, enc);
+			vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE;
+			continue;
+		}
+
+		psize = page_level_size(level);
+		pmask = page_level_mask(level);
+
+		/*
+		 * Check whether we can change the large page in one go.
+		 * We request a split when the address is not aligned and
+		 * the number of pages to set/clear encryption bit is smaller
+		 * than the number of pages in the large page.
+		 */
+		if (vaddr == (vaddr & pmask) &&
+		    ((vaddr_end - vaddr) >= psize)) {
+			__set_clr_pte_enc(kpte, level, enc);
+			vaddr_next = (vaddr & pmask) + psize;
+			continue;
+		}
+
+		/*
+		 * The virtual address is part of a larger page, create the next
+		 * level page table mapping (4K or 2M). If it is part of a 2M
+		 * page then we request a split of the large page into 4K
+		 * chunks. A 1GB large page is split into 2M pages, resp.
+		 */
+		if (level == PG_LEVEL_2M)
+			split_page_size_mask = 0;
+		else
+			split_page_size_mask = 1 << PG_LEVEL_2M;
+
+		kernel_physical_mapping_init(__pa(vaddr & pmask),
+					     __pa((vaddr_end & pmask) + psize),
+					     split_page_size_mask);
+	}
+
+	ret = 0;
+
+out:
+	__flush_tlb_all();
+	return ret;
+}
+
+int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size)
+{
+	return early_set_memory_enc_dec(vaddr, size, false);
+}
+
+int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size)
+{
+	return early_set_memory_enc_dec(vaddr, size, true);
+}
+
+/*
+ * SME and SEV are very similar but they are not the same, so there are
+ * times that the kernel will need to distinguish between SME and SEV. The
+ * sme_active() and sev_active() functions are used for this.  When a
+ * distinction isn't needed, the mem_encrypt_active() function can be used.
+ *
+ * The trampoline code is a good example for this requirement.  Before
+ * paging is activated, SME will access all memory as decrypted, but SEV
+ * will access all memory as encrypted.  So, when APs are being brought
+ * up under SME the trampoline area cannot be encrypted, whereas under SEV
+ * the trampoline area must be encrypted.
+ */
+bool sme_active(void)
+{
+	return sme_me_mask && !sev_enabled;
+}
+EXPORT_SYMBOL(sme_active);
+
+bool sev_active(void)
+{
+	return sme_me_mask && sev_enabled;
+}
+EXPORT_SYMBOL(sev_active);
+
+static const struct dma_map_ops sev_dma_ops = {
+	.alloc                  = sev_alloc,
+	.free                   = sev_free,
+	.map_page               = swiotlb_map_page,
+	.unmap_page             = swiotlb_unmap_page,
+	.map_sg                 = swiotlb_map_sg_attrs,
+	.unmap_sg               = swiotlb_unmap_sg_attrs,
+	.sync_single_for_cpu    = swiotlb_sync_single_for_cpu,
+	.sync_single_for_device = swiotlb_sync_single_for_device,
+	.sync_sg_for_cpu        = swiotlb_sync_sg_for_cpu,
+	.sync_sg_for_device     = swiotlb_sync_sg_for_device,
+	.mapping_error          = swiotlb_dma_mapping_error,
+};
+
 /* Architecture __weak replacement functions */
 void __init mem_encrypt_init(void)
 {
@@ -201,7 +436,23 @@ void __init mem_encrypt_init(void)
 	/* Call into SWIOTLB to update the SWIOTLB DMA buffers */
 	swiotlb_update_mem_attributes();
 
-	pr_info("AMD Secure Memory Encryption (SME) active\n");
+	/*
+	 * With SEV, DMA operations cannot use encryption. New DMA ops
+	 * are required in order to mark the DMA areas as decrypted or
+	 * to use bounce buffers.
+	 */
+	if (sev_active())
+		dma_ops = &sev_dma_ops;
+
+	/*
+	 * With SEV, we need to unroll the rep string I/O instructions.
+	 */
+	if (sev_active())
+		static_branch_enable(&sev_enable_key);
+
+	pr_info("AMD %s active\n",
+		sev_active() ? "Secure Encrypted Virtualization (SEV)"
+			     : "Secure Memory Encryption (SME)");
 }
 
 void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
@@ -529,37 +780,63 @@ void __init __nostackprotector sme_enable(struct boot_params *bp)
 {
 	const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
 	unsigned int eax, ebx, ecx, edx;
+	unsigned long feature_mask;
 	bool active_by_default;
 	unsigned long me_mask;
 	char buffer[16];
 	u64 msr;
 
-	/* Check for the SME support leaf */
+	/* Check for the SME/SEV support leaf */
 	eax = 0x80000000;
 	ecx = 0;
 	native_cpuid(&eax, &ebx, &ecx, &edx);
 	if (eax < 0x8000001f)
 		return;
 
+#define AMD_SME_BIT	BIT(0)
+#define AMD_SEV_BIT	BIT(1)
 	/*
-	 * Check for the SME feature:
-	 *   CPUID Fn8000_001F[EAX] - Bit 0
-	 *     Secure Memory Encryption support
-	 *   CPUID Fn8000_001F[EBX] - Bits 5:0
-	 *     Pagetable bit position used to indicate encryption
+	 * Set the feature mask (SME or SEV) based on whether we are
+	 * running under a hypervisor.
+	 */
+	eax = 1;
+	ecx = 0;
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+	feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT;
+
+	/*
+	 * Check for the SME/SEV feature:
+	 *   CPUID Fn8000_001F[EAX]
+	 *   - Bit 0 - Secure Memory Encryption support
+	 *   - Bit 1 - Secure Encrypted Virtualization support
+	 *   CPUID Fn8000_001F[EBX]
+	 *   - Bits 5:0 - Pagetable bit position used to indicate encryption
 	 */
 	eax = 0x8000001f;
 	ecx = 0;
 	native_cpuid(&eax, &ebx, &ecx, &edx);
-	if (!(eax & 1))
+	if (!(eax & feature_mask))
 		return;
 
 	me_mask = 1UL << (ebx & 0x3f);
 
-	/* Check if SME is enabled */
-	msr = __rdmsr(MSR_K8_SYSCFG);
-	if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+	/* Check if memory encryption is enabled */
+	if (feature_mask == AMD_SME_BIT) {
+		/* For SME, check the SYSCFG MSR */
+		msr = __rdmsr(MSR_K8_SYSCFG);
+		if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+			return;
+	} else {
+		/* For SEV, check the SEV MSR */
+		msr = __rdmsr(MSR_AMD64_SEV);
+		if (!(msr & MSR_AMD64_SEV_ENABLED))
+			return;
+
+		/* SEV state cannot be controlled by a command line option */
+		sme_me_mask = me_mask;
+		sev_enabled = true;
 		return;
+	}
 
 	/*
 	 * Fixups have not been applied to phys_base yet and we're running
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
index 62474ba66c8e..4e1f6e1b8159 100644
--- a/arch/x86/mm/mm_internal.h
+++ b/arch/x86/mm/mm_internal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef __X86_MM_INTERNAL_H
 #define __X86_MM_INTERNAL_H
 
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index a99679826846..155ecbac9e28 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -33,6 +33,8 @@
 #include <linux/compat.h>
 #include <asm/elf.h>
 
+#include "physaddr.h"
+
 struct va_alignment __read_mostly va_align = {
 	.flags = -1,
 };
@@ -174,3 +176,63 @@ const char *arch_vma_name(struct vm_area_struct *vma)
 		return "[mpx]";
 	return NULL;
 }
+
+/**
+ * mmap_address_hint_valid - Validate the address hint of mmap
+ * @addr:	Address hint
+ * @len:	Mapping length
+ *
+ * Check whether @addr and @addr + @len result in a valid mapping.
+ *
+ * On 32bit this only checks whether @addr + @len is <= TASK_SIZE.
+ *
+ * On 64bit with 5-level page tables another sanity check is required
+ * because mappings requested by mmap(@addr, 0) which cross the 47-bit
+ * virtual address boundary can cause the following theoretical issue:
+ *
+ *  An application calls mmap(addr, 0), i.e. without MAP_FIXED, where @addr
+ *  is below the border of the 47-bit address space and @addr + @len is
+ *  above the border.
+ *
+ *  With 4-level paging this request succeeds, but the resulting mapping
+ *  address will always be within the 47-bit virtual address space, because
+ *  the hint address does not result in a valid mapping and is
+ *  ignored. Hence applications which are not prepared to handle virtual
+ *  addresses above 47-bit work correctly.
+ *
+ *  With 5-level paging this request would be granted and result in a
+ *  mapping which crosses the border of the 47-bit virtual address
+ *  space. If the application cannot handle addresses above 47-bit this
+ *  will lead to misbehaviour and hard to diagnose failures.
+ *
+ * Therefore ignore address hints which would result in a mapping crossing
+ * the 47-bit virtual address boundary.
+ *
+ * Note, that in the same scenario with MAP_FIXED the behaviour is
+ * different. The request with @addr < 47-bit and @addr + @len > 47-bit
+ * fails on a 4-level paging machine but succeeds on a 5-level paging
+ * machine. It is reasonable to expect that an application does not rely on
+ * the failure of such a fixed mapping request, so the restriction is not
+ * applied.
+ */
+bool mmap_address_hint_valid(unsigned long addr, unsigned long len)
+{
+	if (TASK_SIZE - len < addr)
+		return false;
+
+	return (addr > DEFAULT_MAP_WINDOW) == (addr + len > DEFAULT_MAP_WINDOW);
+}
+
+/* Can we access it for direct reading/writing? Must be RAM: */
+int valid_phys_addr_range(phys_addr_t addr, size_t count)
+{
+	return addr + count <= __pa(high_memory);
+}
+
+/* Can we access it through mmap? Must be a valid physical address: */
+int valid_mmap_phys_addr_range(unsigned long pfn, size_t count)
+{
+	phys_addr_t addr = (phys_addr_t)pfn << PAGE_SHIFT;
+
+	return phys_addr_valid(addr + count - 1);
+}
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index 9ceaa955d2ba..e500949bae24 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * mpx.c - Memory Protection eXtensions
  *
@@ -12,6 +13,7 @@
 #include <linux/sched/sysctl.h>
 
 #include <asm/insn.h>
+#include <asm/insn-eval.h>
 #include <asm/mman.h>
 #include <asm/mmu_context.h>
 #include <asm/mpx.h>
@@ -60,123 +62,6 @@ static unsigned long mpx_mmap(unsigned long len)
 	return addr;
 }
 
-enum reg_type {
-	REG_TYPE_RM = 0,
-	REG_TYPE_INDEX,
-	REG_TYPE_BASE,
-};
-
-static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
-			  enum reg_type type)
-{
-	int regno = 0;
-
-	static const int regoff[] = {
-		offsetof(struct pt_regs, ax),
-		offsetof(struct pt_regs, cx),
-		offsetof(struct pt_regs, dx),
-		offsetof(struct pt_regs, bx),
-		offsetof(struct pt_regs, sp),
-		offsetof(struct pt_regs, bp),
-		offsetof(struct pt_regs, si),
-		offsetof(struct pt_regs, di),
-#ifdef CONFIG_X86_64
-		offsetof(struct pt_regs, r8),
-		offsetof(struct pt_regs, r9),
-		offsetof(struct pt_regs, r10),
-		offsetof(struct pt_regs, r11),
-		offsetof(struct pt_regs, r12),
-		offsetof(struct pt_regs, r13),
-		offsetof(struct pt_regs, r14),
-		offsetof(struct pt_regs, r15),
-#endif
-	};
-	int nr_registers = ARRAY_SIZE(regoff);
-	/*
-	 * Don't possibly decode a 32-bit instructions as
-	 * reading a 64-bit-only register.
-	 */
-	if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64)
-		nr_registers -= 8;
-
-	switch (type) {
-	case REG_TYPE_RM:
-		regno = X86_MODRM_RM(insn->modrm.value);
-		if (X86_REX_B(insn->rex_prefix.value))
-			regno += 8;
-		break;
-
-	case REG_TYPE_INDEX:
-		regno = X86_SIB_INDEX(insn->sib.value);
-		if (X86_REX_X(insn->rex_prefix.value))
-			regno += 8;
-		break;
-
-	case REG_TYPE_BASE:
-		regno = X86_SIB_BASE(insn->sib.value);
-		if (X86_REX_B(insn->rex_prefix.value))
-			regno += 8;
-		break;
-
-	default:
-		pr_err("invalid register type");
-		BUG();
-		break;
-	}
-
-	if (regno >= nr_registers) {
-		WARN_ONCE(1, "decoded an instruction with an invalid register");
-		return -EINVAL;
-	}
-	return regoff[regno];
-}
-
-/*
- * return the address being referenced be instruction
- * for rm=3 returning the content of the rm reg
- * for rm!=3 calculates the address using SIB and Disp
- */
-static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs)
-{
-	unsigned long addr, base, indx;
-	int addr_offset, base_offset, indx_offset;
-	insn_byte_t sib;
-
-	insn_get_modrm(insn);
-	insn_get_sib(insn);
-	sib = insn->sib.value;
-
-	if (X86_MODRM_MOD(insn->modrm.value) == 3) {
-		addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM);
-		if (addr_offset < 0)
-			goto out_err;
-		addr = regs_get_register(regs, addr_offset);
-	} else {
-		if (insn->sib.nbytes) {
-			base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE);
-			if (base_offset < 0)
-				goto out_err;
-
-			indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX);
-			if (indx_offset < 0)
-				goto out_err;
-
-			base = regs_get_register(regs, base_offset);
-			indx = regs_get_register(regs, indx_offset);
-			addr = base + indx * (1 << X86_SIB_SCALE(sib));
-		} else {
-			addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM);
-			if (addr_offset < 0)
-				goto out_err;
-			addr = regs_get_register(regs, addr_offset);
-		}
-		addr += insn->displacement.value;
-	}
-	return (void __user *)addr;
-out_err:
-	return (void __user *)-1;
-}
-
 static int mpx_insn_decode(struct insn *insn,
 			   struct pt_regs *regs)
 {
@@ -289,7 +174,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
 	info->si_signo = SIGSEGV;
 	info->si_errno = 0;
 	info->si_code = SEGV_BNDERR;
-	info->si_addr = mpx_get_addr_ref(&insn, regs);
+	info->si_addr = insn_get_addr_ref(&insn, regs);
 	/*
 	 * We were not able to extract an address from the instruction,
 	 * probably because there was something invalid in it.
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 9405ffc91502..066f3511d5f1 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Generic VM initialization for x86-64 NUMA setups.
  * Copyright 2002,2003 Andi Kleen, SuSE Labs.
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index d805162e6045..34a2a3bfde9c 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * NUMA emulation
  */
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index ad86ec91e640..86860f279662 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef __X86_MM_NUMA_INTERNAL_H
 #define __X86_MM_NUMA_INTERNAL_H
 
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 5f169d5d76a8..a25588ad75ef 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * self test for change_page_attr.
  *
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index dfb7d657cf43..85cf12219dea 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -753,7 +753,7 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
 
 	if (!debug_pagealloc_enabled())
 		spin_unlock(&cpa_lock);
-	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+	base = alloc_pages(GFP_KERNEL, 0);
 	if (!debug_pagealloc_enabled())
 		spin_lock(&cpa_lock);
 	if (!base)
@@ -904,7 +904,7 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
 
 static int alloc_pte_page(pmd_t *pmd)
 {
-	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
 	if (!pte)
 		return -1;
 
@@ -914,7 +914,7 @@ static int alloc_pte_page(pmd_t *pmd)
 
 static int alloc_pmd_page(pud_t *pud)
 {
-	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
 	if (!pmd)
 		return -1;
 
@@ -1120,7 +1120,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
 	pgd_entry = cpa->pgd + pgd_index(addr);
 
 	if (pgd_none(*pgd_entry)) {
-		p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+		p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
 		if (!p4d)
 			return -1;
 
@@ -1132,7 +1132,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
 	 */
 	p4d = p4d_offset(pgd_entry, addr);
 	if (p4d_none(*p4d)) {
-		pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+		pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
 		if (!pud)
 			return -1;
 
@@ -1781,8 +1781,8 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
 	unsigned long start;
 	int ret;
 
-	/* Nothing to do if the SME is not active */
-	if (!sme_active())
+	/* Nothing to do if memory encryption is not active */
+	if (!mem_encrypt_active())
 		return 0;
 
 	/* Should not be working on unaligned addresses */
diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h
index a739bfc40690..eeb5caeb089b 100644
--- a/arch/x86/mm/pat_internal.h
+++ b/arch/x86/mm/pat_internal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef __PAT_INTERNAL_H_
 #define __PAT_INTERNAL_H_
 
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index d76485b22824..fa16036fa592 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Handle caching attributes in page tables (PAT)
  *
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index b372f3442bbf..004abf9ebf12 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 #include <linux/mm.h>
 #include <linux/gfp.h>
 #include <asm/pgalloc.h>
@@ -6,7 +7,7 @@
 #include <asm/fixmap.h>
 #include <asm/mtrr.h>
 
-#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
 
 #ifdef CONFIG_HIGHPTE
 #define PGALLOC_USER_GFP __GFP_HIGHMEM
@@ -354,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd)
 		kmem_cache_free(pgd_cache, pgd);
 }
 #else
+
 static inline pgd_t *_pgd_alloc(void)
 {
-	return (pgd_t *)__get_free_page(PGALLOC_GFP);
+	return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
 }
 
 static inline void _pgd_free(pgd_t *pgd)
 {
-	free_page((unsigned long)pgd);
+	free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
 }
 #endif /* CONFIG_X86_PAE */
 
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index b9bd5b8b14fa..c3c5274410a9 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
@@ -9,6 +10,7 @@
 #include <linux/pagemap.h>
 #include <linux/spinlock.h>
 
+#include <asm/cpu_entry_area.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/fixmap.h>
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
index cfc3b9121ce4..7f9acb68324c 100644
--- a/arch/x86/mm/physaddr.c
+++ b/arch/x86/mm/physaddr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 #include <linux/bootmem.h>
 #include <linux/mmdebug.h>
 #include <linux/export.h>
diff --git a/arch/x86/mm/physaddr.h b/arch/x86/mm/physaddr.h
index a3cd5a0c97b3..9f6419cafc32 100644
--- a/arch/x86/mm/physaddr.h
+++ b/arch/x86/mm/physaddr.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #include <asm/processor.h>
 
 static inline int phys_addr_valid(resource_size_t addr)
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
new file mode 100644
index 000000000000..43d4a4a29037
--- /dev/null
+++ b/arch/x86/mm/pti.c
@@ -0,0 +1,388 @@
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * This code is based in part on work published here:
+ *
+ *	https://github.com/IAIK/KAISER
+ *
+ * The original work was written by and and signed off by for the Linux
+ * kernel by:
+ *
+ *   Signed-off-by: Richard Fellner <[email protected]>
+ *   Signed-off-by: Moritz Lipp <[email protected]>
+ *   Signed-off-by: Daniel Gruss <[email protected]>
+ *   Signed-off-by: Michael Schwarz <[email protected]>
+ *
+ * Major changes to the original code by: Dave Hansen <[email protected]>
+ * Mostly rewritten by Thomas Gleixner <[email protected]> and
+ *		       Andy Lutomirsky <[email protected]>
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+
+#include <asm/cpufeature.h>
+#include <asm/hypervisor.h>
+#include <asm/vsyscall.h>
+#include <asm/cmdline.h>
+#include <asm/pti.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt)     "Kernel/User page tables isolation: " fmt
+
+/* Backporting helper */
+#ifndef __GFP_NOTRACK
+#define __GFP_NOTRACK	0
+#endif
+
+static void __init pti_print_if_insecure(const char *reason)
+{
+	if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+		pr_info("%s\n", reason);
+}
+
+static void __init pti_print_if_secure(const char *reason)
+{
+	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+		pr_info("%s\n", reason);
+}
+
+void __init pti_check_boottime_disable(void)
+{
+	char arg[5];
+	int ret;
+
+	if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
+		pti_print_if_insecure("disabled on XEN PV.");
+		return;
+	}
+
+	ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
+	if (ret > 0)  {
+		if (ret == 3 && !strncmp(arg, "off", 3)) {
+			pti_print_if_insecure("disabled on command line.");
+			return;
+		}
+		if (ret == 2 && !strncmp(arg, "on", 2)) {
+			pti_print_if_secure("force enabled on command line.");
+			goto enable;
+		}
+		if (ret == 4 && !strncmp(arg, "auto", 4))
+			goto autosel;
+	}
+
+	if (cmdline_find_option_bool(boot_command_line, "nopti")) {
+		pti_print_if_insecure("disabled on command line.");
+		return;
+	}
+
+autosel:
+	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+		return;
+enable:
+	setup_force_cpu_cap(X86_FEATURE_PTI);
+}
+
+pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	/*
+	 * Changes to the high (kernel) portion of the kernelmode page
+	 * tables are not automatically propagated to the usermode tables.
+	 *
+	 * Users should keep in mind that, unlike the kernelmode tables,
+	 * there is no vmalloc_fault equivalent for the usermode tables.
+	 * Top-level entries added to init_mm's usermode pgd after boot
+	 * will not be automatically propagated to other mms.
+	 */
+	if (!pgdp_maps_userspace(pgdp))
+		return pgd;
+
+	/*
+	 * The user page tables get the full PGD, accessible from
+	 * userspace:
+	 */
+	kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;
+
+	/*
+	 * If this is normal user memory, make it NX in the kernel
+	 * pagetables so that, if we somehow screw up and return to
+	 * usermode with the kernel CR3 loaded, we'll get a page fault
+	 * instead of allowing user code to execute with the wrong CR3.
+	 *
+	 * As exceptions, we don't set NX if:
+	 *  - _PAGE_USER is not set.  This could be an executable
+	 *     EFI runtime mapping or something similar, and the kernel
+	 *     may execute from it
+	 *  - we don't have NX support
+	 *  - we're clearing the PGD (i.e. the new pgd is not present).
+	 */
+	if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
+	    (__supported_pte_mask & _PAGE_NX))
+		pgd.pgd |= _PAGE_NX;
+
+	/* return the copy of the PGD we want the kernel to use: */
+	return pgd;
+}
+
+/*
+ * Walk the user copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down.
+ *
+ * Returns a pointer to a P4D on success, or NULL on failure.
+ */
+static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
+{
+	pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
+	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+
+	if (address < PAGE_OFFSET) {
+		WARN_ONCE(1, "attempt to walk user address\n");
+		return NULL;
+	}
+
+	if (pgd_none(*pgd)) {
+		unsigned long new_p4d_page = __get_free_page(gfp);
+		if (!new_p4d_page)
+			return NULL;
+
+		if (pgd_none(*pgd)) {
+			set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
+			new_p4d_page = 0;
+		}
+		if (new_p4d_page)
+			free_page(new_p4d_page);
+	}
+	BUILD_BUG_ON(pgd_large(*pgd) != 0);
+
+	return p4d_offset(pgd, address);
+}
+
+/*
+ * Walk the user copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down.
+ *
+ * Returns a pointer to a PMD on success, or NULL on failure.
+ */
+static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
+{
+	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+	p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
+	pud_t *pud;
+
+	BUILD_BUG_ON(p4d_large(*p4d) != 0);
+	if (p4d_none(*p4d)) {
+		unsigned long new_pud_page = __get_free_page(gfp);
+		if (!new_pud_page)
+			return NULL;
+
+		if (p4d_none(*p4d)) {
+			set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
+			new_pud_page = 0;
+		}
+		if (new_pud_page)
+			free_page(new_pud_page);
+	}
+
+	pud = pud_offset(p4d, address);
+	/* The user page tables do not use large mappings: */
+	if (pud_large(*pud)) {
+		WARN_ON(1);
+		return NULL;
+	}
+	if (pud_none(*pud)) {
+		unsigned long new_pmd_page = __get_free_page(gfp);
+		if (!new_pmd_page)
+			return NULL;
+
+		if (pud_none(*pud)) {
+			set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+			new_pmd_page = 0;
+		}
+		if (new_pmd_page)
+			free_page(new_pmd_page);
+	}
+
+	return pmd_offset(pud, address);
+}
+
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
+/*
+ * Walk the shadow copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down.  Does not support large pages.
+ *
+ * Note: this is only used when mapping *new* kernel data into the
+ * user/shadow page tables.  It is never used for userspace data.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
+{
+	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+	pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
+	pte_t *pte;
+
+	/* We can't do anything sensible if we hit a large mapping. */
+	if (pmd_large(*pmd)) {
+		WARN_ON(1);
+		return NULL;
+	}
+
+	if (pmd_none(*pmd)) {
+		unsigned long new_pte_page = __get_free_page(gfp);
+		if (!new_pte_page)
+			return NULL;
+
+		if (pmd_none(*pmd)) {
+			set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+			new_pte_page = 0;
+		}
+		if (new_pte_page)
+			free_page(new_pte_page);
+	}
+
+	pte = pte_offset_kernel(pmd, address);
+	if (pte_flags(*pte) & _PAGE_USER) {
+		WARN_ONCE(1, "attempt to walk to user pte\n");
+		return NULL;
+	}
+	return pte;
+}
+
+static void __init pti_setup_vsyscall(void)
+{
+	pte_t *pte, *target_pte;
+	unsigned int level;
+
+	pte = lookup_address(VSYSCALL_ADDR, &level);
+	if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
+		return;
+
+	target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
+	if (WARN_ON(!target_pte))
+		return;
+
+	*target_pte = *pte;
+	set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir));
+}
+#else
+static void __init pti_setup_vsyscall(void) { }
+#endif
+
+static void __init
+pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
+{
+	unsigned long addr;
+
+	/*
+	 * Clone the populated PMDs which cover start to end. These PMD areas
+	 * can have holes.
+	 */
+	for (addr = start; addr < end; addr += PMD_SIZE) {
+		pmd_t *pmd, *target_pmd;
+		pgd_t *pgd;
+		p4d_t *p4d;
+		pud_t *pud;
+
+		pgd = pgd_offset_k(addr);
+		if (WARN_ON(pgd_none(*pgd)))
+			return;
+		p4d = p4d_offset(pgd, addr);
+		if (WARN_ON(p4d_none(*p4d)))
+			return;
+		pud = pud_offset(p4d, addr);
+		if (pud_none(*pud))
+			continue;
+		pmd = pmd_offset(pud, addr);
+		if (pmd_none(*pmd))
+			continue;
+
+		target_pmd = pti_user_pagetable_walk_pmd(addr);
+		if (WARN_ON(!target_pmd))
+			return;
+
+		/*
+		 * Copy the PMD.  That is, the kernelmode and usermode
+		 * tables will share the last-level page tables of this
+		 * address range
+		 */
+		*target_pmd = pmd_clear_flags(*pmd, clear);
+	}
+}
+
+/*
+ * Clone a single p4d (i.e. a top-level entry on 4-level systems and a
+ * next-level entry on 5-level systems.
+ */
+static void __init pti_clone_p4d(unsigned long addr)
+{
+	p4d_t *kernel_p4d, *user_p4d;
+	pgd_t *kernel_pgd;
+
+	user_p4d = pti_user_pagetable_walk_p4d(addr);
+	kernel_pgd = pgd_offset_k(addr);
+	kernel_p4d = p4d_offset(kernel_pgd, addr);
+	*user_p4d = *kernel_p4d;
+}
+
+/*
+ * Clone the CPU_ENTRY_AREA into the user space visible page table.
+ */
+static void __init pti_clone_user_shared(void)
+{
+	pti_clone_p4d(CPU_ENTRY_AREA_BASE);
+}
+
+/*
+ * Clone the ESPFIX P4D into the user space visinble page table
+ */
+static void __init pti_setup_espfix64(void)
+{
+#ifdef CONFIG_X86_ESPFIX64
+	pti_clone_p4d(ESPFIX_BASE_ADDR);
+#endif
+}
+
+/*
+ * Clone the populated PMDs of the entry and irqentry text and force it RO.
+ */
+static void __init pti_clone_entry_text(void)
+{
+	pti_clone_pmds((unsigned long) __entry_text_start,
+			(unsigned long) __irqentry_text_end,
+		       _PAGE_RW | _PAGE_GLOBAL);
+}
+
+/*
+ * Initialize kernel page table isolation
+ */
+void __init pti_init(void)
+{
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+
+	pr_info("enabled\n");
+
+	pti_clone_user_shared();
+	pti_clone_entry_text();
+	pti_setup_espfix64();
+	pti_setup_vsyscall();
+}
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index f65a33f505b6..adb3c5784dac 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 #include <linux/spinlock.h>
 #include <linux/errno.h>
 #include <linux/init.h>
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 3ea20d61b523..dac07e4f5834 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * ACPI 3.0 based NUMA setup
  * Copyright 2004 Andi Kleen, SuSE Labs.
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 0f3d0cea4d00..a1561957dccb 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -28,6 +28,38 @@
  *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
  */
 
+/*
+ * We get here when we do something requiring a TLB invalidation
+ * but could not go invalidate all of the contexts.  We do the
+ * necessary invalidation by clearing out the 'ctx_id' which
+ * forces a TLB flush when the context is loaded.
+ */
+void clear_asid_other(void)
+{
+	u16 asid;
+
+	/*
+	 * This is only expected to be set if we have disabled
+	 * kernel _PAGE_GLOBAL pages.
+	 */
+	if (!static_cpu_has(X86_FEATURE_PTI)) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
+		/* Do not need to flush the current asid */
+		if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
+			continue;
+		/*
+		 * Make sure the next time we go to switch to
+		 * this asid, we do a flush:
+		 */
+		this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
+	}
+	this_cpu_write(cpu_tlbstate.invalidate_other, false);
+}
+
 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
 
 
@@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
 		return;
 	}
 
+	if (this_cpu_read(cpu_tlbstate.invalidate_other))
+		clear_asid_other();
+
 	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
 		if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
 		    next->context.ctx_id)
@@ -65,6 +100,25 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
 	*need_flush = true;
 }
 
+static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
+{
+	unsigned long new_mm_cr3;
+
+	if (need_flush) {
+		invalidate_user_asid(new_asid);
+		new_mm_cr3 = build_cr3(pgdir, new_asid);
+	} else {
+		new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
+	}
+
+	/*
+	 * Caution: many callers of this function expect
+	 * that load_cr3() is serializing and orders TLB
+	 * fills with respect to the mm_cpumask writes.
+	 */
+	write_cr3(new_mm_cr3);
+}
+
 void leave_mm(int cpu)
 {
 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -85,6 +139,7 @@ void leave_mm(int cpu)
 
 	switch_mm(NULL, &init_mm, NULL);
 }
+EXPORT_SYMBOL_GPL(leave_mm);
 
 void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	       struct task_struct *tsk)
@@ -127,7 +182,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	 * isn't free.
 	 */
 #ifdef CONFIG_DEBUG_VM
-	if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) {
+	if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
 		/*
 		 * If we were to BUG here, we'd be very likely to kill
 		 * the system so hard that we don't see the call trace.
@@ -194,13 +249,23 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		if (need_flush) {
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
-			write_cr3(build_cr3(next, new_asid));
-			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
-					TLB_FLUSH_ALL);
+			load_new_mm_cr3(next->pgd, new_asid, true);
+
+			/*
+			 * NB: This gets called via leave_mm() in the idle path
+			 * where RCU functions differently.  Tracing normally
+			 * uses RCU, so we need to use the _rcuidle variant.
+			 *
+			 * (There is no good reason for this.  The idle code should
+			 *  be rearranged to call this before rcu_idle_enter().)
+			 */
+			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 		} else {
 			/* The new ASID is already up to date. */
-			write_cr3(build_cr3_noflush(next, new_asid));
-			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
+			load_new_mm_cr3(next->pgd, new_asid, false);
+
+			/* See above wrt _rcuidle. */
+			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
 		}
 
 		this_cpu_write(cpu_tlbstate.loaded_mm, next);
@@ -277,7 +342,7 @@ void initialize_tlbstate_and_flush(void)
 		!(cr4_read_shadow() & X86_CR4_PCIDE));
 
 	/* Force ASID 0 and force a TLB flush. */
-	write_cr3(build_cr3(mm, 0));
+	write_cr3(build_cr3(mm->pgd, 0));
 
 	/* Reinitialize tlbstate. */
 	this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
@@ -540,7 +605,7 @@ static void do_kernel_range_flush(void *info)
 
 	/* flush range by one by one 'invlpg' */
 	for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
-		__flush_tlb_single(addr);
+		__flush_tlb_one(addr);
 }
 
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)