aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/fault.c107
-rw-r--r--arch/x86/mm/hugetlbpage.c8
-rw-r--r--arch/x86/mm/init.c148
-rw-r--r--arch/x86/mm/numa.c11
-rw-r--r--arch/x86/mm/pgtable.c23
-rw-r--r--arch/x86/mm/tlb.c14
7 files changed, 239 insertions, 74 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 23d8e5fecf76..6a19ad9f370d 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -6,6 +6,8 @@ nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_physaddr.o := $(nostackp)
CFLAGS_setup_nx.o := $(nostackp)
+CFLAGS_fault.o := -I$(src)/../include/asm/trace
+
obj-$(CONFIG_X86_PAT) += pat_rbtree.o
obj-$(CONFIG_SMP) += tlb.o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 654be4ae3047..9ff85bb8dd69 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -20,6 +20,9 @@
#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
#include <asm/fixmap.h> /* VSYSCALL_START */
+#define CREATE_TRACE_POINTS
+#include <asm/trace/exceptions.h>
+
/*
* Page fault error code bits:
*
@@ -51,7 +54,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)
return 0;
}
-static inline int __kprobes notify_page_fault(struct pt_regs *regs)
+static inline int __kprobes kprobes_fault(struct pt_regs *regs)
{
int ret = 0;
@@ -596,7 +599,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
printk(KERN_CONT " at %p\n", (void *) address);
printk(KERN_ALERT "IP:");
- printk_address(regs->ip, 1);
+ printk_address(regs->ip);
dump_pagetable(address);
}
@@ -842,23 +845,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
force_sig_info_fault(SIGBUS, code, address, tsk, fault);
}
-static noinline int
+static noinline void
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
unsigned long address, unsigned int fault)
{
- /*
- * Pagefault was interrupted by SIGKILL. We have no reason to
- * continue pagefault.
- */
- if (fatal_signal_pending(current)) {
- if (!(fault & VM_FAULT_RETRY))
- up_read(&current->mm->mmap_sem);
- if (!(error_code & PF_USER))
- no_context(regs, error_code, address, 0, 0);
- return 1;
+ if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
+ up_read(&current->mm->mmap_sem);
+ no_context(regs, error_code, address, 0, 0);
+ return;
}
- if (!(fault & VM_FAULT_ERROR))
- return 0;
if (fault & VM_FAULT_OOM) {
/* Kernel mode? Handle exceptions or die: */
@@ -866,7 +861,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
up_read(&current->mm->mmap_sem);
no_context(regs, error_code, address,
SIGSEGV, SEGV_MAPERR);
- return 1;
+ return;
}
up_read(&current->mm->mmap_sem);
@@ -884,7 +879,6 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
else
BUG();
}
- return 1;
}
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
@@ -1011,9 +1005,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
unsigned long address;
struct mm_struct *mm;
int fault;
- int write = error_code & PF_WRITE;
- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
- (write ? FAULT_FLAG_WRITE : 0);
+ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
tsk = current;
mm = tsk->mm;
@@ -1059,7 +1051,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
return;
/* kprobes don't want to hook the spurious faults: */
- if (notify_page_fault(regs))
+ if (kprobes_fault(regs))
return;
/*
* Don't take the mm semaphore here. If we fixup a prefetch
@@ -1071,22 +1063,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
}
/* kprobes don't want to hook the spurious faults: */
- if (unlikely(notify_page_fault(regs)))
+ if (unlikely(kprobes_fault(regs)))
return;
- /*
- * It's safe to allow irq's after cr2 has been saved and the
- * vmalloc fault has been handled.
- *
- * User-mode registers count as a user access even for any
- * potential system fault or CPU buglet:
- */
- if (user_mode_vm(regs)) {
- local_irq_enable();
- error_code |= PF_USER;
- } else {
- if (regs->flags & X86_EFLAGS_IF)
- local_irq_enable();
- }
if (unlikely(error_code & PF_RSVD))
pgtable_bad(regs, error_code, address);
@@ -1098,8 +1076,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
}
}
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
-
/*
* If we're in an interrupt, have no user context or are running
* in an atomic region then we must not take the fault:
@@ -1110,6 +1086,27 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
}
/*
+ * It's safe to allow irq's after cr2 has been saved and the
+ * vmalloc fault has been handled.
+ *
+ * User-mode registers count as a user access even for any
+ * potential system fault or CPU buglet:
+ */
+ if (user_mode_vm(regs)) {
+ local_irq_enable();
+ error_code |= PF_USER;
+ flags |= FAULT_FLAG_USER;
+ } else {
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_enable();
+ }
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+
+ if (error_code & PF_WRITE)
+ flags |= FAULT_FLAG_WRITE;
+
+ /*
* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in
* the kernel and should generate an OOPS. Unfortunately, in the
@@ -1187,9 +1184,17 @@ good_area:
*/
fault = handle_mm_fault(mm, vma, address, flags);
- if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
- if (mm_fault_error(regs, error_code, address, fault))
- return;
+ /*
+ * If we need to retry but a fatal signal is pending, handle the
+ * signal first. We do not need to release the mmap_sem because it
+ * would already be released in __lock_page_or_retry in mm/filemap.c.
+ */
+ if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)))
+ return;
+
+ if (unlikely(fault & VM_FAULT_ERROR)) {
+ mm_fault_error(regs, error_code, address, fault);
+ return;
}
/*
@@ -1230,3 +1235,23 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
__do_page_fault(regs, error_code);
exception_exit(prev_state);
}
+
+static void trace_page_fault_entries(struct pt_regs *regs,
+ unsigned long error_code)
+{
+ if (user_mode(regs))
+ trace_page_fault_user(read_cr2(), regs, error_code);
+ else
+ trace_page_fault_kernel(read_cr2(), regs, error_code);
+}
+
+dotraplinkage void __kprobes
+trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ enum ctx_state prev_state;
+
+ prev_state = exception_enter();
+ trace_page_fault_entries(regs, error_code);
+ __do_page_fault(regs, error_code);
+ exception_exit(prev_state);
+}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 7e73e8c69096..9d980d88b747 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -59,6 +59,10 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
return NULL;
}
+int pmd_huge_support(void)
+{
+ return 0;
+}
#else
struct page *
@@ -77,6 +81,10 @@ int pud_huge(pud_t pud)
return !!(pud_val(pud) & _PAGE_PSE);
}
+int pmd_huge_support(void)
+{
+ return 1;
+}
#endif
/* x86_64 also uses this file */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 04664cdb7fda..f97130618113 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -53,12 +53,12 @@ __ref void *alloc_low_pages(unsigned int num)
if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
unsigned long ret;
if (min_pfn_mapped >= max_pfn_mapped)
- panic("alloc_low_page: ran out of memory");
+ panic("alloc_low_pages: ran out of memory");
ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
max_pfn_mapped << PAGE_SHIFT,
PAGE_SIZE * num , PAGE_SIZE);
if (!ret)
- panic("alloc_low_page: can not alloc memory");
+ panic("alloc_low_pages: can not alloc memory");
memblock_reserve(ret, PAGE_SIZE * num);
pfn = ret >> PAGE_SHIFT;
} else {
@@ -399,29 +399,46 @@ static unsigned long __init init_range_memory_mapping(
return mapped_ram_size;
}
-/* (PUD_SHIFT-PMD_SHIFT)/2 */
-#define STEP_SIZE_SHIFT 5
-void __init init_mem_mapping(void)
+static unsigned long __init get_new_step_size(unsigned long step_size)
+{
+ /*
+ * Explain why we shift by 5 and why we don't have to worry about
+ * 'step_size << 5' overflowing:
+ *
+ * initial mapped size is PMD_SIZE (2M).
+ * We can not set step_size to be PUD_SIZE (1G) yet.
+ * In worse case, when we cross the 1G boundary, and
+ * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k)
+ * to map 1G range with PTE. Use 5 as shift for now.
+ *
+ * Don't need to worry about overflow, on 32bit, when step_size
+ * is 0, round_down() returns 0 for start, and that turns it
+ * into 0x100000000ULL.
+ */
+ return step_size << 5;
+}
+
+/**
+ * memory_map_top_down - Map [map_start, map_end) top down
+ * @map_start: start address of the target memory range
+ * @map_end: end address of the target memory range
+ *
+ * This function will setup direct mapping for memory range
+ * [map_start, map_end) in top-down. That said, the page tables
+ * will be allocated at the end of the memory, and we map the
+ * memory in top-down.
+ */
+static void __init memory_map_top_down(unsigned long map_start,
+ unsigned long map_end)
{
- unsigned long end, real_end, start, last_start;
+ unsigned long real_end, start, last_start;
unsigned long step_size;
unsigned long addr;
unsigned long mapped_ram_size = 0;
unsigned long new_mapped_ram_size;
- probe_page_size_mask();
-
-#ifdef CONFIG_X86_64
- end = max_pfn << PAGE_SHIFT;
-#else
- end = max_low_pfn << PAGE_SHIFT;
-#endif
-
- /* the ISA range is always mapped regardless of memory holes */
- init_memory_mapping(0, ISA_END_ADDRESS);
-
/* xen has big range in reserved near end of ram, skip it at first.*/
- addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE, PMD_SIZE);
+ addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE);
real_end = addr + PMD_SIZE;
/* step_size need to be small so pgt_buf from BRK could cover it */
@@ -436,25 +453,106 @@ void __init init_mem_mapping(void)
* end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
* for page table.
*/
- while (last_start > ISA_END_ADDRESS) {
+ while (last_start > map_start) {
if (last_start > step_size) {
start = round_down(last_start - 1, step_size);
- if (start < ISA_END_ADDRESS)
- start = ISA_END_ADDRESS;
+ if (start < map_start)
+ start = map_start;
} else
- start = ISA_END_ADDRESS;
+ start = map_start;
new_mapped_ram_size = init_range_memory_mapping(start,
last_start);
last_start = start;
min_pfn_mapped = last_start >> PAGE_SHIFT;
/* only increase step_size after big range get mapped */
if (new_mapped_ram_size > mapped_ram_size)
- step_size <<= STEP_SIZE_SHIFT;
+ step_size = get_new_step_size(step_size);
mapped_ram_size += new_mapped_ram_size;
}
- if (real_end < end)
- init_range_memory_mapping(real_end, end);
+ if (real_end < map_end)
+ init_range_memory_mapping(real_end, map_end);
+}
+
+/**
+ * memory_map_bottom_up - Map [map_start, map_end) bottom up
+ * @map_start: start address of the target memory range
+ * @map_end: end address of the target memory range
+ *
+ * This function will setup direct mapping for memory range
+ * [map_start, map_end) in bottom-up. Since we have limited the
+ * bottom-up allocation above the kernel, the page tables will
+ * be allocated just above the kernel and we map the memory
+ * in [map_start, map_end) in bottom-up.
+ */
+static void __init memory_map_bottom_up(unsigned long map_start,
+ unsigned long map_end)
+{
+ unsigned long next, new_mapped_ram_size, start;
+ unsigned long mapped_ram_size = 0;
+ /* step_size need to be small so pgt_buf from BRK could cover it */
+ unsigned long step_size = PMD_SIZE;
+
+ start = map_start;
+ min_pfn_mapped = start >> PAGE_SHIFT;
+
+ /*
+ * We start from the bottom (@map_start) and go to the top (@map_end).
+ * The memblock_find_in_range() gets us a block of RAM from the
+ * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
+ * for page table.
+ */
+ while (start < map_end) {
+ if (map_end - start > step_size) {
+ next = round_up(start + 1, step_size);
+ if (next > map_end)
+ next = map_end;
+ } else
+ next = map_end;
+
+ new_mapped_ram_size = init_range_memory_mapping(start, next);
+ start = next;
+
+ if (new_mapped_ram_size > mapped_ram_size)
+ step_size = get_new_step_size(step_size);
+ mapped_ram_size += new_mapped_ram_size;
+ }
+}
+
+void __init init_mem_mapping(void)
+{
+ unsigned long end;
+
+ probe_page_size_mask();
+
+#ifdef CONFIG_X86_64
+ end = max_pfn << PAGE_SHIFT;
+#else
+ end = max_low_pfn << PAGE_SHIFT;
+#endif
+
+ /* the ISA range is always mapped regardless of memory holes */
+ init_memory_mapping(0, ISA_END_ADDRESS);
+
+ /*
+ * If the allocation is in bottom-up direction, we setup direct mapping
+ * in bottom-up, otherwise we setup direct mapping in top-down.
+ */
+ if (memblock_bottom_up()) {
+ unsigned long kernel_end = __pa_symbol(_end);
+
+ /*
+ * we need two separate calls here. This is because we want to
+ * allocate page tables above the kernel. So we first map
+ * [kernel_end, end) to make memory above the kernel be mapped
+ * as soon as possible. And then use page tables allocated above
+ * the kernel to map [ISA_END_ADDRESS, kernel_end).
+ */
+ memory_map_bottom_up(kernel_end, end);
+ memory_map_bottom_up(ISA_END_ADDRESS, kernel_end);
+ } else {
+ memory_map_top_down(ISA_END_ADDRESS, end);
+ }
#ifdef CONFIG_X86_64
if (max_pfn > max_low_pfn) {
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 8bf93bae1f13..24aec58d6afd 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -567,6 +567,17 @@ static int __init numa_init(int (*init_func)(void))
ret = init_func();
if (ret < 0)
return ret;
+
+ /*
+ * We reset memblock back to the top-down direction
+ * here because if we configured ACPI_NUMA, we have
+ * parsed SRAT in init_func(). It is ok to have the
+ * reset here even if we did't configure ACPI_NUMA
+ * or acpi numa init fails and fallbacks to dummy
+ * numa init.
+ */
+ memblock_set_bottom_up(false);
+
ret = numa_cleanup_meminfo(&numa_meminfo);
if (ret < 0)
return ret;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index dfa537a03be1..c96314abd144 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -25,8 +25,12 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
struct page *pte;
pte = alloc_pages(__userpte_alloc_gfp, 0);
- if (pte)
- pgtable_page_ctor(pte);
+ if (!pte)
+ return NULL;
+ if (!pgtable_page_ctor(pte)) {
+ __free_page(pte);
+ return NULL;
+ }
return pte;
}
@@ -57,6 +61,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
#if PAGETABLE_LEVELS > 2
void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
+ struct page *page = virt_to_page(pmd);
paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
/*
* NOTE! For PAE, any changes to the top page-directory-pointer-table
@@ -65,7 +70,8 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
#ifdef CONFIG_X86_PAE
tlb->need_flush_all = 1;
#endif
- tlb_remove_page(tlb, virt_to_page(pmd));
+ pgtable_pmd_page_dtor(page);
+ tlb_remove_page(tlb, page);
}
#if PAGETABLE_LEVELS > 3
@@ -189,8 +195,10 @@ static void free_pmds(pmd_t *pmds[])
int i;
for(i = 0; i < PREALLOCATED_PMDS; i++)
- if (pmds[i])
+ if (pmds[i]) {
+ pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
free_page((unsigned long)pmds[i]);
+ }
}
static int preallocate_pmds(pmd_t *pmds[])
@@ -200,8 +208,13 @@ static int preallocate_pmds(pmd_t *pmds[])
for(i = 0; i < PREALLOCATED_PMDS; i++) {
pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP);
- if (pmd == NULL)
+ if (!pmd)
failed = true;
+ if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
+ free_page((unsigned long)pmd);
+ pmd = NULL;
+ failed = true;
+ }
pmds[i] = pmd;
}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 282375f13c7e..ae699b3bbac8 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -103,6 +103,7 @@ static void flush_tlb_func(void *info)
if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
return;
+ count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
if (f->flush_end == TLB_FLUSH_ALL)
local_flush_tlb();
@@ -130,6 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
info.flush_start = start;
info.flush_end = end;
+ count_vm_event(NR_TLB_REMOTE_FLUSH);
if (is_uv_system()) {
unsigned int cpu;
@@ -149,6 +151,7 @@ void flush_tlb_current_task(void)
preempt_disable();
+ count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
@@ -211,16 +214,19 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm;
/* tlb_flushall_shift is on balance point, details in commit log */
- if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
+ if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) {
+ count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
- else {
+ } else {
if (has_large_page(mm, start, end)) {
local_flush_tlb();
goto flush_all;
}
/* flush range by one by one 'invlpg' */
- for (addr = start; addr < end; addr += PAGE_SIZE)
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
__flush_tlb_single(addr);
+ }
if (cpumask_any_but(mm_cpumask(mm),
smp_processor_id()) < nr_cpu_ids)
@@ -256,6 +262,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
static void do_flush_tlb_all(void *info)
{
+ count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
__flush_tlb_all();
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
leave_mm(smp_processor_id());
@@ -263,6 +270,7 @@ static void do_flush_tlb_all(void *info)
void flush_tlb_all(void)
{
+ count_vm_event(NR_TLB_REMOTE_FLUSH);
on_each_cpu(do_flush_tlb_all, NULL, 1);
}