aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig48
-rw-r--r--mm/Makefile10
-rw-r--r--mm/backing-dev.c74
-rw-r--r--mm/bootmem.c13
-rw-r--r--mm/compaction.c186
-rw-r--r--mm/dmapool.c18
-rw-r--r--mm/filemap.c98
-rw-r--r--mm/highmem.c66
-rw-r--r--mm/huge_memory.c2351
-rw-r--r--mm/hugetlb.c290
-rw-r--r--mm/internal.h9
-rw-r--r--mm/kmemleak-test.c6
-rw-r--r--mm/kmemleak.c13
-rw-r--r--mm/ksm.c88
-rw-r--r--mm/maccess.c2
-rw-r--r--mm/madvise.c10
-rw-r--r--mm/memblock.c835
-rw-r--r--mm/memcontrol.c829
-rw-r--r--mm/memory-failure.c284
-rw-r--r--mm/memory.c399
-rw-r--r--mm/memory_hotplug.c100
-rw-r--r--mm/mempolicy.c43
-rw-r--r--mm/migrate.c329
-rw-r--r--mm/mincore.c7
-rw-r--r--mm/mlock.c170
-rw-r--r--mm/mmap.c35
-rw-r--r--mm/mmu_notifier.c20
-rw-r--r--mm/mmzone.c21
-rw-r--r--mm/mprotect.c22
-rw-r--r--mm/mremap.c17
-rw-r--r--mm/nommu.c84
-rw-r--r--mm/oom_kill.c33
-rw-r--r--mm/page-writeback.c42
-rw-r--r--mm/page_alloc.c401
-rw-r--r--mm/page_isolation.c3
-rw-r--r--mm/pagewalk.c6
-rw-r--r--mm/percpu-km.c8
-rw-r--r--mm/percpu-vm.c2
-rw-r--r--mm/percpu.c415
-rw-r--r--mm/percpu_up.c30
-rw-r--r--mm/pgtable-generic.c121
-rw-r--r--mm/rmap.c130
-rw-r--r--mm/shmem.c26
-rw-r--r--mm/slab.c84
-rw-r--r--mm/slob.c9
-rw-r--r--mm/slub.c876
-rw-r--r--mm/sparse-vmemmap.c13
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/swap.c132
-rw-r--r--mm/swap_state.c6
-rw-r--r--mm/swapfile.c66
-rw-r--r--mm/truncate.c15
-rw-r--r--mm/util.c34
-rw-r--r--mm/vmalloc.c185
-rw-r--r--mm/vmscan.c577
-rw-r--r--mm/vmstat.c246
56 files changed, 7469 insertions, 2472 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index f0fb9124e410..e9c0c61f2ddd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -179,7 +179,7 @@ config SPLIT_PTLOCK_CPUS
config COMPACTION
bool "Allow for memory compaction"
select MIGRATION
- depends on EXPERIMENTAL && HUGETLB_PAGE && MMU
+ depends on MMU
help
Allows the compaction of memory for the allocation of huge pages.
@@ -301,3 +301,49 @@ config NOMMU_INITIAL_TRIM_EXCESS
of 1 says that all excess pages should be trimmed.
See Documentation/nommu-mmap.txt for more information.
+
+config TRANSPARENT_HUGEPAGE
+ bool "Transparent Hugepage Support"
+ depends on X86 && MMU
+ select COMPACTION
+ help
+ Transparent Hugepages allows the kernel to use huge pages and
+ huge tlb transparently to the applications whenever possible.
+ This feature can improve computing performance to certain
+ applications by speeding up page faults during memory
+ allocation, by reducing the number of tlb misses and by speeding
+ up the pagetable walking.
+
+ If memory constrained on embedded, you may want to say N.
+
+choice
+ prompt "Transparent Hugepage Support sysfs defaults"
+ depends on TRANSPARENT_HUGEPAGE
+ default TRANSPARENT_HUGEPAGE_ALWAYS
+ help
+ Selects the sysfs defaults for Transparent Hugepage Support.
+
+ config TRANSPARENT_HUGEPAGE_ALWAYS
+ bool "always"
+ help
+ Enabling Transparent Hugepage always, can increase the
+ memory footprint of applications without a guaranteed
+ benefit but it will work automatically for all applications.
+
+ config TRANSPARENT_HUGEPAGE_MADVISE
+ bool "madvise"
+ help
+ Enabling Transparent Hugepage madvise, will only provide a
+ performance improvement benefit to the applications using
+ madvise(MADV_HUGEPAGE) but it won't risk to increase the
+ memory footprint of applications without a guaranteed
+ benefit.
+endchoice
+
+#
+# UP and nommu archs use km based percpu allocator
+#
+config NEED_PER_CPU_KM
+ depends on !SMP
+ bool
+ default y
diff --git a/mm/Makefile b/mm/Makefile
index 34b2546a9e37..2b1b575ae712 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,13 +5,13 @@
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
- vmalloc.o pagewalk.o
+ vmalloc.o pagewalk.o pgtable-generic.o
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
- page_isolation.o mm_init.o mmu_context.o \
+ page_isolation.o mm_init.o mmu_context.o percpu.o \
$(mmu-y)
obj-y += init-mm.o
@@ -36,12 +36,8 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
-ifdef CONFIG_SMP
-obj-y += percpu.o
-else
-obj-y += percpu_up.o
-endif
obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 65d420499a61..027100d30227 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -74,11 +74,11 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
nr_wb = nr_dirty = nr_io = nr_more_io = 0;
spin_lock(&inode_lock);
- list_for_each_entry(inode, &wb->b_dirty, i_list)
+ list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
nr_dirty++;
- list_for_each_entry(inode, &wb->b_io, i_list)
+ list_for_each_entry(inode, &wb->b_io, i_wb_list)
nr_io++;
- list_for_each_entry(inode, &wb->b_more_io, i_list)
+ list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
nr_more_io++;
spin_unlock(&inode_lock);
@@ -362,7 +362,7 @@ static int bdi_forker_thread(void *ptr)
{
struct bdi_writeback *me = ptr;
- current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+ current->flags |= PF_SWAPWRITE;
set_freezable();
/*
@@ -729,6 +729,7 @@ static wait_queue_head_t congestion_wqh[2] = {
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
};
+static atomic_t nr_bdi_congested[2];
void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
{
@@ -736,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
wait_queue_head_t *wqh = &congestion_wqh[sync];
bit = sync ? BDI_sync_congested : BDI_async_congested;
- clear_bit(bit, &bdi->state);
+ if (test_and_clear_bit(bit, &bdi->state))
+ atomic_dec(&nr_bdi_congested[sync]);
smp_mb__after_clear_bit();
if (waitqueue_active(wqh))
wake_up(wqh);
@@ -748,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync)
enum bdi_state bit;
bit = sync ? BDI_sync_congested : BDI_async_congested;
- set_bit(bit, &bdi->state);
+ if (!test_and_set_bit(bit, &bdi->state))
+ atomic_inc(&nr_bdi_congested[sync]);
}
EXPORT_SYMBOL(set_bdi_congested);
@@ -764,13 +767,72 @@ EXPORT_SYMBOL(set_bdi_congested);
long congestion_wait(int sync, long timeout)
{
long ret;
+ unsigned long start = jiffies;
DEFINE_WAIT(wait);
wait_queue_head_t *wqh = &congestion_wqh[sync];
prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
ret = io_schedule_timeout(timeout);
finish_wait(wqh, &wait);
+
+ trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
+ jiffies_to_usecs(jiffies - start));
+
return ret;
}
EXPORT_SYMBOL(congestion_wait);
+/**
+ * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
+ * @zone: A zone to check if it is heavily congested
+ * @sync: SYNC or ASYNC IO
+ * @timeout: timeout in jiffies
+ *
+ * In the event of a congested backing_dev (any backing_dev) and the given
+ * @zone has experienced recent congestion, this waits for up to @timeout
+ * jiffies for either a BDI to exit congestion of the given @sync queue
+ * or a write to complete.
+ *
+ * In the absense of zone congestion, cond_resched() is called to yield
+ * the processor if necessary but otherwise does not sleep.
+ *
+ * The return value is 0 if the sleep is for the full timeout. Otherwise,
+ * it is the number of jiffies that were still remaining when the function
+ * returned. return_value == timeout implies the function did not sleep.
+ */
+long wait_iff_congested(struct zone *zone, int sync, long timeout)
+{
+ long ret;
+ unsigned long start = jiffies;
+ DEFINE_WAIT(wait);
+ wait_queue_head_t *wqh = &congestion_wqh[sync];
+
+ /*
+ * If there is no congestion, or heavy congestion is not being
+ * encountered in the current zone, yield if necessary instead
+ * of sleeping on the congestion queue
+ */
+ if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+ !zone_is_reclaim_congested(zone)) {
+ cond_resched();
+
+ /* In case we scheduled, work out time remaining */
+ ret = timeout - (jiffies - start);
+ if (ret < 0)
+ ret = 0;
+
+ goto out;
+ }
+
+ /* Sleep until uncongested or a write happens */
+ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ ret = io_schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+
+out:
+ trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
+ jiffies_to_usecs(jiffies - start));
+
+ return ret;
+}
+EXPORT_SYMBOL(wait_iff_congested);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 142c84a54993..13b0caa9793c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/kmemleak.h>
#include <linux/range.h>
+#include <linux/memblock.h>
#include <asm/bug.h>
#include <asm/io.h>
@@ -434,7 +435,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size)
{
#ifdef CONFIG_NO_BOOTMEM
- free_early(physaddr, physaddr + size);
+ kmemleak_free_part(__va(physaddr), size);
+ memblock_x86_free_range(physaddr, physaddr + size);
#else
unsigned long start, end;
@@ -459,7 +461,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
void __init free_bootmem(unsigned long addr, unsigned long size)
{
#ifdef CONFIG_NO_BOOTMEM
- free_early(addr, addr + size);
+ kmemleak_free_part(__va(addr), size);
+ memblock_x86_free_range(addr, addr + size);
#else
unsigned long start, end;
@@ -526,6 +529,12 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
}
#ifndef CONFIG_NO_BOOTMEM
+int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
+ int flags)
+{
+ return reserve_bootmem(phys, len, flags);
+}
+
static unsigned long __init align_idx(struct bootmem_data *bdata,
unsigned long idx, unsigned long step)
{
diff --git a/mm/compaction.c b/mm/compaction.c
index 4d709ee59013..8be430b812de 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,6 +16,9 @@
#include <linux/sysfs.h>
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/compaction.h>
+
/*
* compact_control is used to track pages being migrated and the free pages
* they are being migrated to during memory compaction. The free_pfn starts
@@ -30,6 +33,7 @@ struct compact_control {
unsigned long nr_migratepages; /* Number of pages to migrate */
unsigned long free_pfn; /* isolate_freepages search base */
unsigned long migrate_pfn; /* isolate_migratepages search base */
+ bool sync; /* Synchronous migration */
/* Account for isolated anon and file pages */
unsigned long nr_anon;
@@ -38,6 +42,8 @@ struct compact_control {
unsigned int order; /* order a direct compactor needs */
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
+
+ int compact_mode;
};
static unsigned long release_freepages(struct list_head *freelist)
@@ -60,7 +66,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
struct list_head *freelist)
{
unsigned long zone_end_pfn, end_pfn;
- int total_isolated = 0;
+ int nr_scanned = 0, total_isolated = 0;
struct page *cursor;
/* Get the last PFN we should scan for free pages at */
@@ -81,6 +87,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
if (!pfn_valid_within(blockpfn))
continue;
+ nr_scanned++;
if (!PageBuddy(page))
continue;
@@ -100,6 +107,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
}
}
+ trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
return total_isolated;
}
@@ -234,6 +242,8 @@ static unsigned long isolate_migratepages(struct zone *zone,
struct compact_control *cc)
{
unsigned long low_pfn, end_pfn;
+ unsigned long last_pageblock_nr = 0, pageblock_nr;
+ unsigned long nr_scanned = 0, nr_isolated = 0;
struct list_head *migratelist = &cc->migratepages;
/* Do not scan outside zone boundaries */
@@ -266,21 +276,51 @@ static unsigned long isolate_migratepages(struct zone *zone,
struct page *page;
if (!pfn_valid_within(low_pfn))
continue;
+ nr_scanned++;
/* Get the page and skip if free */
page = pfn_to_page(low_pfn);
if (PageBuddy(page))
continue;
+ /*
+ * For async migration, also only scan in MOVABLE blocks. Async
+ * migration is optimistic to see if the minimum amount of work
+ * satisfies the allocation
+ */
+ pageblock_nr = low_pfn >> pageblock_order;
+ if (!cc->sync && last_pageblock_nr != pageblock_nr &&
+ get_pageblock_migratetype(page) != MIGRATE_MOVABLE) {
+ low_pfn += pageblock_nr_pages;
+ low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
+ last_pageblock_nr = pageblock_nr;
+ continue;
+ }
+
+ if (!PageLRU(page))
+ continue;
+
+ /*
+ * PageLRU is set, and lru_lock excludes isolation,
+ * splitting and collapsing (collapsing has already
+ * happened if PageLRU is set).
+ */
+ if (PageTransHuge(page)) {
+ low_pfn += (1 << compound_order(page)) - 1;
+ continue;
+ }
+
/* Try isolate the page */
if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
continue;
+ VM_BUG_ON(PageTransCompound(page));
+
/* Successfully isolated */
del_page_from_lru_list(zone, page, page_lru(page));
list_add(&page->lru, migratelist);
- mem_cgroup_del_lru(page);
cc->nr_migratepages++;
+ nr_isolated++;
/* Avoid isolating too much */
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
@@ -292,6 +332,8 @@ static unsigned long isolate_migratepages(struct zone *zone,
spin_unlock_irq(&zone->lru_lock);
cc->migrate_pfn = low_pfn;
+ trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
+
return cc->nr_migratepages;
}
@@ -342,10 +384,10 @@ static void update_nr_listpages(struct compact_control *cc)
}
static int compact_finished(struct zone *zone,
- struct compact_control *cc)
+ struct compact_control *cc)
{
unsigned int order;
- unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order);
+ unsigned long watermark;
if (fatal_signal_pending(current))
return COMPACT_PARTIAL;
@@ -355,12 +397,31 @@ static int compact_finished(struct zone *zone,
return COMPACT_COMPLETE;
/* Compaction run is not finished if the watermark is not met */
+ if (cc->compact_mode != COMPACT_MODE_KSWAPD)
+ watermark = low_wmark_pages(zone);
+ else
+ watermark = high_wmark_pages(zone);
+ watermark += (1 << cc->order);
+
if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
return COMPACT_CONTINUE;
+ /*
+ * order == -1 is expected when compacting via
+ * /proc/sys/vm/compact_memory
+ */
if (cc->order == -1)
return COMPACT_CONTINUE;
+ /*
+ * Generating only one page of the right order is not enough
+ * for kswapd, we must continue until we're above the high
+ * watermark as a pool for high order GFP_ATOMIC allocations
+ * too.
+ */
+ if (cc->compact_mode == COMPACT_MODE_KSWAPD)
+ return COMPACT_CONTINUE;
+
/* Direct compactor: Is a suitable page free? */
for (order = cc->order; order < MAX_ORDER; order++) {
/* Job done if page is free of the right migratetype */
@@ -375,10 +436,69 @@ static int compact_finished(struct zone *zone,
return COMPACT_CONTINUE;
}
+/*
+ * compaction_suitable: Is this suitable to run compaction on this zone now?
+ * Returns
+ * COMPACT_SKIPPED - If there are too few free pages for compaction
+ * COMPACT_PARTIAL - If the allocation would succeed without compaction
+ * COMPACT_CONTINUE - If compaction should run now
+ */
+unsigned long compaction_suitable(struct zone *zone, int order)
+{
+ int fragindex;
+ unsigned long watermark;
+
+ /*
+ * Watermarks for order-0 must be met for compaction. Note the 2UL.
+ * This is because during migration, copies of pages need to be
+ * allocated and for a short time, the footprint is higher
+ */
+ watermark = low_wmark_pages(zone) + (2UL << order);
+ if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+ return COMPACT_SKIPPED;
+
+ /*
+ * order == -1 is expected when compacting via
+ * /proc/sys/vm/compact_memory
+ */
+ if (order == -1)
+ return COMPACT_CONTINUE;
+
+ /*
+ * fragmentation index determines if allocation failures are due to
+ * low memory or external fragmentation
+ *
+ * index of -1 implies allocations might succeed dependingon watermarks
+ * index towards 0 implies failure is due to lack of memory
+ * index towards 1000 implies failure is due to fragmentation
+ *
+ * Only compact if a failure would be due to fragmentation.
+ */
+ fragindex = fragmentation_index(zone, order);
+ if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
+ return COMPACT_SKIPPED;
+
+ if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
+ return COMPACT_PARTIAL;
+
+ return COMPACT_CONTINUE;
+}
+
static int compact_zone(struct zone *zone, struct compact_control *cc)
{
int ret;
+ ret = compaction_suitable(zone, cc->order);
+ switch (ret) {
+ case COMPACT_PARTIAL:
+ case COMPACT_SKIPPED:
+ /* Compaction is likely to fail */
+ return ret;
+ case COMPACT_CONTINUE:
+ /* Fall through to compaction */
+ ;
+ }
+
/* Setup to move all movable pages to the end of the zone */
cc->migrate_pfn = zone->zone_start_pfn;
cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
@@ -394,7 +514,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
nr_migrate = cc->nr_migratepages;
migrate_pages(&cc->migratepages, compaction_alloc,
- (unsigned long)cc, 0);
+ (unsigned long)cc, false,
+ cc->sync);
update_nr_listpages(cc);
nr_remaining = cc->nr_migratepages;
@@ -402,6 +523,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
if (nr_remaining)
count_vm_events(COMPACTPAGEFAILED, nr_remaining);
+ trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
+ nr_remaining);
/* Release LRU pages not migrated */
if (!list_empty(&cc->migratepages)) {
@@ -418,8 +541,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
return ret;
}
-static unsigned long compact_zone_order(struct zone *zone,
- int order, gfp_t gfp_mask)
+unsigned long compact_zone_order(struct zone *zone,
+ int order, gfp_t gfp_mask,
+ bool sync,
+ int compact_mode)
{
struct compact_control cc = {
.nr_freepages = 0,
@@ -427,6 +552,8 @@ static unsigned long compact_zone_order(struct zone *zone,
.order = order,
.migratetype = allocflags_to_migratetype(gfp_mask),
.zone = zone,
+ .sync = sync,
+ .compact_mode = compact_mode,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
@@ -442,16 +569,17 @@ int sysctl_extfrag_threshold = 500;
* @order: The order of the current allocation
* @gfp_mask: The GFP mask of the current allocation
* @nodemask: The allowed nodes to allocate from
+ * @sync: Whether migration is synchronous or not
*
* This is the main entry point for direct page compaction.
*/
unsigned long try_to_compact_pages(struct zonelist *zonelist,
- int order, gfp_t gfp_mask, nodemask_t *nodemask)
+ int order, gfp_t gfp_mask, nodemask_t *nodemask,
+ bool sync)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
int may_enter_fs = gfp_mask & __GFP_FS;
int may_perform_io = gfp_mask & __GFP_IO;
- unsigned long watermark;
struct zoneref *z;
struct zone *zone;
int rc = COMPACT_SKIPPED;
@@ -461,7 +589,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
* made because an assumption is made that the page allocator can satisfy
* the "cheaper" orders without taking special steps
*/
- if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io)
+ if (!order || !may_enter_fs || !may_perform_io)
return rc;
count_vm_event(COMPACTSTALL);
@@ -469,43 +597,14 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
/* Compact each zone in the list */
for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
nodemask) {
- int fragindex;
int status;
- /*
- * Watermarks for order-0 must be met for compaction. Note
- * the 2UL. This is because during migration, copies of
- * pages need to be allocated and for a short time, the
- * footprint is higher
- */
- watermark = low_wmark_pages(zone) + (2UL << order);
- if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
- continue;
-
- /*
- * fragmentation index determines if allocation failures are
- * due to low memory or external fragmentation
- *
- * index of -1 implies allocations might succeed depending
- * on watermarks
- * index towards 0 implies failure is due to lack of memory
- * index towards 1000 implies failure is due to fragmentation
- *
- * Only compact if a failure would be due to fragmentation.
- */
- fragindex = fragmentation_index(zone, order);
- if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
- continue;
-
- if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
- rc = COMPACT_PARTIAL;
- break;
- }
-
- status = compact_zone_order(zone, order, gfp_mask);
+ status = compact_zone_order(zone, order, gfp_mask, sync,
+ COMPACT_MODE_DIRECT_RECLAIM);
rc = max(status, rc);
- if (zone_watermark_ok(zone, order, watermark, 0, 0))
+ /* If a normal allocation would succeed, stop compacting */
+ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
break;
}
@@ -532,6 +631,7 @@ static int compact_node(int nid)
.nr_freepages = 0,
.nr_migratepages = 0,
.order = -1,
+ .compact_mode = COMPACT_MODE_DIRECT_RECLAIM,
};
zone = &pgdat->node_zones[zoneid];
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 3df063706f53..03bf3bb4519a 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -311,6 +311,8 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
size_t offset;
void *retval;
+ might_sleep_if(mem_flags & __GFP_WAIT);
+
spin_lock_irqsave(&pool->lock, flags);
restart:
list_for_each_entry(page, &pool->page_list, page_list) {
@@ -322,7 +324,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
if (mem_flags & __GFP_WAIT) {
DECLARE_WAITQUEUE(wait, current);
- __set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_UNINTERRUPTIBLE);
__add_wait_queue(&pool->waitq, &wait);
spin_unlock_irqrestore(&pool->lock, flags);
@@ -353,20 +355,15 @@ EXPORT_SYMBOL(dma_pool_alloc);
static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
{
- unsigned long flags;
struct dma_page *page;
- spin_lock_irqsave(&pool->lock, flags);
list_for_each_entry(page, &pool->page_list, page_list) {
if (dma < page->dma)
continue;
if (dma < (page->dma + pool->allocation))
- goto done;
+ return page;
}
- page = NULL;
- done:
- spin_unlock_irqrestore(&pool->lock, flags);
- return page;
+ return NULL;
}
/**
@@ -384,8 +381,10 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
unsigned long flags;
unsigned int offset;
+ spin_lock_irqsave(&pool->lock, flags);
page = pool_find_page(pool, dma);
if (!page) {
+ spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
dev_err(pool->dev,
"dma_pool_free %s, %p/%lx (bad dma)\n",
@@ -399,6 +398,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
offset = vaddr - page->vaddr;
#ifdef DMAPOOL_DEBUG
if ((dma - page->dma) != offset) {
+ spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
dev_err(pool->dev,
"dma_pool_free %s, %p (bad vaddr)/%Lx\n",
@@ -416,6 +416,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
chain = *(int *)(page->vaddr + chain);
continue;
}
+ spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
"already free\n", pool->name,
@@ -430,7 +431,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
memset(vaddr, POOL_POISON_FREED, pool->size);
#endif
- spin_lock_irqsave(&pool->lock, flags);
page->in_use--;
*(int *)vaddr = page->offset;
page->offset = offset;
diff --git a/mm/filemap.c b/mm/filemap.c
index 3d4df44e4221..83a45d35468b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -102,9 +102,6 @@
* ->inode_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
*
- * ->task->proc_lock
- * ->dcache_lock (proc_pid_lookup)
- *
* (code doesn't rely on that order, so you could switch it around)
* ->tasklist_lock (memory_failure, collect_procs_ao)
* ->i_mmap_lock
@@ -143,13 +140,18 @@ void __remove_from_page_cache(struct page *page)
void remove_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
+ void (*freepage)(struct page *);
BUG_ON(!PageLocked(page));
+ freepage = mapping->a_ops->freepage;
spin_lock_irq(&mapping->tree_lock);
__remove_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
+
+ if (freepage)
+ freepage(page);
}
EXPORT_SYMBOL(remove_from_page_cache);
@@ -296,7 +298,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
continue;
wait_on_page_writeback(page);
- if (PageError(page))
+ if (TestClearPageError(page))
ret = -EIO;
}
pagevec_release(&pvec);
@@ -612,6 +614,19 @@ void __lock_page_nosync(struct page *page)
TASK_UNINTERRUPTIBLE);
}
+int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+ unsigned int flags)
+{
+ if (!(flags & FAULT_FLAG_ALLOW_RETRY)) {
+ __lock_page(page);
+ return 1;
+ } else {
+ up_read(&mm->mmap_sem);
+ wait_on_page_locked(page);
+ return 0;
+ }
+}
+
/**
* find_get_page - find and get a page reference
* @mapping: the address_space to search
@@ -631,7 +646,9 @@ repeat:
pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
if (pagep) {
page = radix_tree_deref_slot(pagep);
- if (unlikely(!page || page == RADIX_TREE_RETRY))
+ if (unlikely(!page))
+ goto out;
+ if (radix_tree_deref_retry(page))
goto repeat;
if (!page_cache_get_speculative(page))
@@ -647,6 +664,7 @@ repeat:
goto repeat;
}
}
+out:
rcu_read_unlock();
return page;
@@ -764,12 +782,11 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
- /*
- * this can only trigger if nr_found == 1, making livelock
- * a non issue.
- */
- if (unlikely(page == RADIX_TREE_RETRY))
+ if (radix_tree_deref_retry(page)) {
+ if (ret)
+ start = pages[ret-1]->index;
goto restart;
+ }
if (!page_cache_get_speculative(page))
goto repeat;
@@ -817,16 +834,9 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
- /*
- * this can only trigger if nr_found == 1, making livelock
- * a non issue.
- */
- if (unlikely(page == RADIX_TREE_RETRY))
+ if (radix_tree_deref_retry(page))
goto restart;
- if (page->mapping == NULL || page->index != index)
- break;
-
if (!page_cache_get_speculative(page))
goto repeat;
@@ -836,6 +846,16 @@ repeat:
goto repeat;
}
+ /*
+ * must check mapping and index after taking the ref.
+ * otherwise we can get both false positives and false
+ * negatives, which is just confusing to the caller.
+ */
+ if (page->mapping == NULL || page->index != index) {
+ page_cache_release(page);
+ break;
+ }
+
pages[ret] = page;
ret++;
index++;
@@ -874,11 +894,7 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
- /*
- * this can only trigger if nr_found == 1, making livelock
- * a non issue.
- */
- if (unlikely(page == RADIX_TREE_RETRY))
+ if (radix_tree_deref_retry(page))
goto restart;
if (!page_cache_get_speculative(page))
@@ -1016,6 +1032,9 @@ find_page:
goto page_not_up_to_date;
if (!trylock_page(page))
goto page_not_up_to_date;
+ /* Did it get truncated before we got the lock? */
+ if (!page->mapping)
+ goto page_not_up_to_date_locked;
if (!mapping->a_ops->is_partially_uptodate(page,
desc, offset))
goto page_not_up_to_date_locked;
@@ -1539,25 +1558,30 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
* waiting for the lock.
*/
do_async_mmap_readahead(vma, ra, file, page, offset);
- lock_page(page);
-
- /* Did it get truncated? */
- if (unlikely(page->mapping != mapping)) {
- unlock_page(page);
- put_page(page);
- goto no_cached_page;
- }
} else {
/* No page in the page cache at all */
do_sync_mmap_readahead(vma, ra, file, offset);
count_vm_event(PGMAJFAULT);
ret = VM_FAULT_MAJOR;
retry_find:
- page = find_lock_page(mapping, offset);
+ page = find_get_page(mapping, offset);
if (!page)
goto no_cached_page;
}
+ if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
+ page_cache_release(page);
+ return ret | VM_FAULT_RETRY;
+ }
+
+ /* Did it get truncated? */
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ put_page(page);
+ goto retry_find;
+ }
+ VM_BUG_ON(page->index != offset);
+
/*
* We have a locked page in the page cache, now we need to check
* that it's up-to-date. If not, it is going to be due to an error.
@@ -2177,12 +2201,12 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
}
if (written > 0) {
- loff_t end = pos + written;
- if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
- i_size_write(inode, end);
+ pos += written;
+ if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
+ i_size_write(inode, pos);
mark_inode_dirty(inode);
}
- *ppos = end;
+ *ppos = pos;
}
out:
return written;
@@ -2203,7 +2227,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
gfp_notmask = __GFP_FS;
repeat:
page = find_lock_page(mapping, index);
- if (likely(page))
+ if (page)
return page;
page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
diff --git a/mm/highmem.c b/mm/highmem.c
index 7a0aa1be4993..693394daa2ed 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -29,6 +29,11 @@
#include <linux/kgdb.h>
#include <asm/tlbflush.h>
+
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
+DEFINE_PER_CPU(int, __kmap_atomic_idx);
+#endif
+
/*
* Virtual_count is not a pure "count".
* 0 means that it is not mapped, and has not been mapped
@@ -42,6 +47,9 @@
unsigned long totalhigh_pages __read_mostly;
EXPORT_SYMBOL(totalhigh_pages);
+
+EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
+
unsigned int nr_free_highpages (void)
{
pg_data_t *pgdat;
@@ -422,61 +430,3 @@ void __init page_address_init(void)
}
#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
-
-#ifdef CONFIG_DEBUG_HIGHMEM
-
-void debug_kmap_atomic(enum km_type type)
-{
- static int warn_count = 10;
-
- if (unlikely(warn_count < 0))
- return;
-
- if (unlikely(in_interrupt())) {
- if (in_nmi()) {
- if (type != KM_NMI && type != KM_NMI_PTE) {
- WARN_ON(1);
- warn_count--;
- }
- } else if (in_irq()) {
- if (type != KM_IRQ0 && type != KM_IRQ1 &&
- type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
- type != KM_BOUNCE_READ && type != KM_IRQ_PTE) {
- WARN_ON(1);
- warn_count--;
- }
- } else if (!irqs_disabled()) { /* softirq */
- if (type != KM_IRQ0 && type != KM_IRQ1 &&
- type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
- type != KM_SKB_SUNRPC_DATA &&
- type != KM_SKB_DATA_SOFTIRQ &&
- type != KM_BOUNCE_READ) {
- WARN_ON(1);
- warn_count--;
- }
- }
- }
-
- if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
- type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ ||
- type == KM_IRQ_PTE || type == KM_NMI ||
- type == KM_NMI_PTE ) {
- if (!irqs_disabled()) {
- WARN_ON(1);
- warn_count--;
- }
- } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
- if (irq_count() == 0 && !irqs_disabled()) {
- WARN_ON(1);
- warn_count--;
- }
- }
-#ifdef CONFIG_KGDB_KDB
- if (unlikely(type == KM_KDB && atomic_read(&kgdb_active) == -1)) {
- WARN_ON(1);
- warn_count--;
- }
-#endif /* CONFIG_KGDB_KDB */
-}
-
-#endif
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
new file mode 100644
index 000000000000..3e29781ee762
--- /dev/null
+++ b/mm/huge_memory.c
@@ -0,0 +1,2351 @@
+/*
+ * Copyright (C) 2009 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/mmu_notifier.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/mm_inline.h>
+#include <linux/kthread.h>
+#include <linux/khugepaged.h>
+#include <linux/freezer.h>
+#include <linux/mman.h>
+#include <asm/tlb.h>
+#include <asm/pgalloc.h>
+#include "internal.h"
+
+/*
+ * By default transparent hugepage support is enabled for all mappings
+ * and khugepaged scans all mappings. Defrag is only invoked by
+ * khugepaged hugepage allocations and by page faults inside
+ * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
+ * allocations.
+ */
+unsigned long transparent_hugepage_flags __read_mostly =
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
+ (1<<TRANSPARENT_HUGEPAGE_FLAG)|
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
+ (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
+#endif
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+
+/* default scan 8*512 pte (or vmas) every 30 second */
+static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
+static unsigned int khugepaged_pages_collapsed;
+static unsigned int khugepaged_full_scans;
+static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
+/* during fragmentation poll the hugepage allocator once every minute */
+static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
+static struct task_struct *khugepaged_thread __read_mostly;
+static DEFINE_MUTEX(khugepaged_mutex);
+static DEFINE_SPINLOCK(khugepaged_mm_lock);
+static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
+/*
+ * default collapse hugepages if there is at least one pte mapped like
+ * it would have happened if the vma was large enough during page
+ * fault.
+ */
+static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+
+static int khugepaged(void *none);
+static int mm_slots_hash_init(void);
+static int khugepaged_slab_init(void);
+static void khugepaged_slab_free(void);
+
+#define MM_SLOTS_HASH_HEADS 1024
+static struct hlist_head *mm_slots_hash __read_mostly;
+static struct kmem_cache *mm_slot_cache __read_mostly;
+
+/**
+ * struct mm_slot - hash lookup from mm to mm_slot
+ * @hash: hash collision list
+ * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
+ * @mm: the mm that this information is valid for
+ */
+struct mm_slot {
+ struct hlist_node hash;
+ struct list_head mm_node;
+ struct mm_struct *mm;
+};
+
+/**
+ * struct khugepaged_scan - cursor for scanning
+ * @mm_head: the head of the mm list to scan
+ * @mm_slot: the current mm_slot we are scanning
+ * @address: the next address inside that to be scanned
+ *
+ * There is only the one khugepaged_scan instance of this cursor structure.
+ */
+struct khugepaged_scan {
+ struct list_head mm_head;
+ struct mm_slot *mm_slot;
+ unsigned long address;
+} khugepaged_scan = {
+ .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
+};
+
+
+static int set_recommended_min_free_kbytes(void)
+{
+ struct zone *zone;
+ int nr_zones = 0;
+ unsigned long recommended_min;
+ extern int min_free_kbytes;
+
+ if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags) &&
+ !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags))
+ return 0;
+
+ for_each_populated_zone(zone)
+ nr_zones++;
+
+ /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
+ recommended_min = pageblock_nr_pages * nr_zones * 2;
+
+ /*
+ * Make sure that on average at least two pageblocks are almost free
+ * of another type, one for a migratetype to fall back to and a
+ * second to avoid subsequent fallbacks of other types There are 3
+ * MIGRATE_TYPES we care about.
+ */
+ recommended_min += pageblock_nr_pages * nr_zones *
+ MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
+
+ /* don't ever allow to reserve more than 5% of the lowmem */
+ recommended_min = min(recommended_min,
+ (unsigned long) nr_free_buffer_pages() / 20);
+ recommended_min <<= (PAGE_SHIFT-10);
+
+ if (recommended_min > min_free_kbytes)
+ min_free_kbytes = recommended_min;
+ setup_per_zone_wmarks();
+ return 0;
+}
+late_initcall(set_recommended_min_free_kbytes);
+
+static int start_khugepaged(void)
+{
+ int err = 0;
+ if (khugepaged_enabled()) {
+ int wakeup;
+ if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
+ err = -ENOMEM;
+ goto out;
+ }
+ mutex_lock(&khugepaged_mutex);
+ if (!khugepaged_thread)
+ khugepaged_thread = kthread_run(khugepaged, NULL,
+ "khugepaged");
+ if (unlikely(IS_ERR(khugepaged_thread))) {
+ printk(KERN_ERR
+ "khugepaged: kthread_run(khugepaged) failed\n");
+ err = PTR_ERR(khugepaged_thread);
+ khugepaged_thread = NULL;
+ }
+ wakeup = !list_empty(&khugepaged_scan.mm_head);
+ mutex_unlock(&khugepaged_mutex);
+ if (wakeup)
+ wake_up_interruptible(&khugepaged_wait);
+
+ set_recommended_min_free_kbytes();
+ } else
+ /* wakeup to exit */
+ wake_up_interruptible(&khugepaged_wait);
+out:
+ return err;
+}
+
+#ifdef CONFIG_SYSFS
+
+static ssize_t double_flag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf,
+ enum transparent_hugepage_flag enabled,
+ enum transparent_hugepage_flag req_madv)
+{
+ if (test_bit(enabled, &transparent_hugepage_flags)) {
+ VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
+ return sprintf(buf, "[always] madvise never\n");
+ } else if (test_bit(req_madv, &transparent_hugepage_flags))
+ return sprintf(buf, "always [madvise] never\n");
+ else
+ return sprintf(buf, "always madvise [never]\n");
+}
+static ssize_t double_flag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count,
+ enum transparent_hugepage_flag enabled,
+ enum transparent_hugepage_flag req_madv)
+{
+ if (!memcmp("always", buf,
+ min(sizeof("always")-1, count))) {
+ set_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(req_madv, &transparent_hugepage_flags);
+ } else if (!memcmp("madvise", buf,
+ min(sizeof("madvise")-1, count))) {
+ clear_bit(enabled, &transparent_hugepage_flags);
+ set_bit(req_madv, &transparent_hugepage_flags);
+ } else if (!memcmp("never", buf,
+ min(sizeof("never")-1, count))) {
+ clear_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(req_madv, &transparent_hugepage_flags);
+ } else
+ return -EINVAL;
+
+ return count;
+}
+
+static ssize_t enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return double_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_FLAG,
+ TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+}
+static ssize_t enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ ssize_t ret;
+
+ ret = double_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_FLAG,
+ TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+
+ if (ret > 0) {
+ int err = start_khugepaged();
+ if (err)
+ ret = err;
+ }
+
+ if (ret > 0 &&
+ (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags) ||
+ test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags)))
+ set_recommended_min_free_kbytes();
+
+ return ret;
+}
+static struct kobj_attribute enabled_attr =
+ __ATTR(enabled, 0644, enabled_show, enabled_store);
+
+static ssize_t single_flag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf,
+ enum transparent_hugepage_flag flag)
+{
+ if (test_bit(flag, &transparent_hugepage_flags))
+ return sprintf(buf, "[yes] no\n");
+ else
+ return sprintf(buf, "yes [no]\n");
+}
+static ssize_t single_flag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count,
+ enum transparent_hugepage_flag flag)
+{
+ if (!memcmp("yes", buf,
+ min(sizeof("yes")-1, count))) {
+ set_bit(flag, &transparent_hugepage_flags);
+ } else if (!memcmp("no", buf,
+ min(sizeof("no")-1, count))) {
+ clear_bit(flag, &transparent_hugepage_flags);
+ } else
+ return -EINVAL;
+
+ return count;
+}
+
+/*
+ * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
+ * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
+ * memory just to allocate one more hugepage.
+ */
+static ssize_t defrag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return double_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+ TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+}
+static ssize_t defrag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return double_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+ TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+}
+static struct kobj_attribute defrag_attr =
+ __ATTR(defrag, 0644, defrag_show, defrag_store);
+
+#ifdef CONFIG_DEBUG_VM
+static ssize_t debug_cow_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return single_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
+}
+static ssize_t debug_cow_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return single_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
+}
+static struct kobj_attribute debug_cow_attr =
+ __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
+#endif /* CONFIG_DEBUG_VM */
+
+static struct attribute *hugepage_attr[] = {
+ &enabled_attr.attr,
+ &defrag_attr.attr,
+#ifdef CONFIG_DEBUG_VM
+ &debug_cow_attr.attr,
+#endif
+ NULL,
+};
+
+static struct attribute_group hugepage_attr_group = {
+ .attrs = hugepage_attr,
+};
+
+static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
+}
+
+static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = strict_strtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_scan_sleep_millisecs = msecs;
+ wake_up_interruptible(&khugepaged_wait);
+
+ return count;
+}
+static struct kobj_attribute scan_sleep_millisecs_attr =
+ __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
+ scan_sleep_millisecs_store);
+
+static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
+}
+
+static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = strict_strtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_alloc_sleep_millisecs = msecs;
+ wake_up_interruptible(&khugepaged_wait);
+
+ return count;
+}
+static struct kobj_attribute alloc_sleep_millisecs_attr =
+ __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
+ alloc_sleep_millisecs_store);
+
+static ssize_t pages_to_scan_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
+}
+static ssize_t pages_to_scan_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long pages;
+
+ err = strict_strtoul(buf, 10, &pages);
+ if (err || !pages || pages > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_pages_to_scan = pages;
+
+ return count;
+}
+static struct kobj_attribute pages_to_scan_attr =
+ __ATTR(pages_to_scan, 0644, pages_to_scan_show,
+ pages_to_scan_store);
+
+static ssize_t pages_collapsed_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
+}
+static struct kobj_attribute pages_collapsed_attr =
+ __ATTR_RO(pages_collapsed);
+
+static ssize_t full_scans_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_full_scans);
+}
+static struct kobj_attribute full_scans_attr =
+ __ATTR_RO(full_scans);
+
+static ssize_t khugepaged_defrag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return single_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static ssize_t khugepaged_defrag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return single_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static struct kobj_attribute khugepaged_defrag_attr =
+ __ATTR(defrag, 0644, khugepaged_defrag_show,
+ khugepaged_defrag_store);
+
+/*
+ * max_ptes_none controls if khugepaged should collapse hugepages over
+ * any unmapped ptes in turn potentially increasing the memory
+ * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
+ * reduce the available free memory in the system as it
+ * runs. Increasing max_ptes_none will instead potentially reduce the
+ * free memory in the system during the khugepaged scan.
+ */
+static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
+}
+static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long max_ptes_none;
+
+ err = strict_strtoul(buf, 10, &max_ptes_none);
+ if (err || max_ptes_none > HPAGE_PMD_NR-1)
+ return -EINVAL;
+
+ khugepaged_max_ptes_none = max_ptes_none;
+
+ return count;
+}
+static struct kobj_attribute khugepaged_max_ptes_none_attr =
+ __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
+ khugepaged_max_ptes_none_store);
+
+static struct attribute *khugepaged_attr[] = {
+ &khugepaged_defrag_attr.attr,
+ &khugepaged_max_ptes_none_attr.attr,
+ &pages_to_scan_attr.attr,
+ &pages_collapsed_attr.attr,
+ &full_scans_attr.attr,
+ &scan_sleep_millisecs_attr.attr,
+ &alloc_sleep_millisecs_attr.attr,
+ NULL,
+};
+
+static struct attribute_group khugepaged_attr_group = {
+ .attrs = khugepaged_attr,
+ .name = "khugepaged",
+};
+#endif /* CONFIG_SYSFS */
+
+static int __init hugepage_init(void)
+{
+ int err;
+#ifdef CONFIG_SYSFS
+ static struct kobject *hugepage_kobj;
+#endif
+
+ err = -EINVAL;
+ if (!has_transparent_hugepage()) {
+ transparent_hugepage_flags = 0;
+ goto out;
+ }
+
+#ifdef CONFIG_SYSFS
+ err = -ENOMEM;
+ hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
+ if (unlikely(!hugepage_kobj)) {
+ printk(KERN_ERR "hugepage: failed kobject create\n");
+ goto out;
+ }
+
+ err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group);
+ if (err) {
+ printk(KERN_ERR "hugepage: failed register hugeage group\n");
+ goto out;
+ }
+
+ err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group);
+ if (err) {
+ printk(KERN_ERR "hugepage: failed register hugeage group\n");
+ goto out;
+ }
+#endif
+
+ err = khugepaged_slab_init();
+ if (err)
+ goto out;
+
+ err = mm_slots_hash_init();
+ if (err) {
+ khugepaged_slab_free();
+ goto out;
+ }
+
+ /*
+ * By default disable transparent hugepages on smaller systems,
+ * where the extra memory used could hurt more than TLB overhead
+ * is likely to save. The admin can still enable it through /sys.
+ */
+ if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
+ transparent_hugepage_flags = 0;
+
+ start_khugepaged();
+
+ set_recommended_min_free_kbytes();
+
+out:
+ return err;
+}
+module_init(hugepage_init)
+
+static int __init setup_transparent_hugepage(char *str)
+{
+ int ret = 0;
+ if (!str)
+ goto out;
+ if (!strcmp(str, "always")) {
+ set_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ } else if (!strcmp(str, "madvise")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ } else if (!strcmp(str, "never")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ }
+out:
+ if (!ret)
+ printk(KERN_WARNING
+ "transparent_hugepage= cannot parse, ignored\n");
+ return ret;
+}
+__setup("transparent_hugepage=", setup_transparent_hugepage);
+
+static void prepare_pmd_huge_pte(pgtable_t pgtable,
+ struct mm_struct *mm)
+{
+ assert_spin_locked(&mm->page_table_lock);
+
+ /* FIFO */
+ if (!mm->pmd_huge_pte)
+ INIT_LIST_HEAD(&pgtable->lru);
+ else
+ list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
+ mm->pmd_huge_pte = pgtable;
+}
+
+static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+{
+ if (likely(vma->vm_flags & VM_WRITE))
+ pmd = pmd_mkwrite(pmd);
+ return pmd;
+}
+
+static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long haddr, pmd_t *pmd,
+ struct page *page)
+{
+ int ret = 0;
+ pgtable_t pgtable;
+
+ VM_BUG_ON(!PageCompound(page));
+ pgtable = pte_alloc_one(mm, haddr);
+ if (unlikely(!pgtable)) {
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ return VM_FAULT_OOM;
+ }
+
+ clear_huge_page(page, haddr, HPAGE_PMD_NR);
+ __SetPageUptodate(page);
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_none(*pmd))) {
+ spin_unlock(&mm->page_table_lock);
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ pte_free(mm, pgtable);
+ } else {
+ pmd_t entry;
+ entry = mk_pmd(page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_mkhuge(entry);
+ /*
+ * The spinlocking to take the lru_lock inside
+ * page_add_new_anon_rmap() acts as a full memory
+ * barrier to be sure clear_huge_page writes become
+ * visible after the set_pmd_at() write.
+ */
+ page_add_new_anon_rmap(page, vma, haddr);
+ set_pmd_at(mm, haddr, pmd, entry);
+ prepare_pmd_huge_pte(pgtable, mm);
+ add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ spin_unlock(&mm->page_table_lock);
+ }
+
+ return ret;
+}
+
+static inline gfp_t alloc_hugepage_gfpmask(int defrag)
+{
+ return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT);
+}
+
+static inline struct page *alloc_hugepage_vma(int defrag,
+ struct vm_area_struct *vma,
+ unsigned long haddr)
+{
+ return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
+ HPAGE_PMD_ORDER, vma, haddr);
+}
+
+#ifndef CONFIG_NUMA
+static inline struct page *alloc_hugepage(int defrag)
+{
+ return alloc_pages(alloc_hugepage_gfpmask(defrag),
+ HPAGE_PMD_ORDER);
+}
+#endif
+
+int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ unsigned int flags)
+{
+ struct page *page;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
+ pte_t *pte;
+
+ if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+ if (unlikely(khugepaged_enter(vma)))
+ return VM_FAULT_OOM;
+ page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+ vma, haddr);
+ if (unlikely(!page))
+ goto out;
+ if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
+ put_page(page);
+ goto out;
+ }
+
+ return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
+ }
+out:
+ /*
+ * Use __pte_alloc instead of pte_alloc_map, because we can't
+ * run pte_offset_map on the pmd, if an huge pmd could
+ * materialize from under us from a different thread.
+ */
+ if (unlikely(__pte_alloc(mm, vma, pmd, address)))
+ return VM_FAULT_OOM;
+ /* if an huge pmd materialized from under us just retry later */
+ if (unlikely(pmd_trans_huge(*pmd)))
+ return 0;
+ /*
+ * A regular pmd is established and it can't morph into a huge pmd
+ * from under us anymore at this point because we hold the mmap_sem
+ * read mode and khugepaged takes it in write mode. So now it's
+ * safe to run pte_offset_map().
+ */
+ pte = pte_offset_map(pmd, address);
+ return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+}
+
+int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+ struct vm_area_struct *vma)
+{
+ struct page *src_page;
+ pmd_t pmd;
+ pgtable_t pgtable;
+ int ret;
+
+ ret = -ENOMEM;
+ pgtable = pte_alloc_one(dst_mm, addr);
+ if (unlikely(!pgtable))
+ goto out;
+
+ spin_lock(&dst_mm->page_table_lock);
+ spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
+
+ ret = -EAGAIN;
+ pmd = *src_pmd;
+ if (unlikely(!pmd_trans_huge(pmd))) {
+ pte_free(dst_mm, pgtable);
+ goto out_unlock;
+ }
+ if (unlikely(pmd_trans_splitting(pmd))) {
+ /* split huge page running from under us */
+ spin_unlock(&src_mm->page_table_lock);
+ spin_unlock(&dst_mm->page_table_lock);
+ pte_free(dst_mm, pgtable);
+
+ wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
+ goto out;
+ }
+ src_page = pmd_page(pmd);
+ VM_BUG_ON(!PageHead(src_page));
+ get_page(src_page);
+ page_dup_rmap(src_page);
+ add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+
+ pmdp_set_wrprotect(src_mm, addr, src_pmd);
+ pmd = pmd_mkold(pmd_wrprotect(pmd));
+ set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+ prepare_pmd_huge_pte(pgtable, dst_mm);
+
+ ret = 0;
+out_unlock:
+ spin_unlock(&src_mm->page_table_lock);
+ spin_unlock(&dst_mm->page_table_lock);
+out:
+ return ret;
+}
+
+/* no "address" argument so destroys page coloring of some arch */
+pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
+{
+ pgtable_t pgtable;
+
+ assert_spin_locked(&mm->page_table_lock);
+
+ /* FIFO */
+ pgtable = mm->pmd_huge_pte;
+ if (list_empty(&pgtable->lru))
+ mm->pmd_huge_pte = NULL;
+ else {
+ mm->pmd_huge_pte = list_entry(pgtable->lru.next,
+ struct page, lru);
+ list_del(&pgtable->lru);
+ }
+ return pgtable;
+}
+
+static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmd, pmd_t orig_pmd,
+ struct page *page,
+ unsigned long haddr)
+{
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ int ret = 0, i;
+ struct page **pages;
+
+ pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
+ GFP_KERNEL);
+ if (unlikely(!pages)) {
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
+ vma, address);
+ if (unlikely(!pages[i] ||
+ mem_cgroup_newpage_charge(pages[i], mm,
+ GFP_KERNEL))) {
+ if (pages[i])
+ put_page(pages[i]);
+ mem_cgroup_uncharge_start();
+ while (--i >= 0) {
+ mem_cgroup_uncharge_page(pages[i]);
+ put_page(pages[i]);
+ }
+ mem_cgroup_uncharge_end();
+ kfree(pages);
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+ }
+
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ copy_user_highpage(pages[i], page + i,
+ haddr + PAGE_SHIFT*i, vma);
+ __SetPageUptodate(pages[i]);
+ cond_resched();
+ }
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ goto out_free_pages;
+ VM_BUG_ON(!PageHead(page));
+
+ pmdp_clear_flush_notify(vma, haddr, pmd);
+ /* leave pmd empty until pte is filled */
+
+ pgtable = get_pmd_huge_pte(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ entry = mk_pte(pages[i], vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ page_add_new_anon_rmap(pages[i], vma, haddr);
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+ kfree(pages);
+
+ mm->nr_ptes++;
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+ page_remove_rmap(page);
+ spin_unlock(&mm->page_table_lock);
+
+ ret |= VM_FAULT_WRITE;
+ put_page(page);
+
+out:
+ return ret;
+
+out_free_pages:
+ spin_unlock(&mm->page_table_lock);
+ mem_cgroup_uncharge_start();
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ mem_cgroup_uncharge_page(pages[i]);
+ put_page(pages[i]);
+ }
+ mem_cgroup_uncharge_end();
+ kfree(pages);
+ goto out;
+}
+
+int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
+{
+ int ret = 0;
+ struct page *page, *new_page;
+ unsigned long haddr;
+
+ VM_BUG_ON(!vma->anon_vma);
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ goto out_unlock;
+
+ page = pmd_page(orig_pmd);
+ VM_BUG_ON(!PageCompound(page) || !PageHead(page));
+ haddr = address & HPAGE_PMD_MASK;
+ if (page_mapcount(page) == 1) {
+ pmd_t entry;
+ entry = pmd_mkyoung(orig_pmd);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
+ update_mmu_cache(vma, address, entry);
+ ret |= VM_FAULT_WRITE;
+ goto out_unlock;
+ }
+ get_page(page);
+ spin_unlock(&mm->page_table_lock);
+
+ if (transparent_hugepage_enabled(vma) &&
+ !transparent_hugepage_debug_cow())
+ new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+ vma, haddr);
+ else
+ new_page = NULL;
+
+ if (unlikely(!new_page)) {
+ ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
+ pmd, orig_pmd, page, haddr);
+ put_page(page);
+ goto out;
+ }
+
+ if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+ put_page(new_page);
+ put_page(page);
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
+ __SetPageUptodate(new_page);
+
+ spin_lock(&mm->page_table_lock);
+ put_page(page);
+ if (unlikely(!pmd_same(*pmd, orig_pmd))) {
+ mem_cgroup_uncharge_page(new_page);
+ put_page(new_page);
+ } else {
+ pmd_t entry;
+ VM_BUG_ON(!PageHead(page));
+ entry = mk_pmd(new_page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_mkhuge(entry);
+ pmdp_clear_flush_notify(vma, haddr, pmd);
+ page_add_new_anon_rmap(new_page, vma, haddr);
+ set_pmd_at(mm, haddr, pmd, entry);
+ update_mmu_cache(vma, address, entry);
+ page_remove_rmap(page);
+ put_page(page);
+ ret |= VM_FAULT_WRITE;
+ }
+out_unlock:
+ spin_unlock(&mm->page_table_lock);
+out:
+ return ret;
+}
+
+struct page *follow_trans_huge_pmd(struct mm_struct *mm,
+ unsigned long addr,
+ pmd_t *pmd,
+ unsigned int flags)
+{
+ struct page *page = NULL;
+
+ assert_spin_locked(&mm->page_table_lock);
+
+ if (flags & FOLL_WRITE && !pmd_write(*pmd))
+ goto out;
+
+ page = pmd_page(*pmd);
+ VM_BUG_ON(!PageHead(page));
+ if (flags & FOLL_TOUCH) {
+ pmd_t _pmd;
+ /*
+ * We should set the dirty bit only for FOLL_WRITE but
+ * for now the dirty bit in the pmd is meaningless.
+ * And if the dirty bit will become meaningful and
+ * we'll only set it with FOLL_WRITE, an atomic
+ * set_bit will be required on the pmd to set the
+ * young bit, instead of the current set_pmd_at.
+ */
+ _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+ set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
+ }
+ page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
+ VM_BUG_ON(!PageCompound(page));
+ if (flags & FOLL_GET)
+ get_page(page);
+
+out:
+ return page;
+}
+
+int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *pmd)
+{
+ int ret = 0;
+
+ spin_lock(&tlb->mm->page_table_lock);
+ if (likely(pmd_trans_huge(*pmd))) {
+ if (unlikely(pmd_trans_splitting(*pmd))) {
+ spin_unlock(&tlb->mm->page_table_lock);
+ wait_split_huge_page(vma->anon_vma,
+ pmd);
+ } else {
+ struct page *page;
+ pgtable_t pgtable;
+ pgtable = get_pmd_huge_pte(tlb->mm);
+ page = pmd_page(*pmd);
+ pmd_clear(pmd);
+ page_remove_rmap(page);
+ VM_BUG_ON(page_mapcount(page) < 0);
+ add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+ VM_BUG_ON(!PageHead(page));
+ spin_unlock(&tlb->mm->page_table_lock);
+ tlb_remove_page(tlb, page);
+ pte_free(tlb->mm, pgtable);
+ ret = 1;
+ }
+ } else
+ spin_unlock(&tlb->mm->page_table_lock);
+
+ return ret;
+}
+
+int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
+{
+ int ret = 0;
+
+ spin_lock(&vma->vm_mm->page_table_lock);
+ if (likely(pmd_trans_huge(*pmd))) {
+ ret = !pmd_trans_splitting(*pmd);
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ if (unlikely(!ret))
+ wait_split_huge_page(vma->anon_vma, pmd);
+ else {
+ /*
+ * All logical pages in the range are present
+ * if backed by a huge page.
+ */
+ memset(vec, 1, (end - addr) >> PAGE_SHIFT);
+ }
+ } else
+ spin_unlock(&vma->vm_mm->page_table_lock);
+
+ return ret;
+}
+
+int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, pgprot_t newprot)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int ret = 0;
+
+ spin_lock(&mm->page_table_lock);
+ if (likely(pmd_trans_huge(*pmd))) {
+ if (unlikely(pmd_trans_splitting(*pmd))) {
+ spin_unlock(&mm->page_table_lock);
+ wait_split_huge_page(vma->anon_vma, pmd);
+ } else {
+ pmd_t entry;
+
+ entry = pmdp_get_and_clear(mm, addr, pmd);
+ entry = pmd_modify(entry, newprot);
+ set_pmd_at(mm, addr, pmd, entry);
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
+ ret = 1;
+ }
+ } else
+ spin_unlock(&vma->vm_mm->page_table_lock);
+
+ return ret;
+}
+
+pmd_t *page_check_address_pmd(struct page *page,
+ struct mm_struct *mm,
+ unsigned long address,
+ enum page_check_address_pmd_flag flag)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd, *ret = NULL;
+
+ if (address & ~HPAGE_PMD_MASK)
+ goto out;
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ goto out;
+
+ pmd = pmd_offset(pud, address);
+ if (pmd_none(*pmd))
+ goto out;
+ if (pmd_page(*pmd) != page)
+ goto out;
+ /*
+ * split_vma() may create temporary aliased mappings. There is
+ * no risk as long as all huge pmd are found and have their
+ * splitting bit set before __split_huge_page_refcount
+ * runs. Finding the same huge pmd more than once during the
+ * same rmap walk is not a problem.
+ */
+ if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
+ pmd_trans_splitting(*pmd))
+ goto out;
+ if (pmd_trans_huge(*pmd)) {
+ VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
+ !pmd_trans_splitting(*pmd));
+ ret = pmd;
+ }
+out:
+ return ret;
+}
+
+static int __split_huge_page_splitting(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t *pmd;
+ int ret = 0;
+
+ spin_lock(&mm->page_table_lock);
+ pmd = page_check_address_pmd(page, mm, address,
+ PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
+ if (pmd) {
+ /*
+ * We can't temporarily set the pmd to null in order
+ * to split it, the pmd must remain marked huge at all
+ * times or the VM won't take the pmd_trans_huge paths
+ * and it won't wait on the anon_vma->root->lock to
+ * serialize against split_huge_page*.
+ */
+ pmdp_splitting_flush_notify(vma, address, pmd);
+ ret = 1;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ return ret;
+}
+
+static void __split_huge_page_refcount(struct page *page)
+{
+ int i;
+ unsigned long head_index = page->index;
+ struct zone *zone = page_zone(page);
+ int zonestat;
+
+ /* prevent PageLRU to go away from under us, and freeze lru stats */
+ spin_lock_irq(&zone->lru_lock);
+ compound_lock(page);
+
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
+ struct page *page_tail = page + i;
+
+ /* tail_page->_count cannot change */
+ atomic_sub(atomic_read(&page_tail->_count), &page->_count);
+ BUG_ON(page_count(page) <= 0);
+ atomic_add(page_mapcount(page) + 1, &page_tail->_count);
+ BUG_ON(atomic_read(&page_tail->_count) <= 0);
+
+ /* after clearing PageTail the gup refcount can be released */
+ smp_mb();
+
+ /*
+ * retain hwpoison flag of the poisoned tail page:
+ * fix for the unsuitable process killed on Guest Machine(KVM)
+ * by the memory-failure.
+ */
+ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
+ page_tail->flags |= (page->flags &
+ ((1L << PG_referenced) |
+ (1L << PG_swapbacked) |
+ (1L << PG_mlocked) |
+ (1L << PG_uptodate)));
+ page_tail->flags |= (1L << PG_dirty);
+
+ /*
+ * 1) clear PageTail before overwriting first_page
+ * 2) clear PageTail before clearing PageHead for VM_BUG_ON
+ */
+ smp_wmb();
+
+ /*
+ * __split_huge_page_splitting() already set the
+ * splitting bit in all pmd that could map this
+ * hugepage, that will ensure no CPU can alter the
+ * mapcount on the head page. The mapcount is only
+ * accounted in the head page and it has to be
+ * transferred to all tail pages in the below code. So
+ * for this code to be safe, the split the mapcount
+ * can't change. But that doesn't mean userland can't
+ * keep changing and reading the page contents while
+ * we transfer the mapcount, so the pmd splitting
+ * status is achieved setting a reserved bit in the
+ * pmd, not by clearing the present bit.
+ */
+ BUG_ON(page_mapcount(page_tail));
+ page_tail->_mapcount = page->_mapcount;
+
+ BUG_ON(page_tail->mapping);
+ page_tail->mapping = page->mapping;
+
+ page_tail->index = ++head_index;
+
+ BUG_ON(!PageAnon(page_tail));
+ BUG_ON(!PageUptodate(page_tail));
+ BUG_ON(!PageDirty(page_tail));
+ BUG_ON(!PageSwapBacked(page_tail));
+
+ mem_cgroup_split_huge_fixup(page, page_tail);
+
+ lru_add_page_tail(zone, page, page_tail);
+ }
+
+ __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+ __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
+
+ /*
+ * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics,
+ * so adjust those appropriately if this page is on the LRU.
+ */
+ if (PageLRU(page)) {
+ zonestat = NR_LRU_BASE + page_lru(page);
+ __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1));
+ }
+
+ ClearPageCompound(page);
+ compound_unlock(page);
+ spin_unlock_irq(&zone->lru_lock);
+
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
+ struct page *page_tail = page + i;
+ BUG_ON(page_count(page_tail) <= 0);
+ /*
+ * Tail pages may be freed if there wasn't any mapping
+ * like if add_to_swap() is running on a lru page that
+ * had its mapping zapped. And freeing these pages
+ * requires taking the lru_lock so we do the put_page
+ * of the tail pages after the split is complete.
+ */
+ put_page(page_tail);
+ }
+
+ /*
+ * Only the head page (now become a regular page) is required
+ * to be pinned by the caller.
+ */
+ BUG_ON(page_count(page) <= 0);
+}
+
+static int __split_huge_page_map(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t *pmd, _pmd;
+ int ret = 0, i;
+ pgtable_t pgtable;
+ unsigned long haddr;
+
+ spin_lock(&mm->page_table_lock);
+ pmd = page_check_address_pmd(page, mm, address,
+ PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
+ if (pmd) {
+ pgtable = get_pmd_huge_pte(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0, haddr = address; i < HPAGE_PMD_NR;
+ i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ BUG_ON(PageCompound(page+i));
+ entry = mk_pte(page + i, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (!pmd_write(*pmd))
+ entry = pte_wrprotect(entry);
+ else
+ BUG_ON(page_mapcount(page) != 1);
+ if (!pmd_young(*pmd))
+ entry = pte_mkold(entry);
+ pte = pte_offset_map(&_pmd, haddr);
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+
+ mm->nr_ptes++;
+ smp_wmb(); /* make pte visible before pmd */
+ /*
+ * Up to this point the pmd is present and huge and
+ * userland has the whole access to the hugepage
+ * during the split (which happens in place). If we
+ * overwrite the pmd with the not-huge version
+ * pointing to the pte here (which of course we could
+ * if all CPUs were bug free), userland could trigger
+ * a small page size TLB miss on the small sized TLB
+ * while the hugepage TLB entry is still established
+ * in the huge TLB. Some CPU doesn't like that. See
+ * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
+ * Erratum 383 on page 93. Intel should be safe but is
+ * also warns that it's only safe if the permission
+ * and cache attributes of the two entries loaded in
+ * the two TLB is identical (which should be the case
+ * here). But it is generally safer to never allow
+ * small and huge TLB entries for the same virtual
+ * address to be loaded simultaneously. So instead of
+ * doing "pmd_populate(); flush_tlb_range();" we first
+ * mark the current pmd notpresent (atomically because
+ * here the pmd_trans_huge and pmd_trans_splitting
+ * must remain set at all times on the pmd until the
+ * split is complete for this pmd), then we flush the
+ * SMP TLB and finally we write the non-huge version
+ * of the pmd entry with pmd_populate.
+ */
+ set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ pmd_populate(mm, pmd, pgtable);
+ ret = 1;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ return ret;
+}
+
+/* must be called with anon_vma->root->lock hold */
+static void __split_huge_page(struct page *page,
+ struct anon_vma *anon_vma)
+{
+ int mapcount, mapcount2;
+ struct anon_vma_chain *avc;
+
+ BUG_ON(!PageHead(page));
+ BUG_ON(PageTail(page));
+
+ mapcount = 0;
+ list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ struct vm_area_struct *vma = avc->vma;
+ unsigned long addr = vma_address(page, vma);
+ BUG_ON(is_vma_temporary_stack(vma));
+ if (addr == -EFAULT)
+ continue;
+ mapcount += __split_huge_page_splitting(page, vma, addr);
+ }
+ /*
+ * It is critical that new vmas are added to the tail of the
+ * anon_vma list. This guarantes that if copy_huge_pmd() runs
+ * and establishes a child pmd before
+ * __split_huge_page_splitting() freezes the parent pmd (so if
+ * we fail to prevent copy_huge_pmd() from running until the
+ * whole __split_huge_page() is complete), we will still see
+ * the newly established pmd of the child later during the
+ * walk, to be able to set it as pmd_trans_splitting too.
+ */
+ if (mapcount != page_mapcount(page))
+ printk(KERN_ERR "mapcount %d page_mapcount %d\n",
+ mapcount, page_mapcount(page));
+ BUG_ON(mapcount != page_mapcount(page));
+
+ __split_huge_page_refcount(page);
+
+ mapcount2 = 0;
+ list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ struct vm_area_struct *vma = avc->vma;
+ unsigned long addr = vma_address(page, vma);
+ BUG_ON(is_vma_temporary_stack(vma));
+ if (addr == -EFAULT)
+ continue;
+ mapcount2 += __split_huge_page_map(page, vma, addr);
+ }
+ if (mapcount != mapcount2)
+ printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
+ mapcount, mapcount2, page_mapcount(page));
+ BUG_ON(mapcount != mapcount2);
+}
+
+int split_huge_page(struct page *page)
+{
+ struct anon_vma *anon_vma;
+ int ret = 1;
+
+ BUG_ON(!PageAnon(page));
+ anon_vma = page_lock_anon_vma(page);
+ if (!anon_vma)
+ goto out;
+ ret = 0;
+ if (!PageCompound(page))
+ goto out_unlock;
+
+ BUG_ON(!PageSwapBacked(page));
+ __split_huge_page(page, anon_vma);
+
+ BUG_ON(PageCompound(page));
+out_unlock:
+ page_unlock_anon_vma(anon_vma);
+out:
+ return ret;
+}
+
+int hugepage_madvise(struct vm_area_struct *vma,
+ unsigned long *vm_flags, int advice)
+{
+ switch (advice) {
+ case MADV_HUGEPAGE:
+ /*
+ * Be somewhat over-protective like KSM for now!
+ */
+ if (*vm_flags & (VM_HUGEPAGE |
+ VM_SHARED | VM_MAYSHARE |
+ VM_PFNMAP | VM_IO | VM_DONTEXPAND |
+ VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
+ VM_MIXEDMAP | VM_SAO))
+ return -EINVAL;
+ *vm_flags &= ~VM_NOHUGEPAGE;
+ *vm_flags |= VM_HUGEPAGE;
+ /*
+ * If the vma become good for khugepaged to scan,
+ * register it here without waiting a page fault that
+ * may not happen any time soon.
+ */
+ if (unlikely(khugepaged_enter_vma_merge(vma)))
+ return -ENOMEM;
+ break;
+ case MADV_NOHUGEPAGE:
+ /*
+ * Be somewhat over-protective like KSM for now!
+ */
+ if (*vm_flags & (VM_NOHUGEPAGE |
+ VM_SHARED | VM_MAYSHARE |
+ VM_PFNMAP | VM_IO | VM_DONTEXPAND |
+ VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
+ VM_MIXEDMAP | VM_SAO))
+ return -EINVAL;
+ *vm_flags &= ~VM_HUGEPAGE;
+ *vm_flags |= VM_NOHUGEPAGE;
+ /*
+ * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
+ * this vma even if we leave the mm registered in khugepaged if
+ * it got registered before VM_NOHUGEPAGE was set.
+ */
+ break;
+ }
+
+ return 0;
+}
+
+static int __init khugepaged_slab_init(void)
+{
+ mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
+ sizeof(struct mm_slot),
+ __alignof__(struct mm_slot), 0, NULL);
+ if (!mm_slot_cache)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __init khugepaged_slab_free(void)
+{
+ kmem_cache_destroy(mm_slot_cache);
+ mm_slot_cache = NULL;
+}
+
+static inline struct mm_slot *alloc_mm_slot(void)
+{
+ if (!mm_slot_cache) /* initialization failed */
+ return NULL;
+ return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
+}
+
+static inline void free_mm_slot(struct mm_slot *mm_slot)
+{
+ kmem_cache_free(mm_slot_cache, mm_slot);
+}
+
+static int __init mm_slots_hash_init(void)
+{
+ mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!mm_slots_hash)
+ return -ENOMEM;
+ return 0;
+}
+
+#if 0
+static void __init mm_slots_hash_free(void)
+{
+ kfree(mm_slots_hash);
+ mm_slots_hash = NULL;
+}
+#endif
+
+static struct mm_slot *get_mm_slot(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ struct hlist_head *bucket;
+ struct hlist_node *node;
+
+ bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+ % MM_SLOTS_HASH_HEADS];
+ hlist_for_each_entry(mm_slot, node, bucket, hash) {
+ if (mm == mm_slot->mm)
+ return mm_slot;
+ }
+ return NULL;
+}
+
+static void insert_to_mm_slots_hash(struct mm_struct *mm,
+ struct mm_slot *mm_slot)
+{
+ struct hlist_head *bucket;
+
+ bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+ % MM_SLOTS_HASH_HEADS];
+ mm_slot->mm = mm;
+ hlist_add_head(&mm_slot->hash, bucket);
+}
+
+static inline int khugepaged_test_exit(struct mm_struct *mm)
+{
+ return atomic_read(&mm->mm_users) == 0;
+}
+
+int __khugepaged_enter(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ int wakeup;
+
+ mm_slot = alloc_mm_slot();
+ if (!mm_slot)
+ return -ENOMEM;
+
+ /* __khugepaged_exit() must not run from under us */
+ VM_BUG_ON(khugepaged_test_exit(mm));
+ if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
+ free_mm_slot(mm_slot);
+ return 0;
+ }
+
+ spin_lock(&khugepaged_mm_lock);
+ insert_to_mm_slots_hash(mm, mm_slot);
+ /*
+ * Insert just behind the scanning cursor, to let the area settle
+ * down a little.
+ */
+ wakeup = list_empty(&khugepaged_scan.mm_head);
+ list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
+ spin_unlock(&khugepaged_mm_lock);
+
+ atomic_inc(&mm->mm_count);
+ if (wakeup)
+ wake_up_interruptible(&khugepaged_wait);
+
+ return 0;
+}
+
+int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+{
+ unsigned long hstart, hend;
+ if (!vma->anon_vma)
+ /*
+ * Not yet faulted in so we will register later in the
+ * page fault if needed.
+ */
+ return 0;
+ if (vma->vm_file || vma->vm_ops)
+ /* khugepaged not yet working on file or special mappings */
+ return 0;
+ VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (hstart < hend)
+ return khugepaged_enter(vma);
+ return 0;
+}
+
+void __khugepaged_exit(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ int free = 0;
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = get_mm_slot(mm);
+ if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+ hlist_del(&mm_slot->hash);
+ list_del(&mm_slot->mm_node);
+ free = 1;
+ }
+
+ if (free) {
+ spin_unlock(&khugepaged_mm_lock);
+ clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ free_mm_slot(mm_slot);
+ mmdrop(mm);
+ } else if (mm_slot) {
+ spin_unlock(&khugepaged_mm_lock);
+ /*
+ * This is required to serialize against
+ * khugepaged_test_exit() (which is guaranteed to run
+ * under mmap sem read mode). Stop here (after we
+ * return all pagetables will be destroyed) until
+ * khugepaged has finished working on the pagetables
+ * under the mmap_sem.
+ */
+ down_write(&mm->mmap_sem);
+ up_write(&mm->mmap_sem);
+ } else
+ spin_unlock(&khugepaged_mm_lock);
+}
+
+static void release_pte_page(struct page *page)
+{
+ /* 0 stands for page_is_file_cache(page) == false */
+ dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
+ unlock_page(page);
+ putback_lru_page(page);
+}
+
+static void release_pte_pages(pte_t *pte, pte_t *_pte)
+{
+ while (--_pte >= pte) {
+ pte_t pteval = *_pte;
+ if (!pte_none(pteval))
+ release_pte_page(pte_page(pteval));
+ }
+}
+
+static void release_all_pte_pages(pte_t *pte)
+{
+ release_pte_pages(pte, pte + HPAGE_PMD_NR);
+}
+
+static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
+ unsigned long address,
+ pte_t *pte)
+{
+ struct page *page;
+ pte_t *_pte;
+ int referenced = 0, isolated = 0, none = 0;
+ for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
+ _pte++, address += PAGE_SIZE) {
+ pte_t pteval = *_pte;
+ if (pte_none(pteval)) {
+ if (++none <= khugepaged_max_ptes_none)
+ continue;
+ else {
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ }
+ if (!pte_present(pteval) || !pte_write(pteval)) {
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ page = vm_normal_page(vma, address, pteval);
+ if (unlikely(!page)) {
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ VM_BUG_ON(PageCompound(page));
+ BUG_ON(!PageAnon(page));
+ VM_BUG_ON(!PageSwapBacked(page));
+
+ /* cannot use mapcount: can't collapse if there's a gup pin */
+ if (page_count(page) != 1) {
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ /*
+ * We can do it before isolate_lru_page because the
+ * page can't be freed from under us. NOTE: PG_lock
+ * is needed to serialize against split_huge_page
+ * when invoked from the VM.
+ */
+ if (!trylock_page(page)) {
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ /*
+ * Isolate the page to avoid collapsing an hugepage
+ * currently in use by the VM.
+ */
+ if (isolate_lru_page(page)) {
+ unlock_page(page);
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ /* 0 stands for page_is_file_cache(page) == false */
+ inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(PageLRU(page));
+
+ /* If there is no mapped pte young don't collapse the page */
+ if (pte_young(pteval) || PageReferenced(page) ||
+ mmu_notifier_test_young(vma->vm_mm, address))
+ referenced = 1;
+ }
+ if (unlikely(!referenced))
+ release_all_pte_pages(pte);
+ else
+ isolated = 1;
+out:
+ return isolated;
+}
+
+static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ spinlock_t *ptl)
+{
+ pte_t *_pte;
+ for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
+ pte_t pteval = *_pte;
+ struct page *src_page;
+
+ if (pte_none(pteval)) {
+ clear_user_highpage(page, address);
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
+ } else {
+ src_page = pte_page(pteval);
+ copy_user_highpage(page, src_page, address, vma);
+ VM_BUG_ON(page_mapcount(src_page) != 1);
+ VM_BUG_ON(page_count(src_page) != 2);
+ release_pte_page(src_page);
+ /*
+ * ptl mostly unnecessary, but preempt has to
+ * be disabled to update the per-cpu stats
+ * inside page_remove_rmap().
+ */
+ spin_lock(ptl);
+ /*
+ * paravirt calls inside pte_clear here are
+ * superfluous.
+ */
+ pte_clear(vma->vm_mm, address, _pte);
+ page_remove_rmap(src_page);
+ spin_unlock(ptl);
+ free_page_and_swap_cache(src_page);
+ }
+
+ address += PAGE_SIZE;
+ page++;
+ }
+}
+
+static void collapse_huge_page(struct mm_struct *mm,
+ unsigned long address,
+ struct page **hpage,
+ struct vm_area_struct *vma)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd, _pmd;
+ pte_t *pte;
+ pgtable_t pgtable;
+ struct page *new_page;
+ spinlock_t *ptl;
+ int isolated;
+ unsigned long hstart, hend;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#ifndef CONFIG_NUMA
+ VM_BUG_ON(!*hpage);
+ new_page = *hpage;
+#else
+ VM_BUG_ON(*hpage);
+ /*
+ * Allocate the page while the vma is still valid and under
+ * the mmap_sem read mode so there is no memory allocation
+ * later when we take the mmap_sem in write mode. This is more
+ * friendly behavior (OTOH it may actually hide bugs) to
+ * filesystems in userland with daemons allocating memory in
+ * the userland I/O paths. Allocating memory with the
+ * mmap_sem in read mode is good idea also to allow greater
+ * scalability.
+ */
+ new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
+ if (unlikely(!new_page)) {
+ up_read(&mm->mmap_sem);
+ *hpage = ERR_PTR(-ENOMEM);
+ return;
+ }
+#endif
+ if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+ up_read(&mm->mmap_sem);
+ put_page(new_page);
+ return;
+ }
+
+ /* after allocating the hugepage upgrade to mmap_sem write mode */
+ up_read(&mm->mmap_sem);
+
+ /*
+ * Prevent all access to pagetables with the exception of
+ * gup_fast later hanlded by the ptep_clear_flush and the VM
+ * handled by the anon_vma lock + PG_lock.
+ */
+ down_write(&mm->mmap_sem);
+ if (unlikely(khugepaged_test_exit(mm)))
+ goto out;
+
+ vma = find_vma(mm, address);
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+ goto out;
+
+ if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+ (vma->vm_flags & VM_NOHUGEPAGE))
+ goto out;
+
+ /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
+ if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+ goto out;
+ if (is_vma_temporary_stack(vma))
+ goto out;
+ VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ goto out;
+
+ pmd = pmd_offset(pud, address);
+ /* pmd can't go away or become huge under us */
+ if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+ goto out;
+
+ anon_vma_lock(vma->anon_vma);
+
+ pte = pte_offset_map(pmd, address);
+ ptl = pte_lockptr(mm, pmd);
+
+ spin_lock(&mm->page_table_lock); /* probably unnecessary */
+ /*
+ * After this gup_fast can't run anymore. This also removes
+ * any huge TLB entry from the CPU so we won't allow
+ * huge and small TLB entries for the same virtual address
+ * to avoid the risk of CPU bugs in that area.
+ */
+ _pmd = pmdp_clear_flush_notify(vma, address, pmd);
+ spin_unlock(&mm->page_table_lock);
+
+ spin_lock(ptl);
+ isolated = __collapse_huge_page_isolate(vma, address, pte);
+ spin_unlock(ptl);
+
+ if (unlikely(!isolated)) {
+ pte_unmap(pte);
+ spin_lock(&mm->page_table_lock);
+ BUG_ON(!pmd_none(*pmd));
+ set_pmd_at(mm, address, pmd, _pmd);
+ spin_unlock(&mm->page_table_lock);
+ anon_vma_unlock(vma->anon_vma);
+ goto out;
+ }
+
+ /*
+ * All pages are isolated and locked so anon_vma rmap
+ * can't run anymore.
+ */
+ anon_vma_unlock(vma->anon_vma);
+
+ __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
+ pte_unmap(pte);
+ __SetPageUptodate(new_page);
+ pgtable = pmd_pgtable(_pmd);
+ VM_BUG_ON(page_count(pgtable) != 1);
+ VM_BUG_ON(page_mapcount(pgtable) != 0);
+
+ _pmd = mk_pmd(new_page, vma->vm_page_prot);
+ _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
+ _pmd = pmd_mkhuge(_pmd);
+
+ /*
+ * spin_lock() below is not the equivalent of smp_wmb(), so
+ * this is needed to avoid the copy_huge_page writes to become
+ * visible after the set_pmd_at() write.
+ */
+ smp_wmb();
+
+ spin_lock(&mm->page_table_lock);
+ BUG_ON(!pmd_none(*pmd));
+ page_add_new_anon_rmap(new_page, vma, address);
+ set_pmd_at(mm, address, pmd, _pmd);
+ update_mmu_cache(vma, address, entry);
+ prepare_pmd_huge_pte(pgtable, mm);
+ mm->nr_ptes--;
+ spin_unlock(&mm->page_table_lock);
+
+#ifndef CONFIG_NUMA
+ *hpage = NULL;
+#endif
+ khugepaged_pages_collapsed++;
+out_up_write:
+ up_write(&mm->mmap_sem);
+ return;
+
+out:
+ mem_cgroup_uncharge_page(new_page);
+#ifdef CONFIG_NUMA
+ put_page(new_page);
+#endif
+ goto out_up_write;
+}
+
+static int khugepaged_scan_pmd(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ struct page **hpage)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte, *_pte;
+ int ret = 0, referenced = 0, none = 0;
+ struct page *page;
+ unsigned long _address;
+ spinlock_t *ptl;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ goto out;
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+ goto out;
+
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
+ _pte++, _address += PAGE_SIZE) {
+ pte_t pteval = *_pte;
+ if (pte_none(pteval)) {
+ if (++none <= khugepaged_max_ptes_none)
+ continue;
+ else
+ goto out_unmap;
+ }
+ if (!pte_present(pteval) || !pte_write(pteval))
+ goto out_unmap;
+ page = vm_normal_page(vma, _address, pteval);
+ if (unlikely(!page))
+ goto out_unmap;
+ VM_BUG_ON(PageCompound(page));
+ if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+ goto out_unmap;
+ /* cannot use mapcount: can't collapse if there's a gup pin */
+ if (page_count(page) != 1)
+ goto out_unmap;
+ if (pte_young(pteval) || PageReferenced(page) ||
+ mmu_notifier_test_young(vma->vm_mm, address))
+ referenced = 1;
+ }
+ if (referenced)
+ ret = 1;
+out_unmap:
+ pte_unmap_unlock(pte, ptl);
+ if (ret)
+ /* collapse_huge_page will return with the mmap_sem released */
+ collapse_huge_page(mm, address, hpage, vma);
+out:
+ return ret;
+}
+
+static void collect_mm_slot(struct mm_slot *mm_slot)
+{
+ struct mm_struct *mm = mm_slot->mm;
+
+ VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+
+ if (khugepaged_test_exit(mm)) {
+ /* free mm_slot */
+ hlist_del(&mm_slot->hash);
+ list_del(&mm_slot->mm_node);
+
+ /*
+ * Not strictly needed because the mm exited already.
+ *
+ * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ */
+
+ /* khugepaged_mm_lock actually not necessary for the below */
+ free_mm_slot(mm_slot);
+ mmdrop(mm);
+ }
+}
+
+static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
+ struct page **hpage)
+{
+ struct mm_slot *mm_slot;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ int progress = 0;
+
+ VM_BUG_ON(!pages);
+ VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+
+ if (khugepaged_scan.mm_slot)
+ mm_slot = khugepaged_scan.mm_slot;
+ else {
+ mm_slot = list_entry(khugepaged_scan.mm_head.next,
+ struct mm_slot, mm_node);
+ khugepaged_scan.address = 0;
+ khugepaged_scan.mm_slot = mm_slot;
+ }
+ spin_unlock(&khugepaged_mm_lock);
+
+ mm = mm_slot->mm;
+ down_read(&mm->mmap_sem);
+ if (unlikely(khugepaged_test_exit(mm)))
+ vma = NULL;
+ else
+ vma = find_vma(mm, khugepaged_scan.address);
+
+ progress++;
+ for (; vma; vma = vma->vm_next) {
+ unsigned long hstart, hend;
+
+ cond_resched();
+ if (unlikely(khugepaged_test_exit(mm))) {
+ progress++;
+ break;
+ }
+
+ if ((!(vma->vm_flags & VM_HUGEPAGE) &&
+ !khugepaged_always()) ||
+ (vma->vm_flags & VM_NOHUGEPAGE)) {
+ skip:
+ progress++;
+ continue;
+ }
+ /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
+ if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+ goto skip;
+ if (is_vma_temporary_stack(vma))
+ goto skip;
+
+ VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (hstart >= hend)
+ goto skip;
+ if (khugepaged_scan.address > hend)
+ goto skip;
+ if (khugepaged_scan.address < hstart)
+ khugepaged_scan.address = hstart;
+ VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+
+ while (khugepaged_scan.address < hend) {
+ int ret;
+ cond_resched();
+ if (unlikely(khugepaged_test_exit(mm)))
+ goto breakouterloop;
+
+ VM_BUG_ON(khugepaged_scan.address < hstart ||
+ khugepaged_scan.address + HPAGE_PMD_SIZE >
+ hend);
+ ret = khugepaged_scan_pmd(mm, vma,
+ khugepaged_scan.address,
+ hpage);
+ /* move to next address */
+ khugepaged_scan.address += HPAGE_PMD_SIZE;
+ progress += HPAGE_PMD_NR;
+ if (ret)
+ /* we released mmap_sem so break loop */
+ goto breakouterloop_mmap_sem;
+ if (progress >= pages)
+ goto breakouterloop;
+ }
+ }
+breakouterloop:
+ up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
+breakouterloop_mmap_sem:
+
+ spin_lock(&khugepaged_mm_lock);
+ VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+ /*
+ * Release the current mm_slot if this mm is about to die, or
+ * if we scanned all vmas of this mm.
+ */
+ if (khugepaged_test_exit(mm) || !vma) {
+ /*
+ * Make sure that if mm_users is reaching zero while
+ * khugepaged runs here, khugepaged_exit will find
+ * mm_slot not pointing to the exiting mm.
+ */
+ if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
+ khugepaged_scan.mm_slot = list_entry(
+ mm_slot->mm_node.next,
+ struct mm_slot, mm_node);
+ khugepaged_scan.address = 0;
+ } else {
+ khugepaged_scan.mm_slot = NULL;
+ khugepaged_full_scans++;
+ }
+
+ collect_mm_slot(mm_slot);
+ }
+
+ return progress;
+}
+
+static int khugepaged_has_work(void)
+{
+ return !list_empty(&khugepaged_scan.mm_head) &&
+ khugepaged_enabled();
+}
+
+static int khugepaged_wait_event(void)
+{
+ return !list_empty(&khugepaged_scan.mm_head) ||
+ !khugepaged_enabled();
+}
+
+static void khugepaged_do_scan(struct page **hpage)
+{
+ unsigned int progress = 0, pass_through_head = 0;
+ unsigned int pages = khugepaged_pages_to_scan;
+
+ barrier(); /* write khugepaged_pages_to_scan to local stack */
+
+ while (progress < pages) {
+ cond_resched();
+
+#ifndef CONFIG_NUMA
+ if (!*hpage) {
+ *hpage = alloc_hugepage(khugepaged_defrag());
+ if (unlikely(!*hpage))
+ break;
+ }
+#else
+ if (IS_ERR(*hpage))
+ break;
+#endif
+
+ if (unlikely(kthread_should_stop() || freezing(current)))
+ break;
+
+ spin_lock(&khugepaged_mm_lock);
+ if (!khugepaged_scan.mm_slot)
+ pass_through_head++;
+ if (khugepaged_has_work() &&
+ pass_through_head < 2)
+ progress += khugepaged_scan_mm_slot(pages - progress,
+ hpage);
+ else
+ progress = pages;
+ spin_unlock(&khugepaged_mm_lock);
+ }
+}
+
+static void khugepaged_alloc_sleep(void)
+{
+ DEFINE_WAIT(wait);
+ add_wait_queue(&khugepaged_wait, &wait);
+ schedule_timeout_interruptible(
+ msecs_to_jiffies(
+ khugepaged_alloc_sleep_millisecs));
+ remove_wait_queue(&khugepaged_wait, &wait);
+}
+
+#ifndef CONFIG_NUMA
+static struct page *khugepaged_alloc_hugepage(void)
+{
+ struct page *hpage;
+
+ do {
+ hpage = alloc_hugepage(khugepaged_defrag());
+ if (!hpage)
+ khugepaged_alloc_sleep();
+ } while (unlikely(!hpage) &&
+ likely(khugepaged_enabled()));
+ return hpage;
+}
+#endif
+
+static void khugepaged_loop(void)
+{
+ struct page *hpage;
+
+#ifdef CONFIG_NUMA
+ hpage = NULL;
+#endif
+ while (likely(khugepaged_enabled())) {
+#ifndef CONFIG_NUMA
+ hpage = khugepaged_alloc_hugepage();
+ if (unlikely(!hpage))
+ break;
+#else
+ if (IS_ERR(hpage)) {
+ khugepaged_alloc_sleep();
+ hpage = NULL;
+ }
+#endif
+
+ khugepaged_do_scan(&hpage);
+#ifndef CONFIG_NUMA
+ if (hpage)
+ put_page(hpage);
+#endif
+ try_to_freeze();
+ if (unlikely(kthread_should_stop()))
+ break;
+ if (khugepaged_has_work()) {
+ DEFINE_WAIT(wait);
+ if (!khugepaged_scan_sleep_millisecs)
+ continue;
+ add_wait_queue(&khugepaged_wait, &wait);
+ schedule_timeout_interruptible(
+ msecs_to_jiffies(
+ khugepaged_scan_sleep_millisecs));
+ remove_wait_queue(&khugepaged_wait, &wait);
+ } else if (khugepaged_enabled())
+ wait_event_freezable(khugepaged_wait,
+ khugepaged_wait_event());
+ }
+}
+
+static int khugepaged(void *none)
+{
+ struct mm_slot *mm_slot;
+
+ set_freezable();
+ set_user_nice(current, 19);
+
+ /* serialize with start_khugepaged() */
+ mutex_lock(&khugepaged_mutex);
+
+ for (;;) {
+ mutex_unlock(&khugepaged_mutex);
+ VM_BUG_ON(khugepaged_thread != current);
+ khugepaged_loop();
+ VM_BUG_ON(khugepaged_thread != current);
+
+ mutex_lock(&khugepaged_mutex);
+ if (!khugepaged_enabled())
+ break;
+ if (unlikely(kthread_should_stop()))
+ break;
+ }
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = khugepaged_scan.mm_slot;
+ khugepaged_scan.mm_slot = NULL;
+ if (mm_slot)
+ collect_mm_slot(mm_slot);
+ spin_unlock(&khugepaged_mm_lock);
+
+ khugepaged_thread = NULL;
+ mutex_unlock(&khugepaged_mutex);
+
+ return 0;
+}
+
+void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
+{
+ struct page *page;
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_trans_huge(*pmd))) {
+ spin_unlock(&mm->page_table_lock);
+ return;
+ }
+ page = pmd_page(*pmd);
+ VM_BUG_ON(!page_count(page));
+ get_page(page);
+ spin_unlock(&mm->page_table_lock);
+
+ split_huge_page(page);
+
+ put_page(page);
+ BUG_ON(pmd_trans_huge(*pmd));
+}
+
+static void split_huge_page_address(struct mm_struct *mm,
+ unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ return;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return;
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return;
+ /*
+ * Caller holds the mmap_sem write mode, so a huge pmd cannot
+ * materialize from under us.
+ */
+ split_huge_page_pmd(mm, pmd);
+}
+
+void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ long adjust_next)
+{
+ /*
+ * If the new start address isn't hpage aligned and it could
+ * previously contain an hugepage: check if we need to split
+ * an huge pmd.
+ */
+ if (start & ~HPAGE_PMD_MASK &&
+ (start & HPAGE_PMD_MASK) >= vma->vm_start &&
+ (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+ split_huge_page_address(vma->vm_mm, start);
+
+ /*
+ * If the new end address isn't hpage aligned and it could
+ * previously contain an hugepage: check if we need to split
+ * an huge pmd.
+ */
+ if (end & ~HPAGE_PMD_MASK &&
+ (end & HPAGE_PMD_MASK) >= vma->vm_start &&
+ (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+ split_huge_page_address(vma->vm_mm, end);
+
+ /*
+ * If we're also updating the vma->vm_next->vm_start, if the new
+ * vm_next->vm_start isn't page aligned and it could previously
+ * contain an hugepage: check if we need to split an huge pmd.
+ */
+ if (adjust_next > 0) {
+ struct vm_area_struct *next = vma->vm_next;
+ unsigned long nstart = next->vm_start;
+ nstart += adjust_next << PAGE_SHIFT;
+ if (nstart & ~HPAGE_PMD_MASK &&
+ (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
+ (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
+ split_huge_page_address(next->vm_mm, nstart);
+ }
+}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c03273807182..bb0b7c128015 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -394,67 +394,37 @@ static int vma_has_reserves(struct vm_area_struct *vma)
return 0;
}
-static void clear_gigantic_page(struct page *page,
- unsigned long addr, unsigned long sz)
+static void copy_gigantic_page(struct page *dst, struct page *src)
{
int i;
- struct page *p = page;
-
- might_sleep();
- for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
- cond_resched();
- clear_user_highpage(p, addr + i * PAGE_SIZE);
- }
-}
-static void clear_huge_page(struct page *page,
- unsigned long addr, unsigned long sz)
-{
- int i;
-
- if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
- clear_gigantic_page(page, addr, sz);
- return;
- }
-
- might_sleep();
- for (i = 0; i < sz/PAGE_SIZE; i++) {
- cond_resched();
- clear_user_highpage(page + i, addr + i * PAGE_SIZE);
- }
-}
-
-static void copy_gigantic_page(struct page *dst, struct page *src,
- unsigned long addr, struct vm_area_struct *vma)
-{
- int i;
- struct hstate *h = hstate_vma(vma);
+ struct hstate *h = page_hstate(src);
struct page *dst_base = dst;
struct page *src_base = src;
- might_sleep();
+
for (i = 0; i < pages_per_huge_page(h); ) {
cond_resched();
- copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+ copy_highpage(dst, src);
i++;
dst = mem_map_next(dst, dst_base, i);
src = mem_map_next(src, src_base, i);
}
}
-static void copy_huge_page(struct page *dst, struct page *src,
- unsigned long addr, struct vm_area_struct *vma)
+
+void copy_huge_page(struct page *dst, struct page *src)
{
int i;
- struct hstate *h = hstate_vma(vma);
+ struct hstate *h = page_hstate(src);
if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
- copy_gigantic_page(dst, src, addr, vma);
+ copy_gigantic_page(dst, src);
return;
}
might_sleep();
for (i = 0; i < pages_per_huge_page(h); i++) {
cond_resched();
- copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
+ copy_highpage(dst + i, src + i);
}
}
@@ -466,11 +436,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
h->free_huge_pages_node[nid]++;
}
+static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
+{
+ struct page *page;
+
+ if (list_empty(&h->hugepage_freelists[nid]))
+ return NULL;
+ page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
+ list_del(&page->lru);
+ set_page_refcounted(page);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ return page;
+}
+
static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve)
{
- int nid;
struct page *page = NULL;
struct mempolicy *mpol;
nodemask_t *nodemask;
@@ -496,19 +479,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
for_each_zone_zonelist_nodemask(zone, z, zonelist,
MAX_NR_ZONES - 1, nodemask) {
- nid = zone_to_nid(zone);
- if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
- !list_empty(&h->hugepage_freelists[nid])) {
- page = list_entry(h->hugepage_freelists[nid].next,
- struct page, lru);
- list_del(&page->lru);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
-
- if (!avoid_reserve)
- decrement_hugepage_resv_vma(h, vma);
-
- break;
+ if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
+ page = dequeue_huge_page_node(h, zone_to_nid(zone));
+ if (page) {
+ if (!avoid_reserve)
+ decrement_hugepage_resv_vma(h, vma);
+ break;
+ }
}
}
err:
@@ -770,11 +747,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
return ret;
}
-static struct page *alloc_buddy_huge_page(struct hstate *h,
- struct vm_area_struct *vma, unsigned long address)
+static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
{
struct page *page;
- unsigned int nid;
+ unsigned int r_nid;
if (h->order >= MAX_ORDER)
return NULL;
@@ -812,9 +788,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
}
spin_unlock(&hugetlb_lock);
- page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
- __GFP_REPEAT|__GFP_NOWARN,
- huge_page_order(h));
+ if (nid == NUMA_NO_NODE)
+ page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
+ __GFP_REPEAT|__GFP_NOWARN,
+ huge_page_order(h));
+ else
+ page = alloc_pages_exact_node(nid,
+ htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
+ __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
if (page && arch_prepare_hugepage(page)) {
__free_pages(page, huge_page_order(h));
@@ -823,19 +804,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
spin_lock(&hugetlb_lock);
if (page) {
- /*
- * This page is now managed by the hugetlb allocator and has
- * no users -- drop the buddy allocator's reference.
- */
- put_page_testzero(page);
- VM_BUG_ON(page_count(page));
- nid = page_to_nid(page);
+ r_nid = page_to_nid(page);
set_compound_page_dtor(page, free_huge_page);
/*
* We incremented the global counters already
*/
- h->nr_huge_pages_node[nid]++;
- h->surplus_huge_pages_node[nid]++;
+ h->nr_huge_pages_node[r_nid]++;
+ h->surplus_huge_pages_node[r_nid]++;
__count_vm_event(HTLB_BUDDY_PGALLOC);
} else {
h->nr_huge_pages--;
@@ -848,6 +823,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
}
/*
+ * This allocation function is useful in the context where vma is irrelevant.
+ * E.g. soft-offlining uses this function because it only cares physical
+ * address of error page.
+ */
+struct page *alloc_huge_page_node(struct hstate *h, int nid)
+{
+ struct page *page;
+
+ spin_lock(&hugetlb_lock);
+ page = dequeue_huge_page_node(h, nid);
+ spin_unlock(&hugetlb_lock);
+
+ if (!page)
+ page = alloc_buddy_huge_page(h, nid);
+
+ return page;
+}
+
+/*
* Increase the hugetlb pool such that it can accomodate a reservation
* of size 'delta'.
*/
@@ -871,17 +865,14 @@ static int gather_surplus_pages(struct hstate *h, int delta)
retry:
spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) {
- page = alloc_buddy_huge_page(h, NULL, 0);
- if (!page) {
+ page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+ if (!page)
/*
* We were not able to allocate enough pages to
* satisfy the entire reservation so we free what
* we've allocated so far.
*/
- spin_lock(&hugetlb_lock);
- needed = 0;
goto free;
- }
list_add(&page->lru, &surplus_list);
}
@@ -908,31 +899,31 @@ retry:
needed += allocated;
h->resv_huge_pages += delta;
ret = 0;
-free:
+
+ spin_unlock(&hugetlb_lock);
/* Free the needed pages to the hugetlb pool */
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
if ((--needed) < 0)
break;
list_del(&page->lru);
+ /*
+ * This page is now managed by the hugetlb allocator and has
+ * no users -- drop the buddy allocator's reference.
+ */
+ put_page_testzero(page);
+ VM_BUG_ON(page_count(page));
enqueue_huge_page(h, page);
}
/* Free unnecessary surplus pages to the buddy allocator */
+free:
if (!list_empty(&surplus_list)) {
- spin_unlock(&hugetlb_lock);
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
list_del(&page->lru);
- /*
- * The page has a reference count of zero already, so
- * call free_huge_page directly instead of using
- * put_page. This must be done with hugetlb_lock
- * unlocked which is safe because free_huge_page takes
- * hugetlb_lock before deciding how to free the page.
- */
- free_huge_page(page);
+ put_page(page);
}
- spin_lock(&hugetlb_lock);
}
+ spin_lock(&hugetlb_lock);
return ret;
}
@@ -1052,14 +1043,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
spin_unlock(&hugetlb_lock);
if (!page) {
- page = alloc_buddy_huge_page(h, vma, addr);
+ page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
if (!page) {
hugetlb_put_quota(inode->i_mapping, chg);
return ERR_PTR(-VM_FAULT_SIGBUS);
}
}
- set_page_refcounted(page);
set_page_private(page, (unsigned long) mapping);
vma_commit_reservation(h, vma, addr);
@@ -1373,6 +1363,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj,
return sprintf(buf, "%lu\n", nr_huge_pages);
}
+
static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t len)
@@ -1385,9 +1376,14 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
err = strict_strtoul(buf, 10, &count);
if (err)
- return 0;
+ goto out;
h = kobj_to_hstate(kobj, &nid);
+ if (h->order >= MAX_ORDER) {
+ err = -EINVAL;
+ goto out;
+ }
+
if (nid == NUMA_NO_NODE) {
/*
* global hstate attribute
@@ -1413,6 +1409,9 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
NODEMASK_FREE(nodes_allowed);
return len;
+out:
+ NODEMASK_FREE(nodes_allowed);
+ return err;
}
static ssize_t nr_hugepages_show(struct kobject *kobj,
@@ -1455,6 +1454,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
struct hstate *h = kobj_to_hstate(kobj, NULL);
return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
}
+
static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
{
@@ -1462,9 +1462,12 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
unsigned long input;
struct hstate *h = kobj_to_hstate(kobj, NULL);
+ if (h->order >= MAX_ORDER)
+ return -EINVAL;
+
err = strict_strtoul(buf, 10, &input);
if (err)
- return 0;
+ return err;
spin_lock(&hugetlb_lock);
h->nr_overcommit_huge_pages = input;
@@ -1867,13 +1870,19 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
{
struct hstate *h = &default_hstate;
unsigned long tmp;
+ int ret;
if (!write)
tmp = h->max_huge_pages;
+ if (write && h->order >= MAX_ORDER)
+ return -EINVAL;
+
table->data = &tmp;
table->maxlen = sizeof(unsigned long);
- proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ if (ret)
+ goto out;
if (write) {
NODEMASK_ALLOC(nodemask_t, nodes_allowed,
@@ -1888,8 +1897,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
if (nodes_allowed != &node_states[N_HIGH_MEMORY])
NODEMASK_FREE(nodes_allowed);
}
-
- return 0;
+out:
+ return ret;
}
int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@@ -1927,21 +1936,27 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
{
struct hstate *h = &default_hstate;
unsigned long tmp;
+ int ret;
if (!write)
tmp = h->nr_overcommit_huge_pages;
+ if (write && h->order >= MAX_ORDER)
+ return -EINVAL;
+
table->data = &tmp;
table->maxlen = sizeof(unsigned long);
- proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ if (ret)
+ goto out;
if (write) {
spin_lock(&hugetlb_lock);
h->nr_overcommit_huge_pages = tmp;
spin_unlock(&hugetlb_lock);
}
-
- return 0;
+out:
+ return ret;
}
#endif /* CONFIG_SYSCTL */
@@ -2153,6 +2168,19 @@ nomem:
return -ENOMEM;
}
+static int is_hugetlb_entry_migration(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return 0;
+ swp = pte_to_swp_entry(pte);
+ if (non_swap_entry(swp) && is_migration_entry(swp)) {
+ return 1;
+ } else
+ return 0;
+}
+
static int is_hugetlb_entry_hwpoisoned(pte_t pte)
{
swp_entry_t swp;
@@ -2380,10 +2408,14 @@ retry_avoidcopy:
* When the original hugepage is shared one, it does not have
* anon_vma prepared.
*/
- if (unlikely(anon_vma_prepare(vma)))
+ if (unlikely(anon_vma_prepare(vma))) {
+ /* Caller expects lock to be held */
+ spin_lock(&mm->page_table_lock);
return VM_FAULT_OOM;
+ }
- copy_huge_page(new_page, old_page, address, vma);
+ copy_user_huge_page(new_page, old_page, address, vma,
+ pages_per_huge_page(h));
__SetPageUptodate(new_page);
/*
@@ -2487,7 +2519,7 @@ retry:
ret = -PTR_ERR(page);
goto out;
}
- clear_huge_page(page, address, huge_page_size(h));
+ clear_huge_page(page, address, pages_per_huge_page(h));
__SetPageUptodate(page);
if (vma->vm_flags & VM_MAYSHARE) {
@@ -2515,22 +2547,20 @@ retry:
hugepage_add_new_anon_rmap(page, vma, address);
}
} else {
+ /*
+ * If memory error occurs between mmap() and fault, some process
+ * don't have hwpoisoned swap entry for errored virtual address.
+ * So we need to block hugepage fault by PG_hwpoison bit check.
+ */
+ if (unlikely(PageHWPoison(page))) {
+ ret = VM_FAULT_HWPOISON |
+ VM_FAULT_SET_HINDEX(h - hstates);
+ goto backout_unlocked;
+ }
page_dup_rmap(page);
}
/*
- * Since memory error handler replaces pte into hwpoison swap entry
- * at the time of error handling, a process which reserved but not have
- * the mapping to the error hugepage does not have hwpoison swap entry.
- * So we need to block accesses from such a process by checking
- * PG_hwpoison bit here.
- */
- if (unlikely(PageHWPoison(page))) {
- ret = VM_FAULT_HWPOISON;
- goto backout_unlocked;
- }
-
- /*
* If we are going to COW a private mapping later, we examine the
* pending reservations for this page now. This will ensure that
* any allocations necessary to record that reservation occur outside
@@ -2587,8 +2617,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ptep = huge_pte_offset(mm, address);
if (ptep) {
entry = huge_ptep_get(ptep);
- if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
- return VM_FAULT_HWPOISON;
+ if (unlikely(is_hugetlb_entry_migration(entry))) {
+ migration_entry_wait(mm, (pmd_t *)ptep, address);
+ return 0;
+ } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+ return VM_FAULT_HWPOISON_LARGE |
+ VM_FAULT_SET_HINDEX(h - hstates);
}
ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2665,7 +2699,8 @@ out_page_table_lock:
unlock_page(pagecache_page);
put_page(pagecache_page);
}
- unlock_page(page);
+ if (page != pagecache_page)
+ unlock_page(page);
out_mutex:
mutex_unlock(&hugetlb_instantiation_mutex);
@@ -2878,18 +2913,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
hugetlb_acct_memory(h, -(chg - freed));
}
+#ifdef CONFIG_MEMORY_FAILURE
+
+/* Should be called in hugetlb_lock */
+static int is_hugepage_on_freelist(struct page *hpage)
+{
+ struct page *page;
+ struct page *tmp;
+ struct hstate *h = page_hstate(hpage);
+ int nid = page_to_nid(hpage);
+
+ list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
+ if (page == hpage)
+ return 1;
+ return 0;
+}
+
/*
* This function is called from memory failure code.
* Assume the caller holds page lock of the head page.
*/
-void __isolate_hwpoisoned_huge_page(struct page *hpage)
+int dequeue_hwpoisoned_huge_page(struct page *hpage)
{
struct hstate *h = page_hstate(hpage);
int nid = page_to_nid(hpage);
+ int ret = -EBUSY;
spin_lock(&hugetlb_lock);
- list_del(&hpage->lru);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
+ if (is_hugepage_on_freelist(hpage)) {
+ list_del(&hpage->lru);
+ set_page_refcounted(hpage);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ ret = 0;
+ }
spin_unlock(&hugetlb_lock);
+ return ret;
}
+#endif
diff --git a/mm/internal.h b/mm/internal.h
index 6a697bb97fc5..69488205723d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -62,7 +62,7 @@ extern bool is_free_buddy_page(struct page *page);
*/
static inline unsigned long page_order(struct page *page)
{
- VM_BUG_ON(!PageBuddy(page));
+ /* PageBuddy() must be checked by the caller */
return page_private(page);
}
@@ -134,6 +134,10 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
}
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern unsigned long vma_address(struct page *page,
+ struct vm_area_struct *vma);
+#endif
#else /* !CONFIG_MMU */
static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
{
@@ -243,7 +247,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, unsigned int foll_flags,
- struct page **pages, struct vm_area_struct **vmas);
+ struct page **pages, struct vm_area_struct **vmas,
+ int *nonblocking);
#define ZONE_RECLAIM_NOSCAN -2
#define ZONE_RECLAIM_FULL -1
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index 177a5169bbde..ff0d9779cec8 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -75,13 +75,11 @@ static int __init kmemleak_test_init(void)
* after the module is removed.
*/
for (i = 0; i < 10; i++) {
- elem = kmalloc(sizeof(*elem), GFP_KERNEL);
- pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem);
+ elem = kzalloc(sizeof(*elem), GFP_KERNEL);
+ pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem);
if (!elem)
return -ENOMEM;
- memset(elem, 0, sizeof(*elem));
INIT_LIST_HEAD(&elem->list);
-
list_add_tail(&elem->list, &test_list);
}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index bd9bc214091b..84225f3b7190 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -113,7 +113,9 @@
#define BYTES_PER_POINTER sizeof(void *)
/* GFP bitmask for kmemleak internal allocations */
-#define GFP_KMEMLEAK_MASK (GFP_KERNEL | GFP_ATOMIC)
+#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
+ __GFP_NORETRY | __GFP_NOMEMALLOC | \
+ __GFP_NOWARN)
/* scanning area inside a memory block */
struct kmemleak_scan_area {
@@ -511,9 +513,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
struct kmemleak_object *object;
struct prio_tree_node *node;
- object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK);
+ object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
if (!object) {
- kmemleak_stop("Cannot allocate a kmemleak_object structure\n");
+ pr_warning("Cannot allocate a kmemleak_object structure\n");
+ kmemleak_disable();
return NULL;
}
@@ -734,9 +737,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
return;
}
- area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK);
+ area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
if (!area) {
- kmemleak_warn("Cannot allocate a scan area\n");
+ pr_warning("Cannot allocate a scan area\n");
goto out;
}
diff --git a/mm/ksm.c b/mm/ksm.c
index 65ab5c7067d9..c2b2a94f9d67 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -34,6 +34,7 @@
#include <linux/swap.h>
#include <linux/ksm.h>
#include <linux/hash.h>
+#include <linux/freezer.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -411,6 +412,20 @@ out:
up_read(&mm->mmap_sem);
}
+static struct page *page_trans_compound_anon(struct page *page)
+{
+ if (PageTransCompound(page)) {
+ struct page *head = compound_trans_head(page);
+ /*
+ * head may actually be splitted and freed from under
+ * us but it's ok here.
+ */
+ if (PageAnon(head))
+ return head;
+ }
+ return NULL;
+}
+
static struct page *get_mergeable_page(struct rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
@@ -430,7 +445,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
page = follow_page(vma, addr, FOLL_GET);
if (IS_ERR_OR_NULL(page))
goto out;
- if (PageAnon(page)) {
+ if (PageAnon(page) || page_trans_compound_anon(page)) {
flush_anon_page(vma, page, addr);
flush_dcache_page(page);
} else {
@@ -708,6 +723,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
if (addr == -EFAULT)
goto out;
+ BUG_ON(PageTransCompound(page));
ptep = page_check_address(page, mm, addr, &ptl, 0);
if (!ptep)
goto out;
@@ -783,6 +799,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
goto out;
pmd = pmd_offset(pud, addr);
+ BUG_ON(pmd_trans_huge(*pmd));
if (!pmd_present(*pmd))
goto out;
@@ -800,6 +817,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
page_remove_rmap(page);
+ if (!page_mapped(page))
+ try_to_free_swap(page);
put_page(page);
pte_unmap_unlock(ptep, ptl);
@@ -808,6 +827,33 @@ out:
return err;
}
+static int page_trans_compound_anon_split(struct page *page)
+{
+ int ret = 0;
+ struct page *transhuge_head = page_trans_compound_anon(page);
+ if (transhuge_head) {
+ /* Get the reference on the head to split it. */
+ if (get_page_unless_zero(transhuge_head)) {
+ /*
+ * Recheck we got the reference while the head
+ * was still anonymous.
+ */
+ if (PageAnon(transhuge_head))
+ ret = split_huge_page(transhuge_head);
+ else
+ /*
+ * Retry later if split_huge_page run
+ * from under us.
+ */
+ ret = 1;
+ put_page(transhuge_head);
+ } else
+ /* Retry later if split_huge_page run from under us. */
+ ret = 1;
+ }
+ return ret;
+}
+
/*
* try_to_merge_one_page - take two pages and merge them into one
* @vma: the vma that holds the pte pointing to page
@@ -828,6 +874,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
if (!(vma->vm_flags & VM_MERGEABLE))
goto out;
+ if (PageTransCompound(page) && page_trans_compound_anon_split(page))
+ goto out;
+ BUG_ON(PageTransCompound(page));
if (!PageAnon(page))
goto out;
@@ -1247,6 +1296,18 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
slot = ksm_scan.mm_slot;
if (slot == &ksm_mm_head) {
+ /*
+ * A number of pages can hang around indefinitely on per-cpu
+ * pagevecs, raised page count preventing write_protect_page
+ * from merging them. Though it doesn't really matter much,
+ * it is puzzling to see some stuck in pages_volatile until
+ * other activity jostles them out, and they also prevented
+ * LTP's KSM test from succeeding deterministically; so drain
+ * them here (here rather than on entry to ksm_do_scan(),
+ * so we don't IPI too often when pages_to_scan is set low).
+ */
+ lru_add_drain_all();
+
root_unstable_tree = RB_ROOT;
spin_lock(&ksm_mmlist_lock);
@@ -1277,7 +1338,13 @@ next_mm:
if (ksm_test_exit(mm))
break;
*page = follow_page(vma, ksm_scan.address, FOLL_GET);
- if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) {
+ if (IS_ERR_OR_NULL(*page)) {
+ ksm_scan.address += PAGE_SIZE;
+ cond_resched();
+ continue;
+ }
+ if (PageAnon(*page) ||
+ page_trans_compound_anon(*page)) {
flush_anon_page(vma, *page, ksm_scan.address);
flush_dcache_page(*page);
rmap_item = get_next_rmap_item(slot,
@@ -1291,8 +1358,7 @@ next_mm:
up_read(&mm->mmap_sem);
return rmap_item;
}
- if (!IS_ERR_OR_NULL(*page))
- put_page(*page);
+ put_page(*page);
ksm_scan.address += PAGE_SIZE;
cond_resched();
}
@@ -1352,7 +1418,7 @@ static void ksm_do_scan(unsigned int scan_npages)
struct rmap_item *rmap_item;
struct page *uninitialized_var(page);
- while (scan_npages--) {
+ while (scan_npages-- && likely(!freezing(current))) {
cond_resched();
rmap_item = scan_get_next_rmap_item(&page);
if (!rmap_item)
@@ -1370,6 +1436,7 @@ static int ksmd_should_run(void)
static int ksm_scan_thread(void *nothing)
{
+ set_freezable();
set_user_nice(current, 5);
while (!kthread_should_stop()) {
@@ -1378,11 +1445,13 @@ static int ksm_scan_thread(void *nothing)
ksm_do_scan(ksm_thread_pages_to_scan);
mutex_unlock(&ksm_thread_mutex);
+ try_to_freeze();
+
if (ksmd_should_run()) {
schedule_timeout_interruptible(
msecs_to_jiffies(ksm_thread_sleep_millisecs));
} else {
- wait_event_interruptible(ksm_thread_wait,
+ wait_event_freezable(ksm_thread_wait,
ksmd_should_run() || kthread_should_stop());
}
}
@@ -1724,8 +1793,13 @@ static int ksm_memory_callback(struct notifier_block *self,
/*
* Keep it very simple for now: just lock out ksmd and
* MADV_UNMERGEABLE while any memory is going offline.
+ * mutex_lock_nested() is necessary because lockdep was alarmed
+ * that here we take ksm_thread_mutex inside notifier chain
+ * mutex, and later take notifier chain mutex inside
+ * ksm_thread_mutex to unlock it. But that's safe because both
+ * are inside mem_hotplug_mutex.
*/
- mutex_lock(&ksm_thread_mutex);
+ mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING);
break;
case MEM_OFFLINE:
diff --git a/mm/maccess.c b/mm/maccess.c
index 4e348dbaecd7..e2b6f5634e0d 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -1,9 +1,9 @@
/*
* Access kernel memory without faulting.
*/
-#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/mm.h>
+#include <linux/uaccess.h>
/**
* probe_kernel_read(): safely attempt to read from a location
diff --git a/mm/madvise.c b/mm/madvise.c
index 319528b8db74..2221491ed503 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -71,6 +71,12 @@ static long madvise_behavior(struct vm_area_struct * vma,
if (error)
goto out;
break;
+ case MADV_HUGEPAGE:
+ case MADV_NOHUGEPAGE:
+ error = hugepage_madvise(vma, &new_flags, behavior);
+ if (error)
+ goto out;
+ break;
}
if (new_flags == vma->vm_flags) {
@@ -283,6 +289,10 @@ madvise_behavior_valid(int behavior)
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ case MADV_HUGEPAGE:
+ case MADV_NOHUGEPAGE:
+#endif
return 1;
default:
diff --git a/mm/memblock.c b/mm/memblock.c
index 43840b305ecb..4618fda975a0 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -11,237 +11,421 @@
*/
#include <linux/kernel.h>
+#include <linux/slab.h>
#include <linux/init.h>
#include <linux/bitops.h>
+#include <linux/poison.h>
+#include <linux/pfn.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
#include <linux/memblock.h>
-#define MEMBLOCK_ALLOC_ANYWHERE 0
+struct memblock memblock __initdata_memblock;
-struct memblock memblock;
+int memblock_debug __initdata_memblock;
+int memblock_can_resize __initdata_memblock;
+static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
+static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
-static int memblock_debug;
+/* inline so we don't get a warning when pr_debug is compiled out */
+static inline const char *memblock_type_name(struct memblock_type *type)
+{
+ if (type == &memblock.memory)
+ return "memory";
+ else if (type == &memblock.reserved)
+ return "reserved";
+ else
+ return "unknown";
+}
-static int __init early_memblock(char *p)
+/*
+ * Address comparison utilities
+ */
+
+static phys_addr_t __init_memblock memblock_align_down(phys_addr_t addr, phys_addr_t size)
{
- if (p && strstr(p, "debug"))
- memblock_debug = 1;
+ return addr & ~(size - 1);
+}
+
+static phys_addr_t __init_memblock memblock_align_up(phys_addr_t addr, phys_addr_t size)
+{
+ return (addr + (size - 1)) & ~(size - 1);
+}
+
+static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1,
+ phys_addr_t base2, phys_addr_t size2)
+{
+ return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
+}
+
+static long __init_memblock memblock_addrs_adjacent(phys_addr_t base1, phys_addr_t size1,
+ phys_addr_t base2, phys_addr_t size2)
+{
+ if (base2 == base1 + size1)
+ return 1;
+ else if (base1 == base2 + size2)
+ return -1;
+
return 0;
}
-early_param("memblock", early_memblock);
-static void memblock_dump(struct memblock_region *region, char *name)
+static long __init_memblock memblock_regions_adjacent(struct memblock_type *type,
+ unsigned long r1, unsigned long r2)
{
- unsigned long long base, size;
- int i;
+ phys_addr_t base1 = type->regions[r1].base;
+ phys_addr_t size1 = type->regions[r1].size;
+ phys_addr_t base2 = type->regions[r2].base;
+ phys_addr_t size2 = type->regions[r2].size;
- pr_info(" %s.cnt = 0x%lx\n", name, region->cnt);
+ return memblock_addrs_adjacent(base1, size1, base2, size2);
+}
- for (i = 0; i < region->cnt; i++) {
- base = region->region[i].base;
- size = region->region[i].size;
+long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
+{
+ unsigned long i;
- pr_info(" %s[0x%x]\t0x%016llx - 0x%016llx, 0x%llx bytes\n",
- name, i, base, base + size - 1, size);
+ for (i = 0; i < type->cnt; i++) {
+ phys_addr_t rgnbase = type->regions[i].base;
+ phys_addr_t rgnsize = type->regions[i].size;
+ if (memblock_addrs_overlap(base, size, rgnbase, rgnsize))
+ break;
}
+
+ return (i < type->cnt) ? i : -1;
}
-void memblock_dump_all(void)
+/*
+ * Find, allocate, deallocate or reserve unreserved regions. All allocations
+ * are top-down.
+ */
+
+static phys_addr_t __init_memblock memblock_find_region(phys_addr_t start, phys_addr_t end,
+ phys_addr_t size, phys_addr_t align)
{
- if (!memblock_debug)
- return;
+ phys_addr_t base, res_base;
+ long j;
- pr_info("MEMBLOCK configuration:\n");
- pr_info(" rmo_size = 0x%llx\n", (unsigned long long)memblock.rmo_size);
- pr_info(" memory.size = 0x%llx\n", (unsigned long long)memblock.memory.size);
+ /* In case, huge size is requested */
+ if (end < size)
+ return MEMBLOCK_ERROR;
- memblock_dump(&memblock.memory, "memory");
- memblock_dump(&memblock.reserved, "reserved");
+ base = memblock_align_down((end - size), align);
+
+ /* Prevent allocations returning 0 as it's also used to
+ * indicate an allocation failure
+ */
+ if (start == 0)
+ start = PAGE_SIZE;
+
+ while (start <= base) {
+ j = memblock_overlaps_region(&memblock.reserved, base, size);
+ if (j < 0)
+ return base;
+ res_base = memblock.reserved.regions[j].base;
+ if (res_base < size)
+ break;
+ base = memblock_align_down(res_base - size, align);
+ }
+
+ return MEMBLOCK_ERROR;
}
-static unsigned long memblock_addrs_overlap(u64 base1, u64 size1, u64 base2,
- u64 size2)
+static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size,
+ phys_addr_t align, phys_addr_t start, phys_addr_t end)
{
- return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
+ long i;
+
+ BUG_ON(0 == size);
+
+ /* Pump up max_addr */
+ if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
+ end = memblock.current_limit;
+
+ /* We do a top-down search, this tends to limit memory
+ * fragmentation by keeping early boot allocs near the
+ * top of memory
+ */
+ for (i = memblock.memory.cnt - 1; i >= 0; i--) {
+ phys_addr_t memblockbase = memblock.memory.regions[i].base;
+ phys_addr_t memblocksize = memblock.memory.regions[i].size;
+ phys_addr_t bottom, top, found;
+
+ if (memblocksize < size)
+ continue;
+ if ((memblockbase + memblocksize) <= start)
+ break;
+ bottom = max(memblockbase, start);
+ top = min(memblockbase + memblocksize, end);
+ if (bottom >= top)
+ continue;
+ found = memblock_find_region(bottom, top, size, align);
+ if (found != MEMBLOCK_ERROR)
+ return found;
+ }
+ return MEMBLOCK_ERROR;
}
-static long memblock_addrs_adjacent(u64 base1, u64 size1, u64 base2, u64 size2)
+/*
+ * Find a free area with specified alignment in a specific range.
+ */
+u64 __init_memblock memblock_find_in_range(u64 start, u64 end, u64 size, u64 align)
{
- if (base2 == base1 + size1)
- return 1;
- else if (base1 == base2 + size2)
- return -1;
+ return memblock_find_base(size, align, start, end);
+}
- return 0;
+/*
+ * Free memblock.reserved.regions
+ */
+int __init_memblock memblock_free_reserved_regions(void)
+{
+ if (memblock.reserved.regions == memblock_reserved_init_regions)
+ return 0;
+
+ return memblock_free(__pa(memblock.reserved.regions),
+ sizeof(struct memblock_region) * memblock.reserved.max);
}
-static long memblock_regions_adjacent(struct memblock_region *rgn,
- unsigned long r1, unsigned long r2)
+/*
+ * Reserve memblock.reserved.regions
+ */
+int __init_memblock memblock_reserve_reserved_regions(void)
{
- u64 base1 = rgn->region[r1].base;
- u64 size1 = rgn->region[r1].size;
- u64 base2 = rgn->region[r2].base;
- u64 size2 = rgn->region[r2].size;
+ if (memblock.reserved.regions == memblock_reserved_init_regions)
+ return 0;
- return memblock_addrs_adjacent(base1, size1, base2, size2);
+ return memblock_reserve(__pa(memblock.reserved.regions),
+ sizeof(struct memblock_region) * memblock.reserved.max);
}
-static void memblock_remove_region(struct memblock_region *rgn, unsigned long r)
+static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
{
unsigned long i;
- for (i = r; i < rgn->cnt - 1; i++) {
- rgn->region[i].base = rgn->region[i + 1].base;
- rgn->region[i].size = rgn->region[i + 1].size;
+ for (i = r; i < type->cnt - 1; i++) {
+ type->regions[i].base = type->regions[i + 1].base;
+ type->regions[i].size = type->regions[i + 1].size;
}
- rgn->cnt--;
+ type->cnt--;
}
/* Assumption: base addr of region 1 < base addr of region 2 */
-static void memblock_coalesce_regions(struct memblock_region *rgn,
+static void __init_memblock memblock_coalesce_regions(struct memblock_type *type,
unsigned long r1, unsigned long r2)
{
- rgn->region[r1].size += rgn->region[r2].size;
- memblock_remove_region(rgn, r2);
+ type->regions[r1].size += type->regions[r2].size;
+ memblock_remove_region(type, r2);
}
-void __init memblock_init(void)
+/* Defined below but needed now */
+static long memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size);
+
+static int __init_memblock memblock_double_array(struct memblock_type *type)
{
- /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
- * This simplifies the memblock_add() code below...
+ struct memblock_region *new_array, *old_array;
+ phys_addr_t old_size, new_size, addr;
+ int use_slab = slab_is_available();
+
+ /* We don't allow resizing until we know about the reserved regions
+ * of memory that aren't suitable for allocation
*/
- memblock.memory.region[0].base = 0;
- memblock.memory.region[0].size = 0;
- memblock.memory.cnt = 1;
+ if (!memblock_can_resize)
+ return -1;
- /* Ditto. */
- memblock.reserved.region[0].base = 0;
- memblock.reserved.region[0].size = 0;
- memblock.reserved.cnt = 1;
-}
+ /* Calculate new doubled size */
+ old_size = type->max * sizeof(struct memblock_region);
+ new_size = old_size << 1;
+
+ /* Try to find some space for it.
+ *
+ * WARNING: We assume that either slab_is_available() and we use it or
+ * we use MEMBLOCK for allocations. That means that this is unsafe to use
+ * when bootmem is currently active (unless bootmem itself is implemented
+ * on top of MEMBLOCK which isn't the case yet)
+ *
+ * This should however not be an issue for now, as we currently only
+ * call into MEMBLOCK while it's still active, or much later when slab is
+ * active for memory hotplug operations
+ */
+ if (use_slab) {
+ new_array = kmalloc(new_size, GFP_KERNEL);
+ addr = new_array == NULL ? MEMBLOCK_ERROR : __pa(new_array);
+ } else
+ addr = memblock_find_base(new_size, sizeof(phys_addr_t), 0, MEMBLOCK_ALLOC_ACCESSIBLE);
+ if (addr == MEMBLOCK_ERROR) {
+ pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
+ memblock_type_name(type), type->max, type->max * 2);
+ return -1;
+ }
+ new_array = __va(addr);
-void __init memblock_analyze(void)
-{
- int i;
+ memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]",
+ memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1);
- memblock.memory.size = 0;
+ /* Found space, we now need to move the array over before
+ * we add the reserved region since it may be our reserved
+ * array itself that is full.
+ */
+ memcpy(new_array, type->regions, old_size);
+ memset(new_array + type->max, 0, old_size);
+ old_array = type->regions;
+ type->regions = new_array;
+ type->max <<= 1;
+
+ /* If we use SLAB that's it, we are done */
+ if (use_slab)
+ return 0;
- for (i = 0; i < memblock.memory.cnt; i++)
- memblock.memory.size += memblock.memory.region[i].size;
+ /* Add the new reserved region now. Should not fail ! */
+ BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size) < 0);
+
+ /* If the array wasn't our static init one, then free it. We only do
+ * that before SLAB is available as later on, we don't know whether
+ * to use kfree or free_bootmem_pages(). Shouldn't be a big deal
+ * anyways
+ */
+ if (old_array != memblock_memory_init_regions &&
+ old_array != memblock_reserved_init_regions)
+ memblock_free(__pa(old_array), old_size);
+
+ return 0;
}
-static long memblock_add_region(struct memblock_region *rgn, u64 base, u64 size)
+extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1,
+ phys_addr_t addr2, phys_addr_t size2)
+{
+ return 1;
+}
+
+static long __init_memblock memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
{
unsigned long coalesced = 0;
long adjacent, i;
- if ((rgn->cnt == 1) && (rgn->region[0].size == 0)) {
- rgn->region[0].base = base;
- rgn->region[0].size = size;
+ if ((type->cnt == 1) && (type->regions[0].size == 0)) {
+ type->regions[0].base = base;
+ type->regions[0].size = size;
return 0;
}
/* First try and coalesce this MEMBLOCK with another. */
- for (i = 0; i < rgn->cnt; i++) {
- u64 rgnbase = rgn->region[i].base;
- u64 rgnsize = rgn->region[i].size;
+ for (i = 0; i < type->cnt; i++) {
+ phys_addr_t rgnbase = type->regions[i].base;
+ phys_addr_t rgnsize = type->regions[i].size;
if ((rgnbase == base) && (rgnsize == size))
/* Already have this region, so we're done */
return 0;
adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize);
+ /* Check if arch allows coalescing */
+ if (adjacent != 0 && type == &memblock.memory &&
+ !memblock_memory_can_coalesce(base, size, rgnbase, rgnsize))
+ break;
if (adjacent > 0) {
- rgn->region[i].base -= size;
- rgn->region[i].size += size;
+ type->regions[i].base -= size;
+ type->regions[i].size += size;
coalesced++;
break;
} else if (adjacent < 0) {
- rgn->region[i].size += size;
+ type->regions[i].size += size;
coalesced++;
break;
}
}
- if ((i < rgn->cnt - 1) && memblock_regions_adjacent(rgn, i, i+1)) {
- memblock_coalesce_regions(rgn, i, i+1);
+ /* If we plugged a hole, we may want to also coalesce with the
+ * next region
+ */
+ if ((i < type->cnt - 1) && memblock_regions_adjacent(type, i, i+1) &&
+ ((type != &memblock.memory || memblock_memory_can_coalesce(type->regions[i].base,
+ type->regions[i].size,
+ type->regions[i+1].base,
+ type->regions[i+1].size)))) {
+ memblock_coalesce_regions(type, i, i+1);
coalesced++;
}
if (coalesced)
return coalesced;
- if (rgn->cnt >= MAX_MEMBLOCK_REGIONS)
+
+ /* If we are out of space, we fail. It's too late to resize the array
+ * but then this shouldn't have happened in the first place.
+ */
+ if (WARN_ON(type->cnt >= type->max))
return -1;
/* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */
- for (i = rgn->cnt - 1; i >= 0; i--) {
- if (base < rgn->region[i].base) {
- rgn->region[i+1].base = rgn->region[i].base;
- rgn->region[i+1].size = rgn->region[i].size;
+ for (i = type->cnt - 1; i >= 0; i--) {
+ if (base < type->regions[i].base) {
+ type->regions[i+1].base = type->regions[i].base;
+ type->regions[i+1].size = type->regions[i].size;
} else {
- rgn->region[i+1].base = base;
- rgn->region[i+1].size = size;
+ type->regions[i+1].base = base;
+ type->regions[i+1].size = size;
break;
}
}
- if (base < rgn->region[0].base) {
- rgn->region[0].base = base;
- rgn->region[0].size = size;
+ if (base < type->regions[0].base) {
+ type->regions[0].base = base;
+ type->regions[0].size = size;
+ }
+ type->cnt++;
+
+ /* The array is full ? Try to resize it. If that fails, we undo
+ * our allocation and return an error
+ */
+ if (type->cnt == type->max && memblock_double_array(type)) {
+ type->cnt--;
+ return -1;
}
- rgn->cnt++;
return 0;
}
-long memblock_add(u64 base, u64 size)
+long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
- struct memblock_region *_rgn = &memblock.memory;
-
- /* On pSeries LPAR systems, the first MEMBLOCK is our RMO region. */
- if (base == 0)
- memblock.rmo_size = size;
-
- return memblock_add_region(_rgn, base, size);
+ return memblock_add_region(&memblock.memory, base, size);
}
-static long __memblock_remove(struct memblock_region *rgn, u64 base, u64 size)
+static long __init_memblock __memblock_remove(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
{
- u64 rgnbegin, rgnend;
- u64 end = base + size;
+ phys_addr_t rgnbegin, rgnend;
+ phys_addr_t end = base + size;
int i;
rgnbegin = rgnend = 0; /* supress gcc warnings */
/* Find the region where (base, size) belongs to */
- for (i=0; i < rgn->cnt; i++) {
- rgnbegin = rgn->region[i].base;
- rgnend = rgnbegin + rgn->region[i].size;
+ for (i=0; i < type->cnt; i++) {
+ rgnbegin = type->regions[i].base;
+ rgnend = rgnbegin + type->regions[i].size;
if ((rgnbegin <= base) && (end <= rgnend))
break;
}
/* Didn't find the region */
- if (i == rgn->cnt)
+ if (i == type->cnt)
return -1;
/* Check to see if we are removing entire region */
if ((rgnbegin == base) && (rgnend == end)) {
- memblock_remove_region(rgn, i);
+ memblock_remove_region(type, i);
return 0;
}
/* Check to see if region is matching at the front */
if (rgnbegin == base) {
- rgn->region[i].base = end;
- rgn->region[i].size -= size;
+ type->regions[i].base = end;
+ type->regions[i].size -= size;
return 0;
}
/* Check to see if the region is matching at the end */
if (rgnend == end) {
- rgn->region[i].size -= size;
+ type->regions[i].size -= size;
return 0;
}
@@ -249,208 +433,189 @@ static long __memblock_remove(struct memblock_region *rgn, u64 base, u64 size)
* We need to split the entry - adjust the current one to the
* beginging of the hole and add the region after hole.
*/
- rgn->region[i].size = base - rgn->region[i].base;
- return memblock_add_region(rgn, end, rgnend - end);
+ type->regions[i].size = base - type->regions[i].base;
+ return memblock_add_region(type, end, rgnend - end);
}
-long memblock_remove(u64 base, u64 size)
+long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
{
return __memblock_remove(&memblock.memory, base, size);
}
-long __init memblock_free(u64 base, u64 size)
+long __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
{
return __memblock_remove(&memblock.reserved, base, size);
}
-long __init memblock_reserve(u64 base, u64 size)
+long __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
{
- struct memblock_region *_rgn = &memblock.reserved;
+ struct memblock_type *_rgn = &memblock.reserved;
BUG_ON(0 == size);
return memblock_add_region(_rgn, base, size);
}
-long memblock_overlaps_region(struct memblock_region *rgn, u64 base, u64 size)
+phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
{
- unsigned long i;
+ phys_addr_t found;
- for (i = 0; i < rgn->cnt; i++) {
- u64 rgnbase = rgn->region[i].base;
- u64 rgnsize = rgn->region[i].size;
- if (memblock_addrs_overlap(base, size, rgnbase, rgnsize))
- break;
- }
+ /* We align the size to limit fragmentation. Without this, a lot of
+ * small allocs quickly eat up the whole reserve array on sparc
+ */
+ size = memblock_align_up(size, align);
+
+ found = memblock_find_base(size, align, 0, max_addr);
+ if (found != MEMBLOCK_ERROR &&
+ memblock_add_region(&memblock.reserved, found, size) >= 0)
+ return found;
- return (i < rgn->cnt) ? i : -1;
+ return 0;
}
-static u64 memblock_align_down(u64 addr, u64 size)
+phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
{
- return addr & ~(size - 1);
+ phys_addr_t alloc;
+
+ alloc = __memblock_alloc_base(size, align, max_addr);
+
+ if (alloc == 0)
+ panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
+ (unsigned long long) size, (unsigned long long) max_addr);
+
+ return alloc;
}
-static u64 memblock_align_up(u64 addr, u64 size)
+phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align)
{
- return (addr + (size - 1)) & ~(size - 1);
+ return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
}
-static u64 __init memblock_alloc_nid_unreserved(u64 start, u64 end,
- u64 size, u64 align)
+
+/*
+ * Additional node-local allocators. Search for node memory is bottom up
+ * and walks memblock regions within that node bottom-up as well, but allocation
+ * within an memblock region is top-down. XXX I plan to fix that at some stage
+ *
+ * WARNING: Only available after early_node_map[] has been populated,
+ * on some architectures, that is after all the calls to add_active_range()
+ * have been done to populate it.
+ */
+
+phys_addr_t __weak __init memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid)
{
- u64 base, res_base;
- long j;
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+ /*
+ * This code originates from sparc which really wants use to walk by addresses
+ * and returns the nid. This is not very convenient for early_pfn_map[] users
+ * as the map isn't sorted yet, and it really wants to be walked by nid.
+ *
+ * For now, I implement the inefficient method below which walks the early
+ * map multiple times. Eventually we may want to use an ARCH config option
+ * to implement a completely different method for both case.
+ */
+ unsigned long start_pfn, end_pfn;
+ int i;
- base = memblock_align_down((end - size), align);
- while (start <= base) {
- j = memblock_overlaps_region(&memblock.reserved, base, size);
- if (j < 0) {
- /* this area isn't reserved, take it */
- if (memblock_add_region(&memblock.reserved, base, size) < 0)
- base = ~(u64)0;
- return base;
- }
- res_base = memblock.reserved.region[j].base;
- if (res_base < size)
- break;
- base = memblock_align_down(res_base - size, align);
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ get_pfn_range_for_nid(i, &start_pfn, &end_pfn);
+ if (start < PFN_PHYS(start_pfn) || start >= PFN_PHYS(end_pfn))
+ continue;
+ *nid = i;
+ return min(end, PFN_PHYS(end_pfn));
}
+#endif
+ *nid = 0;
- return ~(u64)0;
+ return end;
}
-static u64 __init memblock_alloc_nid_region(struct memblock_property *mp,
- u64 (*nid_range)(u64, u64, int *),
- u64 size, u64 align, int nid)
+static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp,
+ phys_addr_t size,
+ phys_addr_t align, int nid)
{
- u64 start, end;
+ phys_addr_t start, end;
start = mp->base;
end = start + mp->size;
start = memblock_align_up(start, align);
while (start < end) {
- u64 this_end;
+ phys_addr_t this_end;
int this_nid;
- this_end = nid_range(start, end, &this_nid);
+ this_end = memblock_nid_range(start, end, &this_nid);
if (this_nid == nid) {
- u64 ret = memblock_alloc_nid_unreserved(start, this_end,
- size, align);
- if (ret != ~(u64)0)
+ phys_addr_t ret = memblock_find_region(start, this_end, size, align);
+ if (ret != MEMBLOCK_ERROR &&
+ memblock_add_region(&memblock.reserved, ret, size) >= 0)
return ret;
}
start = this_end;
}
- return ~(u64)0;
+ return MEMBLOCK_ERROR;
}
-u64 __init memblock_alloc_nid(u64 size, u64 align, int nid,
- u64 (*nid_range)(u64 start, u64 end, int *nid))
+phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
{
- struct memblock_region *mem = &memblock.memory;
+ struct memblock_type *mem = &memblock.memory;
int i;
BUG_ON(0 == size);
+ /* We align the size to limit fragmentation. Without this, a lot of
+ * small allocs quickly eat up the whole reserve array on sparc
+ */
size = memblock_align_up(size, align);
+ /* We do a bottom-up search for a region with the right
+ * nid since that's easier considering how memblock_nid_range()
+ * works
+ */
for (i = 0; i < mem->cnt; i++) {
- u64 ret = memblock_alloc_nid_region(&mem->region[i],
- nid_range,
+ phys_addr_t ret = memblock_alloc_nid_region(&mem->regions[i],
size, align, nid);
- if (ret != ~(u64)0)
+ if (ret != MEMBLOCK_ERROR)
return ret;
}
- return memblock_alloc(size, align);
-}
-
-u64 __init memblock_alloc(u64 size, u64 align)
-{
- return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE);
+ return 0;
}
-u64 __init memblock_alloc_base(u64 size, u64 align, u64 max_addr)
+phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
{
- u64 alloc;
-
- alloc = __memblock_alloc_base(size, align, max_addr);
+ phys_addr_t res = memblock_alloc_nid(size, align, nid);
- if (alloc == 0)
- panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
- (unsigned long long) size, (unsigned long long) max_addr);
-
- return alloc;
+ if (res)
+ return res;
+ return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE);
}
-u64 __init __memblock_alloc_base(u64 size, u64 align, u64 max_addr)
-{
- long i, j;
- u64 base = 0;
- u64 res_base;
-
- BUG_ON(0 == size);
-
- size = memblock_align_up(size, align);
-
- /* On some platforms, make sure we allocate lowmem */
- /* Note that MEMBLOCK_REAL_LIMIT may be MEMBLOCK_ALLOC_ANYWHERE */
- if (max_addr == MEMBLOCK_ALLOC_ANYWHERE)
- max_addr = MEMBLOCK_REAL_LIMIT;
-
- for (i = memblock.memory.cnt - 1; i >= 0; i--) {
- u64 memblockbase = memblock.memory.region[i].base;
- u64 memblocksize = memblock.memory.region[i].size;
-
- if (memblocksize < size)
- continue;
- if (max_addr == MEMBLOCK_ALLOC_ANYWHERE)
- base = memblock_align_down(memblockbase + memblocksize - size, align);
- else if (memblockbase < max_addr) {
- base = min(memblockbase + memblocksize, max_addr);
- base = memblock_align_down(base - size, align);
- } else
- continue;
- while (base && memblockbase <= base) {
- j = memblock_overlaps_region(&memblock.reserved, base, size);
- if (j < 0) {
- /* this area isn't reserved, take it */
- if (memblock_add_region(&memblock.reserved, base, size) < 0)
- return 0;
- return base;
- }
- res_base = memblock.reserved.region[j].base;
- if (res_base < size)
- break;
- base = memblock_align_down(res_base - size, align);
- }
- }
- return 0;
-}
+/*
+ * Remaining API functions
+ */
/* You must call memblock_analyze() before this. */
-u64 __init memblock_phys_mem_size(void)
+phys_addr_t __init memblock_phys_mem_size(void)
{
- return memblock.memory.size;
+ return memblock.memory_size;
}
-u64 memblock_end_of_DRAM(void)
+phys_addr_t __init_memblock memblock_end_of_DRAM(void)
{
int idx = memblock.memory.cnt - 1;
- return (memblock.memory.region[idx].base + memblock.memory.region[idx].size);
+ return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size);
}
/* You must call memblock_analyze() after this. */
-void __init memblock_enforce_memory_limit(u64 memory_limit)
+void __init memblock_enforce_memory_limit(phys_addr_t memory_limit)
{
unsigned long i;
- u64 limit;
- struct memblock_property *p;
+ phys_addr_t limit;
+ struct memblock_region *p;
if (!memory_limit)
return;
@@ -458,24 +623,21 @@ void __init memblock_enforce_memory_limit(u64 memory_limit)
/* Truncate the memblock regions to satisfy the memory limit. */
limit = memory_limit;
for (i = 0; i < memblock.memory.cnt; i++) {
- if (limit > memblock.memory.region[i].size) {
- limit -= memblock.memory.region[i].size;
+ if (limit > memblock.memory.regions[i].size) {
+ limit -= memblock.memory.regions[i].size;
continue;
}
- memblock.memory.region[i].size = limit;
+ memblock.memory.regions[i].size = limit;
memblock.memory.cnt = i + 1;
break;
}
- if (memblock.memory.region[0].size < memblock.rmo_size)
- memblock.rmo_size = memblock.memory.region[0].size;
-
memory_limit = memblock_end_of_DRAM();
/* And truncate any reserves above the limit also. */
for (i = 0; i < memblock.reserved.cnt; i++) {
- p = &memblock.reserved.region[i];
+ p = &memblock.reserved.regions[i];
if (p->base > memory_limit)
p->size = 0;
@@ -489,53 +651,190 @@ void __init memblock_enforce_memory_limit(u64 memory_limit)
}
}
-int __init memblock_is_reserved(u64 addr)
+static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
+{
+ unsigned int left = 0, right = type->cnt;
+
+ do {
+ unsigned int mid = (right + left) / 2;
+
+ if (addr < type->regions[mid].base)
+ right = mid;
+ else if (addr >= (type->regions[mid].base +
+ type->regions[mid].size))
+ left = mid + 1;
+ else
+ return mid;
+ } while (left < right);
+ return -1;
+}
+
+int __init memblock_is_reserved(phys_addr_t addr)
+{
+ return memblock_search(&memblock.reserved, addr) != -1;
+}
+
+int __init_memblock memblock_is_memory(phys_addr_t addr)
+{
+ return memblock_search(&memblock.memory, addr) != -1;
+}
+
+int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
+{
+ int idx = memblock_search(&memblock.memory, base);
+
+ if (idx == -1)
+ return 0;
+ return memblock.memory.regions[idx].base <= base &&
+ (memblock.memory.regions[idx].base +
+ memblock.memory.regions[idx].size) >= (base + size);
+}
+
+int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
+}
+
+
+void __init_memblock memblock_set_current_limit(phys_addr_t limit)
+{
+ memblock.current_limit = limit;
+}
+
+static void __init_memblock memblock_dump(struct memblock_type *region, char *name)
{
+ unsigned long long base, size;
int i;
- for (i = 0; i < memblock.reserved.cnt; i++) {
- u64 upper = memblock.reserved.region[i].base +
- memblock.reserved.region[i].size - 1;
- if ((addr >= memblock.reserved.region[i].base) && (addr <= upper))
- return 1;
+ pr_info(" %s.cnt = 0x%lx\n", name, region->cnt);
+
+ for (i = 0; i < region->cnt; i++) {
+ base = region->regions[i].base;
+ size = region->regions[i].size;
+
+ pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes\n",
+ name, i, base, base + size - 1, size);
}
- return 0;
}
-int memblock_is_region_reserved(u64 base, u64 size)
+void __init_memblock memblock_dump_all(void)
{
- return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
+ if (!memblock_debug)
+ return;
+
+ pr_info("MEMBLOCK configuration:\n");
+ pr_info(" memory size = 0x%llx\n", (unsigned long long)memblock.memory_size);
+
+ memblock_dump(&memblock.memory, "memory");
+ memblock_dump(&memblock.reserved, "reserved");
}
-/*
- * Given a <base, len>, find which memory regions belong to this range.
- * Adjust the request and return a contiguous chunk.
- */
-int memblock_find(struct memblock_property *res)
+void __init memblock_analyze(void)
{
int i;
- u64 rstart, rend;
- rstart = res->base;
- rend = rstart + res->size - 1;
+ /* Check marker in the unused last array entry */
+ WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base
+ != (phys_addr_t)RED_INACTIVE);
+ WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base
+ != (phys_addr_t)RED_INACTIVE);
+
+ memblock.memory_size = 0;
+
+ for (i = 0; i < memblock.memory.cnt; i++)
+ memblock.memory_size += memblock.memory.regions[i].size;
+
+ /* We allow resizing from there */
+ memblock_can_resize = 1;
+}
+
+void __init memblock_init(void)
+{
+ static int init_done __initdata = 0;
+
+ if (init_done)
+ return;
+ init_done = 1;
+
+ /* Hookup the initial arrays */
+ memblock.memory.regions = memblock_memory_init_regions;
+ memblock.memory.max = INIT_MEMBLOCK_REGIONS;
+ memblock.reserved.regions = memblock_reserved_init_regions;
+ memblock.reserved.max = INIT_MEMBLOCK_REGIONS;
+
+ /* Write a marker in the unused last array entry */
+ memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
+ memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
+
+ /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
+ * This simplifies the memblock_add() code below...
+ */
+ memblock.memory.regions[0].base = 0;
+ memblock.memory.regions[0].size = 0;
+ memblock.memory.cnt = 1;
+
+ /* Ditto. */
+ memblock.reserved.regions[0].base = 0;
+ memblock.reserved.regions[0].size = 0;
+ memblock.reserved.cnt = 1;
+
+ memblock.current_limit = MEMBLOCK_ALLOC_ANYWHERE;
+}
+
+static int __init early_memblock(char *p)
+{
+ if (p && strstr(p, "debug"))
+ memblock_debug = 1;
+ return 0;
+}
+early_param("memblock", early_memblock);
+
+#if defined(CONFIG_DEBUG_FS) && !defined(ARCH_DISCARD_MEMBLOCK)
+
+static int memblock_debug_show(struct seq_file *m, void *private)
+{
+ struct memblock_type *type = m->private;
+ struct memblock_region *reg;
+ int i;
+
+ for (i = 0; i < type->cnt; i++) {
+ reg = &type->regions[i];
+ seq_printf(m, "%4d: ", i);
+ if (sizeof(phys_addr_t) == 4)
+ seq_printf(m, "0x%08lx..0x%08lx\n",
+ (unsigned long)reg->base,
+ (unsigned long)(reg->base + reg->size - 1));
+ else
+ seq_printf(m, "0x%016llx..0x%016llx\n",
+ (unsigned long long)reg->base,
+ (unsigned long long)(reg->base + reg->size - 1));
- for (i = 0; i < memblock.memory.cnt; i++) {
- u64 start = memblock.memory.region[i].base;
- u64 end = start + memblock.memory.region[i].size - 1;
-
- if (start > rend)
- return -1;
-
- if ((end >= rstart) && (start < rend)) {
- /* adjust the request */
- if (rstart < start)
- rstart = start;
- if (rend > end)
- rend = end;
- res->base = rstart;
- res->size = rend - rstart + 1;
- return 0;
- }
}
- return -1;
+ return 0;
+}
+
+static int memblock_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, memblock_debug_show, inode->i_private);
}
+
+static const struct file_operations memblock_debug_fops = {
+ .open = memblock_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init memblock_init_debugfs(void)
+{
+ struct dentry *root = debugfs_create_dir("memblock", NULL);
+ if (!root)
+ return -ENXIO;
+ debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops);
+ debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops);
+
+ return 0;
+}
+__initcall(memblock_init_debugfs);
+
+#endif /* CONFIG_DEBUG_FS */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9be3cf8a5da4..da53a252b259 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -61,7 +61,14 @@ struct mem_cgroup *root_mem_cgroup __read_mostly;
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
int do_swap_account __read_mostly;
-static int really_do_swap_account __initdata = 1; /* for remember boot option*/
+
+/* for remember boot option*/
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
+static int really_do_swap_account __initdata = 1;
+#else
+static int really_do_swap_account __initdata = 0;
+#endif
+
#else
#define do_swap_account (0)
#endif
@@ -89,7 +96,10 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
- MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */
+ MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
+ /* incremented at every pagein/pageout */
+ MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA,
+ MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
MEM_CGROUP_STAT_NSTATS,
};
@@ -254,6 +264,12 @@ struct mem_cgroup {
* percpu counter.
*/
struct mem_cgroup_stat_cpu *stat;
+ /*
+ * used when a cpu is offlined or other synchronizations
+ * See mem_cgroup_read_stat().
+ */
+ struct mem_cgroup_stat_cpu nocpu_base;
+ spinlock_t pcp_counter_lock;
};
/* Stuffs for move charges at task migration. */
@@ -269,7 +285,7 @@ enum move_type {
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
- spinlock_t lock; /* for from, to, moving_task */
+ spinlock_t lock; /* for from, to */
struct mem_cgroup *from;
struct mem_cgroup *to;
unsigned long precharge;
@@ -530,14 +546,40 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
return mz;
}
+/*
+ * Implementation Note: reading percpu statistics for memcg.
+ *
+ * Both of vmstat[] and percpu_counter has threshold and do periodic
+ * synchronization to implement "quick" read. There are trade-off between
+ * reading cost and precision of value. Then, we may have a chance to implement
+ * a periodic synchronizion of counter in memcg's counter.
+ *
+ * But this _read() function is used for user interface now. The user accounts
+ * memory usage by memory cgroup and he _always_ requires exact value because
+ * he accounts memory. Even if we provide quick-and-fuzzy read, we always
+ * have to visit all online cpus and make sum. So, for now, unnecessary
+ * synchronization is not implemented. (just implemented for cpu hotplug)
+ *
+ * If there are kernel internal actions which can make use of some not-exact
+ * value, and reading all cpu value can be performance bottleneck in some
+ * common workload, threashold and synchonization as vmstat[] should be
+ * implemented.
+ */
static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
enum mem_cgroup_stat_index idx)
{
int cpu;
s64 val = 0;
- for_each_possible_cpu(cpu)
+ get_online_cpus();
+ for_each_online_cpu(cpu)
val += per_cpu(mem->stat->count[idx], cpu);
+#ifdef CONFIG_HOTPLUG_CPU
+ spin_lock(&mem->pcp_counter_lock);
+ val += mem->nocpu_base.count[idx];
+ spin_unlock(&mem->pcp_counter_lock);
+#endif
+ put_online_cpus();
return val;
}
@@ -558,23 +600,24 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
- struct page_cgroup *pc,
- bool charge)
+ bool file, int nr_pages)
{
- int val = (charge) ? 1 : -1;
-
preempt_disable();
- if (PageCgroupCache(pc))
- __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
+ if (file)
+ __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
else
- __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
+ __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
- if (charge)
+ /* pagein of a big page is an event. So, ignore page size */
+ if (nr_pages > 0)
__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
- else
+ else {
__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
- __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
+ nr_pages = -nr_pages; /* for event */
+ }
+
+ __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages);
preempt_enable();
}
@@ -659,40 +702,83 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
return mem;
}
-/*
- * Call callback function against all cgroup under hierarchy tree.
- */
-static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
- int (*func)(struct mem_cgroup *, void *))
+/* The caller has to guarantee "mem" exists before calling this */
+static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
{
- int found, ret, nextid;
struct cgroup_subsys_state *css;
- struct mem_cgroup *mem;
+ int found;
- if (!root->use_hierarchy)
- return (*func)(root, data);
-
- nextid = 1;
- do {
- ret = 0;
+ if (!mem) /* ROOT cgroup has the smallest ID */
+ return root_mem_cgroup; /*css_put/get against root is ignored*/
+ if (!mem->use_hierarchy) {
+ if (css_tryget(&mem->css))
+ return mem;
+ return NULL;
+ }
+ rcu_read_lock();
+ /*
+ * searching a memory cgroup which has the smallest ID under given
+ * ROOT cgroup. (ID >= 1)
+ */
+ css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
+ if (css && css_tryget(css))
+ mem = container_of(css, struct mem_cgroup, css);
+ else
mem = NULL;
+ rcu_read_unlock();
+ return mem;
+}
+static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
+ struct mem_cgroup *root,
+ bool cond)
+{
+ int nextid = css_id(&iter->css) + 1;
+ int found;
+ int hierarchy_used;
+ struct cgroup_subsys_state *css;
+
+ hierarchy_used = iter->use_hierarchy;
+
+ css_put(&iter->css);
+ /* If no ROOT, walk all, ignore hierarchy */
+ if (!cond || (root && !hierarchy_used))
+ return NULL;
+
+ if (!root)
+ root = root_mem_cgroup;
+
+ do {
+ iter = NULL;
rcu_read_lock();
- css = css_get_next(&mem_cgroup_subsys, nextid, &root->css,
- &found);
+
+ css = css_get_next(&mem_cgroup_subsys, nextid,
+ &root->css, &found);
if (css && css_tryget(css))
- mem = container_of(css, struct mem_cgroup, css);
+ iter = container_of(css, struct mem_cgroup, css);
rcu_read_unlock();
-
- if (mem) {
- ret = (*func)(mem, data);
- css_put(&mem->css);
- }
+ /* If css is NULL, no more cgroups will be found */
nextid = found + 1;
- } while (!ret && css);
+ } while (css && !iter);
- return ret;
+ return iter;
}
+/*
+ * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
+ * be careful that "break" loop is not allowed. We have reference count.
+ * Instead of that modify "cond" to be false and "continue" to exit the loop.
+ */
+#define for_each_mem_cgroup_tree_cond(iter, root, cond) \
+ for (iter = mem_cgroup_start_loop(root);\
+ iter != NULL;\
+ iter = mem_cgroup_get_next(iter, root, cond))
+
+#define for_each_mem_cgroup_tree(iter, root) \
+ for_each_mem_cgroup_tree_cond(iter, root, true)
+
+#define for_each_mem_cgroup_all(iter) \
+ for_each_mem_cgroup_tree_cond(iter, NULL, true)
+
static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
{
@@ -730,12 +816,12 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
* removed from global LRU.
*/
mz = page_cgroup_zoneinfo(pc);
- MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+ /* huge page split is done under lru_lock. so, we have no races. */
+ MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
if (mem_cgroup_is_root(pc->mem_cgroup))
return;
VM_BUG_ON(list_empty(&pc->lru));
list_del_init(&pc->lru);
- return;
}
void mem_cgroup_del_lru(struct page *page)
@@ -752,13 +838,12 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
return;
pc = lookup_page_cgroup(page);
- /*
- * Used bit is set without atomic ops but after smp_wmb().
- * For making pc->mem_cgroup visible, insert smp_rmb() here.
- */
- smp_rmb();
/* unused or root page is not rotated. */
- if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
+ if (!PageCgroupUsed(pc))
+ return;
+ /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+ smp_rmb();
+ if (mem_cgroup_is_root(pc->mem_cgroup))
return;
mz = page_cgroup_zoneinfo(pc);
list_move(&pc->lru, &mz->lists[lru]);
@@ -773,16 +858,13 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
return;
pc = lookup_page_cgroup(page);
VM_BUG_ON(PageCgroupAcctLRU(pc));
- /*
- * Used bit is set without atomic ops but after smp_wmb().
- * For making pc->mem_cgroup visible, insert smp_rmb() here.
- */
- smp_rmb();
if (!PageCgroupUsed(pc))
return;
-
+ /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+ smp_rmb();
mz = page_cgroup_zoneinfo(pc);
- MEM_CGROUP_ZSTAT(mz, lru) += 1;
+ /* huge page split is done under lru_lock. so, we have no races. */
+ MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
SetPageCgroupAcctLRU(pc);
if (mem_cgroup_is_root(pc->mem_cgroup))
return;
@@ -946,14 +1028,10 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
return NULL;
pc = lookup_page_cgroup(page);
- /*
- * Used bit is set without atomic ops but after smp_wmb().
- * For making pc->mem_cgroup visible, insert smp_rmb() here.
- */
- smp_rmb();
if (!PageCgroupUsed(pc))
return NULL;
-
+ /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+ smp_rmb();
mz = page_cgroup_zoneinfo(pc);
if (!mz)
return NULL;
@@ -1001,7 +1079,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
case 0:
list_move(&page->lru, dst);
mem_cgroup_del_lru(page);
- nr_taken++;
+ nr_taken += hpage_nr_pages(page);
break;
case -EBUSY:
/* we don't affect global LRU but rotate in our LRU */
@@ -1035,6 +1113,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
return false;
}
+/**
+ * mem_cgroup_check_margin - check if the memory cgroup allows charging
+ * @mem: memory cgroup to check
+ * @bytes: the number of bytes the caller intends to charge
+ *
+ * Returns a boolean value on whether @mem can be charged @bytes or
+ * whether this would exceed the limit.
+ */
+static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes)
+{
+ if (!res_counter_check_margin(&mem->res, bytes))
+ return false;
+ if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes))
+ return false;
+ return true;
+}
+
static unsigned int get_swappiness(struct mem_cgroup *memcg)
{
struct cgroup *cgrp = memcg->css.cgroup;
@@ -1051,7 +1146,52 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
return swappiness;
}
-/* A routine for testing mem is not under move_account */
+static void mem_cgroup_start_move(struct mem_cgroup *mem)
+{
+ int cpu;
+
+ get_online_cpus();
+ spin_lock(&mem->pcp_counter_lock);
+ for_each_online_cpu(cpu)
+ per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
+ mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
+ spin_unlock(&mem->pcp_counter_lock);
+ put_online_cpus();
+
+ synchronize_rcu();
+}
+
+static void mem_cgroup_end_move(struct mem_cgroup *mem)
+{
+ int cpu;
+
+ if (!mem)
+ return;
+ get_online_cpus();
+ spin_lock(&mem->pcp_counter_lock);
+ for_each_online_cpu(cpu)
+ per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
+ mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
+ spin_unlock(&mem->pcp_counter_lock);
+ put_online_cpus();
+}
+/*
+ * 2 routines for checking "mem" is under move_account() or not.
+ *
+ * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
+ * for avoiding race in accounting. If true,
+ * pc->mem_cgroup may be overwritten.
+ *
+ * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
+ * under hierarchy of moving cgroups. This is for
+ * waiting at hith-memory prressure caused by "move".
+ */
+
+static bool mem_cgroup_stealed(struct mem_cgroup *mem)
+{
+ VM_BUG_ON(!rcu_read_lock_held());
+ return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
+}
static bool mem_cgroup_under_move(struct mem_cgroup *mem)
{
@@ -1092,13 +1232,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
return false;
}
-static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
-{
- int *val = data;
- (*val)++;
- return 0;
-}
-
/**
* mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
* @memcg: The memory cgroup that went over limit
@@ -1173,7 +1306,10 @@ done:
static int mem_cgroup_count_children(struct mem_cgroup *mem)
{
int num = 0;
- mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
+ struct mem_cgroup *iter;
+
+ for_each_mem_cgroup_tree(iter, mem)
+ num++;
return num;
}
@@ -1185,8 +1321,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
u64 limit;
u64 memsw;
- limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
- total_swap_pages;
+ limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+ limit += total_swap_pages << PAGE_SHIFT;
+
memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
/*
* If memsw is finite and limits the amount of swap space available
@@ -1322,49 +1459,39 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
return total;
}
-static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
-{
- int *val = (int *)data;
- int x;
- /*
- * Logically, we can stop scanning immediately when we find
- * a memcg is already locked. But condidering unlock ops and
- * creation/removal of memcg, scan-all is simple operation.
- */
- x = atomic_inc_return(&mem->oom_lock);
- *val = max(x, *val);
- return 0;
-}
/*
* Check OOM-Killer is already running under our hierarchy.
* If someone is running, return false.
*/
static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
{
- int lock_count = 0;
+ int x, lock_count = 0;
+ struct mem_cgroup *iter;
- mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
+ for_each_mem_cgroup_tree(iter, mem) {
+ x = atomic_inc_return(&iter->oom_lock);
+ lock_count = max(x, lock_count);
+ }
if (lock_count == 1)
return true;
return false;
}
-static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
+static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
{
+ struct mem_cgroup *iter;
+
/*
* When a new child is created while the hierarchy is under oom,
* mem_cgroup_oom_lock() may not be called. We have to use
* atomic_add_unless() here.
*/
- atomic_add_unless(&mem->oom_lock, -1, 0);
+ for_each_mem_cgroup_tree(iter, mem)
+ atomic_add_unless(&iter->oom_lock, -1, 0);
return 0;
}
-static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
-{
- mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
-}
static DEFINE_MUTEX(memcg_oom_mutex);
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
@@ -1462,35 +1589,73 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
/*
* Currently used to update mapped file statistics, but the routine can be
* generalized to update other statistics as well.
+ *
+ * Notes: Race condition
+ *
+ * We usually use page_cgroup_lock() for accessing page_cgroup member but
+ * it tends to be costly. But considering some conditions, we doesn't need
+ * to do so _always_.
+ *
+ * Considering "charge", lock_page_cgroup() is not required because all
+ * file-stat operations happen after a page is attached to radix-tree. There
+ * are no race with "charge".
+ *
+ * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
+ * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
+ * if there are race with "uncharge". Statistics itself is properly handled
+ * by flags.
+ *
+ * Considering "move", this is an only case we see a race. To make the race
+ * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
+ * possibility of race condition. If there is, we take a lock.
*/
-void mem_cgroup_update_file_mapped(struct page *page, int val)
+
+void mem_cgroup_update_page_stat(struct page *page,
+ enum mem_cgroup_page_stat_item idx, int val)
{
struct mem_cgroup *mem;
- struct page_cgroup *pc;
+ struct page_cgroup *pc = lookup_page_cgroup(page);
+ bool need_unlock = false;
+ unsigned long uninitialized_var(flags);
- pc = lookup_page_cgroup(page);
if (unlikely(!pc))
return;
- lock_page_cgroup(pc);
+ rcu_read_lock();
mem = pc->mem_cgroup;
- if (!mem || !PageCgroupUsed(pc))
- goto done;
+ if (unlikely(!mem || !PageCgroupUsed(pc)))
+ goto out;
+ /* pc->mem_cgroup is unstable ? */
+ if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
+ /* take a lock against to access pc->mem_cgroup */
+ move_lock_page_cgroup(pc, &flags);
+ need_unlock = true;
+ mem = pc->mem_cgroup;
+ if (!mem || !PageCgroupUsed(pc))
+ goto out;
+ }
- /*
- * Preemption is already disabled. We can use __this_cpu_xxx
- */
- if (val > 0) {
- __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
- SetPageCgroupFileMapped(pc);
- } else {
- __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
- ClearPageCgroupFileMapped(pc);
+ switch (idx) {
+ case MEMCG_NR_FILE_MAPPED:
+ if (val > 0)
+ SetPageCgroupFileMapped(pc);
+ else if (!page_mapped(page))
+ ClearPageCgroupFileMapped(pc);
+ idx = MEM_CGROUP_STAT_FILE_MAPPED;
+ break;
+ default:
+ BUG();
}
-done:
- unlock_page_cgroup(pc);
+ this_cpu_add(mem->stat->count[idx], val);
+
+out:
+ if (unlikely(need_unlock))
+ move_unlock_page_cgroup(pc, &flags);
+ rcu_read_unlock();
+ return;
}
+EXPORT_SYMBOL(mem_cgroup_update_page_stat);
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -1605,15 +1770,55 @@ static void drain_all_stock_sync(void)
atomic_dec(&memcg_drain_count);
}
-static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
+/*
+ * This function drains percpu counter value from DEAD cpu and
+ * move it to local cpu. Note that this function can be preempted.
+ */
+static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
+{
+ int i;
+
+ spin_lock(&mem->pcp_counter_lock);
+ for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
+ s64 x = per_cpu(mem->stat->count[i], cpu);
+
+ per_cpu(mem->stat->count[i], cpu) = 0;
+ mem->nocpu_base.count[i] += x;
+ }
+ /* need to clear ON_MOVE value, works as a kind of lock. */
+ per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
+ spin_unlock(&mem->pcp_counter_lock);
+}
+
+static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
+{
+ int idx = MEM_CGROUP_ON_MOVE;
+
+ spin_lock(&mem->pcp_counter_lock);
+ per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
+ spin_unlock(&mem->pcp_counter_lock);
+}
+
+static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
unsigned long action,
void *hcpu)
{
int cpu = (unsigned long)hcpu;
struct memcg_stock_pcp *stock;
+ struct mem_cgroup *iter;
- if (action != CPU_DEAD)
+ if ((action == CPU_ONLINE)) {
+ for_each_mem_cgroup_all(iter)
+ synchronize_mem_cgroup_on_move(iter, cpu);
return NOTIFY_OK;
+ }
+
+ if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
+ return NOTIFY_OK;
+
+ for_each_mem_cgroup_all(iter)
+ mem_cgroup_drain_pcp_counter(iter, cpu);
+
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
return NOTIFY_OK;
@@ -1646,27 +1851,39 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
if (likely(!ret))
return CHARGE_OK;
+ res_counter_uncharge(&mem->res, csize);
mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
flags |= MEM_CGROUP_RECLAIM_NOSWAP;
} else
mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
-
- if (csize > PAGE_SIZE) /* change csize and retry */
+ /*
+ * csize can be either a huge page (HPAGE_SIZE), a batch of
+ * regular pages (CHARGE_SIZE), or a single regular page
+ * (PAGE_SIZE).
+ *
+ * Never reclaim on behalf of optional batching, retry with a
+ * single page instead.
+ */
+ if (csize == CHARGE_SIZE)
return CHARGE_RETRY;
if (!(gfp_mask & __GFP_WAIT))
return CHARGE_WOULDBLOCK;
ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
- gfp_mask, flags);
+ gfp_mask, flags);
+ if (mem_cgroup_check_margin(mem_over_limit, csize))
+ return CHARGE_RETRY;
/*
- * try_to_free_mem_cgroup_pages() might not give us a full
- * picture of reclaim. Some pages are reclaimed and might be
- * moved to swap cache or just unmapped from the cgroup.
- * Check the limit again to see if the reclaim reduced the
- * current usage of the cgroup before giving up
+ * Even though the limit is exceeded at this point, reclaim
+ * may have been able to free some pages. Retry the charge
+ * before killing the task.
+ *
+ * Only for regular pages, though: huge pages are rather
+ * unlikely to succeed so close to the limit, and we fall back
+ * to regular pages anyway in case of failure.
*/
- if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+ if (csize == PAGE_SIZE && ret)
return CHARGE_RETRY;
/*
@@ -1691,12 +1908,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
* oom-killer can be invoked.
*/
static int __mem_cgroup_try_charge(struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
+ gfp_t gfp_mask,
+ struct mem_cgroup **memcg, bool oom,
+ int page_size)
{
int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup *mem = NULL;
int ret;
- int csize = CHARGE_SIZE;
+ int csize = max(CHARGE_SIZE, (unsigned long) page_size);
/*
* Unlike gloval-vm's OOM-kill, we're not in memory shortage
@@ -1721,7 +1940,7 @@ again:
VM_BUG_ON(css_is_removed(&mem->css));
if (mem_cgroup_is_root(mem))
goto done;
- if (consume_stock(mem))
+ if (page_size == PAGE_SIZE && consume_stock(mem))
goto done;
css_get(&mem->css);
} else {
@@ -1729,23 +1948,22 @@ again:
rcu_read_lock();
p = rcu_dereference(mm->owner);
- VM_BUG_ON(!p);
/*
- * because we don't have task_lock(), "p" can exit while
- * we're here. In that case, "mem" can point to root
- * cgroup but never be NULL. (and task_struct itself is freed
- * by RCU, cgroup itself is RCU safe.) Then, we have small
- * risk here to get wrong cgroup. But such kind of mis-account
- * by race always happens because we don't have cgroup_mutex().
- * It's overkill and we allow that small race, here.
+ * Because we don't have task_lock(), "p" can exit.
+ * In that case, "mem" can point to root or p can be NULL with
+ * race with swapoff. Then, we have small risk of mis-accouning.
+ * But such kind of mis-account by race always happens because
+ * we don't have cgroup_mutex(). It's overkill and we allo that
+ * small race, here.
+ * (*) swapoff at el will charge against mm-struct not against
+ * task-struct. So, mm->owner can be NULL.
*/
mem = mem_cgroup_from_task(p);
- VM_BUG_ON(!mem);
- if (mem_cgroup_is_root(mem)) {
+ if (!mem || mem_cgroup_is_root(mem)) {
rcu_read_unlock();
goto done;
}
- if (consume_stock(mem)) {
+ if (page_size == PAGE_SIZE && consume_stock(mem)) {
/*
* It seems dagerous to access memcg without css_get().
* But considering how consume_stok works, it's not
@@ -1786,7 +2004,7 @@ again:
case CHARGE_OK:
break;
case CHARGE_RETRY: /* not in OOM situation but retry */
- csize = PAGE_SIZE;
+ csize = page_size;
css_put(&mem->css);
mem = NULL;
goto again;
@@ -1807,8 +2025,8 @@ again:
}
} while (ret != CHARGE_OK);
- if (csize > PAGE_SIZE)
- refill_stock(mem, csize - PAGE_SIZE);
+ if (csize > page_size)
+ refill_stock(mem, csize - page_size);
css_put(&mem->css);
done:
*memcg = mem;
@@ -1836,9 +2054,10 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
}
}
-static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
+static void mem_cgroup_cancel_charge(struct mem_cgroup *mem,
+ int page_size)
{
- __mem_cgroup_cancel_charge(mem, 1);
+ __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT);
}
/*
@@ -1888,15 +2107,13 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
return mem;
}
-/*
- * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
- * USED state. If already USED, uncharge and return.
- */
-
static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
- struct page_cgroup *pc,
- enum charge_type ctype)
+ struct page_cgroup *pc,
+ enum charge_type ctype,
+ int page_size)
{
+ int nr_pages = page_size >> PAGE_SHIFT;
+
/* try_charge() can return NULL to *memcg, taking care of it. */
if (!mem)
return;
@@ -1904,10 +2121,13 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
lock_page_cgroup(pc);
if (unlikely(PageCgroupUsed(pc))) {
unlock_page_cgroup(pc);
- mem_cgroup_cancel_charge(mem);
+ mem_cgroup_cancel_charge(mem, page_size);
return;
}
-
+ /*
+ * we don't need page_cgroup_lock about tail pages, becase they are not
+ * accessed by any other context at this point.
+ */
pc->mem_cgroup = mem;
/*
* We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -1931,8 +2151,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
break;
}
- mem_cgroup_charge_statistics(mem, pc, true);
-
+ mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
unlock_page_cgroup(pc);
/*
* "charge_statistics" updated event counter. Then, check it.
@@ -1942,6 +2161,48 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
memcg_check_events(mem, pc->page);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
+ (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
+/*
+ * Because tail pages are not marked as "used", set it. We're under
+ * zone->lru_lock, 'splitting on pmd' and compund_lock.
+ */
+void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
+{
+ struct page_cgroup *head_pc = lookup_page_cgroup(head);
+ struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
+ unsigned long flags;
+
+ if (mem_cgroup_disabled())
+ return;
+ /*
+ * We have no races with charge/uncharge but will have races with
+ * page state accounting.
+ */
+ move_lock_page_cgroup(head_pc, &flags);
+
+ tail_pc->mem_cgroup = head_pc->mem_cgroup;
+ smp_wmb(); /* see __commit_charge() */
+ if (PageCgroupAcctLRU(head_pc)) {
+ enum lru_list lru;
+ struct mem_cgroup_per_zone *mz;
+
+ /*
+ * LRU flags cannot be copied because we need to add tail
+ *.page to LRU by generic call and our hook will be called.
+ * We hold lru_lock, then, reduce counter directly.
+ */
+ lru = page_lru(head);
+ mz = page_cgroup_zoneinfo(head_pc);
+ MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+ }
+ tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
+ move_unlock_page_cgroup(head_pc, &flags);
+}
+#endif
+
/**
* __mem_cgroup_move_account - move account of the page
* @pc: page_cgroup of the page.
@@ -1960,11 +2221,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
*/
static void __mem_cgroup_move_account(struct page_cgroup *pc,
- struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
+ struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge,
+ int charge_size)
{
+ int nr_pages = charge_size >> PAGE_SHIFT;
+
VM_BUG_ON(from == to);
VM_BUG_ON(PageLRU(pc->page));
- VM_BUG_ON(!PageCgroupLocked(pc));
+ VM_BUG_ON(!page_is_cgroup_locked(pc));
VM_BUG_ON(!PageCgroupUsed(pc));
VM_BUG_ON(pc->mem_cgroup != from);
@@ -1975,14 +2239,14 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
preempt_enable();
}
- mem_cgroup_charge_statistics(from, pc, false);
+ mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
if (uncharge)
/* This is not "cancel", but cancel_charge does all we need. */
- mem_cgroup_cancel_charge(from);
+ mem_cgroup_cancel_charge(from, charge_size);
/* caller should have done css_get */
pc->mem_cgroup = to;
- mem_cgroup_charge_statistics(to, pc, true);
+ mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
/*
* We charges against "to" which may not have any tasks. Then, "to"
* can be under rmdir(). But in current implementation, caller of
@@ -1997,12 +2261,25 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
* __mem_cgroup_move_account()
*/
static int mem_cgroup_move_account(struct page_cgroup *pc,
- struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
+ struct mem_cgroup *from, struct mem_cgroup *to,
+ bool uncharge, int charge_size)
{
int ret = -EINVAL;
+ unsigned long flags;
+ /*
+ * The page is isolated from LRU. So, collapse function
+ * will not handle this page. But page splitting can happen.
+ * Do this check under compound_page_lock(). The caller should
+ * hold it.
+ */
+ if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))
+ return -EBUSY;
+
lock_page_cgroup(pc);
if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
- __mem_cgroup_move_account(pc, from, to, uncharge);
+ move_lock_page_cgroup(pc, &flags);
+ __mem_cgroup_move_account(pc, from, to, uncharge, charge_size);
+ move_unlock_page_cgroup(pc, &flags);
ret = 0;
}
unlock_page_cgroup(pc);
@@ -2026,6 +2303,8 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
struct cgroup *cg = child->css.cgroup;
struct cgroup *pcg = cg->parent;
struct mem_cgroup *parent;
+ int page_size = PAGE_SIZE;
+ unsigned long flags;
int ret;
/* Is ROOT ? */
@@ -2038,14 +2317,24 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
if (isolate_lru_page(page))
goto put;
+ if (PageTransHuge(page))
+ page_size = HPAGE_SIZE;
+
parent = mem_cgroup_from_cont(pcg);
- ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
+ ret = __mem_cgroup_try_charge(NULL, gfp_mask,
+ &parent, false, page_size);
if (ret || !parent)
goto put_back;
- ret = mem_cgroup_move_account(pc, child, parent, true);
+ if (page_size > PAGE_SIZE)
+ flags = compound_lock_irqsave(page);
+
+ ret = mem_cgroup_move_account(pc, child, parent, true, page_size);
if (ret)
- mem_cgroup_cancel_charge(parent);
+ mem_cgroup_cancel_charge(parent, page_size);
+
+ if (page_size > PAGE_SIZE)
+ compound_unlock_irqrestore(page, flags);
put_back:
putback_lru_page(page);
put:
@@ -2064,20 +2353,32 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask, enum charge_type ctype)
{
struct mem_cgroup *mem = NULL;
+ int page_size = PAGE_SIZE;
struct page_cgroup *pc;
+ bool oom = true;
int ret;
+ if (PageTransHuge(page)) {
+ page_size <<= compound_order(page);
+ VM_BUG_ON(!PageTransHuge(page));
+ /*
+ * Never OOM-kill a process for a huge page. The
+ * fault handler will fall back to regular pages.
+ */
+ oom = false;
+ }
+
pc = lookup_page_cgroup(page);
/* can happen at boot */
if (unlikely(!pc))
return 0;
prefetchw(pc);
- ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
+ ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size);
if (ret || !mem)
return ret;
- __mem_cgroup_commit_charge(mem, pc, ctype);
+ __mem_cgroup_commit_charge(mem, pc, ctype, page_size);
return 0;
}
@@ -2086,8 +2387,6 @@ int mem_cgroup_newpage_charge(struct page *page,
{
if (mem_cgroup_disabled())
return 0;
- if (PageCompound(page))
- return 0;
/*
* If already mapped, we don't have to account.
* If page cache, page->mapping has address_space.
@@ -2193,13 +2492,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
if (!mem)
goto charge_cur_mm;
*ptr = mem;
- ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
+ ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE);
css_put(&mem->css);
return ret;
charge_cur_mm:
if (unlikely(!mm))
mm = &init_mm;
- return __mem_cgroup_try_charge(mm, mask, ptr, true);
+ return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE);
}
static void
@@ -2215,7 +2514,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
cgroup_exclude_rmdir(&ptr->css);
pc = lookup_page_cgroup(page);
mem_cgroup_lru_del_before_commit_swapcache(page);
- __mem_cgroup_commit_charge(ptr, pc, ctype);
+ __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE);
mem_cgroup_lru_add_after_commit_swapcache(page);
/*
* Now swap is on-memory. This means this page may be
@@ -2264,11 +2563,12 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
return;
if (!mem)
return;
- mem_cgroup_cancel_charge(mem);
+ mem_cgroup_cancel_charge(mem, PAGE_SIZE);
}
static void
-__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
+__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
+ int page_size)
{
struct memcg_batch_info *batch = NULL;
bool uncharge_memsw = true;
@@ -2295,6 +2595,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
goto direct_uncharge;
+ if (page_size != PAGE_SIZE)
+ goto direct_uncharge;
+
/*
* In typical case, batch->memcg == mem. This means we can
* merge a series of uncharges to an uncharge of res_counter.
@@ -2308,9 +2611,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
batch->memsw_bytes += PAGE_SIZE;
return;
direct_uncharge:
- res_counter_uncharge(&mem->res, PAGE_SIZE);
+ res_counter_uncharge(&mem->res, page_size);
if (uncharge_memsw)
- res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+ res_counter_uncharge(&mem->memsw, page_size);
if (unlikely(batch->memcg != mem))
memcg_oom_recover(mem);
return;
@@ -2322,8 +2625,10 @@ direct_uncharge:
static struct mem_cgroup *
__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
{
+ int count;
struct page_cgroup *pc;
struct mem_cgroup *mem = NULL;
+ int page_size = PAGE_SIZE;
if (mem_cgroup_disabled())
return NULL;
@@ -2331,6 +2636,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
if (PageSwapCache(page))
return NULL;
+ if (PageTransHuge(page)) {
+ page_size <<= compound_order(page);
+ VM_BUG_ON(!PageTransHuge(page));
+ }
+
+ count = page_size >> PAGE_SHIFT;
/*
* Check if our page_cgroup is valid
*/
@@ -2363,7 +2674,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
break;
}
- mem_cgroup_charge_statistics(mem, pc, false);
+ mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count);
ClearPageCgroupUsed(pc);
/*
@@ -2384,7 +2695,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
mem_cgroup_get(mem);
}
if (!mem_cgroup_is_root(mem))
- __do_uncharge(mem, ctype);
+ __do_uncharge(mem, ctype, page_size);
return mem;
@@ -2579,6 +2890,7 @@ int mem_cgroup_prepare_migration(struct page *page,
enum charge_type ctype;
int ret = 0;
+ VM_BUG_ON(PageTransHuge(page));
if (mem_cgroup_disabled())
return 0;
@@ -2628,7 +2940,7 @@ int mem_cgroup_prepare_migration(struct page *page,
return 0;
*ptr = mem;
- ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
+ ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE);
css_put(&mem->css);/* drop extra refcnt */
if (ret || *ptr == NULL) {
if (PageAnon(page)) {
@@ -2655,13 +2967,13 @@ int mem_cgroup_prepare_migration(struct page *page,
ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
else
ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
- __mem_cgroup_commit_charge(mem, pc, ctype);
+ __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE);
return ret;
}
/* remove redundant charge if migration failed*/
void mem_cgroup_end_migration(struct mem_cgroup *mem,
- struct page *oldpage, struct page *newpage)
+ struct page *oldpage, struct page *newpage, bool migration_ok)
{
struct page *used, *unused;
struct page_cgroup *pc;
@@ -2670,8 +2982,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
return;
/* blocks rmdir() */
cgroup_exclude_rmdir(&mem->css);
- /* at migration success, oldpage->mapping is NULL. */
- if (oldpage->mapping) {
+ if (!migration_ok) {
used = oldpage;
unused = newpage;
} else {
@@ -3038,6 +3349,7 @@ move_account:
lru_add_drain_all();
drain_all_stock_sync();
ret = 0;
+ mem_cgroup_start_move(mem);
for_each_node_state(node, N_HIGH_MEMORY) {
for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
enum lru_list l;
@@ -3051,6 +3363,7 @@ move_account:
if (ret)
break;
}
+ mem_cgroup_end_move(mem);
memcg_oom_recover(mem);
/* it seems parent cgroup doesn't have enough mem */
if (ret == -ENOMEM)
@@ -3137,33 +3450,25 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
return retval;
}
-struct mem_cgroup_idx_data {
- s64 val;
- enum mem_cgroup_stat_index idx;
-};
-static int
-mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
+static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
+ enum mem_cgroup_stat_index idx)
{
- struct mem_cgroup_idx_data *d = data;
- d->val += mem_cgroup_read_stat(mem, d->idx);
- return 0;
-}
+ struct mem_cgroup *iter;
+ s64 val = 0;
-static void
-mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
- enum mem_cgroup_stat_index idx, s64 *val)
-{
- struct mem_cgroup_idx_data d;
- d.idx = idx;
- d.val = 0;
- mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
- *val = d.val;
+ /* each per cpu's value can be minus.Then, use s64 */
+ for_each_mem_cgroup_tree(iter, mem)
+ val += mem_cgroup_read_stat(iter, idx);
+
+ if (val < 0) /* race ? */
+ val = 0;
+ return val;
}
static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
{
- u64 idx_val, val;
+ u64 val;
if (!mem_cgroup_is_root(mem)) {
if (!swap)
@@ -3172,16 +3477,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
return res_counter_read_u64(&mem->memsw, RES_USAGE);
}
- mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val);
- val = idx_val;
- mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val);
- val += idx_val;
+ val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE);
+ val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS);
- if (swap) {
- mem_cgroup_get_recursive_idx_stat(mem,
- MEM_CGROUP_STAT_SWAPOUT, &idx_val);
- val += idx_val;
- }
+ if (swap)
+ val += mem_cgroup_get_recursive_idx_stat(mem,
+ MEM_CGROUP_STAT_SWAPOUT);
return val << PAGE_SHIFT;
}
@@ -3389,9 +3690,9 @@ struct {
};
-static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
+static void
+mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
{
- struct mcs_total_stat *s = data;
s64 val;
/* per cpu stat */
@@ -3421,13 +3722,15 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
- return 0;
}
static void
mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
{
- mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat);
+ struct mem_cgroup *iter;
+
+ for_each_mem_cgroup_tree(iter, mem)
+ mem_cgroup_get_local_stat(iter, s);
}
static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
@@ -3604,7 +3907,7 @@ static int compare_thresholds(const void *a, const void *b)
return _a->threshold - _b->threshold;
}
-static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
+static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
{
struct mem_cgroup_eventfd_list *ev;
@@ -3615,7 +3918,10 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
{
- mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb);
+ struct mem_cgroup *iter;
+
+ for_each_mem_cgroup_tree(iter, mem)
+ mem_cgroup_oom_notify_cb(iter);
}
static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
@@ -3986,13 +4292,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
*/
if (!node_state(node, N_NORMAL_MEMORY))
tmp = -1;
- pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
+ pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
if (!pn)
return 1;
mem->info.nodeinfo[node] = pn;
- memset(pn, 0, sizeof(*pn));
-
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = &pn->zoneinfo[zone];
for_each_lru(l)
@@ -4016,23 +4320,25 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
/* Can be very big if MAX_NUMNODES is very big */
if (size < PAGE_SIZE)
- mem = kmalloc(size, GFP_KERNEL);
+ mem = kzalloc(size, GFP_KERNEL);
else
- mem = vmalloc(size);
+ mem = vzalloc(size);
if (!mem)
return NULL;
- memset(mem, 0, size);
mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
- if (!mem->stat) {
- if (size < PAGE_SIZE)
- kfree(mem);
- else
- vfree(mem);
- mem = NULL;
- }
+ if (!mem->stat)
+ goto out_free;
+ spin_lock_init(&mem->pcp_counter_lock);
return mem;
+
+out_free:
+ if (size < PAGE_SIZE)
+ kfree(mem);
+ else
+ vfree(mem);
+ return NULL;
}
/*
@@ -4158,7 +4464,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
&per_cpu(memcg_stock, cpu);
INIT_WORK(&stock->work, drain_local_stock);
}
- hotcpu_notifier(memcg_stock_cpu_callback, 0);
+ hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
} else {
parent = mem_cgroup_from_cont(cont->parent);
mem->use_hierarchy = parent->use_hierarchy;
@@ -4268,7 +4574,8 @@ one_by_one:
batch_count = PRECHARGE_COUNT_AT_ONCE;
cond_resched();
}
- ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
+ ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
+ PAGE_SIZE);
if (ret || !mem)
/* mem_cgroup_clear_mc() will do uncharge later */
return -ENOMEM;
@@ -4430,6 +4737,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
pte_t *pte;
spinlock_t *ptl;
+ VM_BUG_ON(pmd_trans_huge(*pmd));
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE)
if (is_target_pte_for_mc(vma, addr, *pte, NULL))
@@ -4467,10 +4775,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
static int mem_cgroup_precharge_mc(struct mm_struct *mm)
{
- return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
+ unsigned long precharge = mem_cgroup_count_precharge(mm);
+
+ VM_BUG_ON(mc.moving_task);
+ mc.moving_task = current;
+ return mem_cgroup_do_precharge(precharge);
}
-static void mem_cgroup_clear_mc(void)
+/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
+static void __mem_cgroup_clear_mc(void)
{
struct mem_cgroup *from = mc.from;
struct mem_cgroup *to = mc.to;
@@ -4505,17 +4818,28 @@ static void mem_cgroup_clear_mc(void)
PAGE_SIZE * mc.moved_swap);
}
/* we've already done mem_cgroup_get(mc.to) */
-
mc.moved_swap = 0;
}
+ memcg_oom_recover(from);
+ memcg_oom_recover(to);
+ wake_up_all(&mc.waitq);
+}
+
+static void mem_cgroup_clear_mc(void)
+{
+ struct mem_cgroup *from = mc.from;
+
+ /*
+ * we must clear moving_task before waking up waiters at the end of
+ * task migration.
+ */
+ mc.moving_task = NULL;
+ __mem_cgroup_clear_mc();
spin_lock(&mc.lock);
mc.from = NULL;
mc.to = NULL;
- mc.moving_task = NULL;
spin_unlock(&mc.lock);
- memcg_oom_recover(from);
- memcg_oom_recover(to);
- wake_up_all(&mc.waitq);
+ mem_cgroup_end_move(from);
}
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
@@ -4542,15 +4866,12 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
VM_BUG_ON(mc.precharge);
VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap);
- VM_BUG_ON(mc.moving_task);
+ mem_cgroup_start_move(from);
spin_lock(&mc.lock);
mc.from = from;
mc.to = mem;
- mc.precharge = 0;
- mc.moved_charge = 0;
- mc.moved_swap = 0;
- mc.moving_task = current;
spin_unlock(&mc.lock);
+ /* We set mc.moving_task later */
ret = mem_cgroup_precharge_mc(mm);
if (ret)
@@ -4579,6 +4900,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
spinlock_t *ptl;
retry:
+ VM_BUG_ON(pmd_trans_huge(*pmd));
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; addr += PAGE_SIZE) {
pte_t ptent = *(pte++);
@@ -4599,7 +4921,7 @@ retry:
goto put;
pc = lookup_page_cgroup(page);
if (!mem_cgroup_move_account(pc,
- mc.from, mc.to, false)) {
+ mc.from, mc.to, false, PAGE_SIZE)) {
mc.precharge--;
/* we uncharge from mc.from later. */
mc.moved_charge++;
@@ -4644,7 +4966,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
struct vm_area_struct *vma;
lru_add_drain_all();
- down_read(&mm->mmap_sem);
+retry:
+ if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+ /*
+ * Someone who are holding the mmap_sem might be waiting in
+ * waitq. So we cancel all extra charges, wake up all waiters,
+ * and retry. Because we cancel precharges, we might not be able
+ * to move enough charges, but moving charge is a best-effort
+ * feature anyway, so it wouldn't be a big problem.
+ */
+ __mem_cgroup_clear_mc();
+ cond_resched();
+ goto retry;
+ }
for (vma = mm->mmap; vma; vma = vma->vm_next) {
int ret;
struct mm_walk mem_cgroup_move_charge_walk = {
@@ -4723,10 +5057,21 @@ struct cgroup_subsys mem_cgroup_subsys = {
};
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+static int __init enable_swap_account(char *s)
+{
+ /* consider enabled if no parameter or 1 is given */
+ if (!(*s) || !strcmp(s, "=1"))
+ really_do_swap_account = 1;
+ else if (!strcmp(s, "=0"))
+ really_do_swap_account = 0;
+ return 1;
+}
+__setup("swapaccount", enable_swap_account);
static int __init disable_swap_account(char *s)
{
- really_do_swap_account = 0;
+ printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
+ enable_swap_account("=0");
return 1;
}
__setup("noswapaccount", disable_swap_account);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 757f6b0accfe..0207c2f6f8bd 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -7,21 +7,26 @@
* Free Software Foundation.
*
* High level machine check handler. Handles pages reported by the
- * hardware as being corrupted usually due to a 2bit ECC memory or cache
+ * hardware as being corrupted usually due to a multi-bit ECC memory or cache
* failure.
+ *
+ * In addition there is a "soft offline" entry point that allows stop using
+ * not-yet-corrupted-by-suspicious pages without killing anything.
*
* Handles page cache pages in various states. The tricky part
- * here is that we can access any page asynchronous to other VM
- * users, because memory failures could happen anytime and anywhere,
- * possibly violating some of their assumptions. This is why this code
- * has to be extremely careful. Generally it tries to use normal locking
- * rules, as in get the standard locks, even if that means the
- * error handling takes potentially a long time.
- *
- * The operation to map back from RMAP chains to processes has to walk
- * the complete process list and has non linear complexity with the number
- * mappings. In short it can be quite slow. But since memory corruptions
- * are rare we hope to get away with this.
+ * here is that we can access any page asynchronously in respect to
+ * other VM users, because memory failures could happen anytime and
+ * anywhere. This could violate some of their assumptions. This is why
+ * this code has to be extremely careful. Generally it tries to use
+ * normal locking rules, as in get the standard locks, even if that means
+ * the error handling takes potentially a long time.
+ *
+ * There are several operations here with exponential complexity because
+ * of unsuitable VM data structures. For example the operation to map back
+ * from RMAP chains to processes has to walk the complete process list and
+ * has non linear complexity with the number. But since memory corruptions
+ * are rare we hope to get away with this. This avoids impacting the core
+ * VM.
*/
/*
@@ -30,7 +35,6 @@
* - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
* - pass bad pages to kdump next kernel
*/
-#define DEBUG 1 /* remove me in 2.6.34 */
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
@@ -47,6 +51,7 @@
#include <linux/slab.h>
#include <linux/swapops.h>
#include <linux/hugetlb.h>
+#include <linux/memory_hotplug.h>
#include "internal.h"
int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -78,7 +83,7 @@ static int hwpoison_filter_dev(struct page *p)
return 0;
/*
- * page_mapping() does not accept slab page
+ * page_mapping() does not accept slab pages.
*/
if (PageSlab(p))
return -EINVAL;
@@ -198,7 +203,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
#ifdef __ARCH_SI_TRAPNO
si.si_trapno = trapno;
#endif
- si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
+ si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
/*
* Don't use force here, it's convenient if the signal
* can be temporarily blocked.
@@ -228,8 +233,8 @@ void shake_page(struct page *p, int access)
}
/*
- * Only all shrink_slab here (which would also
- * shrink other caches) if access is not potentially fatal.
+ * Only call shrink_slab here (which would also shrink other caches) if
+ * access is not potentially fatal.
*/
if (access) {
int nr;
@@ -268,7 +273,7 @@ struct to_kill {
struct list_head nd;
struct task_struct *tsk;
unsigned long addr;
- unsigned addr_valid:1;
+ char addr_valid;
};
/*
@@ -309,7 +314,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* a SIGKILL because the error is not contained anymore.
*/
if (tk->addr == -EFAULT) {
- pr_debug("MCE: Unable to find user space address %lx in %s\n",
+ pr_info("MCE: Unable to find user space address %lx in %s\n",
page_to_pfn(p), tsk->comm);
tk->addr_valid = 0;
}
@@ -577,7 +582,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
pfn, err);
} else if (page_has_private(p) &&
!try_to_release_page(p, GFP_NOIO)) {
- pr_debug("MCE %#lx: failed to release buffers\n", pfn);
+ pr_info("MCE %#lx: failed to release buffers\n", pfn);
} else {
ret = RECOVERED;
}
@@ -693,11 +698,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
* Issues:
* - Error on hugepage is contained in hugepage unit (not in raw page unit.)
* To narrow down kill region to one page, we need to break up pmd.
- * - To support soft-offlining for hugepage, we need to support hugepage
- * migration.
*/
static int me_huge_page(struct page *p, unsigned long pfn)
{
+ int res = 0;
struct page *hpage = compound_head(p);
/*
* We can safely recover from error on free or reserved (i.e.
@@ -710,8 +714,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
* so there is no race between isolation and mapping/unmapping.
*/
if (!(page_mapping(hpage) || PageAnon(hpage))) {
- __isolate_hwpoisoned_huge_page(hpage);
- return RECOVERED;
+ res = dequeue_hwpoisoned_huge_page(hpage);
+ if (!res)
+ return RECOVERED;
}
return DELAYED;
}
@@ -836,8 +841,6 @@ static int page_action(struct page_state *ps, struct page *p,
return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
}
-#define N_UNMAP_TRIES 5
-
/*
* Do all that is necessary to remove user space mappings. Unmap
* the pages and send SIGBUS to the processes if the data was dirty.
@@ -849,9 +852,9 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
struct address_space *mapping;
LIST_HEAD(tokill);
int ret;
- int i;
int kill = 1;
struct page *hpage = compound_head(p);
+ struct page *ppage;
if (PageReserved(p) || PageSlab(p))
return SWAP_SUCCESS;
@@ -893,6 +896,44 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
}
/*
+ * ppage: poisoned page
+ * if p is regular page(4k page)
+ * ppage == real poisoned page;
+ * else p is hugetlb or THP, ppage == head page.
+ */
+ ppage = hpage;
+
+ if (PageTransHuge(hpage)) {
+ /*
+ * Verify that this isn't a hugetlbfs head page, the check for
+ * PageAnon is just for avoid tripping a split_huge_page
+ * internal debug check, as split_huge_page refuses to deal with
+ * anything that isn't an anon page. PageAnon can't go away fro
+ * under us because we hold a refcount on the hpage, without a
+ * refcount on the hpage. split_huge_page can't be safely called
+ * in the first place, having a refcount on the tail isn't
+ * enough * to be safe.
+ */
+ if (!PageHuge(hpage) && PageAnon(hpage)) {
+ if (unlikely(split_huge_page(hpage))) {
+ /*
+ * FIXME: if splitting THP is failed, it is
+ * better to stop the following operation rather
+ * than causing panic by unmapping. System might
+ * survive if the page is freed later.
+ */
+ printk(KERN_INFO
+ "MCE %#lx: failed to split THP\n", pfn);
+
+ BUG_ON(!PageHWPoison(p));
+ return SWAP_FAIL;
+ }
+ /* THP is split, so ppage should be the real poisoned page. */
+ ppage = p;
+ }
+ }
+
+ /*
* First collect all the processes that have the page
* mapped in dirty form. This has to be done before try_to_unmap,
* because ttu takes the rmap data structures down.
@@ -901,22 +942,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* there's nothing that can be done.
*/
if (kill)
- collect_procs(hpage, &tokill);
+ collect_procs(ppage, &tokill);
- /*
- * try_to_unmap can fail temporarily due to races.
- * Try a few times (RED-PEN better strategy?)
- */
- for (i = 0; i < N_UNMAP_TRIES; i++) {
- ret = try_to_unmap(hpage, ttu);
- if (ret == SWAP_SUCCESS)
- break;
- pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
- }
+ if (hpage != ppage)
+ lock_page_nosync(ppage);
+ ret = try_to_unmap(ppage, ttu);
if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
- pfn, page_mapcount(hpage));
+ pfn, page_mapcount(ppage));
+
+ if (hpage != ppage)
+ unlock_page(ppage);
/*
* Now that the dirty bit has been propagated to the
@@ -927,7 +964,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* use a more force-full uncatchable kill to prevent
* any accesses to the poisoned memory.
*/
- kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
+ kill_procs_ao(&tokill, !!PageDirty(ppage), trapno,
ret != SWAP_SUCCESS, p, pfn);
return ret;
@@ -936,7 +973,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
static void set_page_hwpoison_huge_page(struct page *hpage)
{
int i;
- int nr_pages = 1 << compound_order(hpage);
+ int nr_pages = 1 << compound_trans_order(hpage);
for (i = 0; i < nr_pages; i++)
SetPageHWPoison(hpage + i);
}
@@ -944,7 +981,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage)
static void clear_page_hwpoison_huge_page(struct page *hpage)
{
int i;
- int nr_pages = 1 << compound_order(hpage);
+ int nr_pages = 1 << compound_trans_order(hpage);
for (i = 0; i < nr_pages; i++)
ClearPageHWPoison(hpage + i);
}
@@ -974,14 +1011,17 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
return 0;
}
- nr_pages = 1 << compound_order(hpage);
+ nr_pages = 1 << compound_trans_order(hpage);
atomic_long_add(nr_pages, &mce_bad_pages);
/*
* We need/can do nothing about count=0 pages.
* 1) it's a free page, and therefore in safe hand:
* prep_new_page() will be the gate keeper.
- * 2) it's part of a non-compound high order page.
+ * 2) it's a free hugepage, which is also safe:
+ * an affected hugepage will be dequeued from hugepage freelist,
+ * so there's no concern about reusing it ever after.
+ * 3) it's part of a non-compound high order page.
* Implies some kernel user: cannot stop them from
* R/W the page; let's pray that the page has been
* used and will be freed some time later.
@@ -993,6 +1033,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
if (is_free_buddy_page(p)) {
action_result(pfn, "free buddy", DELAYED);
return 0;
+ } else if (PageHuge(hpage)) {
+ /*
+ * Check "just unpoisoned", "filter hit", and
+ * "race with other subpage."
+ */
+ lock_page_nosync(hpage);
+ if (!PageHWPoison(hpage)
+ || (hwpoison_filter(p) && TestClearPageHWPoison(p))
+ || (p != hpage && TestSetPageHWPoison(hpage))) {
+ atomic_long_sub(nr_pages, &mce_bad_pages);
+ return 0;
+ }
+ set_page_hwpoison_huge_page(hpage);
+ res = dequeue_hwpoisoned_huge_page(hpage);
+ action_result(pfn, "free huge",
+ res ? IGNORED : DELAYED);
+ unlock_page(hpage);
+ return res;
} else {
action_result(pfn, "high order kernel", IGNORED);
return -EBUSY;
@@ -1007,19 +1065,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
*/
- if (!PageLRU(p) && !PageHuge(p))
- shake_page(p, 0);
- if (!PageLRU(p) && !PageHuge(p)) {
- /*
- * shake_page could have turned it free.
- */
- if (is_free_buddy_page(p)) {
- action_result(pfn, "free buddy, 2nd try", DELAYED);
- return 0;
+ if (!PageHuge(p) && !PageTransCompound(p)) {
+ if (!PageLRU(p))
+ shake_page(p, 0);
+ if (!PageLRU(p)) {
+ /*
+ * shake_page could have turned it free.
+ */
+ if (is_free_buddy_page(p)) {
+ action_result(pfn, "free buddy, 2nd try",
+ DELAYED);
+ return 0;
+ }
+ action_result(pfn, "non LRU", IGNORED);
+ put_page(p);
+ return -EBUSY;
}
- action_result(pfn, "non LRU", IGNORED);
- put_page(p);
- return -EBUSY;
}
/*
@@ -1049,7 +1110,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* For error on the tail page, we should set PG_hwpoison
* on the head page to show that the hugepage is hwpoisoned
*/
- if (PageTail(p) && TestSetPageHWPoison(hpage)) {
+ if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
action_result(pfn, "hugepage already hardware poisoned",
IGNORED);
unlock_page(hpage);
@@ -1147,16 +1208,26 @@ int unpoison_memory(unsigned long pfn)
page = compound_head(p);
if (!PageHWPoison(p)) {
- pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
+ pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
return 0;
}
- nr_pages = 1 << compound_order(page);
+ nr_pages = 1 << compound_trans_order(page);
if (!get_page_unless_zero(page)) {
+ /*
+ * Since HWPoisoned hugepage should have non-zero refcount,
+ * race between memory failure and unpoison seems to happen.
+ * In such case unpoison fails and memory failure runs
+ * to the end.
+ */
+ if (PageHuge(page)) {
+ pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+ return 0;
+ }
if (TestClearPageHWPoison(p))
atomic_long_sub(nr_pages, &mce_bad_pages);
- pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
+ pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
return 0;
}
@@ -1168,12 +1239,12 @@ int unpoison_memory(unsigned long pfn)
* the free buddy page pool.
*/
if (TestClearPageHWPoison(page)) {
- pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
+ pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
atomic_long_sub(nr_pages, &mce_bad_pages);
freeit = 1;
+ if (PageHuge(page))
+ clear_page_hwpoison_huge_page(page);
}
- if (PageHuge(p))
- clear_page_hwpoison_huge_page(page);
unlock_page(page);
put_page(page);
@@ -1187,7 +1258,11 @@ EXPORT_SYMBOL(unpoison_memory);
static struct page *new_page(struct page *p, unsigned long private, int **x)
{
int nid = page_to_nid(p);
- return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+ if (PageHuge(p))
+ return alloc_huge_page_node(page_hstate(compound_head(p)),
+ nid);
+ else
+ return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
}
/*
@@ -1204,25 +1279,31 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
return 1;
/*
- * The lock_system_sleep prevents a race with memory hotplug,
- * because the isolation assumes there's only a single user.
+ * The lock_memory_hotplug prevents a race with memory hotplug.
* This is a big hammer, a better would be nicer.
*/
- lock_system_sleep();
+ lock_memory_hotplug();
/*
* Isolate the page, so that it doesn't get reallocated if it
* was free.
*/
set_migratetype_isolate(p);
+ /*
+ * When the target page is a free hugepage, just remove it
+ * from free hugepage list.
+ */
if (!get_page_unless_zero(compound_head(p))) {
- if (is_free_buddy_page(p)) {
- pr_debug("get_any_page: %#lx free buddy page\n", pfn);
+ if (PageHuge(p)) {
+ pr_info("get_any_page: %#lx free huge page\n", pfn);
+ ret = dequeue_hwpoisoned_huge_page(compound_head(p));
+ } else if (is_free_buddy_page(p)) {
+ pr_info("get_any_page: %#lx free buddy page\n", pfn);
/* Set hwpoison bit while page is still isolated */
SetPageHWPoison(p);
ret = 0;
} else {
- pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
+ pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
pfn, p->flags);
ret = -EIO;
}
@@ -1231,7 +1312,51 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
ret = 1;
}
unset_migratetype_isolate(p);
- unlock_system_sleep();
+ unlock_memory_hotplug();
+ return ret;
+}
+
+static int soft_offline_huge_page(struct page *page, int flags)
+{
+ int ret;
+ unsigned long pfn = page_to_pfn(page);
+ struct page *hpage = compound_head(page);
+ LIST_HEAD(pagelist);
+
+ ret = get_any_page(page, pfn, flags);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ goto done;
+
+ if (PageHWPoison(hpage)) {
+ put_page(hpage);
+ pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
+ return -EBUSY;
+ }
+
+ /* Keep page count to indicate a given hugepage is isolated. */
+
+ list_add(&hpage->lru, &pagelist);
+ ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
+ true);
+ if (ret) {
+ struct page *page1, *page2;
+ list_for_each_entry_safe(page1, page2, &pagelist, lru)
+ put_page(page1);
+
+ pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+ pfn, ret, page->flags);
+ if (ret > 0)
+ ret = -EIO;
+ return ret;
+ }
+done:
+ if (!PageHWPoison(hpage))
+ atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
+ set_page_hwpoison_huge_page(hpage);
+ dequeue_hwpoisoned_huge_page(hpage);
+ /* keep elevated page count for bad page */
return ret;
}
@@ -1262,6 +1387,9 @@ int soft_offline_page(struct page *page, int flags)
int ret;
unsigned long pfn = page_to_pfn(page);
+ if (PageHuge(page))
+ return soft_offline_huge_page(page, flags);
+
ret = get_any_page(page, pfn, flags);
if (ret < 0)
return ret;
@@ -1288,7 +1416,7 @@ int soft_offline_page(struct page *page, int flags)
goto done;
}
if (!PageLRU(page)) {
- pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
+ pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
pfn, page->flags);
return -EIO;
}
@@ -1302,7 +1430,7 @@ int soft_offline_page(struct page *page, int flags)
if (PageHWPoison(page)) {
unlock_page(page);
put_page(page);
- pr_debug("soft offline: %#lx page already poisoned\n", pfn);
+ pr_info("soft offline: %#lx page already poisoned\n", pfn);
return -EBUSY;
}
@@ -1323,7 +1451,7 @@ int soft_offline_page(struct page *page, int flags)
put_page(page);
if (ret == 1) {
ret = 0;
- pr_debug("soft_offline: %#lx: invalidated\n", pfn);
+ pr_info("soft_offline: %#lx: invalidated\n", pfn);
goto done;
}
@@ -1337,15 +1465,17 @@ int soft_offline_page(struct page *page, int flags)
LIST_HEAD(pagelist);
list_add(&page->lru, &pagelist);
- ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+ ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+ 0, true);
if (ret) {
- pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+ putback_lru_pages(&pagelist);
+ pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
if (ret > 0)
ret = -EIO;
}
} else {
- pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
+ pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
pfn, ret, page_count(page), page->flags);
}
if (ret)
diff --git a/mm/memory.c b/mm/memory.c
index 0e18b4d649ec..5823698c2b71 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
}
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long address)
{
pgtable_t new = pte_alloc_one(mm, address);
+ int wait_split_huge_page;
if (!new)
return -ENOMEM;
@@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
spin_lock(&mm->page_table_lock);
- if (!pmd_present(*pmd)) { /* Has another populated it ? */
+ wait_split_huge_page = 0;
+ if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
mm->nr_ptes++;
pmd_populate(mm, pmd, new);
new = NULL;
- }
+ } else if (unlikely(pmd_trans_splitting(*pmd)))
+ wait_split_huge_page = 1;
spin_unlock(&mm->page_table_lock);
if (new)
pte_free(mm, new);
+ if (wait_split_huge_page)
+ wait_split_huge_page(vma->anon_vma, pmd);
return 0;
}
@@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
smp_wmb(); /* See comment in __pte_alloc */
spin_lock(&init_mm.page_table_lock);
- if (!pmd_present(*pmd)) { /* Has another populated it ? */
+ if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
pmd_populate_kernel(&init_mm, pmd, new);
new = NULL;
- }
+ } else
+ VM_BUG_ON(pmd_trans_splitting(*pmd));
spin_unlock(&init_mm.page_table_lock);
if (new)
pte_free_kernel(&init_mm, new);
@@ -719,9 +726,9 @@ out_set_pte:
return 0;
}
-static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
+int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
{
pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
@@ -736,7 +743,7 @@ again:
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte)
return -ENOMEM;
- src_pte = pte_offset_map_nested(src_pmd, addr);
+ src_pte = pte_offset_map(src_pmd, addr);
src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
orig_src_pte = src_pte;
@@ -767,7 +774,7 @@ again:
arch_leave_lazy_mmu_mode();
spin_unlock(src_ptl);
- pte_unmap_nested(orig_src_pte);
+ pte_unmap(orig_src_pte);
add_mm_rss_vec(dst_mm, rss);
pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched();
@@ -795,6 +802,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
src_pmd = pmd_offset(src_pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*src_pmd)) {
+ int err;
+ VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
+ err = copy_huge_pmd(dst_mm, src_mm,
+ dst_pmd, src_pmd, addr, vma);
+ if (err == -ENOMEM)
+ return -ENOMEM;
+ if (!err)
+ continue;
+ /* fall through */
+ }
if (pmd_none_or_clear_bad(src_pmd))
continue;
if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
@@ -997,6 +1015,16 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*pmd)) {
+ if (next-addr != HPAGE_PMD_SIZE) {
+ VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
+ split_huge_page_pmd(vma->vm_mm, pmd);
+ } else if (zap_huge_pmd(tlb, vma, pmd)) {
+ (*zap_work)--;
+ continue;
+ }
+ /* fall through */
+ }
if (pmd_none_or_clear_bad(pmd)) {
(*zap_work)--;
continue;
@@ -1262,7 +1290,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
pud = pud_offset(pgd, address);
if (pud_none(*pud))
goto no_page_table;
- if (pud_huge(*pud)) {
+ if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
BUG_ON(flags & FOLL_GET);
page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
goto out;
@@ -1273,11 +1301,32 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd))
goto no_page_table;
- if (pmd_huge(*pmd)) {
+ if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
BUG_ON(flags & FOLL_GET);
page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
goto out;
}
+ if (pmd_trans_huge(*pmd)) {
+ if (flags & FOLL_SPLIT) {
+ split_huge_page_pmd(mm, pmd);
+ goto split_fallthrough;
+ }
+ spin_lock(&mm->page_table_lock);
+ if (likely(pmd_trans_huge(*pmd))) {
+ if (unlikely(pmd_trans_splitting(*pmd))) {
+ spin_unlock(&mm->page_table_lock);
+ wait_split_huge_page(vma->anon_vma, pmd);
+ } else {
+ page = follow_trans_huge_pmd(mm, address,
+ pmd, flags);
+ spin_unlock(&mm->page_table_lock);
+ goto out;
+ }
+ } else
+ spin_unlock(&mm->page_table_lock);
+ /* fall through */
+ }
+split_fallthrough:
if (unlikely(pmd_bad(*pmd)))
goto no_page_table;
@@ -1310,6 +1359,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
*/
mark_page_accessed(page);
}
+ if (flags & FOLL_MLOCK) {
+ /*
+ * The preliminary mapping check is mainly to avoid the
+ * pointless overhead of lock_page on the ZERO_PAGE
+ * which might bounce very badly if there is contention.
+ *
+ * If the page is already locked, we don't need to
+ * handle it now - vmscan will handle it later if and
+ * when it attempts to reclaim the page.
+ */
+ if (page->mapping && trylock_page(page)) {
+ lru_add_drain(); /* push cached pages to LRU */
+ /*
+ * Because we lock page here and migration is
+ * blocked by the pte's page reference, we need
+ * only check for file-cache page truncation.
+ */
+ if (page->mapping)
+ mlock_vma_page(page);
+ unlock_page(page);
+ }
+ }
unlock:
pte_unmap_unlock(ptep, ptl);
out:
@@ -1341,7 +1412,8 @@ no_page_table:
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int nr_pages, unsigned int gup_flags,
- struct page **pages, struct vm_area_struct **vmas)
+ struct page **pages, struct vm_area_struct **vmas,
+ int *nonblocking)
{
int i;
unsigned long vm_flags;
@@ -1386,6 +1458,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
pmd = pmd_offset(pud, pg);
if (pmd_none(*pmd))
return i ? : -EFAULT;
+ VM_BUG_ON(pmd_trans_huge(*pmd));
pte = pte_offset_map(pmd, pg);
if (pte_none(*pte)) {
pte_unmap(pte);
@@ -1441,16 +1514,22 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
cond_resched();
while (!(page = follow_page(vma, start, foll_flags))) {
int ret;
+ unsigned int fault_flags = 0;
+
+ if (foll_flags & FOLL_WRITE)
+ fault_flags |= FAULT_FLAG_WRITE;
+ if (nonblocking)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY;
ret = handle_mm_fault(mm, vma, start,
- (foll_flags & FOLL_WRITE) ?
- FAULT_FLAG_WRITE : 0);
+ fault_flags);
if (ret & VM_FAULT_ERROR) {
if (ret & VM_FAULT_OOM)
return i ? i : -ENOMEM;
if (ret &
- (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
+ (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE|
+ VM_FAULT_SIGBUS))
return i ? i : -EFAULT;
BUG();
}
@@ -1459,6 +1538,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
else
tsk->min_flt++;
+ if (ret & VM_FAULT_RETRY) {
+ *nonblocking = 0;
+ return i;
+ }
+
/*
* The VM_FAULT_WRITE bit tells us that
* do_wp_page has broken COW when necessary,
@@ -1558,7 +1642,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (force)
flags |= FOLL_FORCE;
- return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
+ return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
+ NULL);
}
EXPORT_SYMBOL(get_user_pages);
@@ -1583,22 +1668,25 @@ struct page *get_dump_page(unsigned long addr)
struct page *page;
if (__get_user_pages(current, current->mm, addr, 1,
- FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1)
+ FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
+ NULL) < 1)
return NULL;
flush_cache_page(vma, addr, page_to_pfn(page));
return page;
}
#endif /* CONFIG_ELF_CORE */
-pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
+pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
spinlock_t **ptl)
{
pgd_t * pgd = pgd_offset(mm, addr);
pud_t * pud = pud_alloc(mm, pgd, addr);
if (pud) {
pmd_t * pmd = pmd_alloc(mm, pud, addr);
- if (pmd)
+ if (pmd) {
+ VM_BUG_ON(pmd_trans_huge(*pmd));
return pte_alloc_map_lock(mm, pmd, addr, ptl);
+ }
}
return NULL;
}
@@ -1817,6 +1905,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
return -ENOMEM;
+ VM_BUG_ON(pmd_trans_huge(*pmd));
do {
next = pmd_addr_end(addr, end);
if (remap_pte_range(mm, pmd, addr, next,
@@ -2047,19 +2136,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
return same;
}
-/*
- * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
- * servicing faults for write access. In the normal case, do always want
- * pte_mkwrite. But get_user_pages can cause write faults for mappings
- * that do not have writing enabled, when used by access_process_vm.
- */
-static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
-{
- if (likely(vma->vm_flags & VM_WRITE))
- pte = pte_mkwrite(pte);
- return pte;
-}
-
static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
{
/*
@@ -2079,7 +2155,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
* zeroes.
*/
if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
- memset(kaddr, 0, PAGE_SIZE);
+ clear_page(kaddr);
kunmap_atomic(kaddr, KM_USER0);
flush_dcache_page(dst);
} else
@@ -2107,10 +2183,11 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
spinlock_t *ptl, pte_t orig_pte)
+ __releases(ptl)
{
struct page *old_page, *new_page;
pte_t entry;
- int reuse = 0, ret = 0;
+ int ret = 0;
int page_mkwrite = 0;
struct page *dirty_page = NULL;
@@ -2142,19 +2219,20 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
&ptl);
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
- page_cache_release(old_page);
goto unlock;
}
page_cache_release(old_page);
}
- reuse = reuse_swap_page(old_page);
- if (reuse)
+ if (reuse_swap_page(old_page)) {
/*
* The page is all ours. Move it to our anon_vma so
* the rmap code will not search our parent or siblings.
* Protected against the rmap code by the page lock.
*/
page_move_anon_rmap(old_page, vma, address);
+ unlock_page(old_page);
+ goto reuse;
+ }
unlock_page(old_page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
@@ -2210,7 +2288,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
&ptl);
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
- page_cache_release(old_page);
goto unlock;
}
@@ -2218,18 +2295,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
dirty_page = old_page;
get_page(dirty_page);
- reuse = 1;
- }
- if (reuse) {
reuse:
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (ptep_set_access_flags(vma, address, page_table, entry,1))
update_mmu_cache(vma, address, page_table);
+ pte_unmap_unlock(page_table, ptl);
ret |= VM_FAULT_WRITE;
- goto unlock;
+
+ if (!dirty_page)
+ return ret;
+
+ /*
+ * Yes, Virginia, this is actually required to prevent a race
+ * with clear_page_dirty_for_io() from clearing the page dirty
+ * bit after it clear all dirty ptes, but before a racing
+ * do_wp_page installs a dirty pte.
+ *
+ * do_no_page is protected similarly.
+ */
+ if (!page_mkwrite) {
+ wait_on_page_locked(dirty_page);
+ set_page_dirty_balance(dirty_page, page_mkwrite);
+ }
+ put_page(dirty_page);
+ if (page_mkwrite) {
+ struct address_space *mapping = dirty_page->mapping;
+
+ set_page_dirty(dirty_page);
+ unlock_page(dirty_page);
+ page_cache_release(dirty_page);
+ if (mapping) {
+ /*
+ * Some device drivers do not set page.mapping
+ * but still dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
+ }
+ }
+
+ /* file_update_time outside page_lock */
+ if (vma->vm_file)
+ file_update_time(vma->vm_file);
+
+ return ret;
}
/*
@@ -2254,16 +2365,6 @@ gotten:
}
__SetPageUptodate(new_page);
- /*
- * Don't let another task, with possibly unlocked vma,
- * keep the mlocked page.
- */
- if ((vma->vm_flags & VM_LOCKED) && old_page) {
- lock_page(old_page); /* for LRU manipulation */
- clear_page_mlock(old_page);
- unlock_page(old_page);
- }
-
if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
goto oom_free_new;
@@ -2331,42 +2432,19 @@ gotten:
if (new_page)
page_cache_release(new_page);
- if (old_page)
- page_cache_release(old_page);
unlock:
pte_unmap_unlock(page_table, ptl);
- if (dirty_page) {
+ if (old_page) {
/*
- * Yes, Virginia, this is actually required to prevent a race
- * with clear_page_dirty_for_io() from clearing the page dirty
- * bit after it clear all dirty ptes, but before a racing
- * do_wp_page installs a dirty pte.
- *
- * do_no_page is protected similarly.
+ * Don't let another task, with possibly unlocked vma,
+ * keep the mlocked page.
*/
- if (!page_mkwrite) {
- wait_on_page_locked(dirty_page);
- set_page_dirty_balance(dirty_page, page_mkwrite);
- }
- put_page(dirty_page);
- if (page_mkwrite) {
- struct address_space *mapping = dirty_page->mapping;
-
- set_page_dirty(dirty_page);
- unlock_page(dirty_page);
- page_cache_release(dirty_page);
- if (mapping) {
- /*
- * Some device drivers do not set page.mapping
- * but still dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
+ if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
+ lock_page(old_page); /* LRU manipulation */
+ munlock_vma_page(old_page);
+ unlock_page(old_page);
}
-
- /* file_update_time outside page_lock */
- if (vma->vm_file)
- file_update_time(vma->vm_file);
+ page_cache_release(old_page);
}
return ret;
oom_free_new:
@@ -2570,6 +2648,7 @@ void unmap_mapping_range(struct address_space *mapping,
details.last_index = ULONG_MAX;
details.i_mmap_lock = &mapping->i_mmap_lock;
+ mutex_lock(&mapping->unmap_mutex);
spin_lock(&mapping->i_mmap_lock);
/* Protect against endless unmapping loops */
@@ -2586,6 +2665,7 @@ void unmap_mapping_range(struct address_space *mapping,
if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->unmap_mutex);
}
EXPORT_SYMBOL(unmap_mapping_range);
@@ -2626,6 +2706,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *page, *swapcache = NULL;
swp_entry_t entry;
pte_t pte;
+ int locked;
struct mem_cgroup *ptr = NULL;
int exclusive = 0;
int ret = 0;
@@ -2676,8 +2757,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_release;
}
- lock_page(page);
+ locked = lock_page_or_retry(page, mm, flags);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+ if (!locked) {
+ ret |= VM_FAULT_RETRY;
+ goto out_release;
+ }
/*
* Make sure try_to_free_swap or reuse_swap_page or swapoff did not
@@ -2926,7 +3011,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vmf.page = NULL;
ret = vma->vm_ops->fault(vma, &vmf);
- if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
+ VM_FAULT_RETRY)))
return ret;
if (unlikely(PageHWPoison(vmf.page))) {
@@ -2967,12 +3053,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
goto out;
}
charged = 1;
- /*
- * Don't let another task, with possibly unlocked vma,
- * keep the mlocked page.
- */
- if (vma->vm_flags & VM_LOCKED)
- clear_page_mlock(vmf.page);
copy_user_highpage(page, vmf.page, address, vma);
__SetPageUptodate(page);
} else {
@@ -3139,9 +3219,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static inline int handle_pte_fault(struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long address,
- pte_t *pte, pmd_t *pmd, unsigned int flags)
+int handle_pte_fault(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pte_t *pte, pmd_t *pmd, unsigned int flags)
{
pte_t entry;
spinlock_t *ptl;
@@ -3185,7 +3265,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
* with threads.
*/
if (flags & FAULT_FLAG_WRITE)
- flush_tlb_page(vma, address);
+ flush_tlb_fix_spurious_fault(vma, address);
}
unlock:
pte_unmap_unlock(pte, ptl);
@@ -3220,9 +3300,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pmd = pmd_alloc(mm, pud, address);
if (!pmd)
return VM_FAULT_OOM;
- pte = pte_alloc_map(mm, pmd, address);
- if (!pte)
+ if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
+ if (!vma->vm_ops)
+ return do_huge_pmd_anonymous_page(mm, vma, address,
+ pmd, flags);
+ } else {
+ pmd_t orig_pmd = *pmd;
+ barrier();
+ if (pmd_trans_huge(orig_pmd)) {
+ if (flags & FAULT_FLAG_WRITE &&
+ !pmd_write(orig_pmd) &&
+ !pmd_trans_splitting(orig_pmd))
+ return do_huge_pmd_wp_page(mm, vma, address,
+ pmd, orig_pmd);
+ return 0;
+ }
+ }
+
+ /*
+ * Use __pte_alloc instead of pte_alloc_map, because we can't
+ * run pte_offset_map on the pmd, if an huge pmd could
+ * materialize from under us from a different thread.
+ */
+ if (unlikely(__pte_alloc(mm, vma, pmd, address)))
return VM_FAULT_OOM;
+ /* if an huge pmd materialized from under us just retry later */
+ if (unlikely(pmd_trans_huge(*pmd)))
+ return 0;
+ /*
+ * A regular pmd is established and it can't morph into a huge pmd
+ * from under us anymore at this point because we hold the mmap_sem
+ * read mode and khugepaged takes it in write mode. So now it's
+ * safe to run pte_offset_map().
+ */
+ pte = pte_offset_map(pmd, address);
return handle_pte_fault(mm, vma, address, pte, pmd, flags);
}
@@ -3288,7 +3399,12 @@ int make_pages_present(unsigned long addr, unsigned long end)
vma = find_vma(current->mm, addr);
if (!vma)
return -ENOMEM;
- write = (vma->vm_flags & VM_WRITE) != 0;
+ /*
+ * We want to touch writable mappings with a write fault in order
+ * to break COW, except for shared mappings because these don't COW
+ * and we would not want to dirty them for nothing.
+ */
+ write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
BUG_ON(addr >= end);
BUG_ON(end > vma->vm_end);
len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
@@ -3343,7 +3459,7 @@ int in_gate_area_no_task(unsigned long addr)
#endif /* __HAVE_ARCH_GATE_AREA */
-static int follow_pte(struct mm_struct *mm, unsigned long address,
+static int __follow_pte(struct mm_struct *mm, unsigned long address,
pte_t **ptepp, spinlock_t **ptlp)
{
pgd_t *pgd;
@@ -3360,6 +3476,7 @@ static int follow_pte(struct mm_struct *mm, unsigned long address,
goto out;
pmd = pmd_offset(pud, address);
+ VM_BUG_ON(pmd_trans_huge(*pmd));
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
@@ -3380,6 +3497,17 @@ out:
return -EINVAL;
}
+static inline int follow_pte(struct mm_struct *mm, unsigned long address,
+ pte_t **ptepp, spinlock_t **ptlp)
+{
+ int res;
+
+ /* (void) is needed to make gcc happy */
+ (void) __cond_lock(*ptlp,
+ !(res = __follow_pte(mm, address, ptepp, ptlp)));
+ return res;
+}
+
/**
* follow_pfn - look up PFN at a user virtual address
* @vma: memory mapping
@@ -3589,3 +3717,74 @@ void might_fault(void)
}
EXPORT_SYMBOL(might_fault);
#endif
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+static void clear_gigantic_page(struct page *page,
+ unsigned long addr,
+ unsigned int pages_per_huge_page)
+{
+ int i;
+ struct page *p = page;
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page;
+ i++, p = mem_map_next(p, page, i)) {
+ cond_resched();
+ clear_user_highpage(p, addr + i * PAGE_SIZE);
+ }
+}
+void clear_huge_page(struct page *page,
+ unsigned long addr, unsigned int pages_per_huge_page)
+{
+ int i;
+
+ if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+ clear_gigantic_page(page, addr, pages_per_huge_page);
+ return;
+ }
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page; i++) {
+ cond_resched();
+ clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ }
+}
+
+static void copy_user_gigantic_page(struct page *dst, struct page *src,
+ unsigned long addr,
+ struct vm_area_struct *vma,
+ unsigned int pages_per_huge_page)
+{
+ int i;
+ struct page *dst_base = dst;
+ struct page *src_base = src;
+
+ for (i = 0; i < pages_per_huge_page; ) {
+ cond_resched();
+ copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+
+ i++;
+ dst = mem_map_next(dst, dst_base, i);
+ src = mem_map_next(src, src_base, i);
+ }
+}
+
+void copy_user_huge_page(struct page *dst, struct page *src,
+ unsigned long addr, struct vm_area_struct *vma,
+ unsigned int pages_per_huge_page)
+{
+ int i;
+
+ if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+ copy_user_gigantic_page(dst, src, addr, vma,
+ pages_per_huge_page);
+ return;
+ }
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page; i++) {
+ cond_resched();
+ copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
+ }
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index dd186c1a5d53..321fc7455df7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -34,6 +34,23 @@
#include "internal.h"
+DEFINE_MUTEX(mem_hotplug_mutex);
+
+void lock_memory_hotplug(void)
+{
+ mutex_lock(&mem_hotplug_mutex);
+
+ /* for exclusive hibernation if CONFIG_HIBERNATION=y */
+ lock_system_sleep();
+}
+
+void unlock_memory_hotplug(void)
+{
+ unlock_system_sleep();
+ mutex_unlock(&mem_hotplug_mutex);
+}
+
+
/* add this memory to iomem resource */
static struct resource *register_memory_resource(u64 start, u64 size)
{
@@ -65,9 +82,10 @@ static void release_memory_resource(struct resource *res)
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
#ifndef CONFIG_SPARSEMEM_VMEMMAP
-static void get_page_bootmem(unsigned long info, struct page *page, int type)
+static void get_page_bootmem(unsigned long info, struct page *page,
+ unsigned long type)
{
- atomic_set(&page->_mapcount, type);
+ page->lru.next = (struct list_head *) type;
SetPagePrivate(page);
set_page_private(page, info);
atomic_inc(&page->_count);
@@ -77,15 +95,16 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type)
* so use __ref to tell modpost not to generate a warning */
void __ref put_page_bootmem(struct page *page)
{
- int type;
+ unsigned long type;
- type = atomic_read(&page->_mapcount);
- BUG_ON(type >= -1);
+ type = (unsigned long) page->lru.next;
+ BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
+ type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
if (atomic_dec_return(&page->_count) == 1) {
ClearPagePrivate(page);
set_page_private(page, 0);
- reset_page_mapcount(page);
+ INIT_LIST_HEAD(&page->lru);
__free_pages_bootmem(page, 0);
}
@@ -390,6 +409,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
int ret;
struct memory_notify arg;
+ lock_memory_hotplug();
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
arg.status_change_nid = -1;
@@ -402,6 +422,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
ret = notifier_to_errno(ret);
if (ret) {
memory_notify(MEM_CANCEL_ONLINE, &arg);
+ unlock_memory_hotplug();
return ret;
}
/*
@@ -426,6 +447,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
nr_pages, pfn);
memory_notify(MEM_CANCEL_ONLINE, &arg);
+ unlock_memory_hotplug();
return ret;
}
@@ -450,6 +472,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
if (onlined_pages)
memory_notify(MEM_ONLINE, &arg);
+ unlock_memory_hotplug();
return 0;
}
@@ -493,7 +516,7 @@ int mem_online_node(int nid)
pg_data_t *pgdat;
int ret;
- lock_system_sleep();
+ lock_memory_hotplug();
pgdat = hotadd_new_pgdat(nid, 0);
if (pgdat) {
ret = -ENOMEM;
@@ -504,7 +527,7 @@ int mem_online_node(int nid)
BUG_ON(ret);
out:
- unlock_system_sleep();
+ unlock_memory_hotplug();
return ret;
}
@@ -516,7 +539,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
struct resource *res;
int ret;
- lock_system_sleep();
+ lock_memory_hotplug();
res = register_memory_resource(start, size);
ret = -EEXIST;
@@ -563,7 +586,7 @@ error:
release_memory_resource(res);
out:
- unlock_system_sleep();
+ unlock_memory_hotplug();
return ret;
}
EXPORT_SYMBOL_GPL(add_memory);
@@ -602,27 +625,14 @@ static struct page *next_active_pageblock(struct page *page)
/* Checks if this range of memory is likely to be hot-removable. */
int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
{
- int type;
struct page *page = pfn_to_page(start_pfn);
struct page *end_page = page + nr_pages;
/* Check the starting page of each pageblock within the range */
for (; page < end_page; page = next_active_pageblock(page)) {
- type = get_pageblock_migratetype(page);
-
- /*
- * A pageblock containing MOVABLE or free pages is considered
- * removable
- */
- if (type != MIGRATE_MOVABLE && !pageblock_free(page))
- return 0;
-
- /*
- * A pageblock starting with a PageReserved page is not
- * considered removable.
- */
- if (PageReserved(page))
+ if (!is_pageblock_removable_nolock(page))
return 0;
+ cond_resched();
}
/* All pageblocks in the memory block are likely to be hot-removable */
@@ -659,7 +669,7 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
* Scanning pfn is much easier than scanning lru list.
* Scan pfn from start to end and Find LRU page.
*/
-int scan_lru_pages(unsigned long start, unsigned long end)
+static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
{
unsigned long pfn;
struct page *page;
@@ -709,29 +719,31 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
page_is_file_cache(page));
} else {
- /* Becasue we don't have big zone->lock. we should
- check this again here. */
- if (page_count(page))
- not_managed++;
#ifdef CONFIG_DEBUG_VM
printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
pfn);
dump_page(page);
#endif
+ /* Becasue we don't have big zone->lock. we should
+ check this again here. */
+ if (page_count(page)) {
+ not_managed++;
+ ret = -EBUSY;
+ break;
+ }
}
}
- ret = -EBUSY;
- if (not_managed) {
- if (!list_empty(&source))
+ if (!list_empty(&source)) {
+ if (not_managed) {
+ putback_lru_pages(&source);
+ goto out;
+ }
+ /* this function returns # of failed pages */
+ ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
+ true, true);
+ if (ret)
putback_lru_pages(&source);
- goto out;
}
- ret = 0;
- if (list_empty(&source))
- goto out;
- /* this function returns # of failed pages */
- ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
-
out:
return ret;
}
@@ -803,7 +815,7 @@ static int offline_pages(unsigned long start_pfn,
if (!test_pages_in_a_zone(start_pfn, end_pfn))
return -EINVAL;
- lock_system_sleep();
+ lock_memory_hotplug();
zone = page_zone(pfn_to_page(start_pfn));
node = zone_to_nid(zone);
@@ -840,7 +852,6 @@ repeat:
ret = 0;
if (drain) {
lru_add_drain_all();
- flush_scheduled_work();
cond_resched();
drain_all_pages();
}
@@ -862,7 +873,6 @@ repeat:
}
/* drain all zone's lru pagevec, this is asyncronous... */
lru_add_drain_all();
- flush_scheduled_work();
yield();
/* drain pcp pages , this is synchrouns. */
drain_all_pages();
@@ -894,7 +904,7 @@ repeat:
writeback_set_ratelimit();
memory_notify(MEM_OFFLINE, &arg);
- unlock_system_sleep();
+ unlock_memory_hotplug();
return 0;
failed_removal:
@@ -905,7 +915,7 @@ failed_removal:
undo_isolate_page_range(start_pfn, end_pfn);
out:
- unlock_system_sleep();
+ unlock_memory_hotplug();
return ret;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f969da5dd8a2..368fc9d23610 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -514,6 +514,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ split_huge_page_pmd(vma->vm_mm, pmd);
if (pmd_none_or_clear_bad(pmd))
continue;
if (check_pte_range(vma, pmd, addr, next, nodes,
@@ -924,15 +925,22 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
nodemask_t nmask;
LIST_HEAD(pagelist);
int err = 0;
+ struct vm_area_struct *vma;
nodes_clear(nmask);
node_set(source, nmask);
- check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
+ vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+ if (IS_ERR(vma))
+ return PTR_ERR(vma);
- if (!list_empty(&pagelist))
- err = migrate_pages(&pagelist, new_node_page, dest, 0);
+ if (!list_empty(&pagelist)) {
+ err = migrate_pages(&pagelist, new_node_page, dest,
+ false, true);
+ if (err)
+ putback_lru_pages(&pagelist);
+ }
return err;
}
@@ -1147,9 +1155,13 @@ static long do_mbind(unsigned long start, unsigned long len,
err = mbind_range(mm, start, end, new);
- if (!list_empty(&pagelist))
+ if (!list_empty(&pagelist)) {
nr_failed = migrate_pages(&pagelist, new_vma_page,
- (unsigned long)vma, 0);
+ (unsigned long)vma,
+ false, true);
+ if (nr_failed)
+ putback_lru_pages(&pagelist);
+ }
if (!err && nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
@@ -1298,15 +1310,15 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
goto out;
/* Find the mm_struct */
- read_lock(&tasklist_lock);
+ rcu_read_lock();
task = pid ? find_task_by_vpid(pid) : current;
if (!task) {
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
err = -ESRCH;
goto out;
}
mm = get_task_mm(task);
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
err = -EINVAL;
if (!mm)
@@ -1588,7 +1600,7 @@ unsigned slab_node(struct mempolicy *policy)
(void)first_zones_zonelist(zonelist, highest_zoneidx,
&policy->v.nodes,
&zone);
- return zone->node;
+ return zone ? zone->node : numa_node_id();
}
default:
@@ -1784,7 +1796,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
}
/**
- * alloc_page_vma - Allocate a page for a VMA.
+ * alloc_pages_vma - Allocate a page for a VMA.
*
* @gfp:
* %GFP_USER user allocation.
@@ -1793,6 +1805,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
* %GFP_FS allocation should not call back into a file system.
* %GFP_ATOMIC don't sleep.
*
+ * @order:Order of the GFP allocation.
* @vma: Pointer to VMA or NULL if not available.
* @addr: Virtual Address of the allocation. Must be inside the VMA.
*
@@ -1806,7 +1819,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
* Should be called with the mm_sem of the vma hold.
*/
struct page *
-alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
+alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
+ unsigned long addr)
{
struct mempolicy *pol = get_vma_policy(current, vma, addr);
struct zonelist *zl;
@@ -1818,7 +1832,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
mpol_cond_put(pol);
- page = alloc_page_interleave(gfp, 0, nid);
+ page = alloc_page_interleave(gfp, order, nid);
put_mems_allowed();
return page;
}
@@ -1827,7 +1841,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
/*
* slow path: ref counted shared policy
*/
- struct page *page = __alloc_pages_nodemask(gfp, 0,
+ struct page *page = __alloc_pages_nodemask(gfp, order,
zl, policy_nodemask(gfp, pol));
__mpol_put(pol);
put_mems_allowed();
@@ -1836,7 +1850,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
/*
* fast path: default or task policy
*/
- page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+ page = __alloc_pages_nodemask(gfp, order, zl,
+ policy_nodemask(gfp, pol));
put_mems_allowed();
return page;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 38e7cad782f4..766115253807 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -32,8 +32,11 @@
#include <linux/security.h>
#include <linux/memcontrol.h>
#include <linux/syscalls.h>
+#include <linux/hugetlb.h>
#include <linux/gfp.h>
+#include <asm/tlbflush.h>
+
#include "internal.h"
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -95,26 +98,36 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
pte_t *ptep, pte;
spinlock_t *ptl;
- pgd = pgd_offset(mm, addr);
- if (!pgd_present(*pgd))
- goto out;
+ if (unlikely(PageHuge(new))) {
+ ptep = huge_pte_offset(mm, addr);
+ if (!ptep)
+ goto out;
+ ptl = &mm->page_table_lock;
+ } else {
+ pgd = pgd_offset(mm, addr);
+ if (!pgd_present(*pgd))
+ goto out;
- pud = pud_offset(pgd, addr);
- if (!pud_present(*pud))
- goto out;
+ pud = pud_offset(pgd, addr);
+ if (!pud_present(*pud))
+ goto out;
- pmd = pmd_offset(pud, addr);
- if (!pmd_present(*pmd))
- goto out;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_trans_huge(*pmd))
+ goto out;
+ if (!pmd_present(*pmd))
+ goto out;
- ptep = pte_offset_map(pmd, addr);
+ ptep = pte_offset_map(pmd, addr);
- if (!is_swap_pte(*ptep)) {
- pte_unmap(ptep);
- goto out;
- }
+ if (!is_swap_pte(*ptep)) {
+ pte_unmap(ptep);
+ goto out;
+ }
+
+ ptl = pte_lockptr(mm, pmd);
+ }
- ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
pte = *ptep;
if (!is_swap_pte(pte))
@@ -130,10 +143,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
if (is_write_migration_entry(entry))
pte = pte_mkwrite(pte);
+#ifdef CONFIG_HUGETLB_PAGE
+ if (PageHuge(new))
+ pte = pte_mkhuge(pte);
+#endif
flush_cache_page(vma, addr, pte_pfn(pte));
set_pte_at(mm, addr, ptep, pte);
- if (PageAnon(new))
+ if (PageHuge(new)) {
+ if (PageAnon(new))
+ hugepage_add_anon_rmap(new, vma, addr);
+ else
+ page_dup_rmap(new);
+ } else if (PageAnon(new))
page_add_anon_rmap(new, vma, addr);
else
page_add_file_rmap(new);
@@ -226,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
expected_count = 2 + page_has_private(page);
if (page_count(page) != expected_count ||
- (struct page *)radix_tree_deref_slot(pslot) != page) {
+ radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
@@ -276,11 +298,59 @@ static int migrate_page_move_mapping(struct address_space *mapping,
}
/*
+ * The expected number of remaining references is the same as that
+ * of migrate_page_move_mapping().
+ */
+int migrate_huge_page_move_mapping(struct address_space *mapping,
+ struct page *newpage, struct page *page)
+{
+ int expected_count;
+ void **pslot;
+
+ if (!mapping) {
+ if (page_count(page) != 1)
+ return -EAGAIN;
+ return 0;
+ }
+
+ spin_lock_irq(&mapping->tree_lock);
+
+ pslot = radix_tree_lookup_slot(&mapping->page_tree,
+ page_index(page));
+
+ expected_count = 2 + page_has_private(page);
+ if (page_count(page) != expected_count ||
+ radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return -EAGAIN;
+ }
+
+ if (!page_freeze_refs(page, expected_count)) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return -EAGAIN;
+ }
+
+ get_page(newpage);
+
+ radix_tree_replace_slot(pslot, newpage);
+
+ page_unfreeze_refs(page, expected_count);
+
+ __put_page(page);
+
+ spin_unlock_irq(&mapping->tree_lock);
+ return 0;
+}
+
+/*
* Copy the page to its new location
*/
-static void migrate_page_copy(struct page *newpage, struct page *page)
+void migrate_page_copy(struct page *newpage, struct page *page)
{
- copy_highpage(newpage, page);
+ if (PageHuge(page))
+ copy_huge_page(newpage, page);
+ else
+ copy_highpage(newpage, page);
if (PageError(page))
SetPageError(newpage);
@@ -431,7 +501,6 @@ static int writeout(struct address_space *mapping, struct page *page)
.nr_to_write = 1,
.range_start = 0,
.range_end = LLONG_MAX,
- .nonblocking = 1,
.for_reclaim = 1
};
int rc;
@@ -547,13 +616,12 @@ static int move_to_new_page(struct page *newpage, struct page *page,
* to the newly allocated page in newpage.
*/
static int unmap_and_move(new_page_t get_new_page, unsigned long private,
- struct page *page, int force, int offlining)
+ struct page *page, int force, bool offlining, bool sync)
{
int rc = 0;
int *result = NULL;
struct page *newpage = get_new_page(page, private, &result);
int remap_swapcache = 1;
- int rcu_locked = 0;
int charge = 0;
struct mem_cgroup *mem = NULL;
struct anon_vma *anon_vma = NULL;
@@ -565,6 +633,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
/* page was freed from under us. So we are done. */
goto move_newpage;
}
+ if (unlikely(PageTransHuge(page)))
+ if (unlikely(split_huge_page(page)))
+ goto move_newpage;
/* prepare cgroup just returns 0 or -ENOMEM */
rc = -EAGAIN;
@@ -572,6 +643,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
if (!trylock_page(page)) {
if (!force)
goto move_newpage;
+
+ /*
+ * It's not safe for direct compaction to call lock_page.
+ * For example, during page readahead pages are added locked
+ * to the LRU. Later, when the IO completes the pages are
+ * marked uptodate and unlocked. However, the queueing
+ * could be merging multiple pages for one bio (e.g.
+ * mpage_readpages). If an allocation happens for the
+ * second or third page, the process can end up locking
+ * the same page twice and deadlocking. Rather than
+ * trying to be clever about what pages can be locked,
+ * avoid the use of lock_page for direct compaction
+ * altogether.
+ */
+ if (current->flags & PF_MEMALLOC)
+ goto move_newpage;
+
lock_page(page);
}
@@ -598,27 +686,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
BUG_ON(charge);
if (PageWriteback(page)) {
- if (!force)
+ if (!force || !sync)
goto uncharge;
wait_on_page_writeback(page);
}
/*
* By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
* we cannot notice that anon_vma is freed while we migrates a page.
- * This rcu_read_lock() delays freeing anon_vma pointer until the end
+ * This get_anon_vma() delays freeing anon_vma pointer until the end
* of migration. File cache pages are no problem because of page_lock()
* File Caches may use write_page() or lock_page() in migration, then,
* just care Anon page here.
*/
if (PageAnon(page)) {
- rcu_read_lock();
- rcu_locked = 1;
-
- /* Determine how to safely use anon_vma */
- if (!page_mapped(page)) {
- if (!PageSwapCache(page))
- goto rcu_unlock;
-
+ /*
+ * Only page_lock_anon_vma() understands the subtleties of
+ * getting a hold on an anon_vma from outside one of its mms.
+ */
+ anon_vma = page_lock_anon_vma(page);
+ if (anon_vma) {
+ /*
+ * Take a reference count on the anon_vma if the
+ * page is mapped so that it is guaranteed to
+ * exist when the page is remapped later
+ */
+ get_anon_vma(anon_vma);
+ page_unlock_anon_vma(anon_vma);
+ } else if (PageSwapCache(page)) {
/*
* We cannot be sure that the anon_vma of an unmapped
* swapcache page is safe to use because we don't
@@ -633,13 +727,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
*/
remap_swapcache = 0;
} else {
- /*
- * Take a reference count on the anon_vma if the
- * page is mapped so that it is guaranteed to
- * exist when the page is remapped later
- */
- anon_vma = page_anon_vma(page);
- get_anon_vma(anon_vma);
+ goto uncharge;
}
}
@@ -656,16 +744,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
* free the metadata, so the page can be freed.
*/
if (!page->mapping) {
- if (!PageAnon(page) && page_has_private(page)) {
- /*
- * Go direct to try_to_free_buffers() here because
- * a) that's what try_to_release_page() would do anyway
- * b) we may be under rcu_read_lock() here, so we can't
- * use GFP_KERNEL which is what try_to_release_page()
- * needs to be effective.
- */
+ VM_BUG_ON(PageAnon(page));
+ if (page_has_private(page)) {
try_to_free_buffers(page);
- goto rcu_unlock;
+ goto uncharge;
}
goto skip_unmap;
}
@@ -679,20 +761,18 @@ skip_unmap:
if (rc && remap_swapcache)
remove_migration_ptes(page, page);
-rcu_unlock:
/* Drop an anon_vma reference if we took one */
if (anon_vma)
drop_anon_vma(anon_vma);
- if (rcu_locked)
- rcu_read_unlock();
uncharge:
if (!charge)
- mem_cgroup_end_migration(mem, page, newpage);
+ mem_cgroup_end_migration(mem, page, newpage, rc == 0);
unlock:
unlock_page(page);
+move_newpage:
if (rc != -EAGAIN) {
/*
* A page that has been migrated has all references
@@ -706,8 +786,6 @@ unlock:
putback_lru_page(page);
}
-move_newpage:
-
/*
* Move the new page to the LRU. If migration was not successful
* then this will free the page.
@@ -724,6 +802,81 @@ move_newpage:
}
/*
+ * Counterpart of unmap_and_move_page() for hugepage migration.
+ *
+ * This function doesn't wait the completion of hugepage I/O
+ * because there is no race between I/O and migration for hugepage.
+ * Note that currently hugepage I/O occurs only in direct I/O
+ * where no lock is held and PG_writeback is irrelevant,
+ * and writeback status of all subpages are counted in the reference
+ * count of the head page (i.e. if all subpages of a 2MB hugepage are
+ * under direct I/O, the reference of the head page is 512 and a bit more.)
+ * This means that when we try to migrate hugepage whose subpages are
+ * doing direct I/O, some references remain after try_to_unmap() and
+ * hugepage migration fails without data corruption.
+ *
+ * There is also no race when direct I/O is issued on the page under migration,
+ * because then pte is replaced with migration swap entry and direct I/O code
+ * will wait in the page fault for migration to complete.
+ */
+static int unmap_and_move_huge_page(new_page_t get_new_page,
+ unsigned long private, struct page *hpage,
+ int force, bool offlining, bool sync)
+{
+ int rc = 0;
+ int *result = NULL;
+ struct page *new_hpage = get_new_page(hpage, private, &result);
+ struct anon_vma *anon_vma = NULL;
+
+ if (!new_hpage)
+ return -ENOMEM;
+
+ rc = -EAGAIN;
+
+ if (!trylock_page(hpage)) {
+ if (!force || !sync)
+ goto out;
+ lock_page(hpage);
+ }
+
+ if (PageAnon(hpage)) {
+ anon_vma = page_lock_anon_vma(hpage);
+ if (anon_vma) {
+ get_anon_vma(anon_vma);
+ page_unlock_anon_vma(anon_vma);
+ }
+ }
+
+ try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+
+ if (!page_mapped(hpage))
+ rc = move_to_new_page(new_hpage, hpage, 1);
+
+ if (rc)
+ remove_migration_ptes(hpage, hpage);
+
+ if (anon_vma)
+ drop_anon_vma(anon_vma);
+out:
+ unlock_page(hpage);
+
+ if (rc != -EAGAIN) {
+ list_del(&hpage->lru);
+ put_page(hpage);
+ }
+
+ put_page(new_hpage);
+
+ if (result) {
+ if (rc)
+ *result = rc;
+ else
+ *result = page_to_nid(new_hpage);
+ }
+ return rc;
+}
+
+/*
* migrate_pages
*
* The function takes one list of pages to migrate and a function
@@ -732,13 +885,15 @@ move_newpage:
*
* The function returns after 10 attempts or if no pages
* are movable anymore because to has become empty
- * or no retryable pages exist anymore. All pages will be
- * returned to the LRU or freed.
+ * or no retryable pages exist anymore.
+ * Caller should call putback_lru_pages to return pages to the LRU
+ * or free list only if ret != 0.
*
* Return: Number of pages not migrated or error code.
*/
int migrate_pages(struct list_head *from,
- new_page_t get_new_page, unsigned long private, int offlining)
+ new_page_t get_new_page, unsigned long private, bool offlining,
+ bool sync)
{
int retry = 1;
int nr_failed = 0;
@@ -758,7 +913,8 @@ int migrate_pages(struct list_head *from,
cond_resched();
rc = unmap_and_move(get_new_page, private,
- page, pass > 2, offlining);
+ page, pass > 2, offlining,
+ sync);
switch(rc) {
case -ENOMEM:
@@ -780,8 +936,50 @@ out:
if (!swapwrite)
current->flags &= ~PF_SWAPWRITE;
- putback_lru_pages(from);
+ if (rc)
+ return rc;
+
+ return nr_failed + retry;
+}
+
+int migrate_huge_pages(struct list_head *from,
+ new_page_t get_new_page, unsigned long private, bool offlining,
+ bool sync)
+{
+ int retry = 1;
+ int nr_failed = 0;
+ int pass = 0;
+ struct page *page;
+ struct page *page2;
+ int rc;
+
+ for (pass = 0; pass < 10 && retry; pass++) {
+ retry = 0;
+
+ list_for_each_entry_safe(page, page2, from, lru) {
+ cond_resched();
+ rc = unmap_and_move_huge_page(get_new_page,
+ private, page, pass > 2, offlining,
+ sync);
+
+ switch(rc) {
+ case -ENOMEM:
+ goto out;
+ case -EAGAIN:
+ retry++;
+ break;
+ case 0:
+ break;
+ default:
+ /* Permanent failure */
+ nr_failed++;
+ break;
+ }
+ }
+ }
+ rc = 0;
+out:
if (rc)
return rc;
@@ -841,10 +1039,10 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
err = -EFAULT;
vma = find_vma(mm, pp->addr);
- if (!vma || !vma_migratable(vma))
+ if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
goto set_status;
- page = follow_page(vma, pp->addr, FOLL_GET);
+ page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
err = PTR_ERR(page);
if (IS_ERR(page))
@@ -890,9 +1088,12 @@ set_status:
}
err = 0;
- if (!list_empty(&pagelist))
+ if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_page_node,
- (unsigned long)pm, 0);
+ (unsigned long)pm, 0, true);
+ if (err)
+ putback_lru_pages(&pagelist);
+ }
up_read(&mm->mmap_sem);
return err;
@@ -1005,7 +1206,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
int err = -EFAULT;
vma = find_vma(mm, addr);
- if (!vma)
+ if (!vma || addr < vma->vm_start)
goto set_status;
page = follow_page(vma, addr, 0);
diff --git a/mm/mincore.c b/mm/mincore.c
index 9ac42dc6d7b6..a4e6b9d75c76 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -154,6 +154,13 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*pmd)) {
+ if (mincore_huge_pmd(vma, pmd, addr, next, vec)) {
+ vec += (next - addr) >> PAGE_SHIFT;
+ continue;
+ }
+ /* fall through */
+ }
if (pmd_none_or_clear_bad(pmd))
mincore_unmapped_range(vma, addr, next, vec);
else
diff --git a/mm/mlock.c b/mm/mlock.c
index b70919ce4f72..c3924c7f00be 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -155,13 +155,12 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add
* vma->vm_mm->mmap_sem must be held for at least read.
*/
static long __mlock_vma_pages_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+ unsigned long start, unsigned long end,
+ int *nonblocking)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long addr = start;
- struct page *pages[16]; /* 16 gives a reasonable batch */
int nr_pages = (end - start) / PAGE_SIZE;
- int ret = 0;
int gup_flags;
VM_BUG_ON(start & ~PAGE_MASK);
@@ -170,73 +169,33 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
VM_BUG_ON(end > vma->vm_end);
VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
- gup_flags = FOLL_TOUCH | FOLL_GET;
- if (vma->vm_flags & VM_WRITE)
+ gup_flags = FOLL_TOUCH;
+ /*
+ * We want to touch writable mappings with a write fault in order
+ * to break COW, except for shared mappings because these don't COW
+ * and we would not want to dirty them for nothing.
+ */
+ if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
gup_flags |= FOLL_WRITE;
+ /*
+ * We want mlock to succeed for regions that have any permissions
+ * other than PROT_NONE.
+ */
+ if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
+ gup_flags |= FOLL_FORCE;
+
+ if (vma->vm_flags & VM_LOCKED)
+ gup_flags |= FOLL_MLOCK;
+
/* We don't try to access the guard page of a stack vma */
if (stack_guard_page(vma, start)) {
addr += PAGE_SIZE;
nr_pages--;
}
- while (nr_pages > 0) {
- int i;
-
- cond_resched();
-
- /*
- * get_user_pages makes pages present if we are
- * setting mlock. and this extra reference count will
- * disable migration of this page. However, page may
- * still be truncated out from under us.
- */
- ret = __get_user_pages(current, mm, addr,
- min_t(int, nr_pages, ARRAY_SIZE(pages)),
- gup_flags, pages, NULL);
- /*
- * This can happen for, e.g., VM_NONLINEAR regions before
- * a page has been allocated and mapped at a given offset,
- * or for addresses that map beyond end of a file.
- * We'll mlock the pages if/when they get faulted in.
- */
- if (ret < 0)
- break;
-
- lru_add_drain(); /* push cached pages to LRU */
-
- for (i = 0; i < ret; i++) {
- struct page *page = pages[i];
-
- if (page->mapping) {
- /*
- * That preliminary check is mainly to avoid
- * the pointless overhead of lock_page on the
- * ZERO_PAGE: which might bounce very badly if
- * there is contention. However, we're still
- * dirtying its cacheline with get/put_page:
- * we'll add another __get_user_pages flag to
- * avoid it if that case turns out to matter.
- */
- lock_page(page);
- /*
- * Because we lock page here and migration is
- * blocked by the elevated reference, we need
- * only check for file-cache page truncation.
- */
- if (page->mapping)
- mlock_vma_page(page);
- unlock_page(page);
- }
- put_page(page); /* ref from get_user_pages() */
- }
-
- addr += ret * PAGE_SIZE;
- nr_pages -= ret;
- ret = 0;
- }
-
- return ret; /* 0 or negative error code */
+ return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
+ NULL, NULL, nonblocking);
}
/*
@@ -280,7 +239,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current))) {
- __mlock_vma_pages_range(vma, start, end);
+ __mlock_vma_pages_range(vma, start, end, NULL);
/* Hide errors from mmap() and other callers */
return 0;
@@ -372,18 +331,10 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
int ret = 0;
int lock = newflags & VM_LOCKED;
- if (newflags == vma->vm_flags ||
- (vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
+ is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))
goto out; /* don't set VM_LOCKED, don't count */
- if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
- is_vm_hugetlb_page(vma) ||
- vma == get_gate_vma(current)) {
- if (lock)
- make_pages_present(start, end);
- goto out; /* don't set VM_LOCKED, don't count */
- }
-
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma));
@@ -419,14 +370,10 @@ success:
* set VM_LOCKED, __mlock_vma_pages_range will bring it back.
*/
- if (lock) {
+ if (lock)
vma->vm_flags = newflags;
- ret = __mlock_vma_pages_range(vma, start, end);
- if (ret < 0)
- ret = __mlock_posix_error_return(ret);
- } else {
+ else
munlock_vma_pages_range(vma, start, end);
- }
out:
*prev = vma;
@@ -439,7 +386,8 @@ static int do_mlock(unsigned long start, size_t len, int on)
struct vm_area_struct * vma, * prev;
int error;
- len = PAGE_ALIGN(len);
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(len != PAGE_ALIGN(len));
end = start + len;
if (end < start)
return -EINVAL;
@@ -482,6 +430,62 @@ static int do_mlock(unsigned long start, size_t len, int on)
return error;
}
+static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long end, nstart, nend;
+ struct vm_area_struct *vma = NULL;
+ int locked = 0;
+ int ret = 0;
+
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(len != PAGE_ALIGN(len));
+ end = start + len;
+
+ for (nstart = start; nstart < end; nstart = nend) {
+ /*
+ * We want to fault in pages for [nstart; end) address range.
+ * Find first corresponding VMA.
+ */
+ if (!locked) {
+ locked = 1;
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, nstart);
+ } else if (nstart >= vma->vm_end)
+ vma = vma->vm_next;
+ if (!vma || vma->vm_start >= end)
+ break;
+ /*
+ * Set [nstart; nend) to intersection of desired address
+ * range with the first VMA. Also, skip undesirable VMA types.
+ */
+ nend = min(end, vma->vm_end);
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+ continue;
+ if (nstart < vma->vm_start)
+ nstart = vma->vm_start;
+ /*
+ * Now fault in a range of pages. __mlock_vma_pages_range()
+ * double checks the vma flags, so that it won't mlock pages
+ * if the vma was already munlocked.
+ */
+ ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
+ if (ret < 0) {
+ if (ignore_errors) {
+ ret = 0;
+ continue; /* continue at next VMA */
+ }
+ ret = __mlock_posix_error_return(ret);
+ break;
+ }
+ nend = nstart + ret * PAGE_SIZE;
+ ret = 0;
+ }
+ if (locked)
+ up_read(&mm->mmap_sem);
+ return ret; /* 0 or negative error code */
+}
+
SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
{
unsigned long locked;
@@ -507,6 +511,8 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
error = do_mlock(start, len, 1);
up_write(&current->mm->mmap_sem);
+ if (!error)
+ error = do_mlock_pages(start, len, 0);
return error;
}
@@ -571,6 +577,10 @@ SYSCALL_DEFINE1(mlockall, int, flags)
capable(CAP_IPC_LOCK))
ret = do_mlockall(flags);
up_write(&current->mm->mmap_sem);
+ if (!ret && (flags & MCL_CURRENT)) {
+ /* Ignore errors */
+ do_mlock_pages(0, TASK_SIZE, 1);
+ }
out:
return ret;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 00161a48a451..2ec8eb5a9cdd 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,6 +28,8 @@
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
#include <linux/perf_event.h>
+#include <linux/audit.h>
+#include <linux/khugepaged.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -252,7 +254,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
down_write(&mm->mmap_sem);
#ifdef CONFIG_COMPAT_BRK
- min_brk = mm->end_code;
+ /*
+ * CONFIG_COMPAT_BRK can still be overridden by setting
+ * randomize_va_space to 2, which will still cause mm->start_brk
+ * to be arbitrarily shifted
+ */
+ if (mm->start_brk > PAGE_ALIGN(mm->end_data))
+ min_brk = mm->start_brk;
+ else
+ min_brk = mm->end_data;
#else
min_brk = mm->start_brk;
#endif
@@ -587,6 +597,8 @@ again: remove_next = 1 + (end > next->vm_end);
}
}
+ vma_adjust_trans_huge(vma, start, end, adjust_next);
+
/*
* When changing only vma->vm_end, we don't really need anon_vma
* lock. This is a fairly rare case by itself, but the anon_vma
@@ -814,6 +826,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
end, prev->vm_pgoff, NULL);
if (err)
return NULL;
+ khugepaged_enter_vma_merge(prev);
return prev;
}
@@ -832,6 +845,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
next->vm_pgoff - pglen, NULL);
if (err)
return NULL;
+ khugepaged_enter_vma_merge(area);
return area;
}
@@ -1108,6 +1122,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
unsigned long retval = -EBADF;
if (!(flags & MAP_ANONYMOUS)) {
+ audit_mmap_fd(fd, flags);
if (unlikely(flags & MAP_HUGETLB))
return -EINVAL;
file = fget(fd);
@@ -1759,6 +1774,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
}
vma_unlock_anon_vma(vma);
+ khugepaged_enter_vma_merge(vma);
return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1806,6 +1822,7 @@ static int expand_downwards(struct vm_area_struct *vma,
}
}
vma_unlock_anon_vma(vma);
+ khugepaged_enter_vma_merge(vma);
return error;
}
@@ -2460,6 +2477,7 @@ int install_special_mapping(struct mm_struct *mm,
unsigned long addr, unsigned long len,
unsigned long vm_flags, struct page **pages)
{
+ int ret;
struct vm_area_struct *vma;
vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
@@ -2477,16 +2495,23 @@ int install_special_mapping(struct mm_struct *mm,
vma->vm_ops = &special_mapping_vmops;
vma->vm_private_data = pages;
- if (unlikely(insert_vm_struct(mm, vma))) {
- kmem_cache_free(vm_area_cachep, vma);
- return -ENOMEM;
- }
+ ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
+ if (ret)
+ goto out;
+
+ ret = insert_vm_struct(mm, vma);
+ if (ret)
+ goto out;
mm->total_vm += len >> PAGE_SHIFT;
perf_event_mmap(vma);
return 0;
+
+out:
+ kmem_cache_free(vm_area_cachep, vma);
+ return ret;
}
static DEFINE_MUTEX(mm_all_locks_mutex);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 438951d366f2..8d032de4088e 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
return young;
}
+int __mmu_notifier_test_young(struct mm_struct *mm,
+ unsigned long address)
+{
+ struct mmu_notifier *mn;
+ struct hlist_node *n;
+ int young = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->test_young) {
+ young = mn->ops->test_young(mn, mm, address);
+ if (young)
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return young;
+}
+
void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
pte_t pte)
{
diff --git a/mm/mmzone.c b/mm/mmzone.c
index e35bfb82c855..f5b7d1760213 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn,
return 1;
}
#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
-
-#ifdef CONFIG_SMP
-/* Called when a more accurate view of NR_FREE_PAGES is needed */
-unsigned long zone_nr_free_pages(struct zone *zone)
-{
- unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
-
- /*
- * While kswapd is awake, it is considered the zone is under some
- * memory pressure. Under pressure, there is a risk that
- * per-cpu-counter-drift will allow the min watermark to be breached
- * potentially causing a live-lock. While kswapd is awake and
- * free pages are low, get a better estimate for free pages
- */
- if (nr_free_pages < zone->percpu_drift_mark &&
- !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
- return zone_page_state_snapshot(zone, NR_FREE_PAGES);
-
- return nr_free_pages;
-}
-#endif /* CONFIG_SMP */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 2d1bf7cf8851..5a688a2756be 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -78,7 +78,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
pte_unmap_unlock(pte - 1, ptl);
}
-static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
+static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable)
{
@@ -88,13 +88,21 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*pmd)) {
+ if (next - addr != HPAGE_PMD_SIZE)
+ split_huge_page_pmd(vma->vm_mm, pmd);
+ else if (change_huge_pmd(vma, pmd, addr, newprot))
+ continue;
+ /* fall through */
+ }
if (pmd_none_or_clear_bad(pmd))
continue;
- change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
+ change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
+ dirty_accountable);
} while (pmd++, addr = next, addr != end);
}
-static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
+static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable)
{
@@ -106,7 +114,8 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable);
+ change_pmd_range(vma, pud, addr, next, newprot,
+ dirty_accountable);
} while (pud++, addr = next, addr != end);
}
@@ -126,7 +135,8 @@ static void change_protection(struct vm_area_struct *vma,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable);
+ change_pud_range(vma, pgd, addr, next, newprot,
+ dirty_accountable);
} while (pgd++, addr = next, addr != end);
flush_tlb_range(vma, start, end);
}
@@ -211,6 +221,7 @@ success:
mmu_notifier_invalidate_range_end(mm, start, end);
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+ perf_event_mmap(vma);
return 0;
fail:
@@ -299,7 +310,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
if (error)
goto out;
- perf_event_mmap(vma);
nstart = tmp;
if (nstart < prev->vm_end)
diff --git a/mm/mremap.c b/mm/mremap.c
index cde56ee51ef7..1de98d492ddc 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -41,13 +41,15 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
return NULL;
pmd = pmd_offset(pud, addr);
+ split_huge_page_pmd(mm, pmd);
if (pmd_none_or_clear_bad(pmd))
return NULL;
return pmd;
}
-static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr)
{
pgd_t *pgd;
pud_t *pud;
@@ -62,7 +64,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
if (!pmd)
return NULL;
- if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
+ VM_BUG_ON(pmd_trans_huge(*pmd));
+ if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr))
return NULL;
return pmd;
@@ -91,9 +94,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
*/
mapping = vma->vm_file->f_mapping;
spin_lock(&mapping->i_mmap_lock);
- if (new_vma->vm_truncate_count &&
- new_vma->vm_truncate_count != vma->vm_truncate_count)
- new_vma->vm_truncate_count = 0;
+ new_vma->vm_truncate_count = 0;
}
/*
@@ -101,7 +102,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
* pte locks because exclusive mmap_sem prevents deadlock.
*/
old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
- new_pte = pte_offset_map_nested(new_pmd, new_addr);
+ new_pte = pte_offset_map(new_pmd, new_addr);
new_ptl = pte_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -119,7 +120,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
arch_leave_lazy_mmu_mode();
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
- pte_unmap_nested(new_pte - 1);
+ pte_unmap(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl);
if (mapping)
spin_unlock(&mapping->i_mmap_lock);
@@ -147,7 +148,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
old_pmd = get_old_pmd(vma->vm_mm, old_addr);
if (!old_pmd)
continue;
- new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
+ new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
if (!new_pmd)
break;
next = (new_addr + PMD_SIZE) & PMD_MASK;
diff --git a/mm/nommu.c b/mm/nommu.c
index 88ff091eb07a..f59e1424d3db 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -10,7 +10,7 @@
* Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
* Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
* Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com>
- * Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org>
+ * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
*/
#include <linux/module.h>
@@ -29,6 +29,7 @@
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/audit.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
@@ -126,7 +127,8 @@ unsigned int kobjsize(const void *objp)
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int nr_pages, unsigned int foll_flags,
- struct page **pages, struct vm_area_struct **vmas)
+ struct page **pages, struct vm_area_struct **vmas,
+ int *retry)
{
struct vm_area_struct *vma;
unsigned long vm_flags;
@@ -184,7 +186,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (force)
flags |= FOLL_FORCE;
- return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
+ return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
+ NULL);
}
EXPORT_SYMBOL(get_user_pages);
@@ -293,12 +296,60 @@ void *vmalloc(unsigned long size)
}
EXPORT_SYMBOL(vmalloc);
+/*
+ * vzalloc - allocate virtually continguos memory with zero fill
+ *
+ * @size: allocation size
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into continguos kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vzalloc(unsigned long size)
+{
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+ PAGE_KERNEL);
+}
+EXPORT_SYMBOL(vzalloc);
+
+/**
+ * vmalloc_node - allocate memory on a specific node
+ * @size: allocation size
+ * @node: numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
void *vmalloc_node(unsigned long size, int node)
{
return vmalloc(size);
}
EXPORT_SYMBOL(vmalloc_node);
+/**
+ * vzalloc_node - allocate memory on a specific node with zero fill
+ * @size: allocation size
+ * @node: numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vzalloc_node(unsigned long size, int node)
+{
+ return vzalloc(size);
+}
+EXPORT_SYMBOL(vzalloc_node);
+
#ifndef PAGE_KERNEL_EXEC
# define PAGE_KERNEL_EXEC PAGE_KERNEL
#endif
@@ -392,6 +443,31 @@ void __attribute__((weak)) vmalloc_sync_all(void)
{
}
+/**
+ * alloc_vm_area - allocate a range of kernel address space
+ * @size: size of the area
+ *
+ * Returns: NULL on failure, vm_struct on success
+ *
+ * This function reserves a range of kernel address space, and
+ * allocates pagetables to map that range. No actual mappings
+ * are created. If the kernel address space is not shared
+ * between processes, it syncs the pagetable across all
+ * processes.
+ */
+struct vm_struct *alloc_vm_area(size_t size)
+{
+ BUG();
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(alloc_vm_area);
+
+void free_vm_area(struct vm_struct *area)
+{
+ BUG();
+}
+EXPORT_SYMBOL_GPL(free_vm_area);
+
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page)
{
@@ -1411,6 +1487,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
struct file *file = NULL;
unsigned long retval = -EBADF;
+ audit_mmap_fd(fd, flags);
if (!(flags & MAP_ANONYMOUS)) {
file = fget(fd);
if (!file)
@@ -1668,6 +1745,7 @@ void exit_mmap(struct mm_struct *mm)
mm->mmap = vma->vm_next;
delete_vma_from_mm(vma);
delete_vma(mm, vma);
+ cond_resched();
}
kleave("");
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4029583a1024..7dcca55ede7c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -162,10 +162,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
return 0;
/*
- * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't
- * need to be executed for something that cannot be killed.
+ * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN
+ * so the entire heuristic doesn't need to be executed for something
+ * that cannot be killed.
*/
- if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+ if (atomic_read(&p->mm->oom_disable_count)) {
task_unlock(p);
return 0;
}
@@ -403,16 +404,40 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
#define K(x) ((x) << (PAGE_SHIFT-10))
static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
{
+ struct task_struct *q;
+ struct mm_struct *mm;
+
p = find_lock_task_mm(p);
if (!p)
return 1;
+ /* mm cannot be safely dereferenced after task_unlock(p) */
+ mm = p->mm;
+
pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
task_pid_nr(p), p->comm, K(p->mm->total_vm),
K(get_mm_counter(p->mm, MM_ANONPAGES)),
K(get_mm_counter(p->mm, MM_FILEPAGES)));
task_unlock(p);
+ /*
+ * Kill all processes sharing p->mm in other thread groups, if any.
+ * They don't get access to memory reserves or a higher scheduler
+ * priority, though, to avoid depletion of all memory or task
+ * starvation. This prevents mm->mmap_sem livelock when an oom killed
+ * task cannot exit because it requires the semaphore and its contended
+ * by another thread trying to allocate memory itself. That thread will
+ * now get access to memory reserves since it has a pending fatal
+ * signal.
+ */
+ for_each_process(q)
+ if (q->mm == mm && !same_thread_group(q, p)) {
+ task_lock(q); /* Protect ->comm from prctl() */
+ pr_err("Kill process %d (%s) sharing same memory\n",
+ task_pid_nr(q), q->comm);
+ task_unlock(q);
+ force_sig(SIGKILL, q);
+ }
set_tsk_thread_flag(p, TIF_MEMDIE);
force_sig(SIGKILL, p);
@@ -680,7 +705,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
read_lock(&tasklist_lock);
if (sysctl_oom_kill_allocating_task &&
!oom_unkillable_task(current, NULL, nodemask) &&
- (current->signal->oom_adj != OOM_DISABLE)) {
+ current->mm && !atomic_read(&current->mm->oom_disable_count)) {
/*
* oom_kill_process() needs tasklist_lock held. If it returns
* non-zero, current could not be killed so we must fallback to
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e3bccac1f025..2cb01f6ec5d0 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -404,25 +404,22 @@ unsigned long determine_dirtyable_memory(void)
* - vm.dirty_background_ratio or vm.dirty_background_bytes
* - vm.dirty_ratio or vm.dirty_bytes
* The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
- * runtime tasks.
+ * real-time tasks.
*/
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
{
unsigned long background;
unsigned long dirty;
- unsigned long available_memory = determine_dirtyable_memory();
+ unsigned long uninitialized_var(available_memory);
struct task_struct *tsk;
+ if (!vm_dirty_bytes || !dirty_background_bytes)
+ available_memory = determine_dirtyable_memory();
+
if (vm_dirty_bytes)
dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
- else {
- int dirty_ratio;
-
- dirty_ratio = vm_dirty_ratio;
- if (dirty_ratio < 5)
- dirty_ratio = 5;
- dirty = (dirty_ratio * available_memory) / 100;
- }
+ else
+ dirty = (vm_dirty_ratio * available_memory) / 100;
if (dirty_background_bytes)
background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
@@ -510,7 +507,7 @@ static void balance_dirty_pages(struct address_space *mapping,
* catch-up. This avoids (excessively) small writeouts
* when the bdi limits are ramping up.
*/
- if (nr_reclaimable + nr_writeback <
+ if (nr_reclaimable + nr_writeback <=
(background_thresh + dirty_thresh) / 2)
break;
@@ -542,8 +539,8 @@ static void balance_dirty_pages(struct address_space *mapping,
* the last resort safeguard.
*/
dirty_exceeded =
- (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh)
- || (nr_reclaimable + nr_writeback >= dirty_thresh);
+ (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
+ || (nr_reclaimable + nr_writeback > dirty_thresh);
if (!dirty_exceeded)
break;
@@ -569,7 +566,7 @@ static void balance_dirty_pages(struct address_space *mapping,
break; /* We've done our duty */
}
trace_wbc_balance_dirty_wait(&wbc, bdi);
- __set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_UNINTERRUPTIBLE);
io_schedule_timeout(pause);
/*
@@ -1109,7 +1106,7 @@ EXPORT_SYMBOL(write_one_page);
int __set_page_dirty_no_writeback(struct page *page)
{
if (!PageDirty(page))
- SetPageDirty(page);
+ return !TestSetPageDirty(page);
return 0;
}
@@ -1121,6 +1118,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
{
if (mapping_cap_account_dirty(mapping)) {
__inc_zone_page_state(page, NR_FILE_DIRTY);
+ __inc_zone_page_state(page, NR_DIRTIED);
__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
task_dirty_inc(current);
task_io_account_write(PAGE_CACHE_SIZE);
@@ -1129,6 +1127,18 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
EXPORT_SYMBOL(account_page_dirtied);
/*
+ * Helper function for set_page_writeback family.
+ * NOTE: Unlike account_page_dirtied this does not rely on being atomic
+ * wrt interrupts.
+ */
+void account_page_writeback(struct page *page)
+{
+ inc_zone_page_state(page, NR_WRITEBACK);
+ inc_zone_page_state(page, NR_WRITTEN);
+}
+EXPORT_SYMBOL(account_page_writeback);
+
+/*
* For address_spaces which do not use buffers. Just tag the page as dirty in
* its radix tree.
*
@@ -1366,7 +1376,7 @@ int test_set_page_writeback(struct page *page)
ret = TestSetPageWriteback(page);
}
if (!ret)
- inc_zone_page_state(page, NR_WRITEBACK);
+ account_page_writeback(page);
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f12ad1836abe..a873e61e312e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -21,6 +21,7 @@
#include <linux/pagemap.h>
#include <linux/jiffies.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/kmemcheck.h>
@@ -103,19 +104,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
* only be modified with pm_mutex held, unless the suspend/hibernate code is
* guaranteed not to run in parallel with that modification).
*/
-void set_gfp_allowed_mask(gfp_t mask)
+
+static gfp_t saved_gfp_mask;
+
+void pm_restore_gfp_mask(void)
{
WARN_ON(!mutex_is_locked(&pm_mutex));
- gfp_allowed_mask = mask;
+ if (saved_gfp_mask) {
+ gfp_allowed_mask = saved_gfp_mask;
+ saved_gfp_mask = 0;
+ }
}
-gfp_t clear_gfp_allowed_mask(gfp_t mask)
+void pm_restrict_gfp_mask(void)
{
- gfp_t ret = gfp_allowed_mask;
-
WARN_ON(!mutex_is_locked(&pm_mutex));
- gfp_allowed_mask &= ~mask;
- return ret;
+ WARN_ON(saved_gfp_mask);
+ saved_gfp_mask = gfp_allowed_mask;
+ gfp_allowed_mask &= ~GFP_IOFS;
}
#endif /* CONFIG_PM_SLEEP */
@@ -351,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order)
}
}
+/* update __split_huge_page_refcount if you change this function */
static int destroy_compound_page(struct page *page, unsigned long order)
{
int i;
@@ -420,18 +427,10 @@ static inline void rmv_page_order(struct page *page)
*
* Assumption: *_mem_map is contiguous at least up to MAX_ORDER
*/
-static inline struct page *
-__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
-{
- unsigned long buddy_idx = page_idx ^ (1 << order);
-
- return page + (buddy_idx - page_idx);
-}
-
static inline unsigned long
-__find_combined_index(unsigned long page_idx, unsigned int order)
+__find_buddy_index(unsigned long page_idx, unsigned int order)
{
- return (page_idx & ~(1 << order));
+ return page_idx ^ (1 << order);
}
/*
@@ -442,8 +441,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
* (c) a page and its buddy have the same order &&
* (d) a page and its buddy are in the same zone.
*
- * For recording whether a page is in the buddy system, we use PG_buddy.
- * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set ->_mapcount -2.
+ * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
*
* For recording page's order, we use page_private(page).
*/
@@ -476,7 +475,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
* as necessary, plus some accounting needed to play nicely with other
* parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with PG_buddy. Page's
+ * free pages of length of (1 << order) and marked with _mapcount -2. Page's
* order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the
* other. That is, if we allocate a small block, and both were
@@ -493,6 +492,7 @@ static inline void __free_one_page(struct page *page,
{
unsigned long page_idx;
unsigned long combined_idx;
+ unsigned long uninitialized_var(buddy_idx);
struct page *buddy;
if (unlikely(PageCompound(page)))
@@ -507,7 +507,8 @@ static inline void __free_one_page(struct page *page,
VM_BUG_ON(bad_range(zone, page));
while (order < MAX_ORDER-1) {
- buddy = __page_find_buddy(page, page_idx, order);
+ buddy_idx = __find_buddy_index(page_idx, order);
+ buddy = page + (buddy_idx - page_idx);
if (!page_is_buddy(page, buddy, order))
break;
@@ -515,7 +516,7 @@ static inline void __free_one_page(struct page *page,
list_del(&buddy->lru);
zone->free_area[order].nr_free--;
rmv_page_order(buddy);
- combined_idx = __find_combined_index(page_idx, order);
+ combined_idx = buddy_idx & page_idx;
page = page + (combined_idx - page_idx);
page_idx = combined_idx;
order++;
@@ -530,11 +531,12 @@ static inline void __free_one_page(struct page *page,
* so it's less likely to be used soon and more likely to be merged
* as a higher order page
*/
- if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) {
+ if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
struct page *higher_page, *higher_buddy;
- combined_idx = __find_combined_index(page_idx, order);
- higher_page = page + combined_idx - page_idx;
- higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
+ combined_idx = buddy_idx & page_idx;
+ higher_page = page + (combined_idx - page_idx);
+ buddy_idx = __find_buddy_index(combined_idx, order + 1);
+ higher_buddy = page + (buddy_idx - combined_idx);
if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
list_add_tail(&page->lru,
&zone->free_area[order].free_list[migratetype]);
@@ -645,13 +647,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
trace_mm_page_free_direct(page, order);
kmemcheck_free_shadow(page, order);
- for (i = 0; i < (1 << order); i++) {
- struct page *pg = page + i;
-
- if (PageAnon(pg))
- pg->mapping = NULL;
- bad += free_pages_check(pg);
- }
+ if (PageAnon(page))
+ page->mapping = NULL;
+ for (i = 0; i < (1 << order); i++)
+ bad += free_pages_check(page + i);
if (bad)
return false;
@@ -1089,8 +1088,10 @@ static void drain_pages(unsigned int cpu)
pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;
- free_pcppages_bulk(zone, pcp->count, pcp);
- pcp->count = 0;
+ if (pcp->count) {
+ free_pcppages_bulk(zone, pcp->count, pcp);
+ pcp->count = 0;
+ }
local_irq_restore(flags);
}
}
@@ -1454,24 +1455,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
#endif /* CONFIG_FAIL_PAGE_ALLOC */
/*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
* of the allocation.
*/
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int alloc_flags)
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags, long free_pages)
{
/* free_pages my go negative - that's OK */
long min = mark;
- long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
int o;
+ free_pages -= (1 << order) + 1;
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
if (alloc_flags & ALLOC_HARDER)
min -= min / 4;
if (free_pages <= min + z->lowmem_reserve[classzone_idx])
- return 0;
+ return false;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
free_pages -= z->free_area[o].nr_free << o;
@@ -1480,9 +1481,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
min >>= 1;
if (free_pages <= min)
- return 0;
+ return false;
}
- return 1;
+ return true;
+}
+
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags)
+{
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ zone_page_state(z, NR_FREE_PAGES));
+}
+
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags)
+{
+ long free_pages = zone_page_state(z, NR_FREE_PAGES);
+
+ if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+ free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ free_pages);
}
#ifdef CONFIG_NUMA
@@ -1787,15 +1807,18 @@ static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress)
+ int migratetype, unsigned long *did_some_progress,
+ bool sync_migration)
{
struct page *page;
if (!order || compaction_deferred(preferred_zone))
return NULL;
+ current->flags |= PF_MEMALLOC;
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
- nodemask);
+ nodemask, sync_migration);
+ current->flags &= ~PF_MEMALLOC;
if (*did_some_progress != COMPACT_SKIPPED) {
/* Page migration frees to the PCP lists but we want merging */
@@ -1831,7 +1854,8 @@ static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress)
+ int migratetype, unsigned long *did_some_progress,
+ bool sync_migration)
{
return NULL;
}
@@ -1846,23 +1870,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
{
struct page *page = NULL;
struct reclaim_state reclaim_state;
- struct task_struct *p = current;
bool drained = false;
cond_resched();
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
- p->flags |= PF_MEMALLOC;
+ current->flags |= PF_MEMALLOC;
lockdep_set_current_reclaim_state(gfp_mask);
reclaim_state.reclaimed_slab = 0;
- p->reclaim_state = &reclaim_state;
+ current->reclaim_state = &reclaim_state;
*did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
- p->reclaim_state = NULL;
+ current->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
- p->flags &= ~PF_MEMALLOC;
+ current->flags &= ~PF_MEMALLOC;
cond_resched();
@@ -1906,7 +1929,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
preferred_zone, migratetype);
if (!page && gfp_mask & __GFP_NOFAIL)
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
} while (!page && (gfp_mask & __GFP_NOFAIL));
return page;
@@ -1914,24 +1937,24 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
static inline
void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
- enum zone_type high_zoneidx)
+ enum zone_type high_zoneidx,
+ enum zone_type classzone_idx)
{
struct zoneref *z;
struct zone *zone;
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
- wakeup_kswapd(zone, order);
+ wakeup_kswapd(zone, order, classzone_idx);
}
static inline int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
- struct task_struct *p = current;
int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
const gfp_t wait = gfp_mask & __GFP_WAIT;
/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
- BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
+ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
/*
* The caller may dip into page reserves a bit more if the caller
@@ -1939,21 +1962,26 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
*/
- alloc_flags |= (gfp_mask & __GFP_HIGH);
+ alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
if (!wait) {
- alloc_flags |= ALLOC_HARDER;
+ /*
+ * Not worth trying to allocate harder for
+ * __GFP_NOMEMALLOC even if it can't schedule.
+ */
+ if (!(gfp_mask & __GFP_NOMEMALLOC))
+ alloc_flags |= ALLOC_HARDER;
/*
* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
alloc_flags &= ~ALLOC_CPUSET;
- } else if (unlikely(rt_task(p)) && !in_interrupt())
+ } else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
if (!in_interrupt() &&
- ((p->flags & PF_MEMALLOC) ||
+ ((current->flags & PF_MEMALLOC) ||
unlikely(test_thread_flag(TIF_MEMDIE))))
alloc_flags |= ALLOC_NO_WATERMARKS;
}
@@ -1972,7 +2000,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
int alloc_flags;
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
- struct task_struct *p = current;
+ bool sync_migration = false;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -1997,7 +2025,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
goto nopage;
restart:
- wake_all_kswapd(order, zonelist, high_zoneidx);
+ if (!(gfp_mask & __GFP_NO_KSWAPD))
+ wake_all_kswapd(order, zonelist, high_zoneidx,
+ zone_idx(preferred_zone));
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -2006,6 +2036,14 @@ restart:
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
+ /*
+ * Find the true preferred zone if the allocation is unconstrained by
+ * cpusets.
+ */
+ if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
+ first_zones_zonelist(zonelist, high_zoneidx, NULL,
+ &preferred_zone);
+
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2028,21 +2066,26 @@ rebalance:
goto nopage;
/* Avoid recursion of direct reclaim */
- if (p->flags & PF_MEMALLOC)
+ if (current->flags & PF_MEMALLOC)
goto nopage;
/* Avoid allocations with no watermarks from looping endlessly */
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
goto nopage;
- /* Try direct compaction */
+ /*
+ * Try direct compaction. The first pass is asynchronous. Subsequent
+ * attempts after direct reclaim are synchronous
+ */
page = __alloc_pages_direct_compact(gfp_mask, order,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- migratetype, &did_some_progress);
+ migratetype, &did_some_progress,
+ sync_migration);
if (page)
goto got_pg;
+ sync_migration = true;
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2094,15 +2137,29 @@ rebalance:
pages_reclaimed += did_some_progress;
if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
/* Wait for some write requests to complete then retry */
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
goto rebalance;
+ } else {
+ /*
+ * High-order allocations do not necessarily loop after
+ * direct reclaim and reclaim/compaction depends on compaction
+ * being called after reclaim so call directly if necessary
+ */
+ page = __alloc_pages_direct_compact(gfp_mask, order,
+ zonelist, high_zoneidx,
+ nodemask,
+ alloc_flags, preferred_zone,
+ migratetype, &did_some_progress,
+ sync_migration);
+ if (page)
+ goto got_pg;
}
nopage:
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
printk(KERN_WARNING "%s: page allocation failure."
" order:%d, mode:0x%x\n",
- p->comm, order, gfp_mask);
+ current->comm, order, gfp_mask);
dump_stack();
show_mem();
}
@@ -2145,7 +2202,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
get_mems_allowed();
/* The preferred zone is used for statistics later */
- first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
+ first_zones_zonelist(zonelist, high_zoneidx,
+ nodemask ? : &cpuset_current_mems_allowed,
+ &preferred_zone);
if (!preferred_zone) {
put_mems_allowed();
return NULL;
@@ -2436,7 +2495,7 @@ void show_free_areas(void)
" all_unreclaimable? %s"
"\n",
zone->name,
- K(zone_nr_free_pages(zone)),
+ K(zone_page_state(zone, NR_FREE_PAGES)),
K(min_wmark_pages(zone)),
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
@@ -2579,9 +2638,16 @@ static int __parse_numa_zonelist_order(char *s)
static __init int setup_numa_zonelist_order(char *s)
{
- if (s)
- return __parse_numa_zonelist_order(s);
- return 0;
+ int ret;
+
+ if (!s)
+ return 0;
+
+ ret = __parse_numa_zonelist_order(s);
+ if (ret == 0)
+ strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
+
+ return ret;
}
early_param("numa_zonelist_order", setup_numa_zonelist_order);
@@ -3007,14 +3073,6 @@ static __init_refok int __build_all_zonelists(void *data)
build_zonelist_cache(pgdat);
}
-#ifdef CONFIG_MEMORY_HOTPLUG
- /* Setup real pagesets for the new zone */
- if (data) {
- struct zone *zone = data;
- setup_zone_pageset(zone);
- }
-#endif
-
/*
* Initialize the boot_pagesets that are going to be used
* for bootstrapping processors. The real pagesets for
@@ -3063,7 +3121,11 @@ void build_all_zonelists(void *data)
} else {
/* we have to stop all cpus to guarantee there is no user
of zonelist */
- stop_machine(__build_all_zonelists, data, NULL);
+#ifdef CONFIG_MEMORY_HOTPLUG
+ if (data)
+ setup_zone_pageset((struct zone *)data);
+#endif
+ stop_machine(__build_all_zonelists, NULL, NULL);
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
@@ -3636,6 +3698,41 @@ void __init free_bootmem_with_active_regions(int nid,
}
}
+#ifdef CONFIG_HAVE_MEMBLOCK
+u64 __init find_memory_core_early(int nid, u64 size, u64 align,
+ u64 goal, u64 limit)
+{
+ int i;
+
+ /* Need to go over early_node_map to find out good range for node */
+ for_each_active_range_index_in_nid(i, nid) {
+ u64 addr;
+ u64 ei_start, ei_last;
+ u64 final_start, final_end;
+
+ ei_last = early_node_map[i].end_pfn;
+ ei_last <<= PAGE_SHIFT;
+ ei_start = early_node_map[i].start_pfn;
+ ei_start <<= PAGE_SHIFT;
+
+ final_start = max(ei_start, goal);
+ final_end = min(ei_last, limit);
+
+ if (final_start >= final_end)
+ continue;
+
+ addr = memblock_find_in_range(final_start, final_end, size, align);
+
+ if (addr == MEMBLOCK_ERROR)
+ continue;
+
+ return addr;
+ }
+
+ return MEMBLOCK_ERROR;
+}
+#endif
+
int __init add_from_early_node_map(struct range *range, int az,
int nr_range, int nid)
{
@@ -3655,46 +3752,26 @@ int __init add_from_early_node_map(struct range *range, int az,
void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
u64 goal, u64 limit)
{
- int i;
void *ptr;
+ u64 addr;
- if (limit > get_max_mapped())
- limit = get_max_mapped();
+ if (limit > memblock.current_limit)
+ limit = memblock.current_limit;
- /* need to go over early_node_map to find out good range for node */
- for_each_active_range_index_in_nid(i, nid) {
- u64 addr;
- u64 ei_start, ei_last;
+ addr = find_memory_core_early(nid, size, align, goal, limit);
- ei_last = early_node_map[i].end_pfn;
- ei_last <<= PAGE_SHIFT;
- ei_start = early_node_map[i].start_pfn;
- ei_start <<= PAGE_SHIFT;
- addr = find_early_area(ei_start, ei_last,
- goal, limit, size, align);
-
- if (addr == -1ULL)
- continue;
-
-#if 0
- printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n",
- nid,
- ei_start, ei_last, goal, limit, size,
- align, addr);
-#endif
-
- ptr = phys_to_virt(addr);
- memset(ptr, 0, size);
- reserve_early_without_check(addr, addr + size, "BOOTMEM");
- /*
- * The min_count is set to 0 so that bootmem allocated blocks
- * are never reported as leaks.
- */
- kmemleak_alloc(ptr, size, 0, 0);
- return ptr;
- }
+ if (addr == MEMBLOCK_ERROR)
+ return NULL;
- return NULL;
+ ptr = phys_to_virt(addr);
+ memset(ptr, 0, size);
+ memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
+ /*
+ * The min_count is set to 0 so that bootmem allocated blocks
+ * are never reported as leaks.
+ */
+ kmemleak_alloc(ptr, size, 0, 0);
+ return ptr;
}
#endif
@@ -3997,7 +4074,7 @@ static void __init setup_usemap(struct pglist_data *pgdat,
zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
}
#else
-static void inline setup_usemap(struct pglist_data *pgdat,
+static inline void setup_usemap(struct pglist_data *pgdat,
struct zone *zone, unsigned long zonesize) {}
#endif /* CONFIG_SPARSEMEM */
@@ -5281,12 +5358,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
* page allocater never alloc memory from ISOLATE block.
*/
+static int
+__count_immobile_pages(struct zone *zone, struct page *page, int count)
+{
+ unsigned long pfn, iter, found;
+ /*
+ * For avoiding noise data, lru_add_drain_all() should be called
+ * If ZONE_MOVABLE, the zone never contains immobile pages
+ */
+ if (zone_idx(zone) == ZONE_MOVABLE)
+ return true;
+
+ if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
+ return true;
+
+ pfn = page_to_pfn(page);
+ for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
+ unsigned long check = pfn + iter;
+
+ if (!pfn_valid_within(check)) {
+ iter++;
+ continue;
+ }
+ page = pfn_to_page(check);
+ if (!page_count(page)) {
+ if (PageBuddy(page))
+ iter += (1 << page_order(page)) - 1;
+ continue;
+ }
+ if (!PageLRU(page))
+ found++;
+ /*
+ * If there are RECLAIMABLE pages, we need to check it.
+ * But now, memory offline itself doesn't call shrink_slab()
+ * and it still to be fixed.
+ */
+ /*
+ * If the page is not RAM, page_count()should be 0.
+ * we don't need more check. This is an _used_ not-movable page.
+ *
+ * The problematic thing here is PG_reserved pages. PG_reserved
+ * is set to both of a memory hole page and a _used_ kernel
+ * page at boot.
+ */
+ if (found > count)
+ return false;
+ }
+ return true;
+}
+
+bool is_pageblock_removable_nolock(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+ return __count_immobile_pages(zone, page, 0);
+}
+
int set_migratetype_isolate(struct page *page)
{
struct zone *zone;
- struct page *curr_page;
- unsigned long flags, pfn, iter;
- unsigned long immobile = 0;
+ unsigned long flags, pfn;
struct memory_isolate_notify arg;
int notifier_ret;
int ret = -EBUSY;
@@ -5296,11 +5426,6 @@ int set_migratetype_isolate(struct page *page)
zone_idx = zone_idx(zone);
spin_lock_irqsave(&zone->lock, flags);
- if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
- zone_idx == ZONE_MOVABLE) {
- ret = 0;
- goto out;
- }
pfn = page_to_pfn(page);
arg.start_pfn = pfn;
@@ -5320,23 +5445,20 @@ int set_migratetype_isolate(struct page *page)
*/
notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
notifier_ret = notifier_to_errno(notifier_ret);
- if (notifier_ret || !arg.pages_found)
+ if (notifier_ret)
goto out;
-
- for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
- if (!pfn_valid_within(pfn))
- continue;
-
- curr_page = pfn_to_page(iter);
- if (!page_count(curr_page) || PageLRU(curr_page))
- continue;
-
- immobile++;
- }
-
- if (arg.pages_found == immobile)
+ /*
+ * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
+ * We just check MOVABLE pages.
+ */
+ if (__count_immobile_pages(zone, page, arg.pages_found))
ret = 0;
+ /*
+ * immobile means "not-on-lru" paes. If immobile is larger than
+ * removable-by-driver pages reported by notifier, we'll fail.
+ */
+
out:
if (!ret) {
set_pageblock_migratetype(page, MIGRATE_ISOLATE);
@@ -5455,7 +5577,6 @@ static struct trace_print_flags pageflag_names[] = {
{1UL << PG_swapcache, "swapcache" },
{1UL << PG_mappedtodisk, "mappedtodisk" },
{1UL << PG_reclaim, "reclaim" },
- {1UL << PG_buddy, "buddy" },
{1UL << PG_swapbacked, "swapbacked" },
{1UL << PG_unevictable, "unevictable" },
#ifdef CONFIG_MMU
@@ -5503,7 +5624,7 @@ void dump_page(struct page *page)
{
printk(KERN_ALERT
"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
- page, page_count(page), page_mapcount(page),
+ page, atomic_read(&page->_count), page_mapcount(page),
page->mapping, page->index);
dump_page_flags(page->flags);
}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 5e0ffd967452..4ae42bb40892 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -86,7 +86,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
* all pages in [start_pfn...end_pfn) must be in the same zone.
* zone->lock must be held before call this.
*
- * Returns 0 if all pages in the range is isolated.
+ * Returns 1 if all pages in the range is isolated.
*/
static int
__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
@@ -119,7 +119,6 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
struct zone *zone;
int ret;
- pfn = start_pfn;
/*
* Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page
* is not aligned to pageblock_nr_pages.
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 8b1a2ce21ee5..7cfa6ae02303 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -34,6 +34,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ split_huge_page_pmd(walk->mm, pmd);
if (pmd_none_or_clear_bad(pmd)) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
@@ -139,7 +140,6 @@ int walk_page_range(unsigned long addr, unsigned long end,
pgd_t *pgd;
unsigned long next;
int err = 0;
- struct vm_area_struct *vma;
if (addr >= end)
return err;
@@ -149,15 +149,17 @@ int walk_page_range(unsigned long addr, unsigned long end,
pgd = pgd_offset(walk->mm, addr);
do {
+ struct vm_area_struct *uninitialized_var(vma);
+
next = pgd_addr_end(addr, end);
+#ifdef CONFIG_HUGETLB_PAGE
/*
* handle hugetlb vma individually because pagetable walk for
* the hugetlb page is dependent on the architecture and
* we can't handled it in the same manner as non-huge pages.
*/
vma = find_vma(walk->mm, addr);
-#ifdef CONFIG_HUGETLB_PAGE
if (vma && is_vm_hugetlb_page(vma)) {
if (vma->vm_end < next)
next = vma->vm_end;
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index df680855540a..89633fefc6a2 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -27,7 +27,7 @@
* chunk size is not aligned. percpu-km code will whine about it.
*/
-#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+#if defined(CONFIG_SMP) && defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
#error "contiguous percpu allocation is incompatible with paged first chunk"
#endif
@@ -35,7 +35,11 @@
static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
{
- /* noop */
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu)
+ memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
+
return 0;
}
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 7d9c1d0ebd3f..ea534960a04b 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -421,7 +421,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
return NULL;
vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
- pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL);
+ pcpu_nr_groups, pcpu_atom_size);
if (!vms) {
pcpu_free_chunk(chunk);
return NULL;
diff --git a/mm/percpu.c b/mm/percpu.c
index c76ef3891e0d..3f930018aa60 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -31,7 +31,7 @@
* as small as 4 bytes. The allocator organizes chunks into lists
* according to free size and tries to allocate from the fullest one.
* Each chunk keeps the maximum contiguous area size hint which is
- * guaranteed to be eqaul to or larger than the maximum contiguous
+ * guaranteed to be equal to or larger than the maximum contiguous
* area in the chunk. This helps the allocator not to iterate the
* chunk maps unnecessarily.
*
@@ -76,6 +76,7 @@
#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
+#ifdef CONFIG_SMP
/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
#ifndef __addr_to_pcpu_ptr
#define __addr_to_pcpu_ptr(addr) \
@@ -89,6 +90,11 @@
(unsigned long)pcpu_base_addr - \
(unsigned long)__per_cpu_start)
#endif
+#else /* CONFIG_SMP */
+/* on UP, it's always identity mapped */
+#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)
+#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
+#endif /* CONFIG_SMP */
struct pcpu_chunk {
struct list_head list; /* linked to pcpu_slot lists */
@@ -252,7 +258,7 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
/*
* (Un)populated page region iterators. Iterate over (un)populated
- * page regions betwen @start and @end in @chunk. @rs and @re should
+ * page regions between @start and @end in @chunk. @rs and @re should
* be integer variables and will be set to start and end page index of
* the current region.
*/
@@ -287,12 +293,8 @@ static void *pcpu_mem_alloc(size_t size)
if (size <= PAGE_SIZE)
return kzalloc(size, GFP_KERNEL);
- else {
- void *ptr = vmalloc(size);
- if (ptr)
- memset(ptr, 0, size);
- return ptr;
- }
+ else
+ return vzalloc(size);
}
/**
@@ -820,8 +822,8 @@ fail_unlock_mutex:
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
*
- * Allocate percpu area of @size bytes aligned at @align. Might
- * sleep. Might trigger writeouts.
+ * Allocate zero-filled percpu area of @size bytes aligned at @align.
+ * Might sleep. Might trigger writeouts.
*
* CONTEXT:
* Does GFP_KERNEL allocation.
@@ -840,9 +842,10 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
*
- * Allocate percpu area of @size bytes aligned at @align from reserved
- * percpu area if arch has set it up; otherwise, allocation is served
- * from the same dynamic area. Might sleep. Might trigger writeouts.
+ * Allocate zero-filled percpu area of @size bytes aligned at @align
+ * from reserved percpu area if arch has set it up; otherwise,
+ * allocation is served from the same dynamic area. Might sleep.
+ * Might trigger writeouts.
*
* CONTEXT:
* Does GFP_KERNEL allocation.
@@ -949,6 +952,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
*/
bool is_kernel_percpu_address(unsigned long addr)
{
+#ifdef CONFIG_SMP
const size_t static_size = __per_cpu_end - __per_cpu_start;
void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
unsigned int cpu;
@@ -959,6 +963,8 @@ bool is_kernel_percpu_address(unsigned long addr)
if ((void *)addr >= start && (void *)addr < start + static_size)
return true;
}
+#endif
+ /* on UP, can't distinguish from other static vars, always false */
return false;
}
@@ -1067,161 +1073,6 @@ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
}
/**
- * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
- * @reserved_size: the size of reserved percpu area in bytes
- * @dyn_size: minimum free size for dynamic allocation in bytes
- * @atom_size: allocation atom size
- * @cpu_distance_fn: callback to determine distance between cpus, optional
- *
- * This function determines grouping of units, their mappings to cpus
- * and other parameters considering needed percpu size, allocation
- * atom size and distances between CPUs.
- *
- * Groups are always mutliples of atom size and CPUs which are of
- * LOCAL_DISTANCE both ways are grouped together and share space for
- * units in the same group. The returned configuration is guaranteed
- * to have CPUs on different nodes on different groups and >=75% usage
- * of allocated virtual address space.
- *
- * RETURNS:
- * On success, pointer to the new allocation_info is returned. On
- * failure, ERR_PTR value is returned.
- */
-static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
- size_t reserved_size, size_t dyn_size,
- size_t atom_size,
- pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
-{
- static int group_map[NR_CPUS] __initdata;
- static int group_cnt[NR_CPUS] __initdata;
- const size_t static_size = __per_cpu_end - __per_cpu_start;
- int nr_groups = 1, nr_units = 0;
- size_t size_sum, min_unit_size, alloc_size;
- int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
- int last_allocs, group, unit;
- unsigned int cpu, tcpu;
- struct pcpu_alloc_info *ai;
- unsigned int *cpu_map;
-
- /* this function may be called multiple times */
- memset(group_map, 0, sizeof(group_map));
- memset(group_cnt, 0, sizeof(group_cnt));
-
- /* calculate size_sum and ensure dyn_size is enough for early alloc */
- size_sum = PFN_ALIGN(static_size + reserved_size +
- max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
- dyn_size = size_sum - static_size - reserved_size;
-
- /*
- * Determine min_unit_size, alloc_size and max_upa such that
- * alloc_size is multiple of atom_size and is the smallest
- * which can accomodate 4k aligned segments which are equal to
- * or larger than min_unit_size.
- */
- min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
-
- alloc_size = roundup(min_unit_size, atom_size);
- upa = alloc_size / min_unit_size;
- while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
- upa--;
- max_upa = upa;
-
- /* group cpus according to their proximity */
- for_each_possible_cpu(cpu) {
- group = 0;
- next_group:
- for_each_possible_cpu(tcpu) {
- if (cpu == tcpu)
- break;
- if (group_map[tcpu] == group && cpu_distance_fn &&
- (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
- cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
- group++;
- nr_groups = max(nr_groups, group + 1);
- goto next_group;
- }
- }
- group_map[cpu] = group;
- group_cnt[group]++;
- }
-
- /*
- * Expand unit size until address space usage goes over 75%
- * and then as much as possible without using more address
- * space.
- */
- last_allocs = INT_MAX;
- for (upa = max_upa; upa; upa--) {
- int allocs = 0, wasted = 0;
-
- if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
- continue;
-
- for (group = 0; group < nr_groups; group++) {
- int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
- allocs += this_allocs;
- wasted += this_allocs * upa - group_cnt[group];
- }
-
- /*
- * Don't accept if wastage is over 1/3. The
- * greater-than comparison ensures upa==1 always
- * passes the following check.
- */
- if (wasted > num_possible_cpus() / 3)
- continue;
-
- /* and then don't consume more memory */
- if (allocs > last_allocs)
- break;
- last_allocs = allocs;
- best_upa = upa;
- }
- upa = best_upa;
-
- /* allocate and fill alloc_info */
- for (group = 0; group < nr_groups; group++)
- nr_units += roundup(group_cnt[group], upa);
-
- ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
- if (!ai)
- return ERR_PTR(-ENOMEM);
- cpu_map = ai->groups[0].cpu_map;
-
- for (group = 0; group < nr_groups; group++) {
- ai->groups[group].cpu_map = cpu_map;
- cpu_map += roundup(group_cnt[group], upa);
- }
-
- ai->static_size = static_size;
- ai->reserved_size = reserved_size;
- ai->dyn_size = dyn_size;
- ai->unit_size = alloc_size / upa;
- ai->atom_size = atom_size;
- ai->alloc_size = alloc_size;
-
- for (group = 0, unit = 0; group_cnt[group]; group++) {
- struct pcpu_group_info *gi = &ai->groups[group];
-
- /*
- * Initialize base_offset as if all groups are located
- * back-to-back. The caller should update this to
- * reflect actual allocation.
- */
- gi->base_offset = unit * ai->unit_size;
-
- for_each_possible_cpu(cpu)
- if (group_map[cpu] == group)
- gi->cpu_map[gi->nr_units++] = cpu;
- gi->nr_units = roundup(gi->nr_units, upa);
- unit += gi->nr_units;
- }
- BUG_ON(unit != nr_units);
-
- return ai;
-}
-
-/**
* pcpu_dump_alloc_info - print out information about pcpu_alloc_info
* @lvl: loglevel
* @ai: allocation info to dump
@@ -1363,7 +1214,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
/* sanity checks */
PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
+#ifdef CONFIG_SMP
PCPU_SETUP_BUG_ON(!ai->static_size);
+#endif
PCPU_SETUP_BUG_ON(!base_addr);
PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
@@ -1411,7 +1264,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
/* we're done parsing the input, undefine BUG macro and dump config */
#undef PCPU_SETUP_BUG_ON
- pcpu_dump_alloc_info(KERN_INFO, ai);
+ pcpu_dump_alloc_info(KERN_DEBUG, ai);
pcpu_nr_groups = ai->nr_groups;
pcpu_group_offsets = group_offsets;
@@ -1488,6 +1341,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
return 0;
}
+#ifdef CONFIG_SMP
+
const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
[PCPU_FC_AUTO] = "auto",
[PCPU_FC_EMBED] = "embed",
@@ -1515,8 +1370,180 @@ static int __init percpu_alloc_setup(char *str)
}
early_param("percpu_alloc", percpu_alloc_setup);
+/*
+ * pcpu_embed_first_chunk() is used by the generic percpu setup.
+ * Build it if needed by the arch config or the generic setup is going
+ * to be used.
+ */
#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
+#define BUILD_EMBED_FIRST_CHUNK
+#endif
+
+/* build pcpu_page_first_chunk() iff needed by the arch config */
+#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
+#define BUILD_PAGE_FIRST_CHUNK
+#endif
+
+/* pcpu_build_alloc_info() is used by both embed and page first chunk */
+#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
+/**
+ * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_size: minimum free size for dynamic allocation in bytes
+ * @atom_size: allocation atom size
+ * @cpu_distance_fn: callback to determine distance between cpus, optional
+ *
+ * This function determines grouping of units, their mappings to cpus
+ * and other parameters considering needed percpu size, allocation
+ * atom size and distances between CPUs.
+ *
+ * Groups are always mutliples of atom size and CPUs which are of
+ * LOCAL_DISTANCE both ways are grouped together and share space for
+ * units in the same group. The returned configuration is guaranteed
+ * to have CPUs on different nodes on different groups and >=75% usage
+ * of allocated virtual address space.
+ *
+ * RETURNS:
+ * On success, pointer to the new allocation_info is returned. On
+ * failure, ERR_PTR value is returned.
+ */
+static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
+ size_t reserved_size, size_t dyn_size,
+ size_t atom_size,
+ pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
+{
+ static int group_map[NR_CPUS] __initdata;
+ static int group_cnt[NR_CPUS] __initdata;
+ const size_t static_size = __per_cpu_end - __per_cpu_start;
+ int nr_groups = 1, nr_units = 0;
+ size_t size_sum, min_unit_size, alloc_size;
+ int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
+ int last_allocs, group, unit;
+ unsigned int cpu, tcpu;
+ struct pcpu_alloc_info *ai;
+ unsigned int *cpu_map;
+
+ /* this function may be called multiple times */
+ memset(group_map, 0, sizeof(group_map));
+ memset(group_cnt, 0, sizeof(group_cnt));
+
+ /* calculate size_sum and ensure dyn_size is enough for early alloc */
+ size_sum = PFN_ALIGN(static_size + reserved_size +
+ max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
+ dyn_size = size_sum - static_size - reserved_size;
+
+ /*
+ * Determine min_unit_size, alloc_size and max_upa such that
+ * alloc_size is multiple of atom_size and is the smallest
+ * which can accomodate 4k aligned segments which are equal to
+ * or larger than min_unit_size.
+ */
+ min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+
+ alloc_size = roundup(min_unit_size, atom_size);
+ upa = alloc_size / min_unit_size;
+ while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ upa--;
+ max_upa = upa;
+
+ /* group cpus according to their proximity */
+ for_each_possible_cpu(cpu) {
+ group = 0;
+ next_group:
+ for_each_possible_cpu(tcpu) {
+ if (cpu == tcpu)
+ break;
+ if (group_map[tcpu] == group && cpu_distance_fn &&
+ (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
+ cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
+ group++;
+ nr_groups = max(nr_groups, group + 1);
+ goto next_group;
+ }
+ }
+ group_map[cpu] = group;
+ group_cnt[group]++;
+ }
+
+ /*
+ * Expand unit size until address space usage goes over 75%
+ * and then as much as possible without using more address
+ * space.
+ */
+ last_allocs = INT_MAX;
+ for (upa = max_upa; upa; upa--) {
+ int allocs = 0, wasted = 0;
+
+ if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ continue;
+
+ for (group = 0; group < nr_groups; group++) {
+ int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
+ allocs += this_allocs;
+ wasted += this_allocs * upa - group_cnt[group];
+ }
+
+ /*
+ * Don't accept if wastage is over 1/3. The
+ * greater-than comparison ensures upa==1 always
+ * passes the following check.
+ */
+ if (wasted > num_possible_cpus() / 3)
+ continue;
+
+ /* and then don't consume more memory */
+ if (allocs > last_allocs)
+ break;
+ last_allocs = allocs;
+ best_upa = upa;
+ }
+ upa = best_upa;
+
+ /* allocate and fill alloc_info */
+ for (group = 0; group < nr_groups; group++)
+ nr_units += roundup(group_cnt[group], upa);
+
+ ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
+ if (!ai)
+ return ERR_PTR(-ENOMEM);
+ cpu_map = ai->groups[0].cpu_map;
+
+ for (group = 0; group < nr_groups; group++) {
+ ai->groups[group].cpu_map = cpu_map;
+ cpu_map += roundup(group_cnt[group], upa);
+ }
+
+ ai->static_size = static_size;
+ ai->reserved_size = reserved_size;
+ ai->dyn_size = dyn_size;
+ ai->unit_size = alloc_size / upa;
+ ai->atom_size = atom_size;
+ ai->alloc_size = alloc_size;
+
+ for (group = 0, unit = 0; group_cnt[group]; group++) {
+ struct pcpu_group_info *gi = &ai->groups[group];
+
+ /*
+ * Initialize base_offset as if all groups are located
+ * back-to-back. The caller should update this to
+ * reflect actual allocation.
+ */
+ gi->base_offset = unit * ai->unit_size;
+
+ for_each_possible_cpu(cpu)
+ if (group_map[cpu] == group)
+ gi->cpu_map[gi->nr_units++] = cpu;
+ gi->nr_units = roundup(gi->nr_units, upa);
+ unit += gi->nr_units;
+ }
+ BUG_ON(unit != nr_units);
+
+ return ai;
+}
+#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
+
+#if defined(BUILD_EMBED_FIRST_CHUNK)
/**
* pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
* @reserved_size: the size of reserved percpu area in bytes
@@ -1645,10 +1672,9 @@ out_free:
free_bootmem(__pa(areas), areas_size);
return rc;
}
-#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
- !CONFIG_HAVE_SETUP_PER_CPU_AREA */
+#endif /* BUILD_EMBED_FIRST_CHUNK */
-#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+#ifdef BUILD_PAGE_FIRST_CHUNK
/**
* pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
* @reserved_size: the size of reserved percpu area in bytes
@@ -1756,10 +1782,11 @@ out_free_ar:
pcpu_free_alloc_info(ai);
return rc;
}
-#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
+#endif /* BUILD_PAGE_FIRST_CHUNK */
+#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
/*
- * Generic percpu area setup.
+ * Generic SMP percpu area setup.
*
* The embedding helper is used because its behavior closely resembles
* the original non-dynamic generic percpu area setup. This is
@@ -1770,7 +1797,6 @@ out_free_ar:
* on the physical linear memory mapping which uses large page
* mappings on applicable archs.
*/
-#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);
@@ -1799,13 +1825,48 @@ void __init setup_per_cpu_areas(void)
PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
if (rc < 0)
- panic("Failed to initialized percpu areas.");
+ panic("Failed to initialize percpu areas.");
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
for_each_possible_cpu(cpu)
__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
}
-#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
+#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
+
+#else /* CONFIG_SMP */
+
+/*
+ * UP percpu area setup.
+ *
+ * UP always uses km-based percpu allocator with identity mapping.
+ * Static percpu variables are indistinguishable from the usual static
+ * variables and don't require any special preparation.
+ */
+void __init setup_per_cpu_areas(void)
+{
+ const size_t unit_size =
+ roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
+ PERCPU_DYNAMIC_RESERVE));
+ struct pcpu_alloc_info *ai;
+ void *fc;
+
+ ai = pcpu_alloc_alloc_info(1, 1);
+ fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ if (!ai || !fc)
+ panic("Failed to allocate memory for percpu areas.");
+
+ ai->dyn_size = unit_size;
+ ai->unit_size = unit_size;
+ ai->atom_size = unit_size;
+ ai->alloc_size = unit_size;
+ ai->groups[0].nr_units = 1;
+ ai->groups[0].cpu_map[0] = 0;
+
+ if (pcpu_setup_first_chunk(ai, fc) < 0)
+ panic("Failed to initialize percpu areas.");
+}
+
+#endif /* CONFIG_SMP */
/*
* First and reserved chunks are initialized with temporary allocation
diff --git a/mm/percpu_up.c b/mm/percpu_up.c
deleted file mode 100644
index db884fae5721..000000000000
--- a/mm/percpu_up.c
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * mm/percpu_up.c - dummy percpu memory allocator implementation for UP
- */
-
-#include <linux/module.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-
-void __percpu *__alloc_percpu(size_t size, size_t align)
-{
- /*
- * Can't easily make larger alignment work with kmalloc. WARN
- * on it. Larger alignment should only be used for module
- * percpu sections on SMP for which this path isn't used.
- */
- WARN_ON_ONCE(align > SMP_CACHE_BYTES);
- return (void __percpu __force *)kzalloc(size, GFP_KERNEL);
-}
-EXPORT_SYMBOL_GPL(__alloc_percpu);
-
-void free_percpu(void __percpu *p)
-{
- kfree(this_cpu_ptr(p));
-}
-EXPORT_SYMBOL_GPL(free_percpu);
-
-phys_addr_t per_cpu_ptr_to_phys(void *addr)
-{
- return __pa(addr);
-}
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
new file mode 100644
index 000000000000..eb663fb533e0
--- /dev/null
+++ b/mm/pgtable-generic.c
@@ -0,0 +1,121 @@
+/*
+ * mm/pgtable-generic.c
+ *
+ * Generic pgtable methods declared in asm-generic/pgtable.h
+ *
+ * Copyright (C) 2010 Linus Torvalds
+ */
+
+#include <linux/pagemap.h>
+#include <asm/tlb.h>
+#include <asm-generic/pgtable.h>
+
+#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+/*
+ * Only sets the access flags (dirty, accessed, and
+ * writable). Furthermore, we know it always gets set to a "more
+ * permissive" setting, which allows most architectures to optimize
+ * this. We return whether the PTE actually changed, which in turn
+ * instructs the caller to do things like update__mmu_cache. This
+ * used to be done in the caller, but sparc needs minor faults to
+ * force that call on sun4c so we changed this macro slightly
+ */
+int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty)
+{
+ int changed = !pte_same(*ptep, entry);
+ if (changed) {
+ set_pte_at(vma->vm_mm, address, ptep, entry);
+ flush_tlb_page(vma, address);
+ }
+ return changed;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp,
+ pmd_t entry, int dirty)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ int changed = !pmd_same(*pmdp, entry);
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ if (changed) {
+ set_pmd_at(vma->vm_mm, address, pmdp, entry);
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ }
+ return changed;
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+ BUG();
+ return 0;
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+}
+#endif
+
+#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ int young;
+ young = ptep_test_and_clear_young(vma, address, ptep);
+ if (young)
+ flush_tlb_page(vma, address);
+ return young;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ int young;
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+ BUG();
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ young = pmdp_test_and_clear_young(vma, address, pmdp);
+ if (young)
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ return young;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
+pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
+ pte_t *ptep)
+{
+ pte_t pte;
+ pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
+ flush_tlb_page(vma, address);
+ return pte;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_t pmd;
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ return pmd;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_t pmd = pmd_mksplitting(*pmdp);
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ set_pmd_at(vma->vm_mm, address, pmdp, pmd);
+ /* tlb flush only to serialize against gup-fast */
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 92e6757f196e..f21f4a1d6a1c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -80,7 +80,7 @@ static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
}
-void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
+static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
{
kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
}
@@ -94,7 +94,7 @@ void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
* anonymous pages mapped into it with that anon_vma.
*
* The common case will be that we already have one, but if
- * if not we either need to find an adjacent mapping that we
+ * not we either need to find an adjacent mapping that we
* can re-use the anon_vma from (very common when the only
* reason for splitting a vma has been mprotect()), or we
* allocate a new one.
@@ -177,6 +177,10 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
list_add(&avc->same_vma, &vma->anon_vma_chain);
anon_vma_lock(anon_vma);
+ /*
+ * It's critical to add new vmas to the tail of the anon_vma,
+ * see comment in huge_memory.c:__split_huge_page().
+ */
list_add_tail(&avc->same_anon_vma, &anon_vma->head);
anon_vma_unlock(anon_vma);
}
@@ -314,7 +318,7 @@ void __init anon_vma_init(void)
* Getting a lock on a stable anon_vma from a page off the LRU is
* tricky: page_lock_anon_vma rely on RCU to guard against the races.
*/
-struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *__page_lock_anon_vma(struct page *page)
{
struct anon_vma *anon_vma, *root_anon_vma;
unsigned long anon_mapping;
@@ -348,6 +352,8 @@ out:
}
void page_unlock_anon_vma(struct anon_vma *anon_vma)
+ __releases(&anon_vma->root->lock)
+ __releases(RCU)
{
anon_vma_unlock(anon_vma);
rcu_read_unlock();
@@ -358,7 +364,7 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
* Returns virtual address or -EFAULT if page's index/offset is not
* within the range mapped the @vma.
*/
-static inline unsigned long
+inline unsigned long
vma_address(struct page *page, struct vm_area_struct *vma)
{
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -407,7 +413,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
*
* On success returns with pte mapped and locked.
*/
-pte_t *page_check_address(struct page *page, struct mm_struct *mm,
+pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
unsigned long address, spinlock_t **ptlp, int sync)
{
pgd_t *pgd;
@@ -433,6 +439,8 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return NULL;
+ if (pmd_trans_huge(*pmd))
+ return NULL;
pte = pte_offset_map(pmd, address);
/* Make a quick check before getting the lock */
@@ -487,35 +495,17 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
unsigned long *vm_flags)
{
struct mm_struct *mm = vma->vm_mm;
- pte_t *pte;
- spinlock_t *ptl;
int referenced = 0;
- pte = page_check_address(page, mm, address, &ptl, 0);
- if (!pte)
- goto out;
-
/*
* Don't want to elevate referenced for mlocked page that gets this far,
* in order that it progresses to try_to_unmap and is moved to the
* unevictable list.
*/
if (vma->vm_flags & VM_LOCKED) {
- *mapcount = 1; /* break early from loop */
+ *mapcount = 0; /* break early from loop */
*vm_flags |= VM_LOCKED;
- goto out_unmap;
- }
-
- if (ptep_clear_flush_young_notify(vma, address, pte)) {
- /*
- * Don't treat a reference through a sequentially read
- * mapping as such. If the page has been used in
- * another mapping, we will catch it; if this other
- * mapping is already gone, the unmap path will have
- * set PG_referenced or activated the page.
- */
- if (likely(!VM_SequentialReadHint(vma)))
- referenced++;
+ goto out;
}
/* Pretend the page is referenced if the task has the
@@ -524,9 +514,39 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
rwsem_is_locked(&mm->mmap_sem))
referenced++;
-out_unmap:
+ if (unlikely(PageTransHuge(page))) {
+ pmd_t *pmd;
+
+ spin_lock(&mm->page_table_lock);
+ pmd = page_check_address_pmd(page, mm, address,
+ PAGE_CHECK_ADDRESS_PMD_FLAG);
+ if (pmd && !pmd_trans_splitting(*pmd) &&
+ pmdp_clear_flush_young_notify(vma, address, pmd))
+ referenced++;
+ spin_unlock(&mm->page_table_lock);
+ } else {
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ pte = page_check_address(page, mm, address, &ptl, 0);
+ if (!pte)
+ goto out;
+
+ if (ptep_clear_flush_young_notify(vma, address, pte)) {
+ /*
+ * Don't treat a reference through a sequentially read
+ * mapping as such. If the page has been used in
+ * another mapping, we will catch it; if this other
+ * mapping is already gone, the unmap path will have
+ * set PG_referenced or activated the page.
+ */
+ if (likely(!VM_SequentialReadHint(vma)))
+ referenced++;
+ }
+ pte_unmap_unlock(pte, ptl);
+ }
+
(*mapcount)--;
- pte_unmap_unlock(pte, ptl);
if (referenced)
*vm_flags |= vma->vm_flags;
@@ -745,7 +765,7 @@ int page_mkclean(struct page *page)
if (mapping) {
ret = page_mkclean_file(mapping, page);
if (page_test_dirty(page)) {
- page_clear_dirty(page);
+ page_clear_dirty(page, 1);
ret = 1;
}
}
@@ -780,10 +800,10 @@ void page_move_anon_rmap(struct page *page,
}
/**
- * __page_set_anon_rmap - setup new anonymous rmap
- * @page: the page to add the mapping to
- * @vma: the vm area in which the mapping is added
- * @address: the user virtual address mapped
+ * __page_set_anon_rmap - set up new anonymous rmap
+ * @page: Page to add to rmap
+ * @vma: VM area to add page to.
+ * @address: User virtual address of the mapping
* @exclusive: the page is exclusively owned by the current process
*/
static void __page_set_anon_rmap(struct page *page,
@@ -793,25 +813,16 @@ static void __page_set_anon_rmap(struct page *page,
BUG_ON(!anon_vma);
+ if (PageAnon(page))
+ return;
+
/*
* If the page isn't exclusively mapped into this vma,
* we must use the _oldest_ possible anon_vma for the
* page mapping!
*/
- if (!exclusive) {
- if (PageAnon(page))
- return;
+ if (!exclusive)
anon_vma = anon_vma->root;
- } else {
- /*
- * In this case, swapped-out-but-not-discarded swap-cache
- * is remapped. So, no need to update page->mapping here.
- * We convice anon_vma poitned by page->mapping is not obsolete
- * because vma->anon_vma is necessary to be a family of it.
- */
- if (PageAnon(page))
- return;
- }
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma;
@@ -871,8 +882,13 @@ void do_page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address, int exclusive)
{
int first = atomic_inc_and_test(&page->_mapcount);
- if (first)
- __inc_zone_page_state(page, NR_ANON_PAGES);
+ if (first) {
+ if (!PageTransHuge(page))
+ __inc_zone_page_state(page, NR_ANON_PAGES);
+ else
+ __inc_zone_page_state(page,
+ NR_ANON_TRANSPARENT_HUGEPAGES);
+ }
if (unlikely(PageKsm(page)))
return;
@@ -900,7 +916,10 @@ void page_add_new_anon_rmap(struct page *page,
VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
SetPageSwapBacked(page);
atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
- __inc_zone_page_state(page, NR_ANON_PAGES);
+ if (!PageTransHuge(page))
+ __inc_zone_page_state(page, NR_ANON_PAGES);
+ else
+ __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
__page_set_anon_rmap(page, vma, address, 1);
if (page_evictable(page, vma))
lru_cache_add_lru(page, LRU_ACTIVE_ANON);
@@ -918,7 +937,7 @@ void page_add_file_rmap(struct page *page)
{
if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_update_file_mapped(page, 1);
+ mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
}
}
@@ -942,7 +961,7 @@ void page_remove_rmap(struct page *page)
* containing the swap entry, but page not yet written to swap.
*/
if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) {
- page_clear_dirty(page);
+ page_clear_dirty(page, 1);
set_page_dirty(page);
}
/*
@@ -953,10 +972,14 @@ void page_remove_rmap(struct page *page)
return;
if (PageAnon(page)) {
mem_cgroup_uncharge_page(page);
- __dec_zone_page_state(page, NR_ANON_PAGES);
+ if (!PageTransHuge(page))
+ __dec_zone_page_state(page, NR_ANON_PAGES);
+ else
+ __dec_zone_page_state(page,
+ NR_ANON_TRANSPARENT_HUGEPAGES);
} else {
__dec_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_update_file_mapped(page, -1);
+ mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
}
/*
* It would be tidy to reset the PageAnon mapping here,
@@ -1209,7 +1232,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
return ret;
}
-static bool is_vma_temporary_stack(struct vm_area_struct *vma)
+bool is_vma_temporary_stack(struct vm_area_struct *vma)
{
int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -1407,6 +1430,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
int ret;
BUG_ON(!PageLocked(page));
+ VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));
if (unlikely(PageKsm(page)))
ret = try_to_unmap_ksm(page, flags);
diff --git a/mm/shmem.c b/mm/shmem.c
index 080b09a57a8f..5ee67c990602 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1586,6 +1586,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
inode = new_inode(sb);
if (inode) {
+ inode->i_ino = get_next_ino();
inode_init_owner(inode, dir, mode);
inode->i_blocks = 0;
inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
@@ -1903,7 +1904,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
dir->i_size += BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
inc_nlink(inode);
- atomic_inc(&inode->i_count); /* New dentry reference */
+ ihold(inode); /* New dentry reference */
dget(dentry); /* Extra pinning count for the created dentry */
d_instantiate(dentry, inode);
out:
@@ -2146,7 +2147,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
if (*len < 3)
return 255;
- if (hlist_unhashed(&inode->i_hash)) {
+ if (inode_unhashed(inode)) {
/* Unfortunately insert_inode_hash is not idempotent,
* so as we hash inodes here rather than at creation
* time, we need a lock to ensure we only try
@@ -2154,7 +2155,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
*/
static DEFINE_SPINLOCK(lock);
spin_lock(&lock);
- if (hlist_unhashed(&inode->i_hash))
+ if (inode_unhashed(inode))
__insert_inode_hash(inode,
inode->i_ino + inode->i_generation);
spin_unlock(&lock);
@@ -2414,13 +2415,20 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
return &p->vfs_inode;
}
+static void shmem_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ INIT_LIST_HEAD(&inode->i_dentry);
+ kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
+}
+
static void shmem_destroy_inode(struct inode *inode)
{
if ((inode->i_mode & S_IFMT) == S_IFREG) {
/* only struct inode is valid if it's an inline symlink */
mpol_free_shared_policy(&SHMEM_I(inode)->policy);
}
- kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
+ call_rcu(&inode->i_rcu, shmem_i_callback);
}
static void init_once(void *foo)
@@ -2537,16 +2545,16 @@ static const struct vm_operations_struct shmem_vm_ops = {
};
-static int shmem_get_sb(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *shmem_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
{
- return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt);
+ return mount_nodev(fs_type, flags, data, shmem_fill_super);
}
static struct file_system_type tmpfs_fs_type = {
.owner = THIS_MODULE,
.name = "tmpfs",
- .get_sb = shmem_get_sb,
+ .mount = shmem_mount,
.kill_sb = kill_litter_super,
};
@@ -2642,7 +2650,7 @@ out:
static struct file_system_type tmpfs_fs_type = {
.name = "tmpfs",
- .get_sb = ramfs_get_sb,
+ .mount = ramfs_mount,
.kill_sb = kill_litter_super,
};
diff --git a/mm/slab.c b/mm/slab.c
index fcae9815d3b3..37961d1f584f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -284,7 +284,7 @@ struct kmem_list3 {
* Need this for bootstrapping a per node allocator.
*/
#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
-struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
+static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
#define CACHE_CACHE 0
#define SIZE_AC MAX_NUMNODES
#define SIZE_L3 (2 * MAX_NUMNODES)
@@ -829,12 +829,12 @@ static void init_reap_node(int cpu)
static void next_reap_node(void)
{
- int node = __get_cpu_var(slab_reap_node);
+ int node = __this_cpu_read(slab_reap_node);
node = next_node(node, node_online_map);
if (unlikely(node >= MAX_NUMNODES))
node = first_node(node_online_map);
- __get_cpu_var(slab_reap_node) = node;
+ __this_cpu_write(slab_reap_node, node);
}
#else
@@ -901,7 +901,7 @@ static int transfer_objects(struct array_cache *to,
struct array_cache *from, unsigned int max)
{
/* Figure out how many entries to transfer */
- int nr = min(min(from->avail, max), to->limit - to->avail);
+ int nr = min3(from->avail, max, to->limit - to->avail);
if (!nr)
return 0;
@@ -1012,7 +1012,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
*/
static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
{
- int node = __get_cpu_var(slab_reap_node);
+ int node = __this_cpu_read(slab_reap_node);
if (l3->alien) {
struct array_cache *ac = l3->alien[node];
@@ -1293,7 +1293,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
* anything expensive but will only modify reap_work
* and reschedule the timer.
*/
- cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu));
+ cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
/* Now the cache_reaper is guaranteed to be not running. */
per_cpu(slab_reap_work, cpu).work.func = NULL;
break;
@@ -2781,7 +2781,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
/*
* Map pages beginning at addr to the given cache and slab. This is required
* for the slab allocator to be able to lookup the cache and slab of a
- * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
+ * virtual address for kfree, ksize, and slab debugging.
*/
static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
void *addr)
@@ -3653,42 +3653,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
EXPORT_SYMBOL(kmem_cache_alloc);
#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
+void *
+kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags)
{
- return __cache_alloc(cachep, flags, __builtin_return_address(0));
-}
-EXPORT_SYMBOL(kmem_cache_alloc_notrace);
-#endif
+ void *ret;
-/**
- * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
- * @cachep: the cache we're checking against
- * @ptr: pointer to validate
- *
- * This verifies that the untrusted pointer looks sane;
- * it is _not_ a guarantee that the pointer is actually
- * part of the slab cache in question, but it at least
- * validates that the pointer can be dereferenced and
- * looks half-way sane.
- *
- * Currently only used for dentry validation.
- */
-int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
-{
- unsigned long size = cachep->buffer_size;
- struct page *page;
+ ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
- if (unlikely(!kern_ptr_validate(ptr, size)))
- goto out;
- page = virt_to_page(ptr);
- if (unlikely(!PageSlab(page)))
- goto out;
- if (unlikely(page_get_cache(page) != cachep))
- goto out;
- return 1;
-out:
- return 0;
+ trace_kmalloc(_RET_IP_, ret,
+ size, slab_buffer_size(cachep), flags);
+ return ret;
}
+EXPORT_SYMBOL(kmem_cache_alloc_trace);
+#endif
#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
@@ -3705,31 +3682,32 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
EXPORT_SYMBOL(kmem_cache_alloc_node);
#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
- gfp_t flags,
- int nodeid)
+void *kmem_cache_alloc_node_trace(size_t size,
+ struct kmem_cache *cachep,
+ gfp_t flags,
+ int nodeid)
{
- return __cache_alloc_node(cachep, flags, nodeid,
+ void *ret;
+
+ ret = __cache_alloc_node(cachep, flags, nodeid,
__builtin_return_address(0));
+ trace_kmalloc_node(_RET_IP_, ret,
+ size, slab_buffer_size(cachep),
+ flags, nodeid);
+ return ret;
}
-EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
+EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
#endif
static __always_inline void *
__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
{
struct kmem_cache *cachep;
- void *ret;
cachep = kmem_find_general_cachep(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
- ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
-
- trace_kmalloc_node((unsigned long) caller, ret,
- size, cachep->buffer_size, flags, node);
-
- return ret;
+ return kmem_cache_alloc_node_trace(size, cachep, flags, node);
}
#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
@@ -4075,7 +4053,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
* necessary. Note that the l3 listlock also protects the array_cache
* if drain_array() is used on the shared array.
*/
-void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
struct array_cache *ac, int force, int node)
{
int tofree;
@@ -4339,7 +4317,7 @@ static const struct seq_operations slabinfo_op = {
* @count: data length
* @ppos: unused
*/
-ssize_t slabinfo_write(struct file *file, const char __user * buffer,
+static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
diff --git a/mm/slob.c b/mm/slob.c
index d582171c8101..3588eaaef726 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -500,7 +500,9 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
} else {
unsigned int order = get_order(size);
- ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node);
+ if (likely(order))
+ gfp |= __GFP_COMP;
+ ret = slob_new_pages(gfp, order, node);
if (ret) {
struct page *page;
page = virt_to_page(ret);
@@ -676,11 +678,6 @@ int kmem_cache_shrink(struct kmem_cache *d)
}
EXPORT_SYMBOL(kmem_cache_shrink);
-int kmem_ptr_validate(struct kmem_cache *a, const void *b)
-{
- return 0;
-}
-
static unsigned int slob_ready __read_mostly;
int slab_is_available(void)
diff --git a/mm/slub.c b/mm/slub.c
index 13fffe1f0f3d..e15aa7f193c9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -28,6 +28,8 @@
#include <linux/math64.h>
#include <linux/fault-inject.h>
+#include <trace/events/kmem.h>
+
/*
* Lock order:
* 1. slab_lock(page)
@@ -168,7 +170,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
/* Internal SLUB flags */
#define __OBJECT_POISON 0x80000000UL /* Poison object */
-#define __SYSFS_ADD_DEFERRED 0x40000000UL /* Not yet visible via sysfs */
static int kmem_size = sizeof(struct kmem_cache);
@@ -178,7 +179,7 @@ static struct notifier_block slab_notifier;
static enum {
DOWN, /* No slab functionality available */
- PARTIAL, /* kmem_cache_open() works but kmalloc does not */
+ PARTIAL, /* Kmem_cache_node works */
UP, /* Everything works but does not show up in sysfs */
SYSFS /* Sysfs up */
} slab_state = DOWN;
@@ -199,7 +200,7 @@ struct track {
enum track_item { TRACK_ALLOC, TRACK_FREE };
-#ifdef CONFIG_SLUB_DEBUG
+#ifdef CONFIG_SYSFS
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
static void sysfs_slab_remove(struct kmem_cache *);
@@ -210,6 +211,7 @@ static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
{ return 0; }
static inline void sysfs_slab_remove(struct kmem_cache *s)
{
+ kfree(s->name);
kfree(s);
}
@@ -233,11 +235,7 @@ int slab_is_available(void)
static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
{
-#ifdef CONFIG_NUMA
return s->node[node];
-#else
- return &s->local_node;
-#endif
}
/* Verify that a pointer has an address that is valid within a slab page */
@@ -494,7 +492,7 @@ static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
dump_stack();
}
-static void init_object(struct kmem_cache *s, void *object, int active)
+static void init_object(struct kmem_cache *s, void *object, u8 val)
{
u8 *p = object;
@@ -504,9 +502,7 @@ static void init_object(struct kmem_cache *s, void *object, int active)
}
if (s->flags & SLAB_RED_ZONE)
- memset(p + s->objsize,
- active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
- s->inuse - s->objsize);
+ memset(p + s->objsize, val, s->inuse - s->objsize);
}
static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
@@ -641,17 +637,14 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
}
static int check_object(struct kmem_cache *s, struct page *page,
- void *object, int active)
+ void *object, u8 val)
{
u8 *p = object;
u8 *endobject = object + s->objsize;
if (s->flags & SLAB_RED_ZONE) {
- unsigned int red =
- active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
-
if (!check_bytes_and_report(s, page, object, "Redzone",
- endobject, red, s->inuse - s->objsize))
+ endobject, val, s->inuse - s->objsize))
return 0;
} else {
if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
@@ -661,7 +654,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
}
if (s->flags & SLAB_POISON) {
- if (!active && (s->flags & __OBJECT_POISON) &&
+ if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
(!check_bytes_and_report(s, page, p, "Poison", p,
POISON_FREE, s->objsize - 1) ||
!check_bytes_and_report(s, page, p, "Poison",
@@ -673,7 +666,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
check_pad_bytes(s, page, p);
}
- if (!s->offset && active)
+ if (!s->offset && val == SLUB_RED_ACTIVE)
/*
* Object and freepointer overlap. Cannot check
* freepointer while object is allocated.
@@ -792,6 +785,39 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
}
/*
+ * Hooks for other subsystems that check memory allocations. In a typical
+ * production configuration these hooks all should produce no code at all.
+ */
+static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
+{
+ flags &= gfp_allowed_mask;
+ lockdep_trace_alloc(flags);
+ might_sleep_if(flags & __GFP_WAIT);
+
+ return should_failslab(s->objsize, flags, s->flags);
+}
+
+static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
+{
+ flags &= gfp_allowed_mask;
+ kmemcheck_slab_alloc(s, flags, object, s->objsize);
+ kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags);
+}
+
+static inline void slab_free_hook(struct kmem_cache *s, void *x)
+{
+ kmemleak_free_recursive(x, s->flags);
+}
+
+static inline void slab_free_hook_irq(struct kmem_cache *s, void *object)
+{
+ kmemcheck_slab_free(s, object, s->objsize);
+ debug_check_no_locks_freed(object, s->objsize);
+ if (!(s->flags & SLAB_DEBUG_OBJECTS))
+ debug_check_no_obj_freed(object, s->objsize);
+}
+
+/*
* Tracking of fully allocated slabs for debugging purposes.
*/
static void add_full(struct kmem_cache_node *n, struct page *page)
@@ -838,7 +864,7 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
* dilemma by deferring the increment of the count during
* bootstrap (see early_kmem_cache_node_alloc).
*/
- if (!NUMA_BUILD || n) {
+ if (n) {
atomic_long_inc(&n->nr_slabs);
atomic_long_add(objects, &n->total_objects);
}
@@ -858,11 +884,11 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
return;
- init_object(s, object, 0);
+ init_object(s, object, SLUB_RED_INACTIVE);
init_tracking(s, object);
}
-static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
+static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page,
void *object, unsigned long addr)
{
if (!check_slab(s, page))
@@ -878,14 +904,14 @@ static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
goto bad;
}
- if (!check_object(s, page, object, 0))
+ if (!check_object(s, page, object, SLUB_RED_INACTIVE))
goto bad;
/* Success perform special debug activities for allocs */
if (s->flags & SLAB_STORE_USER)
set_track(s, object, TRACK_ALLOC, addr);
trace(s, page, object, 1);
- init_object(s, object, 1);
+ init_object(s, object, SLUB_RED_ACTIVE);
return 1;
bad:
@@ -902,8 +928,8 @@ bad:
return 0;
}
-static int free_debug_processing(struct kmem_cache *s, struct page *page,
- void *object, unsigned long addr)
+static noinline int free_debug_processing(struct kmem_cache *s,
+ struct page *page, void *object, unsigned long addr)
{
if (!check_slab(s, page))
goto fail;
@@ -918,7 +944,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
goto fail;
}
- if (!check_object(s, page, object, 1))
+ if (!check_object(s, page, object, SLUB_RED_ACTIVE))
return 0;
if (unlikely(s != page->slab)) {
@@ -942,7 +968,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
if (s->flags & SLAB_STORE_USER)
set_track(s, object, TRACK_FREE, addr);
trace(s, page, object, 0);
- init_object(s, object, 0);
+ init_object(s, object, SLUB_RED_INACTIVE);
return 1;
fail:
@@ -1046,7 +1072,7 @@ static inline int free_debug_processing(struct kmem_cache *s,
static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
{ return 1; }
static inline int check_object(struct kmem_cache *s, struct page *page,
- void *object, int active) { return 1; }
+ void *object, u8 val) { return 1; }
static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
static inline unsigned long kmem_cache_flags(unsigned long objsize,
unsigned long flags, const char *name,
@@ -1066,7 +1092,19 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
int objects) {}
static inline void dec_slabs_node(struct kmem_cache *s, int node,
int objects) {}
-#endif
+
+static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
+ { return 0; }
+
+static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
+ void *object) {}
+
+static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
+
+static inline void slab_free_hook_irq(struct kmem_cache *s,
+ void *object) {}
+
+#endif /* CONFIG_SLUB_DEBUG */
/*
* Slab allocation and freeing
@@ -1194,7 +1232,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
slab_pad_check(s, page);
for_each_object(p, s, page_address(page),
page->objects)
- check_object(s, page, p, 0);
+ check_object(s, page, p, SLUB_RED_INACTIVE);
}
kmemcheck_free_shadow(page, compound_order(page));
@@ -1274,13 +1312,19 @@ static void add_partial(struct kmem_cache_node *n,
spin_unlock(&n->list_lock);
}
+static inline void __remove_partial(struct kmem_cache_node *n,
+ struct page *page)
+{
+ list_del(&page->lru);
+ n->nr_partial--;
+}
+
static void remove_partial(struct kmem_cache *s, struct page *page)
{
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
spin_lock(&n->list_lock);
- list_del(&page->lru);
- n->nr_partial--;
+ __remove_partial(n, page);
spin_unlock(&n->list_lock);
}
@@ -1293,8 +1337,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
struct page *page)
{
if (slab_trylock(page)) {
- list_del(&page->lru);
- n->nr_partial--;
+ __remove_partial(n, page);
__SetPageSlubFrozen(page);
return 1;
}
@@ -1405,6 +1448,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
* On exit the slab lock will have been dropped.
*/
static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
+ __releases(bitlock)
{
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
@@ -1447,6 +1491,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
* Remove the cpu slab
*/
static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
+ __releases(bitlock)
{
struct page *page = c->page;
int tail = 1;
@@ -1647,6 +1692,7 @@ new_slab:
goto load_freelist;
}
+ gfpflags &= gfp_allowed_mask;
if (gfpflags & __GFP_WAIT)
local_irq_enable();
@@ -1674,7 +1720,7 @@ debug:
c->page->inuse++;
c->page->freelist = get_freepointer(s, object);
- c->node = -1;
+ c->node = NUMA_NO_NODE;
goto unlock_out;
}
@@ -1695,12 +1741,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
struct kmem_cache_cpu *c;
unsigned long flags;
- gfpflags &= gfp_allowed_mask;
-
- lockdep_trace_alloc(gfpflags);
- might_sleep_if(gfpflags & __GFP_WAIT);
-
- if (should_failslab(s->objsize, gfpflags, s->flags))
+ if (slab_pre_alloc_hook(s, gfpflags))
return NULL;
local_irq_save(flags);
@@ -1719,8 +1760,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
if (unlikely(gfpflags & __GFP_ZERO) && object)
memset(object, 0, s->objsize);
- kmemcheck_slab_alloc(s, gfpflags, object, s->objsize);
- kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags);
+ slab_post_alloc_hook(s, gfpflags, object);
return object;
}
@@ -1736,11 +1776,21 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
EXPORT_SYMBOL(kmem_cache_alloc);
#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
+void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
{
- return slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
+ void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
+ trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
+ return ret;
}
-EXPORT_SYMBOL(kmem_cache_alloc_notrace);
+EXPORT_SYMBOL(kmem_cache_alloc_trace);
+
+void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+{
+ void *ret = kmalloc_order(size, flags, order);
+ trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
+ return ret;
+}
+EXPORT_SYMBOL(kmalloc_order_trace);
#endif
#ifdef CONFIG_NUMA
@@ -1754,16 +1804,20 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
-#endif
#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
+void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
gfp_t gfpflags,
- int node)
+ int node, size_t size)
{
- return slab_alloc(s, gfpflags, node, _RET_IP_);
+ void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
+
+ trace_kmalloc_node(_RET_IP_, ret,
+ size, s->size, gfpflags, node);
+ return ret;
}
-EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
+EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
+#endif
#endif
/*
@@ -1850,14 +1904,14 @@ static __always_inline void slab_free(struct kmem_cache *s,
struct kmem_cache_cpu *c;
unsigned long flags;
- kmemleak_free_recursive(x, s->flags);
+ slab_free_hook(s, x);
+
local_irq_save(flags);
c = __this_cpu_ptr(s->cpu_slab);
- kmemcheck_slab_free(s, object, s->objsize);
- debug_check_no_locks_freed(object, s->objsize);
- if (!(s->flags & SLAB_DEBUG_OBJECTS))
- debug_check_no_obj_freed(object, s->objsize);
- if (likely(page == c->page && c->node >= 0)) {
+
+ slab_free_hook_irq(s, x);
+
+ if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
set_freepointer(s, object, c->freelist);
c->freelist = object;
stat(s, FREE_FASTPATH);
@@ -1879,17 +1933,6 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
}
EXPORT_SYMBOL(kmem_cache_free);
-/* Figure out on which slab page the object resides */
-static struct page *get_object_page(const void *x)
-{
- struct page *page = virt_to_head_page(x);
-
- if (!PageSlab(page))
- return NULL;
-
- return page;
-}
-
/*
* Object placement in a slab is made very easy because we always start at
* offset 0. If we tune the size of the object to the alignment then we can
@@ -2062,26 +2105,18 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
#endif
}
-static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]);
-
-static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
+static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
{
- if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches)
- /*
- * Boot time creation of the kmalloc array. Use static per cpu data
- * since the per cpu allocator is not available yet.
- */
- s->cpu_slab = kmalloc_percpu + (s - kmalloc_caches);
- else
- s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
+ BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
+ SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
- if (!s->cpu_slab)
- return 0;
+ s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
- return 1;
+ return s->cpu_slab != NULL;
}
-#ifdef CONFIG_NUMA
+static struct kmem_cache *kmem_cache_node;
+
/*
* No kmalloc_node yet so do it by hand. We know that this is the first
* slab on the node for this slabcache. There are no concurrent accesses
@@ -2091,15 +2126,15 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
* when allocating for the kmalloc_node_cache. This is used for bootstrapping
* memory on a fresh node that has no slab structures yet.
*/
-static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node)
+static void early_kmem_cache_node_alloc(int node)
{
struct page *page;
struct kmem_cache_node *n;
unsigned long flags;
- BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
+ BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
- page = new_slab(kmalloc_caches, gfpflags, node);
+ page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
BUG_ON(!page);
if (page_to_nid(page) != node) {
@@ -2111,15 +2146,15 @@ static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node)
n = page->freelist;
BUG_ON(!n);
- page->freelist = get_freepointer(kmalloc_caches, n);
+ page->freelist = get_freepointer(kmem_cache_node, n);
page->inuse++;
- kmalloc_caches->node[node] = n;
+ kmem_cache_node->node[node] = n;
#ifdef CONFIG_SLUB_DEBUG
- init_object(kmalloc_caches, n, 1);
- init_tracking(kmalloc_caches, n);
+ init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
+ init_tracking(kmem_cache_node, n);
#endif
- init_kmem_cache_node(n, kmalloc_caches);
- inc_slabs_node(kmalloc_caches, node, page->objects);
+ init_kmem_cache_node(n, kmem_cache_node);
+ inc_slabs_node(kmem_cache_node, node, page->objects);
/*
* lockdep requires consistent irq usage for each lock
@@ -2137,13 +2172,15 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = s->node[node];
+
if (n)
- kmem_cache_free(kmalloc_caches, n);
+ kmem_cache_free(kmem_cache_node, n);
+
s->node[node] = NULL;
}
}
-static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
+static int init_kmem_cache_nodes(struct kmem_cache *s)
{
int node;
@@ -2151,11 +2188,11 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
struct kmem_cache_node *n;
if (slab_state == DOWN) {
- early_kmem_cache_node_alloc(gfpflags, node);
+ early_kmem_cache_node_alloc(node);
continue;
}
- n = kmem_cache_alloc_node(kmalloc_caches,
- gfpflags, node);
+ n = kmem_cache_alloc_node(kmem_cache_node,
+ GFP_KERNEL, node);
if (!n) {
free_kmem_cache_nodes(s);
@@ -2167,17 +2204,6 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
}
return 1;
}
-#else
-static void free_kmem_cache_nodes(struct kmem_cache *s)
-{
-}
-
-static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
-{
- init_kmem_cache_node(&s->local_node, s);
- return 1;
-}
-#endif
static void set_min_partial(struct kmem_cache *s, unsigned long min)
{
@@ -2312,7 +2338,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
}
-static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
+static int kmem_cache_open(struct kmem_cache *s,
const char *name, size_t size,
size_t align, unsigned long flags,
void (*ctor)(void *))
@@ -2348,10 +2374,10 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
#ifdef CONFIG_NUMA
s->remote_node_defrag_ratio = 1000;
#endif
- if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
+ if (!init_kmem_cache_nodes(s))
goto error;
- if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
+ if (alloc_kmem_cache_cpus(s))
return 1;
free_kmem_cache_nodes(s);
@@ -2365,35 +2391,6 @@ error:
}
/*
- * Check if a given pointer is valid
- */
-int kmem_ptr_validate(struct kmem_cache *s, const void *object)
-{
- struct page *page;
-
- if (!kern_ptr_validate(object, s->size))
- return 0;
-
- page = get_object_page(object);
-
- if (!page || s != page->slab)
- /* No slab or wrong slab */
- return 0;
-
- if (!check_valid_pointer(s, page, object))
- return 0;
-
- /*
- * We could also check if the object is on the slabs freelist.
- * But this would be too expensive and it seems that the main
- * purpose of kmem_ptr_valid() is to check if the object belongs
- * to a certain slab.
- */
- return 1;
-}
-EXPORT_SYMBOL(kmem_ptr_validate);
-
-/*
* Determine the size of a slab object
*/
unsigned int kmem_cache_size(struct kmem_cache *s)
@@ -2414,9 +2411,8 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
#ifdef CONFIG_SLUB_DEBUG
void *addr = page_address(page);
void *p;
- long *map = kzalloc(BITS_TO_LONGS(page->objects) * sizeof(long),
- GFP_ATOMIC);
-
+ unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
+ sizeof(long), GFP_ATOMIC);
if (!map)
return;
slab_err(s, page, "%s", text);
@@ -2448,9 +2444,8 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry_safe(page, h, &n->partial, lru) {
if (!page->inuse) {
- list_del(&page->lru);
+ __remove_partial(n, page);
discard_slab(s, page);
- n->nr_partial--;
} else {
list_slab_objects(s, page,
"Objects remaining on kmem_cache_close()");
@@ -2507,9 +2502,15 @@ EXPORT_SYMBOL(kmem_cache_destroy);
* Kmalloc subsystem
*******************************************************************/
-struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned;
+struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
EXPORT_SYMBOL(kmalloc_caches);
+static struct kmem_cache *kmem_cache;
+
+#ifdef CONFIG_ZONE_DMA
+static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
+#endif
+
static int __init setup_slub_min_order(char *str)
{
get_option(&str, &slub_min_order);
@@ -2546,116 +2547,29 @@ static int __init setup_slub_nomerge(char *str)
__setup("slub_nomerge", setup_slub_nomerge);
-static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
- const char *name, int size, gfp_t gfp_flags)
+static struct kmem_cache *__init create_kmalloc_cache(const char *name,
+ int size, unsigned int flags)
{
- unsigned int flags = 0;
+ struct kmem_cache *s;
- if (gfp_flags & SLUB_DMA)
- flags = SLAB_CACHE_DMA;
+ s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
/*
* This function is called with IRQs disabled during early-boot on
* single CPU so there's no need to take slub_lock here.
*/
- if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
+ if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
flags, NULL))
goto panic;
list_add(&s->list, &slab_caches);
-
- if (sysfs_slab_add(s))
- goto panic;
return s;
panic:
panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
+ return NULL;
}
-#ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT];
-
-static void sysfs_add_func(struct work_struct *w)
-{
- struct kmem_cache *s;
-
- down_write(&slub_lock);
- list_for_each_entry(s, &slab_caches, list) {
- if (s->flags & __SYSFS_ADD_DEFERRED) {
- s->flags &= ~__SYSFS_ADD_DEFERRED;
- sysfs_slab_add(s);
- }
- }
- up_write(&slub_lock);
-}
-
-static DECLARE_WORK(sysfs_add_work, sysfs_add_func);
-
-static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
-{
- struct kmem_cache *s;
- char *text;
- size_t realsize;
- unsigned long slabflags;
- int i;
-
- s = kmalloc_caches_dma[index];
- if (s)
- return s;
-
- /* Dynamically create dma cache */
- if (flags & __GFP_WAIT)
- down_write(&slub_lock);
- else {
- if (!down_write_trylock(&slub_lock))
- goto out;
- }
-
- if (kmalloc_caches_dma[index])
- goto unlock_out;
-
- realsize = kmalloc_caches[index].objsize;
- text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
- (unsigned int)realsize);
-
- s = NULL;
- for (i = 0; i < KMALLOC_CACHES; i++)
- if (!kmalloc_caches[i].size)
- break;
-
- BUG_ON(i >= KMALLOC_CACHES);
- s = kmalloc_caches + i;
-
- /*
- * Must defer sysfs creation to a workqueue because we don't know
- * what context we are called from. Before sysfs comes up, we don't
- * need to do anything because our sysfs initcall will start by
- * adding all existing slabs to sysfs.
- */
- slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK;
- if (slab_state >= SYSFS)
- slabflags |= __SYSFS_ADD_DEFERRED;
-
- if (!text || !kmem_cache_open(s, flags, text,
- realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
- s->size = 0;
- kfree(text);
- goto unlock_out;
- }
-
- list_add(&s->list, &slab_caches);
- kmalloc_caches_dma[index] = s;
-
- if (slab_state >= SYSFS)
- schedule_work(&sysfs_add_work);
-
-unlock_out:
- up_write(&slub_lock);
-out:
- return kmalloc_caches_dma[index];
-}
-#endif
-
/*
* Conversion table for small slabs sizes / 8 to the index in the
* kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -2708,10 +2622,10 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
#ifdef CONFIG_ZONE_DMA
if (unlikely((flags & SLUB_DMA)))
- return dma_kmalloc_cache(index, flags);
+ return kmalloc_dma_caches[index];
#endif
- return &kmalloc_caches[index];
+ return kmalloc_caches[index];
}
void *__kmalloc(size_t size, gfp_t flags)
@@ -2735,6 +2649,7 @@ void *__kmalloc(size_t size, gfp_t flags)
}
EXPORT_SYMBOL(__kmalloc);
+#ifdef CONFIG_NUMA
static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
{
struct page *page;
@@ -2749,7 +2664,6 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
return ptr;
}
-#ifdef CONFIG_NUMA
void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
struct kmem_cache *s;
@@ -2889,8 +2803,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
* may have freed the last object and be
* waiting to release the slab.
*/
- list_del(&page->lru);
- n->nr_partial--;
+ __remove_partial(n, page);
slab_unlock(page);
discard_slab(s, page);
} else {
@@ -2914,7 +2827,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
}
EXPORT_SYMBOL(kmem_cache_shrink);
-#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+#if defined(CONFIG_MEMORY_HOTPLUG)
static int slab_mem_going_offline_callback(void *arg)
{
struct kmem_cache *s;
@@ -2956,7 +2869,7 @@ static void slab_mem_offline_callback(void *arg)
BUG_ON(slabs_node(s, offline_node));
s->node[offline_node] = NULL;
- kmem_cache_free(kmalloc_caches, n);
+ kmem_cache_free(kmem_cache_node, n);
}
}
up_read(&slub_lock);
@@ -2989,7 +2902,7 @@ static int slab_mem_going_online_callback(void *arg)
* since memory is not yet available from the node that
* is brought up.
*/
- n = kmem_cache_alloc(kmalloc_caches, GFP_KERNEL);
+ n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
if (!n) {
ret = -ENOMEM;
goto out;
@@ -3035,46 +2948,92 @@ static int slab_memory_callback(struct notifier_block *self,
* Basic setup of slabs
*******************************************************************/
+/*
+ * Used for early kmem_cache structures that were allocated using
+ * the page allocator
+ */
+
+static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
+{
+ int node;
+
+ list_add(&s->list, &slab_caches);
+ s->refcount = -1;
+
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n = get_node(s, node);
+ struct page *p;
+
+ if (n) {
+ list_for_each_entry(p, &n->partial, lru)
+ p->slab = s;
+
+#ifdef CONFIG_SLAB_DEBUG
+ list_for_each_entry(p, &n->full, lru)
+ p->slab = s;
+#endif
+ }
+ }
+}
+
void __init kmem_cache_init(void)
{
int i;
int caches = 0;
+ struct kmem_cache *temp_kmem_cache;
+ int order;
+ struct kmem_cache *temp_kmem_cache_node;
+ unsigned long kmalloc_size;
+
+ kmem_size = offsetof(struct kmem_cache, node) +
+ nr_node_ids * sizeof(struct kmem_cache_node *);
+
+ /* Allocate two kmem_caches from the page allocator */
+ kmalloc_size = ALIGN(kmem_size, cache_line_size());
+ order = get_order(2 * kmalloc_size);
+ kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order);
-#ifdef CONFIG_NUMA
/*
* Must first have the slab cache available for the allocations of the
* struct kmem_cache_node's. There is special bootstrap code in
* kmem_cache_open for slab_state == DOWN.
*/
- create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
- sizeof(struct kmem_cache_node), GFP_NOWAIT);
- kmalloc_caches[0].refcount = -1;
- caches++;
+ kmem_cache_node = (void *)kmem_cache + kmalloc_size;
+
+ kmem_cache_open(kmem_cache_node, "kmem_cache_node",
+ sizeof(struct kmem_cache_node),
+ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
-#endif
/* Able to allocate the per node structures */
slab_state = PARTIAL;
- /* Caches that are not of the two-to-the-power-of size */
- if (KMALLOC_MIN_SIZE <= 32) {
- create_kmalloc_cache(&kmalloc_caches[1],
- "kmalloc-96", 96, GFP_NOWAIT);
- caches++;
- }
- if (KMALLOC_MIN_SIZE <= 64) {
- create_kmalloc_cache(&kmalloc_caches[2],
- "kmalloc-192", 192, GFP_NOWAIT);
- caches++;
- }
+ temp_kmem_cache = kmem_cache;
+ kmem_cache_open(kmem_cache, "kmem_cache", kmem_size,
+ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+ kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
+ memcpy(kmem_cache, temp_kmem_cache, kmem_size);
- for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
- create_kmalloc_cache(&kmalloc_caches[i],
- "kmalloc", 1 << i, GFP_NOWAIT);
- caches++;
- }
+ /*
+ * Allocate kmem_cache_node properly from the kmem_cache slab.
+ * kmem_cache_node is separately allocated so no need to
+ * update any list pointers.
+ */
+ temp_kmem_cache_node = kmem_cache_node;
+ kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
+ memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size);
+
+ kmem_cache_bootstrap_fixup(kmem_cache_node);
+
+ caches++;
+ kmem_cache_bootstrap_fixup(kmem_cache);
+ caches++;
+ /* Free temporary boot structure */
+ free_pages((unsigned long)temp_kmem_cache, order);
+
+ /* Now we can use the kmem_cache to allocate kmalloc slabs */
/*
* Patch up the size_index table if we have strange large alignment
@@ -3114,26 +3073,60 @@ void __init kmem_cache_init(void)
size_index[size_index_elem(i)] = 8;
}
+ /* Caches that are not of the two-to-the-power-of size */
+ if (KMALLOC_MIN_SIZE <= 32) {
+ kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0);
+ caches++;
+ }
+
+ if (KMALLOC_MIN_SIZE <= 64) {
+ kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0);
+ caches++;
+ }
+
+ for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
+ kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0);
+ caches++;
+ }
+
slab_state = UP;
/* Provide the correct kmalloc names now that the caches are up */
+ if (KMALLOC_MIN_SIZE <= 32) {
+ kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT);
+ BUG_ON(!kmalloc_caches[1]->name);
+ }
+
+ if (KMALLOC_MIN_SIZE <= 64) {
+ kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT);
+ BUG_ON(!kmalloc_caches[2]->name);
+ }
+
for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
BUG_ON(!s);
- kmalloc_caches[i].name = s;
+ kmalloc_caches[i]->name = s;
}
#ifdef CONFIG_SMP
register_cpu_notifier(&slab_notifier);
#endif
-#ifdef CONFIG_NUMA
- kmem_size = offsetof(struct kmem_cache, node) +
- nr_node_ids * sizeof(struct kmem_cache_node *);
-#else
- kmem_size = sizeof(struct kmem_cache);
-#endif
+#ifdef CONFIG_ZONE_DMA
+ for (i = 0; i < SLUB_PAGE_SHIFT; i++) {
+ struct kmem_cache *s = kmalloc_caches[i];
+
+ if (s && s->size) {
+ char *name = kasprintf(GFP_NOWAIT,
+ "dma-kmalloc-%d", s->objsize);
+
+ BUG_ON(!name);
+ kmalloc_dma_caches[i] = create_kmalloc_cache(name,
+ s->objsize, SLAB_CACHE_DMA);
+ }
+ }
+#endif
printk(KERN_INFO
"SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
" CPUs=%d, Nodes=%d\n",
@@ -3211,6 +3204,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
size_t align, unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *s;
+ char *n;
if (WARN_ON(!name))
return NULL;
@@ -3234,24 +3228,30 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
return s;
}
+ n = kstrdup(name, GFP_KERNEL);
+ if (!n)
+ goto err;
+
s = kmalloc(kmem_size, GFP_KERNEL);
if (s) {
- if (kmem_cache_open(s, GFP_KERNEL, name,
+ if (kmem_cache_open(s, n,
size, align, flags, ctor)) {
list_add(&s->list, &slab_caches);
if (sysfs_slab_add(s)) {
list_del(&s->list);
+ kfree(n);
kfree(s);
goto err;
}
up_write(&slub_lock);
return s;
}
+ kfree(n);
kfree(s);
}
+err:
up_write(&slub_lock);
-err:
if (flags & SLAB_PANIC)
panic("Cannot create slabcache %s\n", name);
else
@@ -3318,6 +3318,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
return ret;
}
+#ifdef CONFIG_NUMA
void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
int node, unsigned long caller)
{
@@ -3346,8 +3347,9 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
return ret;
}
+#endif
-#ifdef CONFIG_SLUB_DEBUG
+#ifdef CONFIG_SYSFS
static int count_inuse(struct page *page)
{
return page->inuse;
@@ -3357,7 +3359,9 @@ static int count_total(struct page *page)
{
return page->objects;
}
+#endif
+#ifdef CONFIG_SLUB_DEBUG
static int validate_slab(struct kmem_cache *s, struct page *page,
unsigned long *map)
{
@@ -3373,13 +3377,13 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
for_each_free_object(p, s, page->freelist) {
set_bit(slab_index(p, s, addr), map);
- if (!check_object(s, page, p, 0))
+ if (!check_object(s, page, p, SLUB_RED_INACTIVE))
return 0;
}
for_each_object(p, s, addr, page->objects)
if (!test_bit(slab_index(p, s, addr), map))
- if (!check_object(s, page, p, 1))
+ if (!check_object(s, page, p, SLUB_RED_ACTIVE))
return 0;
return 1;
}
@@ -3448,65 +3452,6 @@ static long validate_slab_cache(struct kmem_cache *s)
kfree(map);
return count;
}
-
-#ifdef SLUB_RESILIENCY_TEST
-static void resiliency_test(void)
-{
- u8 *p;
-
- printk(KERN_ERR "SLUB resiliency testing\n");
- printk(KERN_ERR "-----------------------\n");
- printk(KERN_ERR "A. Corruption after allocation\n");
-
- p = kzalloc(16, GFP_KERNEL);
- p[16] = 0x12;
- printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
- " 0x12->0x%p\n\n", p + 16);
-
- validate_slab_cache(kmalloc_caches + 4);
-
- /* Hmmm... The next two are dangerous */
- p = kzalloc(32, GFP_KERNEL);
- p[32 + sizeof(void *)] = 0x34;
- printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
- " 0x34 -> -0x%p\n", p);
- printk(KERN_ERR
- "If allocated object is overwritten then not detectable\n\n");
-
- validate_slab_cache(kmalloc_caches + 5);
- p = kzalloc(64, GFP_KERNEL);
- p += 64 + (get_cycles() & 0xff) * sizeof(void *);
- *p = 0x56;
- printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
- p);
- printk(KERN_ERR
- "If allocated object is overwritten then not detectable\n\n");
- validate_slab_cache(kmalloc_caches + 6);
-
- printk(KERN_ERR "\nB. Corruption after free\n");
- p = kzalloc(128, GFP_KERNEL);
- kfree(p);
- *p = 0x78;
- printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
- validate_slab_cache(kmalloc_caches + 7);
-
- p = kzalloc(256, GFP_KERNEL);
- kfree(p);
- p[50] = 0x9a;
- printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
- p);
- validate_slab_cache(kmalloc_caches + 8);
-
- p = kzalloc(512, GFP_KERNEL);
- kfree(p);
- p[512] = 0xab;
- printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
- validate_slab_cache(kmalloc_caches + 9);
-}
-#else
-static void resiliency_test(void) {};
-#endif
-
/*
* Generate lists of code addresses where slabcache objects are allocated
* and freed.
@@ -3635,7 +3580,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
static void process_slab(struct loc_track *t, struct kmem_cache *s,
struct page *page, enum track_item alloc,
- long *map)
+ unsigned long *map)
{
void *addr = page_address(page);
void *p;
@@ -3691,7 +3636,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
len += sprintf(buf + len, "%7ld ", l->count);
if (l->addr)
- len += sprint_symbol(buf + len, (unsigned long)l->addr);
+ len += sprintf(buf + len, "%pS", (void *)l->addr);
else
len += sprintf(buf + len, "<not-available>");
@@ -3735,7 +3680,71 @@ static int list_locations(struct kmem_cache *s, char *buf,
len += sprintf(buf, "No data\n");
return len;
}
+#endif
+
+#ifdef SLUB_RESILIENCY_TEST
+static void resiliency_test(void)
+{
+ u8 *p;
+
+ BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10);
+ printk(KERN_ERR "SLUB resiliency testing\n");
+ printk(KERN_ERR "-----------------------\n");
+ printk(KERN_ERR "A. Corruption after allocation\n");
+
+ p = kzalloc(16, GFP_KERNEL);
+ p[16] = 0x12;
+ printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
+ " 0x12->0x%p\n\n", p + 16);
+
+ validate_slab_cache(kmalloc_caches[4]);
+
+ /* Hmmm... The next two are dangerous */
+ p = kzalloc(32, GFP_KERNEL);
+ p[32 + sizeof(void *)] = 0x34;
+ printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
+ " 0x34 -> -0x%p\n", p);
+ printk(KERN_ERR
+ "If allocated object is overwritten then not detectable\n\n");
+
+ validate_slab_cache(kmalloc_caches[5]);
+ p = kzalloc(64, GFP_KERNEL);
+ p += 64 + (get_cycles() & 0xff) * sizeof(void *);
+ *p = 0x56;
+ printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
+ p);
+ printk(KERN_ERR
+ "If allocated object is overwritten then not detectable\n\n");
+ validate_slab_cache(kmalloc_caches[6]);
+
+ printk(KERN_ERR "\nB. Corruption after free\n");
+ p = kzalloc(128, GFP_KERNEL);
+ kfree(p);
+ *p = 0x78;
+ printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
+ validate_slab_cache(kmalloc_caches[7]);
+
+ p = kzalloc(256, GFP_KERNEL);
+ kfree(p);
+ p[50] = 0x9a;
+ printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
+ p);
+ validate_slab_cache(kmalloc_caches[8]);
+
+ p = kzalloc(512, GFP_KERNEL);
+ kfree(p);
+ p[512] = 0xab;
+ printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
+ validate_slab_cache(kmalloc_caches[9]);
+}
+#else
+#ifdef CONFIG_SYSFS
+static void resiliency_test(void) {};
+#endif
+#endif
+
+#ifdef CONFIG_SYSFS
enum slab_stat_type {
SL_ALL, /* All slabs */
SL_PARTIAL, /* Only partially allocated slabs */
@@ -3788,6 +3797,8 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
}
}
+ lock_memory_hotplug();
+#ifdef CONFIG_SLUB_DEBUG
if (flags & SO_ALL) {
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = get_node(s, node);
@@ -3804,7 +3815,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
nodes[node] += x;
}
- } else if (flags & SO_PARTIAL) {
+ } else
+#endif
+ if (flags & SO_PARTIAL) {
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = get_node(s, node);
@@ -3825,10 +3838,12 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
x += sprintf(buf + x, " N%d=%lu",
node, nodes[node]);
#endif
+ unlock_memory_hotplug();
kfree(nodes);
return x + sprintf(buf + x, "\n");
}
+#ifdef CONFIG_SLUB_DEBUG
static int any_slab_objects(struct kmem_cache *s)
{
int node;
@@ -3844,6 +3859,7 @@ static int any_slab_objects(struct kmem_cache *s)
}
return 0;
}
+#endif
#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
#define to_slab(n) container_of(n, struct kmem_cache, kobj);
@@ -3930,12 +3946,9 @@ SLAB_ATTR(min_partial);
static ssize_t ctor_show(struct kmem_cache *s, char *buf)
{
- if (s->ctor) {
- int n = sprint_symbol(buf, (unsigned long)s->ctor);
-
- return n + sprintf(buf + n, "\n");
- }
- return 0;
+ if (!s->ctor)
+ return 0;
+ return sprintf(buf, "%pS\n", s->ctor);
}
SLAB_ATTR_RO(ctor);
@@ -3945,12 +3958,6 @@ static ssize_t aliases_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(aliases);
-static ssize_t slabs_show(struct kmem_cache *s, char *buf)
-{
- return show_slab_objects(s, buf, SO_ALL);
-}
-SLAB_ATTR_RO(slabs);
-
static ssize_t partial_show(struct kmem_cache *s, char *buf)
{
return show_slab_objects(s, buf, SO_PARTIAL);
@@ -3975,93 +3982,83 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(objects_partial);
-static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
-{
- return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
-}
-SLAB_ATTR_RO(total_objects);
-
-static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
+static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
}
-static ssize_t sanity_checks_store(struct kmem_cache *s,
+static ssize_t reclaim_account_store(struct kmem_cache *s,
const char *buf, size_t length)
{
- s->flags &= ~SLAB_DEBUG_FREE;
+ s->flags &= ~SLAB_RECLAIM_ACCOUNT;
if (buf[0] == '1')
- s->flags |= SLAB_DEBUG_FREE;
+ s->flags |= SLAB_RECLAIM_ACCOUNT;
return length;
}
-SLAB_ATTR(sanity_checks);
+SLAB_ATTR(reclaim_account);
-static ssize_t trace_show(struct kmem_cache *s, char *buf)
+static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
}
+SLAB_ATTR_RO(hwcache_align);
-static ssize_t trace_store(struct kmem_cache *s, const char *buf,
- size_t length)
+#ifdef CONFIG_ZONE_DMA
+static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
{
- s->flags &= ~SLAB_TRACE;
- if (buf[0] == '1')
- s->flags |= SLAB_TRACE;
- return length;
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
}
-SLAB_ATTR(trace);
+SLAB_ATTR_RO(cache_dma);
+#endif
-#ifdef CONFIG_FAILSLAB
-static ssize_t failslab_show(struct kmem_cache *s, char *buf)
+static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
}
+SLAB_ATTR_RO(destroy_by_rcu);
-static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
- size_t length)
+#ifdef CONFIG_SLUB_DEBUG
+static ssize_t slabs_show(struct kmem_cache *s, char *buf)
{
- s->flags &= ~SLAB_FAILSLAB;
- if (buf[0] == '1')
- s->flags |= SLAB_FAILSLAB;
- return length;
+ return show_slab_objects(s, buf, SO_ALL);
}
-SLAB_ATTR(failslab);
-#endif
+SLAB_ATTR_RO(slabs);
-static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
+static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
+ return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
}
+SLAB_ATTR_RO(total_objects);
-static ssize_t reclaim_account_store(struct kmem_cache *s,
- const char *buf, size_t length)
+static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
{
- s->flags &= ~SLAB_RECLAIM_ACCOUNT;
- if (buf[0] == '1')
- s->flags |= SLAB_RECLAIM_ACCOUNT;
- return length;
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
}
-SLAB_ATTR(reclaim_account);
-static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
+static ssize_t sanity_checks_store(struct kmem_cache *s,
+ const char *buf, size_t length)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
+ s->flags &= ~SLAB_DEBUG_FREE;
+ if (buf[0] == '1')
+ s->flags |= SLAB_DEBUG_FREE;
+ return length;
}
-SLAB_ATTR_RO(hwcache_align);
+SLAB_ATTR(sanity_checks);
-#ifdef CONFIG_ZONE_DMA
-static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
+static ssize_t trace_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
}
-SLAB_ATTR_RO(cache_dma);
-#endif
-static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
+static ssize_t trace_store(struct kmem_cache *s, const char *buf,
+ size_t length)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
+ s->flags &= ~SLAB_TRACE;
+ if (buf[0] == '1')
+ s->flags |= SLAB_TRACE;
+ return length;
}
-SLAB_ATTR_RO(destroy_by_rcu);
+SLAB_ATTR(trace);
static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
{
@@ -4139,6 +4136,40 @@ static ssize_t validate_store(struct kmem_cache *s,
}
SLAB_ATTR(validate);
+static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return -ENOSYS;
+ return list_locations(s, buf, TRACK_ALLOC);
+}
+SLAB_ATTR_RO(alloc_calls);
+
+static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return -ENOSYS;
+ return list_locations(s, buf, TRACK_FREE);
+}
+SLAB_ATTR_RO(free_calls);
+#endif /* CONFIG_SLUB_DEBUG */
+
+#ifdef CONFIG_FAILSLAB
+static ssize_t failslab_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
+}
+
+static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
+ size_t length)
+{
+ s->flags &= ~SLAB_FAILSLAB;
+ if (buf[0] == '1')
+ s->flags |= SLAB_FAILSLAB;
+ return length;
+}
+SLAB_ATTR(failslab);
+#endif
+
static ssize_t shrink_show(struct kmem_cache *s, char *buf)
{
return 0;
@@ -4158,22 +4189,6 @@ static ssize_t shrink_store(struct kmem_cache *s,
}
SLAB_ATTR(shrink);
-static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
-{
- if (!(s->flags & SLAB_STORE_USER))
- return -ENOSYS;
- return list_locations(s, buf, TRACK_ALLOC);
-}
-SLAB_ATTR_RO(alloc_calls);
-
-static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
-{
- if (!(s->flags & SLAB_STORE_USER))
- return -ENOSYS;
- return list_locations(s, buf, TRACK_FREE);
-}
-SLAB_ATTR_RO(free_calls);
-
#ifdef CONFIG_NUMA
static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
{
@@ -4279,25 +4294,27 @@ static struct attribute *slab_attrs[] = {
&min_partial_attr.attr,
&objects_attr.attr,
&objects_partial_attr.attr,
- &total_objects_attr.attr,
- &slabs_attr.attr,
&partial_attr.attr,
&cpu_slabs_attr.attr,
&ctor_attr.attr,
&aliases_attr.attr,
&align_attr.attr,
- &sanity_checks_attr.attr,
- &trace_attr.attr,
&hwcache_align_attr.attr,
&reclaim_account_attr.attr,
&destroy_by_rcu_attr.attr,
+ &shrink_attr.attr,
+#ifdef CONFIG_SLUB_DEBUG
+ &total_objects_attr.attr,
+ &slabs_attr.attr,
+ &sanity_checks_attr.attr,
+ &trace_attr.attr,
&red_zone_attr.attr,
&poison_attr.attr,
&store_user_attr.attr,
&validate_attr.attr,
- &shrink_attr.attr,
&alloc_calls_attr.attr,
&free_calls_attr.attr,
+#endif
#ifdef CONFIG_ZONE_DMA
&cache_dma_attr.attr,
#endif
@@ -4377,6 +4394,7 @@ static void kmem_cache_release(struct kobject *kobj)
{
struct kmem_cache *s = to_slab(kobj);
+ kfree(s->name);
kfree(s);
}
@@ -4579,7 +4597,7 @@ static int __init slab_sysfs_init(void)
}
__initcall(slab_sysfs_init);
-#endif
+#endif /* CONFIG_SYSFS */
/*
* The /proc/slabinfo ABI
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index aa33fd67fa41..64b984091edb 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -9,7 +9,7 @@
*
* However, virtual mappings need a page table and TLBs. Many Linux
* architectures already map their physical space using 1-1 mappings
- * via TLBs. For those arches the virtual memmory map is essentially
+ * via TLBs. For those arches the virtual memory map is essentially
* for free if we use the same page size as the 1-1 mappings. In that
* case the overhead consists of a few additional pages that are
* allocated to create a view of memory for vmemmap.
@@ -220,18 +220,7 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
if (vmemmap_buf_start) {
/* need to free left buf */
-#ifdef CONFIG_NO_BOOTMEM
- free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end));
- if (vmemmap_buf_start < vmemmap_buf) {
- char name[15];
-
- snprintf(name, sizeof(name), "MEMMAP %d", nodeid);
- reserve_early_without_check(__pa(vmemmap_buf_start),
- __pa(vmemmap_buf), name);
- }
-#else
free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
-#endif
vmemmap_buf = NULL;
vmemmap_buf_end = NULL;
}
diff --git a/mm/sparse.c b/mm/sparse.c
index 95ac219af379..93250207c5cf 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -671,10 +671,10 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
static void free_map_bootmem(struct page *page, unsigned long nr_pages)
{
unsigned long maps_section_nr, removing_section_nr, i;
- int magic;
+ unsigned long magic;
for (i = 0; i < nr_pages; i++, page++) {
- magic = atomic_read(&page->_mapcount);
+ magic = (unsigned long) page->lru.next;
BUG_ON(magic == NODE_INFO);
diff --git a/mm/swap.c b/mm/swap.c
index 3ce7bc373a52..c02f93611a84 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -56,17 +56,97 @@ static void __page_cache_release(struct page *page)
del_page_from_lru(zone, page);
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
+}
+
+static void __put_single_page(struct page *page)
+{
+ __page_cache_release(page);
free_hot_cold_page(page, 0);
}
-static void put_compound_page(struct page *page)
+static void __put_compound_page(struct page *page)
{
- page = compound_head(page);
- if (put_page_testzero(page)) {
- compound_page_dtor *dtor;
+ compound_page_dtor *dtor;
- dtor = get_compound_page_dtor(page);
- (*dtor)(page);
+ __page_cache_release(page);
+ dtor = get_compound_page_dtor(page);
+ (*dtor)(page);
+}
+
+static void put_compound_page(struct page *page)
+{
+ if (unlikely(PageTail(page))) {
+ /* __split_huge_page_refcount can run under us */
+ struct page *page_head = page->first_page;
+ smp_rmb();
+ /*
+ * If PageTail is still set after smp_rmb() we can be sure
+ * that the page->first_page we read wasn't a dangling pointer.
+ * See __split_huge_page_refcount() smp_wmb().
+ */
+ if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
+ unsigned long flags;
+ /*
+ * Verify that our page_head wasn't converted
+ * to a a regular page before we got a
+ * reference on it.
+ */
+ if (unlikely(!PageHead(page_head))) {
+ /* PageHead is cleared after PageTail */
+ smp_rmb();
+ VM_BUG_ON(PageTail(page));
+ goto out_put_head;
+ }
+ /*
+ * Only run compound_lock on a valid PageHead,
+ * after having it pinned with
+ * get_page_unless_zero() above.
+ */
+ smp_mb();
+ /* page_head wasn't a dangling pointer */
+ flags = compound_lock_irqsave(page_head);
+ if (unlikely(!PageTail(page))) {
+ /* __split_huge_page_refcount run before us */
+ compound_unlock_irqrestore(page_head, flags);
+ VM_BUG_ON(PageHead(page_head));
+ out_put_head:
+ if (put_page_testzero(page_head))
+ __put_single_page(page_head);
+ out_put_single:
+ if (put_page_testzero(page))
+ __put_single_page(page);
+ return;
+ }
+ VM_BUG_ON(page_head != page->first_page);
+ /*
+ * We can release the refcount taken by
+ * get_page_unless_zero now that
+ * split_huge_page_refcount is blocked on the
+ * compound_lock.
+ */
+ if (put_page_testzero(page_head))
+ VM_BUG_ON(1);
+ /* __split_huge_page_refcount will wait now */
+ VM_BUG_ON(atomic_read(&page->_count) <= 0);
+ atomic_dec(&page->_count);
+ VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
+ compound_unlock_irqrestore(page_head, flags);
+ if (put_page_testzero(page_head)) {
+ if (PageHead(page_head))
+ __put_compound_page(page_head);
+ else
+ __put_single_page(page_head);
+ }
+ } else {
+ /* page_head is a dangling pointer */
+ VM_BUG_ON(PageTail(page));
+ goto out_put_single;
+ }
+ } else if (put_page_testzero(page)) {
+ if (PageHead(page))
+ __put_compound_page(page);
+ else
+ __put_single_page(page);
}
}
@@ -75,7 +155,7 @@ void put_page(struct page *page)
if (unlikely(PageCompound(page)))
put_compound_page(page);
else if (put_page_testzero(page))
- __page_cache_release(page);
+ __put_single_page(page);
}
EXPORT_SYMBOL(put_page);
@@ -378,6 +458,7 @@ void release_pages(struct page **pages, int nr, int cold)
pagevec_free(&pages_to_free);
}
+EXPORT_SYMBOL(release_pages);
/*
* The pages which we're about to release may be in the deferred lru-addition
@@ -398,6 +479,43 @@ void __pagevec_release(struct pagevec *pvec)
EXPORT_SYMBOL(__pagevec_release);
+/* used by __split_huge_page_refcount() */
+void lru_add_page_tail(struct zone* zone,
+ struct page *page, struct page *page_tail)
+{
+ int active;
+ enum lru_list lru;
+ const int file = 0;
+ struct list_head *head;
+
+ VM_BUG_ON(!PageHead(page));
+ VM_BUG_ON(PageCompound(page_tail));
+ VM_BUG_ON(PageLRU(page_tail));
+ VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
+
+ SetPageLRU(page_tail);
+
+ if (page_evictable(page_tail, NULL)) {
+ if (PageActive(page)) {
+ SetPageActive(page_tail);
+ active = 1;
+ lru = LRU_ACTIVE_ANON;
+ } else {
+ active = 0;
+ lru = LRU_INACTIVE_ANON;
+ }
+ update_page_reclaim_stat(zone, page_tail, file, active);
+ if (likely(PageLRU(page)))
+ head = page->lru.prev;
+ else
+ head = &zone->lru[lru].list;
+ __add_page_to_lru_list(zone, page_tail, lru, head);
+ } else {
+ SetPageUnevictable(page_tail);
+ add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE);
+ }
+}
+
/*
* Add the passed pages to the LRU, then drop the caller's refcount
* on them. Reinitialises the caller's pagevec.
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e10f5833167f..5c8cfabbc9bc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -157,6 +157,12 @@ int add_to_swap(struct page *page)
if (!entry.val)
return 0;
+ if (unlikely(PageTransHuge(page)))
+ if (unlikely(split_huge_page(page))) {
+ swapcache_free(entry, NULL);
+ return 0;
+ }
+
/*
* Radix-tree node allocations from PF_MEMALLOC contexts could
* completely exhaust the page allocator. __GFP_NOMEMALLOC
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 7c703ff2f36f..0341c5700e34 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -30,6 +30,7 @@
#include <linux/capability.h>
#include <linux/syscalls.h>
#include <linux/memcontrol.h>
+#include <linux/poll.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -58,6 +59,10 @@ static struct swap_info_struct *swap_info[MAX_SWAPFILES];
static DEFINE_MUTEX(swapon_mutex);
+static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
+/* Activity counter to indicate that a swapon or swapoff has occurred */
+static atomic_t proc_poll_event = ATOMIC_INIT(0);
+
static inline unsigned char swap_count(unsigned char ent)
{
return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
@@ -139,7 +144,7 @@ static int discard_swap(struct swap_info_struct *si)
nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
if (nr_blocks) {
err = blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT);
+ nr_blocks, GFP_KERNEL, 0);
if (err)
return err;
cond_resched();
@@ -150,7 +155,7 @@ static int discard_swap(struct swap_info_struct *si)
nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
err = blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT);
+ nr_blocks, GFP_KERNEL, 0);
if (err)
break;
@@ -189,7 +194,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
start_block <<= PAGE_SHIFT - 9;
nr_blocks <<= PAGE_SHIFT - 9;
if (blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT))
+ nr_blocks, GFP_NOIO, 0))
break;
}
@@ -959,6 +964,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (unlikely(pmd_trans_huge(*pmd)))
+ continue;
if (pmd_none_or_clear_bad(pmd))
continue;
ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
@@ -1672,7 +1679,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
if (S_ISBLK(inode->i_mode)) {
struct block_device *bdev = I_BDEV(inode);
set_blocksize(bdev, p->old_block_size);
- bd_release(bdev);
+ blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
} else {
mutex_lock(&inode->i_mutex);
inode->i_flags &= ~S_SWAPFILE;
@@ -1680,6 +1687,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
}
filp_close(swap_file, NULL);
err = 0;
+ atomic_inc(&proc_poll_event);
+ wake_up_interruptible(&proc_poll_wait);
out_dput:
filp_close(victim, NULL);
@@ -1688,6 +1697,25 @@ out:
}
#ifdef CONFIG_PROC_FS
+struct proc_swaps {
+ struct seq_file seq;
+ int event;
+};
+
+static unsigned swaps_poll(struct file *file, poll_table *wait)
+{
+ struct proc_swaps *s = file->private_data;
+
+ poll_wait(file, &proc_poll_wait, wait);
+
+ if (s->event != atomic_read(&proc_poll_event)) {
+ s->event = atomic_read(&proc_poll_event);
+ return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
+ }
+
+ return POLLIN | POLLRDNORM;
+}
+
/* iterator */
static void *swap_start(struct seq_file *swap, loff_t *pos)
{
@@ -1771,7 +1799,24 @@ static const struct seq_operations swaps_op = {
static int swaps_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &swaps_op);
+ struct proc_swaps *s;
+ int ret;
+
+ s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ file->private_data = s;
+
+ ret = seq_open(file, &swaps_op);
+ if (ret) {
+ kfree(s);
+ return ret;
+ }
+
+ s->seq.private = s;
+ s->event = atomic_read(&proc_poll_event);
+ return ret;
}
static const struct file_operations proc_swaps_operations = {
@@ -1779,6 +1824,7 @@ static const struct file_operations proc_swaps_operations = {
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
+ .poll = swaps_poll,
};
static int __init procswaps_init(void)
@@ -1894,8 +1940,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = -EINVAL;
if (S_ISBLK(inode->i_mode)) {
- bdev = I_BDEV(inode);
- error = bd_claim(bdev, sys_swapon);
+ bdev = bdgrab(I_BDEV(inode));
+ error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL,
+ sys_swapon);
if (error < 0) {
bdev = NULL;
error = -EINVAL;
@@ -2084,12 +2131,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
swap_info[prev]->next = type;
spin_unlock(&swap_lock);
mutex_unlock(&swapon_mutex);
+ atomic_inc(&proc_poll_event);
+ wake_up_interruptible(&proc_poll_wait);
+
error = 0;
goto out;
bad_swap:
if (bdev) {
set_blocksize(bdev, p->old_block_size);
- bd_release(bdev);
+ blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
}
destroy_swap_extents(p);
swap_cgroup_swapoff(type);
diff --git a/mm/truncate.c b/mm/truncate.c
index ba887bff48c5..49feb46e77b8 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -390,6 +390,10 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
__remove_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
+
+ if (mapping->a_ops->freepage)
+ mapping->a_ops->freepage(page);
+
page_cache_release(page); /* pagecache ref */
return 1;
failed:
@@ -545,13 +549,12 @@ EXPORT_SYMBOL(truncate_pagecache);
* @inode: inode
* @newsize: new file size
*
- * truncate_setsize updastes i_size update and performs pagecache
- * truncation (if necessary) for a file size updates. It will be
- * typically be called from the filesystem's setattr function when
- * ATTR_SIZE is passed in.
+ * truncate_setsize updates i_size and performs pagecache truncation (if
+ * necessary) to @newsize. It will be typically be called from the filesystem's
+ * setattr function when ATTR_SIZE is passed in.
*
- * Must be called with inode_mutex held and after all filesystem
- * specific block truncation has been performed.
+ * Must be called with inode_mutex held and before all filesystem specific
+ * block truncation has been performed.
*/
void truncate_setsize(struct inode *inode, loff_t newsize)
{
diff --git a/mm/util.c b/mm/util.c
index 4735ea481816..f126975ef23e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -186,27 +186,6 @@ void kzfree(const void *p)
}
EXPORT_SYMBOL(kzfree);
-int kern_ptr_validate(const void *ptr, unsigned long size)
-{
- unsigned long addr = (unsigned long)ptr;
- unsigned long min_addr = PAGE_OFFSET;
- unsigned long align_mask = sizeof(void *) - 1;
-
- if (unlikely(addr < min_addr))
- goto out;
- if (unlikely(addr > (unsigned long)high_memory - size))
- goto out;
- if (unlikely(addr & align_mask))
- goto out;
- if (unlikely(!kern_addr_valid(addr)))
- goto out;
- if (unlikely(!kern_addr_valid(addr + size - 1)))
- goto out;
- return 1;
-out:
- return 0;
-}
-
/*
* strndup_user - duplicate an existing string from user space
* @s: The string to duplicate
@@ -245,6 +224,19 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
}
#endif
+/*
+ * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
+ * back to the regular GUP.
+ * If the architecture not support this fucntion, simply return with no
+ * page pinned
+ */
+int __attribute__((weak)) __get_user_pages_fast(unsigned long start,
+ int nr_pages, int write, struct page **pages)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__get_user_pages_fast);
+
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6b8889da69a6..f9b166732e70 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -31,8 +31,6 @@
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
-bool vmap_lazy_unmap __read_mostly = true;
-
/*** Page table manipulation functions ***/
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
@@ -293,13 +291,13 @@ static void __insert_vmap_area(struct vmap_area *va)
struct rb_node *tmp;
while (*p) {
- struct vmap_area *tmp;
+ struct vmap_area *tmp_va;
parent = *p;
- tmp = rb_entry(parent, struct vmap_area, rb_node);
- if (va->va_start < tmp->va_end)
+ tmp_va = rb_entry(parent, struct vmap_area, rb_node);
+ if (va->va_start < tmp_va->va_end)
p = &(*p)->rb_left;
- else if (va->va_end > tmp->va_start)
+ else if (va->va_end > tmp_va->va_start)
p = &(*p)->rb_right;
else
BUG();
@@ -503,9 +501,6 @@ static unsigned long lazy_max_pages(void)
{
unsigned int log;
- if (!vmap_lazy_unmap)
- return 0;
-
log = fls(num_online_cpus());
return log * (32UL * 1024 * 1024 / PAGE_SIZE);
@@ -517,6 +512,15 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
static void purge_fragmented_blocks_allcpus(void);
/*
+ * called before a call to iounmap() if the caller wants vm_area_struct's
+ * immediately freed.
+ */
+void set_iounmap_nonlazy(void)
+{
+ atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
+}
+
+/*
* Purges all lazily-freed vmap areas.
*
* If sync is 0 then don't purge if there is already a purge in progress.
@@ -557,7 +561,6 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
if (va->va_end > *end)
*end = va->va_end;
nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
- unmap_vmap_area(va);
list_add_tail(&va->purge_list, &valist);
va->flags |= VM_LAZY_FREEING;
va->flags &= ~VM_LAZY_FREE;
@@ -602,10 +605,11 @@ static void purge_vmap_area_lazy(void)
}
/*
- * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
- * called for the correct range previously.
+ * Free a vmap area, caller ensuring that the area has been unmapped
+ * and flush_cache_vunmap had been called for the correct range
+ * previously.
*/
-static void free_unmap_vmap_area_noflush(struct vmap_area *va)
+static void free_vmap_area_noflush(struct vmap_area *va)
{
va->flags |= VM_LAZY_FREE;
atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
@@ -614,6 +618,16 @@ static void free_unmap_vmap_area_noflush(struct vmap_area *va)
}
/*
+ * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
+ * called for the correct range previously.
+ */
+static void free_unmap_vmap_area_noflush(struct vmap_area *va)
+{
+ unmap_vmap_area(va);
+ free_vmap_area_noflush(va);
+}
+
+/*
* Free and unmap a vmap area
*/
static void free_unmap_vmap_area(struct vmap_area *va)
@@ -734,7 +748,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
VMALLOC_START, VMALLOC_END,
node, gfp_mask);
- if (unlikely(IS_ERR(va))) {
+ if (IS_ERR(va)) {
kfree(vb);
return ERR_CAST(va);
}
@@ -789,7 +803,7 @@ static void free_vmap_block(struct vmap_block *vb)
spin_unlock(&vmap_block_tree_lock);
BUG_ON(tmp != vb);
- free_unmap_vmap_area_noflush(vb->va);
+ free_vmap_area_noflush(vb->va);
call_rcu(&vb->rcu_head, rcu_free_vb);
}
@@ -927,6 +941,8 @@ static void vb_free(const void *addr, unsigned long size)
rcu_read_unlock();
BUG_ON(!vb);
+ vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
+
spin_lock(&vb->lock);
BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
@@ -979,7 +995,6 @@ void vm_unmap_aliases(void)
s = vb->va->va_start + (i << PAGE_SHIFT);
e = vb->va->va_start + (j << PAGE_SHIFT);
- vunmap_page_range(s, e);
flush = 1;
if (s < start)
@@ -1160,6 +1175,7 @@ void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
{
vunmap_page_range(addr, addr + size);
}
+EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
/**
* unmap_kernel_range - unmap kernel VM area and flush cache and TLB
@@ -1300,13 +1316,6 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
-1, GFP_KERNEL, caller);
}
-struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
- int node, gfp_t gfp_mask)
-{
- return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
- node, gfp_mask, __builtin_return_address(0));
-}
-
static struct vm_struct *find_vm_area(const void *addr)
{
struct vmap_area *va;
@@ -1522,25 +1531,12 @@ fail:
return NULL;
}
-void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
-{
- void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1,
- __builtin_return_address(0));
-
- /*
- * A ref_count = 3 is needed because the vm_struct and vmap_area
- * structures allocated in the __get_vm_area_node() function contain
- * references to the virtual address of the vmalloc'ed block.
- */
- kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask);
-
- return addr;
-}
-
/**
- * __vmalloc_node - allocate virtually contiguous memory
+ * __vmalloc_node_range - allocate virtually contiguous memory
* @size: allocation size
* @align: desired alignment
+ * @start: vm area range start
+ * @end: vm area range end
* @gfp_mask: flags for the page level allocator
* @prot: protection mask for the allocated pages
* @node: node to use for allocation or -1
@@ -1550,9 +1546,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
* allocator with @gfp_mask flags. Map them into contiguous
* kernel virtual space, using a pagetable protection of @prot.
*/
-static void *__vmalloc_node(unsigned long size, unsigned long align,
- gfp_t gfp_mask, pgprot_t prot,
- int node, void *caller)
+void *__vmalloc_node_range(unsigned long size, unsigned long align,
+ unsigned long start, unsigned long end, gfp_t gfp_mask,
+ pgprot_t prot, int node, void *caller)
{
struct vm_struct *area;
void *addr;
@@ -1562,8 +1558,8 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
if (!size || (size >> PAGE_SHIFT) > totalram_pages)
return NULL;
- area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START,
- VMALLOC_END, node, gfp_mask, caller);
+ area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node,
+ gfp_mask, caller);
if (!area)
return NULL;
@@ -1580,6 +1576,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
return addr;
}
+/**
+ * __vmalloc_node - allocate virtually contiguous memory
+ * @size: allocation size
+ * @align: desired alignment
+ * @gfp_mask: flags for the page level allocator
+ * @prot: protection mask for the allocated pages
+ * @node: node to use for allocation or -1
+ * @caller: caller's return address
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator with @gfp_mask flags. Map them into contiguous
+ * kernel virtual space, using a pagetable protection of @prot.
+ */
+static void *__vmalloc_node(unsigned long size, unsigned long align,
+ gfp_t gfp_mask, pgprot_t prot,
+ int node, void *caller)
+{
+ return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
+ gfp_mask, prot, node, caller);
+}
+
void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
{
return __vmalloc_node(size, 1, gfp_mask, prot, -1,
@@ -1587,6 +1604,13 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
}
EXPORT_SYMBOL(__vmalloc);
+static inline void *__vmalloc_node_flags(unsigned long size,
+ int node, gfp_t flags)
+{
+ return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
+ node, __builtin_return_address(0));
+}
+
/**
* vmalloc - allocate virtually contiguous memory
* @size: allocation size
@@ -1598,12 +1622,28 @@ EXPORT_SYMBOL(__vmalloc);
*/
void *vmalloc(unsigned long size)
{
- return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
- -1, __builtin_return_address(0));
+ return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM);
}
EXPORT_SYMBOL(vmalloc);
/**
+ * vzalloc - allocate virtually contiguous memory with zero fill
+ * @size: allocation size
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vzalloc(unsigned long size)
+{
+ return __vmalloc_node_flags(size, -1,
+ GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vzalloc);
+
+/**
* vmalloc_user - allocate zeroed virtually contiguous memory for userspace
* @size: allocation size
*
@@ -1644,6 +1684,25 @@ void *vmalloc_node(unsigned long size, int node)
}
EXPORT_SYMBOL(vmalloc_node);
+/**
+ * vzalloc_node - allocate memory on a specific node with zero fill
+ * @size: allocation size
+ * @node: numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc_node() instead.
+ */
+void *vzalloc_node(unsigned long size, int node)
+{
+ return __vmalloc_node_flags(size, node,
+ GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vzalloc_node);
+
#ifndef PAGE_KERNEL_EXEC
# define PAGE_KERNEL_EXEC PAGE_KERNEL
#endif
@@ -2056,6 +2115,7 @@ void free_vm_area(struct vm_struct *area)
}
EXPORT_SYMBOL_GPL(free_vm_area);
+#ifdef CONFIG_SMP
static struct vmap_area *node_to_va(struct rb_node *n)
{
return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
@@ -2145,17 +2205,16 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
* @sizes: array containing size of each area
* @nr_vms: the number of areas to allocate
* @align: alignment, all entries in @offsets and @sizes must be aligned to this
- * @gfp_mask: allocation mask
*
* Returns: kmalloc'd vm_struct pointer array pointing to allocated
* vm_structs on success, %NULL on failure
*
* Percpu allocator wants to use congruent vm areas so that it can
* maintain the offsets among percpu areas. This function allocates
- * congruent vmalloc areas for it. These areas tend to be scattered
- * pretty far, distance between two areas easily going up to
- * gigabytes. To avoid interacting with regular vmallocs, these areas
- * are allocated from top.
+ * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to
+ * be scattered pretty far, distance between two areas easily going up
+ * to gigabytes. To avoid interacting with regular vmallocs, these
+ * areas are allocated from top.
*
* Despite its complicated look, this allocator is rather simple. It
* does everything top-down and scans areas from the end looking for
@@ -2166,7 +2225,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
*/
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
const size_t *sizes, int nr_vms,
- size_t align, gfp_t gfp_mask)
+ size_t align)
{
const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
@@ -2176,8 +2235,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
unsigned long base, start, end, last_end;
bool purged = false;
- gfp_mask &= GFP_RECLAIM_MASK;
-
/* verify parameters and allocate data structures */
BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
for (last_area = 0, area = 0; area < nr_vms; area++) {
@@ -2210,14 +2267,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
return NULL;
}
- vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask);
- vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask);
+ vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL);
+ vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL);
if (!vas || !vms)
goto err_free;
for (area = 0; area < nr_vms; area++) {
- vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
- vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
+ vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
+ vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
if (!vas[area] || !vms[area])
goto err_free;
}
@@ -2336,9 +2393,11 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
free_vm_area(vms[i]);
kfree(vms);
}
+#endif /* CONFIG_SMP */
#ifdef CONFIG_PROC_FS
static void *s_start(struct seq_file *m, loff_t *pos)
+ __acquires(&vmlist_lock)
{
loff_t n = *pos;
struct vm_struct *v;
@@ -2365,6 +2424,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
}
static void s_stop(struct seq_file *m, void *p)
+ __releases(&vmlist_lock)
{
read_unlock(&vmlist_lock);
}
@@ -2395,13 +2455,8 @@ static int s_show(struct seq_file *m, void *p)
seq_printf(m, "0x%p-0x%p %7ld",
v->addr, v->addr + v->size, v->size);
- if (v->caller) {
- char buff[KSYM_SYMBOL_LEN];
-
- seq_putc(m, ' ');
- sprint_symbol(buff, (unsigned long)v->caller);
- seq_puts(m, buff);
- }
+ if (v->caller)
+ seq_printf(m, " %pS", v->caller);
if (v->nr_pages)
seq_printf(m, " pages=%d", v->nr_pages);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c5dfabf25f11..17497d0cd8b9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -32,6 +32,7 @@
#include <linux/topology.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
+#include <linux/compaction.h>
#include <linux/notifier.h>
#include <linux/rwsem.h>
#include <linux/delay.h>
@@ -51,6 +52,24 @@
#define CREATE_TRACE_POINTS
#include <trace/events/vmscan.h>
+/*
+ * reclaim_mode determines how the inactive list is shrunk
+ * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
+ * RECLAIM_MODE_ASYNC: Do not block
+ * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback
+ * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
+ * page from the LRU and reclaim all pages within a
+ * naturally aligned range
+ * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
+ * order-0 pages and then compact the zone
+ */
+typedef unsigned __bitwise__ reclaim_mode_t;
+#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u)
+#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u)
+#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u)
+#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u)
+#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u)
+
struct scan_control {
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
@@ -79,10 +98,10 @@ struct scan_control {
int order;
/*
- * Intend to reclaim enough contenious memory rather than to reclaim
- * enough amount memory. I.e, it's the mode for high order allocation.
+ * Intend to reclaim enough continuous memory rather than reclaim
+ * enough amount of memory. i.e, mode for high order allocation.
*/
- bool lumpy_reclaim_mode;
+ reclaim_mode_t reclaim_mode;
/* Which cgroup do we reclaim from */
struct mem_cgroup *mem_cgroup;
@@ -265,6 +284,39 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
return ret;
}
+static void set_reclaim_mode(int priority, struct scan_control *sc,
+ bool sync)
+{
+ reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
+
+ /*
+ * Initially assume we are entering either lumpy reclaim or
+ * reclaim/compaction.Depending on the order, we will either set the
+ * sync mode or just reclaim order-0 pages later.
+ */
+ if (COMPACTION_BUILD)
+ sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
+ else
+ sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
+
+ /*
+ * Avoid using lumpy reclaim or reclaim/compaction if possible by
+ * restricting when its set to either costly allocations or when
+ * under memory pressure
+ */
+ if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+ sc->reclaim_mode |= syncmode;
+ else if (sc->order && priority < DEF_PRIORITY - 2)
+ sc->reclaim_mode |= syncmode;
+ else
+ sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
+}
+
+static void reset_reclaim_mode(struct scan_control *sc)
+{
+ sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
+}
+
static inline int is_page_cache_freeable(struct page *page)
{
/*
@@ -275,7 +327,8 @@ static inline int is_page_cache_freeable(struct page *page)
return page_count(page) - page_has_private(page) == 2;
}
-static int may_write_to_queue(struct backing_dev_info *bdi)
+static int may_write_to_queue(struct backing_dev_info *bdi,
+ struct scan_control *sc)
{
if (current->flags & PF_SWAPWRITE)
return 1;
@@ -283,6 +336,10 @@ static int may_write_to_queue(struct backing_dev_info *bdi)
return 1;
if (bdi == current->backing_dev_info)
return 1;
+
+ /* lumpy reclaim for hugepage often need a lot of write */
+ if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+ return 1;
return 0;
}
@@ -307,12 +364,6 @@ static void handle_write_error(struct address_space *mapping,
unlock_page(page);
}
-/* Request for sync pageout. */
-enum pageout_io {
- PAGEOUT_IO_ASYNC,
- PAGEOUT_IO_SYNC,
-};
-
/* possible outcome of pageout() */
typedef enum {
/* failed to write page out, page is locked */
@@ -330,7 +381,7 @@ typedef enum {
* Calls ->writepage().
*/
static pageout_t pageout(struct page *page, struct address_space *mapping,
- enum pageout_io sync_writeback)
+ struct scan_control *sc)
{
/*
* If the page is dirty, only perform writeback if that write
@@ -366,7 +417,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
- if (!may_write_to_queue(mapping->backing_dev_info))
+ if (!may_write_to_queue(mapping->backing_dev_info, sc))
return PAGE_KEEP;
if (clear_page_dirty_for_io(page)) {
@@ -376,7 +427,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
.nr_to_write = SWAP_CLUSTER_MAX,
.range_start = 0,
.range_end = LLONG_MAX,
- .nonblocking = 1,
.for_reclaim = 1,
};
@@ -394,7 +444,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
* direct reclaiming a large contiguous area and the
* first attempt to free a range of pages fails.
*/
- if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
+ if (PageWriteback(page) &&
+ (sc->reclaim_mode & RECLAIM_MODE_SYNC))
wait_on_page_writeback(page);
if (!PageWriteback(page)) {
@@ -402,7 +453,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
ClearPageReclaim(page);
}
trace_mm_vmscan_writepage(page,
- trace_reclaim_flags(page, sync_writeback));
+ trace_reclaim_flags(page, sc->reclaim_mode));
inc_zone_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
@@ -459,9 +510,16 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
spin_unlock_irq(&mapping->tree_lock);
swapcache_free(swap, page);
} else {
+ void (*freepage)(struct page *);
+
+ freepage = mapping->a_ops->freepage;
+
__remove_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
+
+ if (freepage != NULL)
+ freepage(page);
}
return 1;
@@ -580,7 +638,7 @@ static enum page_references page_check_references(struct page *page,
referenced_page = TestClearPageReferenced(page);
/* Lumpy reclaim - ignore references */
- if (sc->lumpy_reclaim_mode)
+ if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
return PAGEREF_RECLAIM;
/*
@@ -616,7 +674,7 @@ static enum page_references page_check_references(struct page *page,
}
/* Reclaim if clean, defer dirty pages to writeback */
- if (referenced_page)
+ if (referenced_page && !PageSwapBacked(page))
return PAGEREF_RECLAIM_CLEAN;
return PAGEREF_RECLAIM;
@@ -644,12 +702,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
* shrink_page_list() returns the number of reclaimed pages
*/
static unsigned long shrink_page_list(struct list_head *page_list,
- struct scan_control *sc,
- enum pageout_io sync_writeback)
+ struct zone *zone,
+ struct scan_control *sc)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
int pgactivate = 0;
+ unsigned long nr_dirty = 0;
+ unsigned long nr_congested = 0;
unsigned long nr_reclaimed = 0;
cond_resched();
@@ -669,6 +729,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep;
VM_BUG_ON(PageActive(page));
+ VM_BUG_ON(page_zone(page) != zone);
sc->nr_scanned++;
@@ -694,10 +755,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* for any page for which writeback has already
* started.
*/
- if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
+ if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
+ may_enter_fs)
wait_on_page_writeback(page);
- else
- goto keep_locked;
+ else {
+ unlock_page(page);
+ goto keep_lumpy;
+ }
}
references = page_check_references(page, sc);
@@ -743,6 +807,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
if (PageDirty(page)) {
+ nr_dirty++;
+
if (references == PAGEREF_RECLAIM_CLEAN)
goto keep_locked;
if (!may_enter_fs)
@@ -751,14 +817,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep_locked;
/* Page is dirty, try to write it out here */
- switch (pageout(page, mapping, sync_writeback)) {
+ switch (pageout(page, mapping, sc)) {
case PAGE_KEEP:
+ nr_congested++;
goto keep_locked;
case PAGE_ACTIVATE:
goto activate_locked;
case PAGE_SUCCESS:
- if (PageWriteback(page) || PageDirty(page))
+ if (PageWriteback(page))
+ goto keep_lumpy;
+ if (PageDirty(page))
goto keep;
+
/*
* A synchronous write - probably a ramdisk. Go
* ahead and try to reclaim the page.
@@ -841,6 +911,7 @@ cull_mlocked:
try_to_free_swap(page);
unlock_page(page);
putback_lru_page(page);
+ reset_reclaim_mode(sc);
continue;
activate_locked:
@@ -853,10 +924,21 @@ activate_locked:
keep_locked:
unlock_page(page);
keep:
+ reset_reclaim_mode(sc);
+keep_lumpy:
list_add(&page->lru, &ret_pages);
VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
}
+ /*
+ * Tag a zone as congested if all the dirty pages encountered were
+ * backed by a congested BDI. In this case, reclaimers should just
+ * back off and wait for congestion to clear because further reclaim
+ * will encounter the same problem
+ */
+ if (nr_dirty == nr_congested && nr_dirty != 0)
+ zone_set_flag(zone, ZONE_CONGESTED);
+
free_page_list(&free_pages);
list_splice(&ret_pages, page_list);
@@ -962,7 +1044,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
case 0:
list_move(&page->lru, dst);
mem_cgroup_del_lru(page);
- nr_taken++;
+ nr_taken += hpage_nr_pages(page);
break;
case -EBUSY:
@@ -1006,7 +1088,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
/* Check that we have not crossed a zone boundary. */
if (unlikely(page_zone_id(cursor_page) != zone_id))
- continue;
+ break;
/*
* If we don't have enough swap space, reclaiming of
@@ -1014,23 +1096,28 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
* pointless.
*/
if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
- !PageSwapCache(cursor_page))
- continue;
+ !PageSwapCache(cursor_page))
+ break;
if (__isolate_lru_page(cursor_page, mode, file) == 0) {
list_move(&cursor_page->lru, dst);
mem_cgroup_del_lru(cursor_page);
- nr_taken++;
+ nr_taken += hpage_nr_pages(page);
nr_lumpy_taken++;
if (PageDirty(cursor_page))
nr_lumpy_dirty++;
scan++;
} else {
- if (mode == ISOLATE_BOTH &&
- page_count(cursor_page))
- nr_lumpy_failed++;
+ /* the page is freed already. */
+ if (!page_count(cursor_page))
+ continue;
+ break;
}
}
+
+ /* If we break out of the loop above, lumpy reclaim failed */
+ if (pfn < end_pfn)
+ nr_lumpy_failed++;
}
*scanned = scan;
@@ -1070,14 +1157,15 @@ static unsigned long clear_active_flags(struct list_head *page_list,
struct page *page;
list_for_each_entry(page, page_list, lru) {
+ int numpages = hpage_nr_pages(page);
lru = page_lru_base_type(page);
if (PageActive(page)) {
lru += LRU_ACTIVE;
ClearPageActive(page);
- nr_active++;
+ nr_active += numpages;
}
if (count)
- count[lru]++;
+ count[lru] += numpages;
}
return nr_active;
@@ -1187,7 +1275,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
add_page_to_lru_list(zone, page, lru);
if (is_active_lru(lru)) {
int file = is_file_lru(lru);
- reclaim_stat->recent_rotated[file]++;
+ int numpages = hpage_nr_pages(page);
+ reclaim_stat->recent_rotated[file] += numpages;
}
if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&zone->lru_lock);
@@ -1253,7 +1342,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
return false;
/* Only stall on lumpy reclaim */
- if (!sc->lumpy_reclaim_mode)
+ if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
return false;
/* If we have relaimed everything on the isolated list, no stall */
@@ -1286,7 +1375,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
unsigned long nr_scanned;
unsigned long nr_reclaimed = 0;
unsigned long nr_taken;
- unsigned long nr_active;
unsigned long nr_anon;
unsigned long nr_file;
@@ -1298,15 +1386,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
return SWAP_CLUSTER_MAX;
}
-
+ set_reclaim_mode(priority, sc, false);
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
if (scanning_global_lru(sc)) {
nr_taken = isolate_pages_global(nr_to_scan,
&page_list, &nr_scanned, sc->order,
- sc->lumpy_reclaim_mode ?
- ISOLATE_BOTH : ISOLATE_INACTIVE,
+ sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
+ ISOLATE_BOTH : ISOLATE_INACTIVE,
zone, 0, file);
zone->pages_scanned += nr_scanned;
if (current_is_kswapd())
@@ -1318,8 +1406,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
} else {
nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
&page_list, &nr_scanned, sc->order,
- sc->lumpy_reclaim_mode ?
- ISOLATE_BOTH : ISOLATE_INACTIVE,
+ sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
+ ISOLATE_BOTH : ISOLATE_INACTIVE,
zone, sc->mem_cgroup,
0, file);
/*
@@ -1337,20 +1425,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
spin_unlock_irq(&zone->lru_lock);
- nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+ nr_reclaimed = shrink_page_list(&page_list, zone, sc);
/* Check if we should syncronously wait for writeback */
if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
- congestion_wait(BLK_RW_ASYNC, HZ/10);
-
- /*
- * The attempt at page out may have made some
- * of the pages active, mark them inactive again.
- */
- nr_active = clear_active_flags(&page_list, NULL);
- count_vm_events(PGDEACTIVATE, nr_active);
-
- nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC);
+ set_reclaim_mode(priority, sc, true);
+ nr_reclaimed += shrink_page_list(&page_list, zone, sc);
}
local_irq_disable();
@@ -1359,6 +1439,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
__count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
+
+ trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
+ zone_idx(zone),
+ nr_scanned, nr_reclaimed,
+ priority,
+ trace_shrink_flags(file, sc->reclaim_mode));
return nr_reclaimed;
}
@@ -1398,7 +1484,7 @@ static void move_active_pages_to_lru(struct zone *zone,
list_move(&page->lru, &zone->lru[lru].list);
mem_cgroup_add_lru_list(page, lru);
- pgmoved++;
+ pgmoved += hpage_nr_pages(page);
if (!pagevec_add(&pvec, page) || list_empty(list)) {
spin_unlock_irq(&zone->lru_lock);
@@ -1466,7 +1552,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
}
if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
- nr_rotated++;
+ nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
@@ -1506,6 +1592,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
spin_unlock_irq(&zone->lru_lock);
}
+#ifdef CONFIG_SWAP
static int inactive_anon_is_low_global(struct zone *zone)
{
unsigned long active, inactive;
@@ -1531,12 +1618,26 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
{
int low;
+ /*
+ * If we don't have swap space, anonymous page deactivation
+ * is pointless.
+ */
+ if (!total_swap_pages)
+ return 0;
+
if (scanning_global_lru(sc))
low = inactive_anon_is_low_global(zone);
else
low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
return low;
}
+#else
+static inline int inactive_anon_is_low(struct zone *zone,
+ struct scan_control *sc)
+{
+ return 0;
+}
+#endif
static int inactive_file_is_low_global(struct zone *zone)
{
@@ -1721,19 +1822,55 @@ out:
}
}
-static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc)
+/*
+ * Reclaim/compaction depends on a number of pages being freed. To avoid
+ * disruption to the system, a small number of order-0 pages continue to be
+ * rotated and reclaimed in the normal fashion. However, by the time we get
+ * back to the allocator and call try_to_compact_zone(), we ensure that
+ * there are enough free pages for it to be likely successful
+ */
+static inline bool should_continue_reclaim(struct zone *zone,
+ unsigned long nr_reclaimed,
+ unsigned long nr_scanned,
+ struct scan_control *sc)
{
+ unsigned long pages_for_compaction;
+ unsigned long inactive_lru_pages;
+
+ /* If not in reclaim/compaction mode, stop */
+ if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
+ return false;
+
/*
- * If we need a large contiguous chunk of memory, or have
- * trouble getting a small set of contiguous pages, we
- * will reclaim both active and inactive pages.
+ * If we failed to reclaim and have scanned the full list, stop.
+ * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
+ * faster but obviously would be less likely to succeed
+ * allocation. If this is desirable, use GFP_REPEAT to decide
+ * if both reclaimed and scanned should be checked or just
+ * reclaimed
*/
- if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
- sc->lumpy_reclaim_mode = 1;
- else if (sc->order && priority < DEF_PRIORITY - 2)
- sc->lumpy_reclaim_mode = 1;
- else
- sc->lumpy_reclaim_mode = 0;
+ if (!nr_reclaimed && !nr_scanned)
+ return false;
+
+ /*
+ * If we have not reclaimed enough pages for compaction and the
+ * inactive lists are large enough, continue reclaiming
+ */
+ pages_for_compaction = (2UL << sc->order);
+ inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
+ zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+ if (sc->nr_reclaimed < pages_for_compaction &&
+ inactive_lru_pages > pages_for_compaction)
+ return true;
+
+ /* If compaction would go ahead or the allocation would succeed, stop */
+ switch (compaction_suitable(zone, sc->order)) {
+ case COMPACT_PARTIAL:
+ case COMPACT_CONTINUE:
+ return false;
+ default:
+ return true;
+ }
}
/*
@@ -1745,13 +1882,14 @@ static void shrink_zone(int priority, struct zone *zone,
unsigned long nr[NR_LRU_LISTS];
unsigned long nr_to_scan;
enum lru_list l;
- unsigned long nr_reclaimed = sc->nr_reclaimed;
+ unsigned long nr_reclaimed, nr_scanned;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+restart:
+ nr_reclaimed = 0;
+ nr_scanned = sc->nr_scanned;
get_scan_count(zone, sc, nr, priority);
- set_lumpy_reclaim_mode(priority, sc);
-
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
for_each_evictable_lru(l) {
@@ -1775,16 +1913,20 @@ static void shrink_zone(int priority, struct zone *zone,
if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
break;
}
-
- sc->nr_reclaimed = nr_reclaimed;
+ sc->nr_reclaimed += nr_reclaimed;
/*
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0)
+ if (inactive_anon_is_low(zone, sc))
shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+ /* reclaim/compaction might need reclaim to continue */
+ if (should_continue_reclaim(zone, nr_reclaimed,
+ sc->nr_scanned - nr_scanned, sc))
+ goto restart;
+
throttle_vm_writeout(sc->gfp_mask);
}
@@ -1937,21 +2079,17 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
/* Take a nap, wait for some writeback to complete */
if (!sc->hibernation_mode && sc->nr_scanned &&
- priority < DEF_PRIORITY - 2)
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ priority < DEF_PRIORITY - 2) {
+ struct zone *preferred_zone;
+
+ first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
+ &cpuset_current_mems_allowed,
+ &preferred_zone);
+ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
+ }
}
out:
- /*
- * Now that we've scanned all the zones at this priority level, note
- * that level within the zone so that the next thread which performs
- * scanning of this zone will immediately start out at this priority
- * level. This affects only the decision whether or not to bring
- * mapped pages onto the inactive list.
- */
- if (priority < 0)
- priority = 0;
-
delayacct_freepages_end();
put_mems_allowed();
@@ -2063,38 +2201,87 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
}
#endif
+/*
+ * pgdat_balanced is used when checking if a node is balanced for high-order
+ * allocations. Only zones that meet watermarks and are in a zone allowed
+ * by the callers classzone_idx are added to balanced_pages. The total of
+ * balanced pages must be at least 25% of the zones allowed by classzone_idx
+ * for the node to be considered balanced. Forcing all zones to be balanced
+ * for high orders can cause excessive reclaim when there are imbalanced zones.
+ * The choice of 25% is due to
+ * o a 16M DMA zone that is balanced will not balance a zone on any
+ * reasonable sized machine
+ * o On all other machines, the top zone must be at least a reasonable
+ * precentage of the middle zones. For example, on 32-bit x86, highmem
+ * would need to be at least 256M for it to be balance a whole node.
+ * Similarly, on x86-64 the Normal zone would need to be at least 1G
+ * to balance a node on its own. These seemed like reasonable ratios.
+ */
+static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
+ int classzone_idx)
+{
+ unsigned long present_pages = 0;
+ int i;
+
+ for (i = 0; i <= classzone_idx; i++)
+ present_pages += pgdat->node_zones[i].present_pages;
+
+ return balanced_pages > (present_pages >> 2);
+}
+
/* is kswapd sleeping prematurely? */
-static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
+static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
+ int classzone_idx)
{
int i;
+ unsigned long balanced = 0;
+ bool all_zones_ok = true;
/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
if (remaining)
- return 1;
+ return true;
- /* If after HZ/10, a zone is below the high mark, it's premature */
+ /* Check the watermark levels */
for (i = 0; i < pgdat->nr_zones; i++) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
- if (zone->all_unreclaimable)
+ /*
+ * balance_pgdat() skips over all_unreclaimable after
+ * DEF_PRIORITY. Effectively, it considers them balanced so
+ * they must be considered balanced here as well if kswapd
+ * is to sleep
+ */
+ if (zone->all_unreclaimable) {
+ balanced += zone->present_pages;
continue;
+ }
- if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
- 0, 0))
- return 1;
+ if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
+ classzone_idx, 0))
+ all_zones_ok = false;
+ else
+ balanced += zone->present_pages;
}
- return 0;
+ /*
+ * For high-order requests, the balanced zones must contain at least
+ * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
+ * must be balanced
+ */
+ if (order)
+ return pgdat_balanced(pgdat, balanced, classzone_idx);
+ else
+ return !all_zones_ok;
}
/*
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at high_wmark_pages(zone).
*
- * Returns the number of pages which were actually freed.
+ * Returns the final order kswapd was reclaiming at
*
* There is special handling here for zones which are full of pinned pages.
* This can happen if the pages are all mlocked, or if they are all used by
@@ -2111,11 +2298,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
* interoperates with the page allocator fallback scheme to ensure that aging
* of pages is balanced across the zones.
*/
-static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
+ int *classzone_idx)
{
int all_zones_ok;
+ unsigned long balanced;
int priority;
int i;
+ int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long total_scanned;
struct reclaim_state *reclaim_state = current->reclaim_state;
struct scan_control sc = {
@@ -2138,7 +2328,6 @@ loop_again:
count_vm_event(PAGEOUTRUN);
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
- int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long lru_pages = 0;
int has_under_min_watermark_zone = 0;
@@ -2147,6 +2336,7 @@ loop_again:
disable_swap_token();
all_zones_ok = 1;
+ balanced = 0;
/*
* Scan in the highmem->dma direction for the highest
@@ -2169,9 +2359,10 @@ loop_again:
shrink_active_list(SWAP_CLUSTER_MAX, zone,
&sc, priority, 0);
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), 0, 0)) {
end_zone = i;
+ *classzone_idx = i;
break;
}
}
@@ -2194,6 +2385,7 @@ loop_again:
* cause too much scanning of the lower zones.
*/
for (i = 0; i <= end_zone; i++) {
+ int compaction;
struct zone *zone = pgdat->node_zones + i;
int nr_slab;
@@ -2215,7 +2407,7 @@ loop_again:
* We put equal pressure on every zone, unless one
* zone has way too many pages free already.
*/
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
8*high_wmark_pages(zone), end_zone, 0))
shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
@@ -2223,9 +2415,26 @@ loop_again:
lru_pages);
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
total_scanned += sc.nr_scanned;
+
+ compaction = 0;
+ if (order &&
+ zone_watermark_ok(zone, 0,
+ high_wmark_pages(zone),
+ end_zone, 0) &&
+ !zone_watermark_ok(zone, order,
+ high_wmark_pages(zone),
+ end_zone, 0)) {
+ compact_zone_order(zone,
+ order,
+ sc.gfp_mask, false,
+ COMPACT_MODE_KSWAPD);
+ compaction = 1;
+ }
+
if (zone->all_unreclaimable)
continue;
- if (nr_slab == 0 && !zone_reclaimable(zone))
+ if (!compaction && nr_slab == 0 &&
+ !zone_reclaimable(zone))
zone->all_unreclaimable = 1;
/*
* If we've done a decent amount of scanning and
@@ -2236,7 +2445,7 @@ loop_again:
total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
sc.may_writepage = 1;
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), end_zone, 0)) {
all_zones_ok = 0;
/*
@@ -2244,13 +2453,24 @@ loop_again:
* means that we have a GFP_ATOMIC allocation
* failure risk. Hurry up!
*/
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
min_wmark_pages(zone), end_zone, 0))
has_under_min_watermark_zone = 1;
+ } else {
+ /*
+ * If a zone reaches its high watermark,
+ * consider it to be no longer congested. It's
+ * possible there are dirty pages backed by
+ * congested BDIs but as pressure is relieved,
+ * spectulatively avoid congestion waits
+ */
+ zone_clear_flag(zone, ZONE_CONGESTED);
+ if (i <= *classzone_idx)
+ balanced += zone->present_pages;
}
}
- if (all_zones_ok)
+ if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
break; /* kswapd: all done */
/*
* OK, kswapd is getting into trouble. Take a nap, then take
@@ -2273,7 +2493,13 @@ loop_again:
break;
}
out:
- if (!all_zones_ok) {
+
+ /*
+ * order-0: All zones must meet high watermark for a balanced node
+ * high-order: Balanced zones must make up at least 25% of the node
+ * for the node to be balanced
+ */
+ if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
cond_resched();
try_to_freeze();
@@ -2298,7 +2524,88 @@ out:
goto loop_again;
}
- return sc.nr_reclaimed;
+ /*
+ * If kswapd was reclaiming at a higher order, it has the option of
+ * sleeping without all zones being balanced. Before it does, it must
+ * ensure that the watermarks for order-0 on *all* zones are met and
+ * that the congestion flags are cleared. The congestion flag must
+ * be cleared as kswapd is the only mechanism that clears the flag
+ * and it is potentially going to sleep here.
+ */
+ if (order) {
+ for (i = 0; i <= end_zone; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ continue;
+
+ /* Confirm the zone is balanced for order-0 */
+ if (!zone_watermark_ok(zone, 0,
+ high_wmark_pages(zone), 0, 0)) {
+ order = sc.order = 0;
+ goto loop_again;
+ }
+
+ /* If balanced, clear the congested flag */
+ zone_clear_flag(zone, ZONE_CONGESTED);
+ }
+ }
+
+ /*
+ * Return the order we were reclaiming at so sleeping_prematurely()
+ * makes a decision on the order we were last reclaiming at. However,
+ * if another caller entered the allocator slow path while kswapd
+ * was awake, order will remain at the higher level
+ */
+ *classzone_idx = end_zone;
+ return order;
+}
+
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+{
+ long remaining = 0;
+ DEFINE_WAIT(wait);
+
+ if (freezing(current) || kthread_should_stop())
+ return;
+
+ prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+
+ /* Try to sleep for a short interval */
+ if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+ remaining = schedule_timeout(HZ/10);
+ finish_wait(&pgdat->kswapd_wait, &wait);
+ prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+ }
+
+ /*
+ * After a short sleep, check if it was a premature sleep. If not, then
+ * go fully to sleep until explicitly woken up.
+ */
+ if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+ trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
+
+ /*
+ * vmstat counters are not perfectly accurate and the estimated
+ * value for counters such as NR_FREE_PAGES can deviate from the
+ * true value by nr_online_cpus * threshold. To avoid the zone
+ * watermarks being breached while under pressure, we reduce the
+ * per-cpu vmstat threshold while kswapd is awake and restore
+ * them before going back to sleep.
+ */
+ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
+ schedule();
+ set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
+ } else {
+ if (remaining)
+ count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
+ else
+ count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
+ }
+ finish_wait(&pgdat->kswapd_wait, &wait);
}
/*
@@ -2317,9 +2624,10 @@ out:
static int kswapd(void *p)
{
unsigned long order;
+ int classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
- DEFINE_WAIT(wait);
+
struct reclaim_state reclaim_state = {
.reclaimed_slab = 0,
};
@@ -2347,49 +2655,30 @@ static int kswapd(void *p)
set_freezable();
order = 0;
+ classzone_idx = MAX_NR_ZONES - 1;
for ( ; ; ) {
unsigned long new_order;
+ int new_classzone_idx;
int ret;
- prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
new_order = pgdat->kswapd_max_order;
+ new_classzone_idx = pgdat->classzone_idx;
pgdat->kswapd_max_order = 0;
- if (order < new_order) {
+ pgdat->classzone_idx = MAX_NR_ZONES - 1;
+ if (order < new_order || classzone_idx > new_classzone_idx) {
/*
* Don't sleep if someone wants a larger 'order'
- * allocation
+ * allocation or has tigher zone constraints
*/
order = new_order;
+ classzone_idx = new_classzone_idx;
} else {
- if (!freezing(current) && !kthread_should_stop()) {
- long remaining = 0;
-
- /* Try to sleep for a short interval */
- if (!sleeping_prematurely(pgdat, order, remaining)) {
- remaining = schedule_timeout(HZ/10);
- finish_wait(&pgdat->kswapd_wait, &wait);
- prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
- }
-
- /*
- * After a short sleep, check if it was a
- * premature sleep. If not, then go fully
- * to sleep until explicitly woken up
- */
- if (!sleeping_prematurely(pgdat, order, remaining)) {
- trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
- schedule();
- } else {
- if (remaining)
- count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
- else
- count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
- }
- }
-
+ kswapd_try_to_sleep(pgdat, order, classzone_idx);
order = pgdat->kswapd_max_order;
+ classzone_idx = pgdat->classzone_idx;
+ pgdat->kswapd_max_order = 0;
+ pgdat->classzone_idx = MAX_NR_ZONES - 1;
}
- finish_wait(&pgdat->kswapd_wait, &wait);
ret = try_to_freeze();
if (kthread_should_stop())
@@ -2401,7 +2690,7 @@ static int kswapd(void *p)
*/
if (!ret) {
trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
- balance_pgdat(pgdat, order);
+ order = balance_pgdat(pgdat, order, &classzone_idx);
}
}
return 0;
@@ -2410,23 +2699,26 @@ static int kswapd(void *p)
/*
* A zone is low on free memory, so wake its kswapd task to service it.
*/
-void wakeup_kswapd(struct zone *zone, int order)
+void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
{
pg_data_t *pgdat;
if (!populated_zone(zone))
return;
- pgdat = zone->zone_pgdat;
- if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
- return;
- if (pgdat->kswapd_max_order < order)
- pgdat->kswapd_max_order = order;
- trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
return;
+ pgdat = zone->zone_pgdat;
+ if (pgdat->kswapd_max_order < order) {
+ pgdat->kswapd_max_order = order;
+ pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
+ }
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
+ if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+ return;
+
+ trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
wake_up_interruptible(&pgdat->kswapd_wait);
}
@@ -2987,6 +3279,7 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
return 0;
}
+#ifdef CONFIG_NUMA
/*
* per node 'scan_unevictable_pages' attribute. On demand re-scan of
* a specified node's per zone unevictable lists for evictable pages.
@@ -3033,4 +3326,4 @@ void scan_unevictable_unregister_node(struct node *node)
{
sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
}
-
+#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 355a9e669aaa..0c3b5048773e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -17,6 +17,8 @@
#include <linux/vmstat.h>
#include <linux/sched.h>
#include <linux/math64.h>
+#include <linux/writeback.h>
+#include <linux/compaction.h>
#ifdef CONFIG_VM_EVENT_COUNTERS
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -81,7 +83,31 @@ EXPORT_SYMBOL(vm_stat);
#ifdef CONFIG_SMP
-static int calculate_threshold(struct zone *zone)
+int calculate_pressure_threshold(struct zone *zone)
+{
+ int threshold;
+ int watermark_distance;
+
+ /*
+ * As vmstats are not up to date, there is drift between the estimated
+ * and real values. For high thresholds and a high number of CPUs, it
+ * is possible for the min watermark to be breached while the estimated
+ * value looks fine. The pressure threshold is a reduced value such
+ * that even the maximum amount of drift will not accidentally breach
+ * the min watermark
+ */
+ watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
+ threshold = max(1, (int)(watermark_distance / num_online_cpus()));
+
+ /*
+ * Maximum threshold is 125
+ */
+ threshold = min(125, threshold);
+
+ return threshold;
+}
+
+int calculate_normal_threshold(struct zone *zone)
{
int threshold;
int mem; /* memory in 128 MB units */
@@ -140,7 +166,7 @@ static void refresh_zone_stat_thresholds(void)
for_each_populated_zone(zone) {
unsigned long max_drift, tolerate_drift;
- threshold = calculate_threshold(zone);
+ threshold = calculate_normal_threshold(zone);
for_each_online_cpu(cpu)
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
@@ -159,42 +185,50 @@ static void refresh_zone_stat_thresholds(void)
}
}
+void set_pgdat_percpu_threshold(pg_data_t *pgdat,
+ int (*calculate_pressure)(struct zone *))
+{
+ struct zone *zone;
+ int cpu;
+ int threshold;
+ int i;
+
+ for (i = 0; i < pgdat->nr_zones; i++) {
+ zone = &pgdat->node_zones[i];
+ if (!zone->percpu_drift_mark)
+ continue;
+
+ threshold = (*calculate_pressure)(zone);
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ = threshold;
+ }
+}
+
/*
* For use when we know that interrupts are disabled.
*/
void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
int delta)
{
- struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
-
- s8 *p = pcp->vm_stat_diff + item;
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
long x;
+ long t;
+
+ x = delta + __this_cpu_read(*p);
- x = delta + *p;
+ t = __this_cpu_read(pcp->stat_threshold);
- if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
+ if (unlikely(x > t || x < -t)) {
zone_page_state_add(x, zone, item);
x = 0;
}
- *p = x;
+ __this_cpu_write(*p, x);
}
EXPORT_SYMBOL(__mod_zone_page_state);
/*
- * For an unknown interrupt state
- */
-void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
- int delta)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __mod_zone_page_state(zone, item, delta);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL(mod_zone_page_state);
-
-/*
* Optimized increment and decrement functions.
*
* These are only for a single page and therefore can take a struct page *
@@ -219,16 +253,17 @@ EXPORT_SYMBOL(mod_zone_page_state);
*/
void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
- s8 *p = pcp->vm_stat_diff + item;
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
+ s8 v, t;
- (*p)++;
+ v = __this_cpu_inc_return(*p);
+ t = __this_cpu_read(pcp->stat_threshold);
+ if (unlikely(v > t)) {
+ s8 overstep = t >> 1;
- if (unlikely(*p > pcp->stat_threshold)) {
- int overstep = pcp->stat_threshold / 2;
-
- zone_page_state_add(*p + overstep, zone, item);
- *p = -overstep;
+ zone_page_state_add(v + overstep, zone, item);
+ __this_cpu_write(*p, -overstep);
}
}
@@ -240,16 +275,17 @@ EXPORT_SYMBOL(__inc_zone_page_state);
void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
- s8 *p = pcp->vm_stat_diff + item;
-
- (*p)--;
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
+ s8 v, t;
- if (unlikely(*p < - pcp->stat_threshold)) {
- int overstep = pcp->stat_threshold / 2;
+ v = __this_cpu_dec_return(*p);
+ t = __this_cpu_read(pcp->stat_threshold);
+ if (unlikely(v < - t)) {
+ s8 overstep = t >> 1;
- zone_page_state_add(*p - overstep, zone, item);
- *p = overstep;
+ zone_page_state_add(v - overstep, zone, item);
+ __this_cpu_write(*p, overstep);
}
}
@@ -259,6 +295,92 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
}
EXPORT_SYMBOL(__dec_zone_page_state);
+#ifdef CONFIG_CMPXCHG_LOCAL
+/*
+ * If we have cmpxchg_local support then we do not need to incur the overhead
+ * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
+ *
+ * mod_state() modifies the zone counter state through atomic per cpu
+ * operations.
+ *
+ * Overstep mode specifies how overstep should handled:
+ * 0 No overstepping
+ * 1 Overstepping half of threshold
+ * -1 Overstepping minus half of threshold
+*/
+static inline void mod_state(struct zone *zone,
+ enum zone_stat_item item, int delta, int overstep_mode)
+{
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
+ long o, n, t, z;
+
+ do {
+ z = 0; /* overflow to zone counters */
+
+ /*
+ * The fetching of the stat_threshold is racy. We may apply
+ * a counter threshold to the wrong the cpu if we get
+ * rescheduled while executing here. However, the following
+ * will apply the threshold again and therefore bring the
+ * counter under the threshold.
+ */
+ t = this_cpu_read(pcp->stat_threshold);
+
+ o = this_cpu_read(*p);
+ n = delta + o;
+
+ if (n > t || n < -t) {
+ int os = overstep_mode * (t >> 1) ;
+
+ /* Overflow must be added to zone counters */
+ z = n + os;
+ n = -os;
+ }
+ } while (this_cpu_cmpxchg(*p, o, n) != o);
+
+ if (z)
+ zone_page_state_add(z, zone, item);
+}
+
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+ int delta)
+{
+ mod_state(zone, item, delta, 0);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
+void inc_zone_state(struct zone *zone, enum zone_stat_item item)
+{
+ mod_state(zone, item, 1, 1);
+}
+
+void inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ mod_state(page_zone(page), item, 1, 1);
+}
+EXPORT_SYMBOL(inc_zone_page_state);
+
+void dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ mod_state(page_zone(page), item, -1, -1);
+}
+EXPORT_SYMBOL(dec_zone_page_state);
+#else
+/*
+ * Use interrupt disable to serialize counter updates
+ */
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+ int delta)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __mod_zone_page_state(zone, item, delta);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
void inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
unsigned long flags;
@@ -289,6 +411,7 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
local_irq_restore(flags);
}
EXPORT_SYMBOL(dec_zone_page_state);
+#endif
/*
* Update the zone counters for one cpu.
@@ -394,6 +517,7 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
#endif
#ifdef CONFIG_COMPACTION
+
struct contig_page_info {
unsigned long free_pages;
unsigned long free_blocks_total;
@@ -745,6 +869,9 @@ static const char * const vmstat_text[] = {
"nr_isolated_anon",
"nr_isolated_file",
"nr_shmem",
+ "nr_dirtied",
+ "nr_written",
+
#ifdef CONFIG_NUMA
"numa_hit",
"numa_miss",
@@ -753,6 +880,9 @@ static const char * const vmstat_text[] = {
"numa_local",
"numa_other",
#endif
+ "nr_anon_transparent_hugepages",
+ "nr_dirty_threshold",
+ "nr_dirty_background_threshold",
#ifdef CONFIG_VM_EVENT_COUNTERS
"pgpgin",
@@ -826,7 +956,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n scanned %lu"
"\n spanned %lu"
"\n present %lu",
- zone_nr_free_pages(zone),
+ zone_page_state(zone, NR_FREE_PAGES),
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
@@ -904,36 +1034,44 @@ static const struct file_operations proc_zoneinfo_file_operations = {
.release = seq_release,
};
+enum writeback_stat_item {
+ NR_DIRTY_THRESHOLD,
+ NR_DIRTY_BG_THRESHOLD,
+ NR_VM_WRITEBACK_STAT_ITEMS,
+};
+
static void *vmstat_start(struct seq_file *m, loff_t *pos)
{
unsigned long *v;
-#ifdef CONFIG_VM_EVENT_COUNTERS
- unsigned long *e;
-#endif
- int i;
+ int i, stat_items_size;
if (*pos >= ARRAY_SIZE(vmstat_text))
return NULL;
+ stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
+ NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
#ifdef CONFIG_VM_EVENT_COUNTERS
- v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
- + sizeof(struct vm_event_state), GFP_KERNEL);
-#else
- v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
- GFP_KERNEL);
+ stat_items_size += sizeof(struct vm_event_state);
#endif
+
+ v = kmalloc(stat_items_size, GFP_KERNEL);
m->private = v;
if (!v)
return ERR_PTR(-ENOMEM);
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
v[i] = global_page_state(i);
+ v += NR_VM_ZONE_STAT_ITEMS;
+
+ global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
+ v + NR_DIRTY_THRESHOLD);
+ v += NR_VM_WRITEBACK_STAT_ITEMS;
+
#ifdef CONFIG_VM_EVENT_COUNTERS
- e = v + NR_VM_ZONE_STAT_ITEMS;
- all_vm_events(e);
- e[PGPGIN] /= 2; /* sectors -> kbytes */
- e[PGPGOUT] /= 2;
+ all_vm_events(v);
+ v[PGPGIN] /= 2; /* sectors -> kbytes */
+ v[PGPGOUT] /= 2;
#endif
- return v + *pos;
+ return (unsigned long *)m->private + *pos;
}
static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
@@ -1017,7 +1155,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
- cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu));
+ cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
per_cpu(vmstat_work, cpu).work.func = NULL;
break;
case CPU_DOWN_FAILED: