diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 48 | ||||
-rw-r--r-- | mm/Makefile | 10 | ||||
-rw-r--r-- | mm/backing-dev.c | 74 | ||||
-rw-r--r-- | mm/bootmem.c | 13 | ||||
-rw-r--r-- | mm/compaction.c | 186 | ||||
-rw-r--r-- | mm/dmapool.c | 18 | ||||
-rw-r--r-- | mm/filemap.c | 98 | ||||
-rw-r--r-- | mm/highmem.c | 66 | ||||
-rw-r--r-- | mm/huge_memory.c | 2351 | ||||
-rw-r--r-- | mm/hugetlb.c | 290 | ||||
-rw-r--r-- | mm/internal.h | 9 | ||||
-rw-r--r-- | mm/kmemleak-test.c | 6 | ||||
-rw-r--r-- | mm/kmemleak.c | 13 | ||||
-rw-r--r-- | mm/ksm.c | 88 | ||||
-rw-r--r-- | mm/maccess.c | 2 | ||||
-rw-r--r-- | mm/madvise.c | 10 | ||||
-rw-r--r-- | mm/memblock.c | 835 | ||||
-rw-r--r-- | mm/memcontrol.c | 829 | ||||
-rw-r--r-- | mm/memory-failure.c | 284 | ||||
-rw-r--r-- | mm/memory.c | 399 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 100 | ||||
-rw-r--r-- | mm/mempolicy.c | 43 | ||||
-rw-r--r-- | mm/migrate.c | 329 | ||||
-rw-r--r-- | mm/mincore.c | 7 | ||||
-rw-r--r-- | mm/mlock.c | 170 | ||||
-rw-r--r-- | mm/mmap.c | 35 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 20 | ||||
-rw-r--r-- | mm/mmzone.c | 21 | ||||
-rw-r--r-- | mm/mprotect.c | 22 | ||||
-rw-r--r-- | mm/mremap.c | 17 | ||||
-rw-r--r-- | mm/nommu.c | 84 | ||||
-rw-r--r-- | mm/oom_kill.c | 33 | ||||
-rw-r--r-- | mm/page-writeback.c | 42 | ||||
-rw-r--r-- | mm/page_alloc.c | 401 | ||||
-rw-r--r-- | mm/page_isolation.c | 3 | ||||
-rw-r--r-- | mm/pagewalk.c | 6 | ||||
-rw-r--r-- | mm/percpu-km.c | 8 | ||||
-rw-r--r-- | mm/percpu-vm.c | 2 | ||||
-rw-r--r-- | mm/percpu.c | 415 | ||||
-rw-r--r-- | mm/percpu_up.c | 30 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 121 | ||||
-rw-r--r-- | mm/rmap.c | 130 | ||||
-rw-r--r-- | mm/shmem.c | 26 | ||||
-rw-r--r-- | mm/slab.c | 84 | ||||
-rw-r--r-- | mm/slob.c | 9 | ||||
-rw-r--r-- | mm/slub.c | 876 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 13 | ||||
-rw-r--r-- | mm/sparse.c | 4 | ||||
-rw-r--r-- | mm/swap.c | 132 | ||||
-rw-r--r-- | mm/swap_state.c | 6 | ||||
-rw-r--r-- | mm/swapfile.c | 66 | ||||
-rw-r--r-- | mm/truncate.c | 15 | ||||
-rw-r--r-- | mm/util.c | 34 | ||||
-rw-r--r-- | mm/vmalloc.c | 185 | ||||
-rw-r--r-- | mm/vmscan.c | 577 | ||||
-rw-r--r-- | mm/vmstat.c | 246 |
56 files changed, 7469 insertions, 2472 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index f0fb9124e410..e9c0c61f2ddd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -179,7 +179,7 @@ config SPLIT_PTLOCK_CPUS config COMPACTION bool "Allow for memory compaction" select MIGRATION - depends on EXPERIMENTAL && HUGETLB_PAGE && MMU + depends on MMU help Allows the compaction of memory for the allocation of huge pages. @@ -301,3 +301,49 @@ config NOMMU_INITIAL_TRIM_EXCESS of 1 says that all excess pages should be trimmed. See Documentation/nommu-mmap.txt for more information. + +config TRANSPARENT_HUGEPAGE + bool "Transparent Hugepage Support" + depends on X86 && MMU + select COMPACTION + help + Transparent Hugepages allows the kernel to use huge pages and + huge tlb transparently to the applications whenever possible. + This feature can improve computing performance to certain + applications by speeding up page faults during memory + allocation, by reducing the number of tlb misses and by speeding + up the pagetable walking. + + If memory constrained on embedded, you may want to say N. + +choice + prompt "Transparent Hugepage Support sysfs defaults" + depends on TRANSPARENT_HUGEPAGE + default TRANSPARENT_HUGEPAGE_ALWAYS + help + Selects the sysfs defaults for Transparent Hugepage Support. + + config TRANSPARENT_HUGEPAGE_ALWAYS + bool "always" + help + Enabling Transparent Hugepage always, can increase the + memory footprint of applications without a guaranteed + benefit but it will work automatically for all applications. + + config TRANSPARENT_HUGEPAGE_MADVISE + bool "madvise" + help + Enabling Transparent Hugepage madvise, will only provide a + performance improvement benefit to the applications using + madvise(MADV_HUGEPAGE) but it won't risk to increase the + memory footprint of applications without a guaranteed + benefit. +endchoice + +# +# UP and nommu archs use km based percpu allocator +# +config NEED_PER_CPU_KM + depends on !SMP + bool + default y diff --git a/mm/Makefile b/mm/Makefile index 34b2546a9e37..2b1b575ae712 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,13 +5,13 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ - vmalloc.o pagewalk.o + vmalloc.o pagewalk.o pgtable-generic.o obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ - page_isolation.o mm_init.o mmu_context.o \ + page_isolation.o mm_init.o mmu_context.o percpu.o \ $(mmu-y) obj-y += init-mm.o @@ -36,12 +36,8 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o -ifdef CONFIG_SMP -obj-y += percpu.o -else -obj-y += percpu_up.o -endif obj-$(CONFIG_QUICKLIST) += quicklist.o +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 65d420499a61..027100d30227 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -74,11 +74,11 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) nr_wb = nr_dirty = nr_io = nr_more_io = 0; spin_lock(&inode_lock); - list_for_each_entry(inode, &wb->b_dirty, i_list) + list_for_each_entry(inode, &wb->b_dirty, i_wb_list) nr_dirty++; - list_for_each_entry(inode, &wb->b_io, i_list) + list_for_each_entry(inode, &wb->b_io, i_wb_list) nr_io++; - list_for_each_entry(inode, &wb->b_more_io, i_list) + list_for_each_entry(inode, &wb->b_more_io, i_wb_list) nr_more_io++; spin_unlock(&inode_lock); @@ -362,7 +362,7 @@ static int bdi_forker_thread(void *ptr) { struct bdi_writeback *me = ptr; - current->flags |= PF_FLUSHER | PF_SWAPWRITE; + current->flags |= PF_SWAPWRITE; set_freezable(); /* @@ -729,6 +729,7 @@ static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) }; +static atomic_t nr_bdi_congested[2]; void clear_bdi_congested(struct backing_dev_info *bdi, int sync) { @@ -736,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync) wait_queue_head_t *wqh = &congestion_wqh[sync]; bit = sync ? BDI_sync_congested : BDI_async_congested; - clear_bit(bit, &bdi->state); + if (test_and_clear_bit(bit, &bdi->state)) + atomic_dec(&nr_bdi_congested[sync]); smp_mb__after_clear_bit(); if (waitqueue_active(wqh)) wake_up(wqh); @@ -748,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync) enum bdi_state bit; bit = sync ? BDI_sync_congested : BDI_async_congested; - set_bit(bit, &bdi->state); + if (!test_and_set_bit(bit, &bdi->state)) + atomic_inc(&nr_bdi_congested[sync]); } EXPORT_SYMBOL(set_bdi_congested); @@ -764,13 +767,72 @@ EXPORT_SYMBOL(set_bdi_congested); long congestion_wait(int sync, long timeout) { long ret; + unsigned long start = jiffies; DEFINE_WAIT(wait); wait_queue_head_t *wqh = &congestion_wqh[sync]; prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); ret = io_schedule_timeout(timeout); finish_wait(wqh, &wait); + + trace_writeback_congestion_wait(jiffies_to_usecs(timeout), + jiffies_to_usecs(jiffies - start)); + return ret; } EXPORT_SYMBOL(congestion_wait); +/** + * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes + * @zone: A zone to check if it is heavily congested + * @sync: SYNC or ASYNC IO + * @timeout: timeout in jiffies + * + * In the event of a congested backing_dev (any backing_dev) and the given + * @zone has experienced recent congestion, this waits for up to @timeout + * jiffies for either a BDI to exit congestion of the given @sync queue + * or a write to complete. + * + * In the absense of zone congestion, cond_resched() is called to yield + * the processor if necessary but otherwise does not sleep. + * + * The return value is 0 if the sleep is for the full timeout. Otherwise, + * it is the number of jiffies that were still remaining when the function + * returned. return_value == timeout implies the function did not sleep. + */ +long wait_iff_congested(struct zone *zone, int sync, long timeout) +{ + long ret; + unsigned long start = jiffies; + DEFINE_WAIT(wait); + wait_queue_head_t *wqh = &congestion_wqh[sync]; + + /* + * If there is no congestion, or heavy congestion is not being + * encountered in the current zone, yield if necessary instead + * of sleeping on the congestion queue + */ + if (atomic_read(&nr_bdi_congested[sync]) == 0 || + !zone_is_reclaim_congested(zone)) { + cond_resched(); + + /* In case we scheduled, work out time remaining */ + ret = timeout - (jiffies - start); + if (ret < 0) + ret = 0; + + goto out; + } + + /* Sleep until uncongested or a write happens */ + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + ret = io_schedule_timeout(timeout); + finish_wait(wqh, &wait); + +out: + trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout), + jiffies_to_usecs(jiffies - start)); + + return ret; +} +EXPORT_SYMBOL(wait_iff_congested); diff --git a/mm/bootmem.c b/mm/bootmem.c index 142c84a54993..13b0caa9793c 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/kmemleak.h> #include <linux/range.h> +#include <linux/memblock.h> #include <asm/bug.h> #include <asm/io.h> @@ -434,7 +435,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { #ifdef CONFIG_NO_BOOTMEM - free_early(physaddr, physaddr + size); + kmemleak_free_part(__va(physaddr), size); + memblock_x86_free_range(physaddr, physaddr + size); #else unsigned long start, end; @@ -459,7 +461,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, void __init free_bootmem(unsigned long addr, unsigned long size) { #ifdef CONFIG_NO_BOOTMEM - free_early(addr, addr + size); + kmemleak_free_part(__va(addr), size); + memblock_x86_free_range(addr, addr + size); #else unsigned long start, end; @@ -526,6 +529,12 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, } #ifndef CONFIG_NO_BOOTMEM +int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len, + int flags) +{ + return reserve_bootmem(phys, len, flags); +} + static unsigned long __init align_idx(struct bootmem_data *bdata, unsigned long idx, unsigned long step) { diff --git a/mm/compaction.c b/mm/compaction.c index 4d709ee59013..8be430b812de 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -16,6 +16,9 @@ #include <linux/sysfs.h> #include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/compaction.h> + /* * compact_control is used to track pages being migrated and the free pages * they are being migrated to during memory compaction. The free_pfn starts @@ -30,6 +33,7 @@ struct compact_control { unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ + bool sync; /* Synchronous migration */ /* Account for isolated anon and file pages */ unsigned long nr_anon; @@ -38,6 +42,8 @@ struct compact_control { unsigned int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; + + int compact_mode; }; static unsigned long release_freepages(struct list_head *freelist) @@ -60,7 +66,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, struct list_head *freelist) { unsigned long zone_end_pfn, end_pfn; - int total_isolated = 0; + int nr_scanned = 0, total_isolated = 0; struct page *cursor; /* Get the last PFN we should scan for free pages at */ @@ -81,6 +87,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, if (!pfn_valid_within(blockpfn)) continue; + nr_scanned++; if (!PageBuddy(page)) continue; @@ -100,6 +107,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, } } + trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); return total_isolated; } @@ -234,6 +242,8 @@ static unsigned long isolate_migratepages(struct zone *zone, struct compact_control *cc) { unsigned long low_pfn, end_pfn; + unsigned long last_pageblock_nr = 0, pageblock_nr; + unsigned long nr_scanned = 0, nr_isolated = 0; struct list_head *migratelist = &cc->migratepages; /* Do not scan outside zone boundaries */ @@ -266,21 +276,51 @@ static unsigned long isolate_migratepages(struct zone *zone, struct page *page; if (!pfn_valid_within(low_pfn)) continue; + nr_scanned++; /* Get the page and skip if free */ page = pfn_to_page(low_pfn); if (PageBuddy(page)) continue; + /* + * For async migration, also only scan in MOVABLE blocks. Async + * migration is optimistic to see if the minimum amount of work + * satisfies the allocation + */ + pageblock_nr = low_pfn >> pageblock_order; + if (!cc->sync && last_pageblock_nr != pageblock_nr && + get_pageblock_migratetype(page) != MIGRATE_MOVABLE) { + low_pfn += pageblock_nr_pages; + low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; + last_pageblock_nr = pageblock_nr; + continue; + } + + if (!PageLRU(page)) + continue; + + /* + * PageLRU is set, and lru_lock excludes isolation, + * splitting and collapsing (collapsing has already + * happened if PageLRU is set). + */ + if (PageTransHuge(page)) { + low_pfn += (1 << compound_order(page)) - 1; + continue; + } + /* Try isolate the page */ if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) continue; + VM_BUG_ON(PageTransCompound(page)); + /* Successfully isolated */ del_page_from_lru_list(zone, page, page_lru(page)); list_add(&page->lru, migratelist); - mem_cgroup_del_lru(page); cc->nr_migratepages++; + nr_isolated++; /* Avoid isolating too much */ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) @@ -292,6 +332,8 @@ static unsigned long isolate_migratepages(struct zone *zone, spin_unlock_irq(&zone->lru_lock); cc->migrate_pfn = low_pfn; + trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); + return cc->nr_migratepages; } @@ -342,10 +384,10 @@ static void update_nr_listpages(struct compact_control *cc) } static int compact_finished(struct zone *zone, - struct compact_control *cc) + struct compact_control *cc) { unsigned int order; - unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order); + unsigned long watermark; if (fatal_signal_pending(current)) return COMPACT_PARTIAL; @@ -355,12 +397,31 @@ static int compact_finished(struct zone *zone, return COMPACT_COMPLETE; /* Compaction run is not finished if the watermark is not met */ + if (cc->compact_mode != COMPACT_MODE_KSWAPD) + watermark = low_wmark_pages(zone); + else + watermark = high_wmark_pages(zone); + watermark += (1 << cc->order); + if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) return COMPACT_CONTINUE; + /* + * order == -1 is expected when compacting via + * /proc/sys/vm/compact_memory + */ if (cc->order == -1) return COMPACT_CONTINUE; + /* + * Generating only one page of the right order is not enough + * for kswapd, we must continue until we're above the high + * watermark as a pool for high order GFP_ATOMIC allocations + * too. + */ + if (cc->compact_mode == COMPACT_MODE_KSWAPD) + return COMPACT_CONTINUE; + /* Direct compactor: Is a suitable page free? */ for (order = cc->order; order < MAX_ORDER; order++) { /* Job done if page is free of the right migratetype */ @@ -375,10 +436,69 @@ static int compact_finished(struct zone *zone, return COMPACT_CONTINUE; } +/* + * compaction_suitable: Is this suitable to run compaction on this zone now? + * Returns + * COMPACT_SKIPPED - If there are too few free pages for compaction + * COMPACT_PARTIAL - If the allocation would succeed without compaction + * COMPACT_CONTINUE - If compaction should run now + */ +unsigned long compaction_suitable(struct zone *zone, int order) +{ + int fragindex; + unsigned long watermark; + + /* + * Watermarks for order-0 must be met for compaction. Note the 2UL. + * This is because during migration, copies of pages need to be + * allocated and for a short time, the footprint is higher + */ + watermark = low_wmark_pages(zone) + (2UL << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + return COMPACT_SKIPPED; + + /* + * order == -1 is expected when compacting via + * /proc/sys/vm/compact_memory + */ + if (order == -1) + return COMPACT_CONTINUE; + + /* + * fragmentation index determines if allocation failures are due to + * low memory or external fragmentation + * + * index of -1 implies allocations might succeed dependingon watermarks + * index towards 0 implies failure is due to lack of memory + * index towards 1000 implies failure is due to fragmentation + * + * Only compact if a failure would be due to fragmentation. + */ + fragindex = fragmentation_index(zone, order); + if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) + return COMPACT_SKIPPED; + + if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) + return COMPACT_PARTIAL; + + return COMPACT_CONTINUE; +} + static int compact_zone(struct zone *zone, struct compact_control *cc) { int ret; + ret = compaction_suitable(zone, cc->order); + switch (ret) { + case COMPACT_PARTIAL: + case COMPACT_SKIPPED: + /* Compaction is likely to fail */ + return ret; + case COMPACT_CONTINUE: + /* Fall through to compaction */ + ; + } + /* Setup to move all movable pages to the end of the zone */ cc->migrate_pfn = zone->zone_start_pfn; cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; @@ -394,7 +514,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; migrate_pages(&cc->migratepages, compaction_alloc, - (unsigned long)cc, 0); + (unsigned long)cc, false, + cc->sync); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; @@ -402,6 +523,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); if (nr_remaining) count_vm_events(COMPACTPAGEFAILED, nr_remaining); + trace_mm_compaction_migratepages(nr_migrate - nr_remaining, + nr_remaining); /* Release LRU pages not migrated */ if (!list_empty(&cc->migratepages)) { @@ -418,8 +541,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) return ret; } -static unsigned long compact_zone_order(struct zone *zone, - int order, gfp_t gfp_mask) +unsigned long compact_zone_order(struct zone *zone, + int order, gfp_t gfp_mask, + bool sync, + int compact_mode) { struct compact_control cc = { .nr_freepages = 0, @@ -427,6 +552,8 @@ static unsigned long compact_zone_order(struct zone *zone, .order = order, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, + .sync = sync, + .compact_mode = compact_mode, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -442,16 +569,17 @@ int sysctl_extfrag_threshold = 500; * @order: The order of the current allocation * @gfp_mask: The GFP mask of the current allocation * @nodemask: The allowed nodes to allocate from + * @sync: Whether migration is synchronous or not * * This is the main entry point for direct page compaction. */ unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *nodemask) + int order, gfp_t gfp_mask, nodemask_t *nodemask, + bool sync) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; - unsigned long watermark; struct zoneref *z; struct zone *zone; int rc = COMPACT_SKIPPED; @@ -461,7 +589,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, * made because an assumption is made that the page allocator can satisfy * the "cheaper" orders without taking special steps */ - if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io) + if (!order || !may_enter_fs || !may_perform_io) return rc; count_vm_event(COMPACTSTALL); @@ -469,43 +597,14 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { - int fragindex; int status; - /* - * Watermarks for order-0 must be met for compaction. Note - * the 2UL. This is because during migration, copies of - * pages need to be allocated and for a short time, the - * footprint is higher - */ - watermark = low_wmark_pages(zone) + (2UL << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) - continue; - - /* - * fragmentation index determines if allocation failures are - * due to low memory or external fragmentation - * - * index of -1 implies allocations might succeed depending - * on watermarks - * index towards 0 implies failure is due to lack of memory - * index towards 1000 implies failure is due to fragmentation - * - * Only compact if a failure would be due to fragmentation. - */ - fragindex = fragmentation_index(zone, order); - if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) - continue; - - if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) { - rc = COMPACT_PARTIAL; - break; - } - - status = compact_zone_order(zone, order, gfp_mask); + status = compact_zone_order(zone, order, gfp_mask, sync, + COMPACT_MODE_DIRECT_RECLAIM); rc = max(status, rc); - if (zone_watermark_ok(zone, order, watermark, 0, 0)) + /* If a normal allocation would succeed, stop compacting */ + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) break; } @@ -532,6 +631,7 @@ static int compact_node(int nid) .nr_freepages = 0, .nr_migratepages = 0, .order = -1, + .compact_mode = COMPACT_MODE_DIRECT_RECLAIM, }; zone = &pgdat->node_zones[zoneid]; diff --git a/mm/dmapool.c b/mm/dmapool.c index 3df063706f53..03bf3bb4519a 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -311,6 +311,8 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, size_t offset; void *retval; + might_sleep_if(mem_flags & __GFP_WAIT); + spin_lock_irqsave(&pool->lock, flags); restart: list_for_each_entry(page, &pool->page_list, page_list) { @@ -322,7 +324,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, if (mem_flags & __GFP_WAIT) { DECLARE_WAITQUEUE(wait, current); - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); __add_wait_queue(&pool->waitq, &wait); spin_unlock_irqrestore(&pool->lock, flags); @@ -353,20 +355,15 @@ EXPORT_SYMBOL(dma_pool_alloc); static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) { - unsigned long flags; struct dma_page *page; - spin_lock_irqsave(&pool->lock, flags); list_for_each_entry(page, &pool->page_list, page_list) { if (dma < page->dma) continue; if (dma < (page->dma + pool->allocation)) - goto done; + return page; } - page = NULL; - done: - spin_unlock_irqrestore(&pool->lock, flags); - return page; + return NULL; } /** @@ -384,8 +381,10 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) unsigned long flags; unsigned int offset; + spin_lock_irqsave(&pool->lock, flags); page = pool_find_page(pool, dma); if (!page) { + spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) dev_err(pool->dev, "dma_pool_free %s, %p/%lx (bad dma)\n", @@ -399,6 +398,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) offset = vaddr - page->vaddr; #ifdef DMAPOOL_DEBUG if ((dma - page->dma) != offset) { + spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) dev_err(pool->dev, "dma_pool_free %s, %p (bad vaddr)/%Lx\n", @@ -416,6 +416,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) chain = *(int *)(page->vaddr + chain); continue; } + spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) dev_err(pool->dev, "dma_pool_free %s, dma %Lx " "already free\n", pool->name, @@ -430,7 +431,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) memset(vaddr, POOL_POISON_FREED, pool->size); #endif - spin_lock_irqsave(&pool->lock, flags); page->in_use--; *(int *)vaddr = page->offset; page->offset = offset; diff --git a/mm/filemap.c b/mm/filemap.c index 3d4df44e4221..83a45d35468b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -102,9 +102,6 @@ * ->inode_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * - * ->task->proc_lock - * ->dcache_lock (proc_pid_lookup) - * * (code doesn't rely on that order, so you could switch it around) * ->tasklist_lock (memory_failure, collect_procs_ao) * ->i_mmap_lock @@ -143,13 +140,18 @@ void __remove_from_page_cache(struct page *page) void remove_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; + void (*freepage)(struct page *); BUG_ON(!PageLocked(page)); + freepage = mapping->a_ops->freepage; spin_lock_irq(&mapping->tree_lock); __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); + + if (freepage) + freepage(page); } EXPORT_SYMBOL(remove_from_page_cache); @@ -296,7 +298,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, continue; wait_on_page_writeback(page); - if (PageError(page)) + if (TestClearPageError(page)) ret = -EIO; } pagevec_release(&pvec); @@ -612,6 +614,19 @@ void __lock_page_nosync(struct page *page) TASK_UNINTERRUPTIBLE); } +int __lock_page_or_retry(struct page *page, struct mm_struct *mm, + unsigned int flags) +{ + if (!(flags & FAULT_FLAG_ALLOW_RETRY)) { + __lock_page(page); + return 1; + } else { + up_read(&mm->mmap_sem); + wait_on_page_locked(page); + return 0; + } +} + /** * find_get_page - find and get a page reference * @mapping: the address_space to search @@ -631,7 +646,9 @@ repeat: pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); if (pagep) { page = radix_tree_deref_slot(pagep); - if (unlikely(!page || page == RADIX_TREE_RETRY)) + if (unlikely(!page)) + goto out; + if (radix_tree_deref_retry(page)) goto repeat; if (!page_cache_get_speculative(page)) @@ -647,6 +664,7 @@ repeat: goto repeat; } } +out: rcu_read_unlock(); return page; @@ -764,12 +782,11 @@ repeat: page = radix_tree_deref_slot((void **)pages[i]); if (unlikely(!page)) continue; - /* - * this can only trigger if nr_found == 1, making livelock - * a non issue. - */ - if (unlikely(page == RADIX_TREE_RETRY)) + if (radix_tree_deref_retry(page)) { + if (ret) + start = pages[ret-1]->index; goto restart; + } if (!page_cache_get_speculative(page)) goto repeat; @@ -817,16 +834,9 @@ repeat: page = radix_tree_deref_slot((void **)pages[i]); if (unlikely(!page)) continue; - /* - * this can only trigger if nr_found == 1, making livelock - * a non issue. - */ - if (unlikely(page == RADIX_TREE_RETRY)) + if (radix_tree_deref_retry(page)) goto restart; - if (page->mapping == NULL || page->index != index) - break; - if (!page_cache_get_speculative(page)) goto repeat; @@ -836,6 +846,16 @@ repeat: goto repeat; } + /* + * must check mapping and index after taking the ref. + * otherwise we can get both false positives and false + * negatives, which is just confusing to the caller. + */ + if (page->mapping == NULL || page->index != index) { + page_cache_release(page); + break; + } + pages[ret] = page; ret++; index++; @@ -874,11 +894,7 @@ repeat: page = radix_tree_deref_slot((void **)pages[i]); if (unlikely(!page)) continue; - /* - * this can only trigger if nr_found == 1, making livelock - * a non issue. - */ - if (unlikely(page == RADIX_TREE_RETRY)) + if (radix_tree_deref_retry(page)) goto restart; if (!page_cache_get_speculative(page)) @@ -1016,6 +1032,9 @@ find_page: goto page_not_up_to_date; if (!trylock_page(page)) goto page_not_up_to_date; + /* Did it get truncated before we got the lock? */ + if (!page->mapping) + goto page_not_up_to_date_locked; if (!mapping->a_ops->is_partially_uptodate(page, desc, offset)) goto page_not_up_to_date_locked; @@ -1539,25 +1558,30 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * waiting for the lock. */ do_async_mmap_readahead(vma, ra, file, page, offset); - lock_page(page); - - /* Did it get truncated? */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - put_page(page); - goto no_cached_page; - } } else { /* No page in the page cache at all */ do_sync_mmap_readahead(vma, ra, file, offset); count_vm_event(PGMAJFAULT); ret = VM_FAULT_MAJOR; retry_find: - page = find_lock_page(mapping, offset); + page = find_get_page(mapping, offset); if (!page) goto no_cached_page; } + if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { + page_cache_release(page); + return ret | VM_FAULT_RETRY; + } + + /* Did it get truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + put_page(page); + goto retry_find; + } + VM_BUG_ON(page->index != offset); + /* * We have a locked page in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error. @@ -2177,12 +2201,12 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, } if (written > 0) { - loff_t end = pos + written; - if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { - i_size_write(inode, end); + pos += written; + if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { + i_size_write(inode, pos); mark_inode_dirty(inode); } - *ppos = end; + *ppos = pos; } out: return written; @@ -2203,7 +2227,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, gfp_notmask = __GFP_FS; repeat: page = find_lock_page(mapping, index); - if (likely(page)) + if (page) return page; page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); diff --git a/mm/highmem.c b/mm/highmem.c index 7a0aa1be4993..693394daa2ed 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -29,6 +29,11 @@ #include <linux/kgdb.h> #include <asm/tlbflush.h> + +#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) +DEFINE_PER_CPU(int, __kmap_atomic_idx); +#endif + /* * Virtual_count is not a pure "count". * 0 means that it is not mapped, and has not been mapped @@ -42,6 +47,9 @@ unsigned long totalhigh_pages __read_mostly; EXPORT_SYMBOL(totalhigh_pages); + +EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); + unsigned int nr_free_highpages (void) { pg_data_t *pgdat; @@ -422,61 +430,3 @@ void __init page_address_init(void) } #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ - -#ifdef CONFIG_DEBUG_HIGHMEM - -void debug_kmap_atomic(enum km_type type) -{ - static int warn_count = 10; - - if (unlikely(warn_count < 0)) - return; - - if (unlikely(in_interrupt())) { - if (in_nmi()) { - if (type != KM_NMI && type != KM_NMI_PTE) { - WARN_ON(1); - warn_count--; - } - } else if (in_irq()) { - if (type != KM_IRQ0 && type != KM_IRQ1 && - type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && - type != KM_BOUNCE_READ && type != KM_IRQ_PTE) { - WARN_ON(1); - warn_count--; - } - } else if (!irqs_disabled()) { /* softirq */ - if (type != KM_IRQ0 && type != KM_IRQ1 && - type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 && - type != KM_SKB_SUNRPC_DATA && - type != KM_SKB_DATA_SOFTIRQ && - type != KM_BOUNCE_READ) { - WARN_ON(1); - warn_count--; - } - } - } - - if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || - type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ || - type == KM_IRQ_PTE || type == KM_NMI || - type == KM_NMI_PTE ) { - if (!irqs_disabled()) { - WARN_ON(1); - warn_count--; - } - } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) { - if (irq_count() == 0 && !irqs_disabled()) { - WARN_ON(1); - warn_count--; - } - } -#ifdef CONFIG_KGDB_KDB - if (unlikely(type == KM_KDB && atomic_read(&kgdb_active) == -1)) { - WARN_ON(1); - warn_count--; - } -#endif /* CONFIG_KGDB_KDB */ -} - -#endif diff --git a/mm/huge_memory.c b/mm/huge_memory.c new file mode 100644 index 000000000000..3e29781ee762 --- /dev/null +++ b/mm/huge_memory.c @@ -0,0 +1,2351 @@ +/* + * Copyright (C) 2009 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/highmem.h> +#include <linux/hugetlb.h> +#include <linux/mmu_notifier.h> +#include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/mm_inline.h> +#include <linux/kthread.h> +#include <linux/khugepaged.h> +#include <linux/freezer.h> +#include <linux/mman.h> +#include <asm/tlb.h> +#include <asm/pgalloc.h> +#include "internal.h" + +/* + * By default transparent hugepage support is enabled for all mappings + * and khugepaged scans all mappings. Defrag is only invoked by + * khugepaged hugepage allocations and by page faults inside + * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived + * allocations. + */ +unsigned long transparent_hugepage_flags __read_mostly = +#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS + (1<<TRANSPARENT_HUGEPAGE_FLAG)| +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE + (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| +#endif + (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| + (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); + +/* default scan 8*512 pte (or vmas) every 30 second */ +static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; +static unsigned int khugepaged_pages_collapsed; +static unsigned int khugepaged_full_scans; +static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; +/* during fragmentation poll the hugepage allocator once every minute */ +static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; +static struct task_struct *khugepaged_thread __read_mostly; +static DEFINE_MUTEX(khugepaged_mutex); +static DEFINE_SPINLOCK(khugepaged_mm_lock); +static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); +/* + * default collapse hugepages if there is at least one pte mapped like + * it would have happened if the vma was large enough during page + * fault. + */ +static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; + +static int khugepaged(void *none); +static int mm_slots_hash_init(void); +static int khugepaged_slab_init(void); +static void khugepaged_slab_free(void); + +#define MM_SLOTS_HASH_HEADS 1024 +static struct hlist_head *mm_slots_hash __read_mostly; +static struct kmem_cache *mm_slot_cache __read_mostly; + +/** + * struct mm_slot - hash lookup from mm to mm_slot + * @hash: hash collision list + * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head + * @mm: the mm that this information is valid for + */ +struct mm_slot { + struct hlist_node hash; + struct list_head mm_node; + struct mm_struct *mm; +}; + +/** + * struct khugepaged_scan - cursor for scanning + * @mm_head: the head of the mm list to scan + * @mm_slot: the current mm_slot we are scanning + * @address: the next address inside that to be scanned + * + * There is only the one khugepaged_scan instance of this cursor structure. + */ +struct khugepaged_scan { + struct list_head mm_head; + struct mm_slot *mm_slot; + unsigned long address; +} khugepaged_scan = { + .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), +}; + + +static int set_recommended_min_free_kbytes(void) +{ + struct zone *zone; + int nr_zones = 0; + unsigned long recommended_min; + extern int min_free_kbytes; + + if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags) && + !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags)) + return 0; + + for_each_populated_zone(zone) + nr_zones++; + + /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */ + recommended_min = pageblock_nr_pages * nr_zones * 2; + + /* + * Make sure that on average at least two pageblocks are almost free + * of another type, one for a migratetype to fall back to and a + * second to avoid subsequent fallbacks of other types There are 3 + * MIGRATE_TYPES we care about. + */ + recommended_min += pageblock_nr_pages * nr_zones * + MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; + + /* don't ever allow to reserve more than 5% of the lowmem */ + recommended_min = min(recommended_min, + (unsigned long) nr_free_buffer_pages() / 20); + recommended_min <<= (PAGE_SHIFT-10); + + if (recommended_min > min_free_kbytes) + min_free_kbytes = recommended_min; + setup_per_zone_wmarks(); + return 0; +} +late_initcall(set_recommended_min_free_kbytes); + +static int start_khugepaged(void) +{ + int err = 0; + if (khugepaged_enabled()) { + int wakeup; + if (unlikely(!mm_slot_cache || !mm_slots_hash)) { + err = -ENOMEM; + goto out; + } + mutex_lock(&khugepaged_mutex); + if (!khugepaged_thread) + khugepaged_thread = kthread_run(khugepaged, NULL, + "khugepaged"); + if (unlikely(IS_ERR(khugepaged_thread))) { + printk(KERN_ERR + "khugepaged: kthread_run(khugepaged) failed\n"); + err = PTR_ERR(khugepaged_thread); + khugepaged_thread = NULL; + } + wakeup = !list_empty(&khugepaged_scan.mm_head); + mutex_unlock(&khugepaged_mutex); + if (wakeup) + wake_up_interruptible(&khugepaged_wait); + + set_recommended_min_free_kbytes(); + } else + /* wakeup to exit */ + wake_up_interruptible(&khugepaged_wait); +out: + return err; +} + +#ifdef CONFIG_SYSFS + +static ssize_t double_flag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf, + enum transparent_hugepage_flag enabled, + enum transparent_hugepage_flag req_madv) +{ + if (test_bit(enabled, &transparent_hugepage_flags)) { + VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags)); + return sprintf(buf, "[always] madvise never\n"); + } else if (test_bit(req_madv, &transparent_hugepage_flags)) + return sprintf(buf, "always [madvise] never\n"); + else + return sprintf(buf, "always madvise [never]\n"); +} +static ssize_t double_flag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count, + enum transparent_hugepage_flag enabled, + enum transparent_hugepage_flag req_madv) +{ + if (!memcmp("always", buf, + min(sizeof("always")-1, count))) { + set_bit(enabled, &transparent_hugepage_flags); + clear_bit(req_madv, &transparent_hugepage_flags); + } else if (!memcmp("madvise", buf, + min(sizeof("madvise")-1, count))) { + clear_bit(enabled, &transparent_hugepage_flags); + set_bit(req_madv, &transparent_hugepage_flags); + } else if (!memcmp("never", buf, + min(sizeof("never")-1, count))) { + clear_bit(enabled, &transparent_hugepage_flags); + clear_bit(req_madv, &transparent_hugepage_flags); + } else + return -EINVAL; + + return count; +} + +static ssize_t enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return double_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_FLAG, + TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); +} +static ssize_t enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + ssize_t ret; + + ret = double_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_FLAG, + TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); + + if (ret > 0) { + int err = start_khugepaged(); + if (err) + ret = err; + } + + if (ret > 0 && + (test_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags) || + test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags))) + set_recommended_min_free_kbytes(); + + return ret; +} +static struct kobj_attribute enabled_attr = + __ATTR(enabled, 0644, enabled_show, enabled_store); + +static ssize_t single_flag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf, + enum transparent_hugepage_flag flag) +{ + if (test_bit(flag, &transparent_hugepage_flags)) + return sprintf(buf, "[yes] no\n"); + else + return sprintf(buf, "yes [no]\n"); +} +static ssize_t single_flag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count, + enum transparent_hugepage_flag flag) +{ + if (!memcmp("yes", buf, + min(sizeof("yes")-1, count))) { + set_bit(flag, &transparent_hugepage_flags); + } else if (!memcmp("no", buf, + min(sizeof("no")-1, count))) { + clear_bit(flag, &transparent_hugepage_flags); + } else + return -EINVAL; + + return count; +} + +/* + * Currently defrag only disables __GFP_NOWAIT for allocation. A blind + * __GFP_REPEAT is too aggressive, it's never worth swapping tons of + * memory just to allocate one more hugepage. + */ +static ssize_t defrag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return double_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); +} +static ssize_t defrag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return double_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); +} +static struct kobj_attribute defrag_attr = + __ATTR(defrag, 0644, defrag_show, defrag_store); + +#ifdef CONFIG_DEBUG_VM +static ssize_t debug_cow_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return single_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); +} +static ssize_t debug_cow_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return single_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); +} +static struct kobj_attribute debug_cow_attr = + __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); +#endif /* CONFIG_DEBUG_VM */ + +static struct attribute *hugepage_attr[] = { + &enabled_attr.attr, + &defrag_attr.attr, +#ifdef CONFIG_DEBUG_VM + &debug_cow_attr.attr, +#endif + NULL, +}; + +static struct attribute_group hugepage_attr_group = { + .attrs = hugepage_attr, +}; + +static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); +} + +static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = strict_strtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + khugepaged_scan_sleep_millisecs = msecs; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute scan_sleep_millisecs_attr = + __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, + scan_sleep_millisecs_store); + +static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); +} + +static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = strict_strtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + khugepaged_alloc_sleep_millisecs = msecs; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute alloc_sleep_millisecs_attr = + __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, + alloc_sleep_millisecs_store); + +static ssize_t pages_to_scan_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_pages_to_scan); +} +static ssize_t pages_to_scan_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long pages; + + err = strict_strtoul(buf, 10, &pages); + if (err || !pages || pages > UINT_MAX) + return -EINVAL; + + khugepaged_pages_to_scan = pages; + + return count; +} +static struct kobj_attribute pages_to_scan_attr = + __ATTR(pages_to_scan, 0644, pages_to_scan_show, + pages_to_scan_store); + +static ssize_t pages_collapsed_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_pages_collapsed); +} +static struct kobj_attribute pages_collapsed_attr = + __ATTR_RO(pages_collapsed); + +static ssize_t full_scans_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_full_scans); +} +static struct kobj_attribute full_scans_attr = + __ATTR_RO(full_scans); + +static ssize_t khugepaged_defrag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return single_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static ssize_t khugepaged_defrag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return single_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static struct kobj_attribute khugepaged_defrag_attr = + __ATTR(defrag, 0644, khugepaged_defrag_show, + khugepaged_defrag_store); + +/* + * max_ptes_none controls if khugepaged should collapse hugepages over + * any unmapped ptes in turn potentially increasing the memory + * footprint of the vmas. When max_ptes_none is 0 khugepaged will not + * reduce the available free memory in the system as it + * runs. Increasing max_ptes_none will instead potentially reduce the + * free memory in the system during the khugepaged scan. + */ +static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_max_ptes_none); +} +static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long max_ptes_none; + + err = strict_strtoul(buf, 10, &max_ptes_none); + if (err || max_ptes_none > HPAGE_PMD_NR-1) + return -EINVAL; + + khugepaged_max_ptes_none = max_ptes_none; + + return count; +} +static struct kobj_attribute khugepaged_max_ptes_none_attr = + __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, + khugepaged_max_ptes_none_store); + +static struct attribute *khugepaged_attr[] = { + &khugepaged_defrag_attr.attr, + &khugepaged_max_ptes_none_attr.attr, + &pages_to_scan_attr.attr, + &pages_collapsed_attr.attr, + &full_scans_attr.attr, + &scan_sleep_millisecs_attr.attr, + &alloc_sleep_millisecs_attr.attr, + NULL, +}; + +static struct attribute_group khugepaged_attr_group = { + .attrs = khugepaged_attr, + .name = "khugepaged", +}; +#endif /* CONFIG_SYSFS */ + +static int __init hugepage_init(void) +{ + int err; +#ifdef CONFIG_SYSFS + static struct kobject *hugepage_kobj; +#endif + + err = -EINVAL; + if (!has_transparent_hugepage()) { + transparent_hugepage_flags = 0; + goto out; + } + +#ifdef CONFIG_SYSFS + err = -ENOMEM; + hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); + if (unlikely(!hugepage_kobj)) { + printk(KERN_ERR "hugepage: failed kobject create\n"); + goto out; + } + + err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group); + if (err) { + printk(KERN_ERR "hugepage: failed register hugeage group\n"); + goto out; + } + + err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group); + if (err) { + printk(KERN_ERR "hugepage: failed register hugeage group\n"); + goto out; + } +#endif + + err = khugepaged_slab_init(); + if (err) + goto out; + + err = mm_slots_hash_init(); + if (err) { + khugepaged_slab_free(); + goto out; + } + + /* + * By default disable transparent hugepages on smaller systems, + * where the extra memory used could hurt more than TLB overhead + * is likely to save. The admin can still enable it through /sys. + */ + if (totalram_pages < (512 << (20 - PAGE_SHIFT))) + transparent_hugepage_flags = 0; + + start_khugepaged(); + + set_recommended_min_free_kbytes(); + +out: + return err; +} +module_init(hugepage_init) + +static int __init setup_transparent_hugepage(char *str) +{ + int ret = 0; + if (!str) + goto out; + if (!strcmp(str, "always")) { + set_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags); + ret = 1; + } else if (!strcmp(str, "madvise")) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags); + ret = 1; + } else if (!strcmp(str, "never")) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags); + ret = 1; + } +out: + if (!ret) + printk(KERN_WARNING + "transparent_hugepage= cannot parse, ignored\n"); + return ret; +} +__setup("transparent_hugepage=", setup_transparent_hugepage); + +static void prepare_pmd_huge_pte(pgtable_t pgtable, + struct mm_struct *mm) +{ + assert_spin_locked(&mm->page_table_lock); + + /* FIFO */ + if (!mm->pmd_huge_pte) + INIT_LIST_HEAD(&pgtable->lru); + else + list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); + mm->pmd_huge_pte = pgtable; +} + +static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) +{ + if (likely(vma->vm_flags & VM_WRITE)) + pmd = pmd_mkwrite(pmd); + return pmd; +} + +static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long haddr, pmd_t *pmd, + struct page *page) +{ + int ret = 0; + pgtable_t pgtable; + + VM_BUG_ON(!PageCompound(page)); + pgtable = pte_alloc_one(mm, haddr); + if (unlikely(!pgtable)) { + mem_cgroup_uncharge_page(page); + put_page(page); + return VM_FAULT_OOM; + } + + clear_huge_page(page, haddr, HPAGE_PMD_NR); + __SetPageUptodate(page); + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_none(*pmd))) { + spin_unlock(&mm->page_table_lock); + mem_cgroup_uncharge_page(page); + put_page(page); + pte_free(mm, pgtable); + } else { + pmd_t entry; + entry = mk_pmd(page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + entry = pmd_mkhuge(entry); + /* + * The spinlocking to take the lru_lock inside + * page_add_new_anon_rmap() acts as a full memory + * barrier to be sure clear_huge_page writes become + * visible after the set_pmd_at() write. + */ + page_add_new_anon_rmap(page, vma, haddr); + set_pmd_at(mm, haddr, pmd, entry); + prepare_pmd_huge_pte(pgtable, mm); + add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); + spin_unlock(&mm->page_table_lock); + } + + return ret; +} + +static inline gfp_t alloc_hugepage_gfpmask(int defrag) +{ + return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT); +} + +static inline struct page *alloc_hugepage_vma(int defrag, + struct vm_area_struct *vma, + unsigned long haddr) +{ + return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), + HPAGE_PMD_ORDER, vma, haddr); +} + +#ifndef CONFIG_NUMA +static inline struct page *alloc_hugepage(int defrag) +{ + return alloc_pages(alloc_hugepage_gfpmask(defrag), + HPAGE_PMD_ORDER); +} +#endif + +int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + unsigned int flags) +{ + struct page *page; + unsigned long haddr = address & HPAGE_PMD_MASK; + pte_t *pte; + + if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + if (unlikely(khugepaged_enter(vma))) + return VM_FAULT_OOM; + page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), + vma, haddr); + if (unlikely(!page)) + goto out; + if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { + put_page(page); + goto out; + } + + return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page); + } +out: + /* + * Use __pte_alloc instead of pte_alloc_map, because we can't + * run pte_offset_map on the pmd, if an huge pmd could + * materialize from under us from a different thread. + */ + if (unlikely(__pte_alloc(mm, vma, pmd, address))) + return VM_FAULT_OOM; + /* if an huge pmd materialized from under us just retry later */ + if (unlikely(pmd_trans_huge(*pmd))) + return 0; + /* + * A regular pmd is established and it can't morph into a huge pmd + * from under us anymore at this point because we hold the mmap_sem + * read mode and khugepaged takes it in write mode. So now it's + * safe to run pte_offset_map(). + */ + pte = pte_offset_map(pmd, address); + return handle_pte_fault(mm, vma, address, pte, pmd, flags); +} + +int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + struct vm_area_struct *vma) +{ + struct page *src_page; + pmd_t pmd; + pgtable_t pgtable; + int ret; + + ret = -ENOMEM; + pgtable = pte_alloc_one(dst_mm, addr); + if (unlikely(!pgtable)) + goto out; + + spin_lock(&dst_mm->page_table_lock); + spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); + + ret = -EAGAIN; + pmd = *src_pmd; + if (unlikely(!pmd_trans_huge(pmd))) { + pte_free(dst_mm, pgtable); + goto out_unlock; + } + if (unlikely(pmd_trans_splitting(pmd))) { + /* split huge page running from under us */ + spin_unlock(&src_mm->page_table_lock); + spin_unlock(&dst_mm->page_table_lock); + pte_free(dst_mm, pgtable); + + wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ + goto out; + } + src_page = pmd_page(pmd); + VM_BUG_ON(!PageHead(src_page)); + get_page(src_page); + page_dup_rmap(src_page); + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + + pmdp_set_wrprotect(src_mm, addr, src_pmd); + pmd = pmd_mkold(pmd_wrprotect(pmd)); + set_pmd_at(dst_mm, addr, dst_pmd, pmd); + prepare_pmd_huge_pte(pgtable, dst_mm); + + ret = 0; +out_unlock: + spin_unlock(&src_mm->page_table_lock); + spin_unlock(&dst_mm->page_table_lock); +out: + return ret; +} + +/* no "address" argument so destroys page coloring of some arch */ +pgtable_t get_pmd_huge_pte(struct mm_struct *mm) +{ + pgtable_t pgtable; + + assert_spin_locked(&mm->page_table_lock); + + /* FIFO */ + pgtable = mm->pmd_huge_pte; + if (list_empty(&pgtable->lru)) + mm->pmd_huge_pte = NULL; + else { + mm->pmd_huge_pte = list_entry(pgtable->lru.next, + struct page, lru); + list_del(&pgtable->lru); + } + return pgtable; +} + +static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmd, pmd_t orig_pmd, + struct page *page, + unsigned long haddr) +{ + pgtable_t pgtable; + pmd_t _pmd; + int ret = 0, i; + struct page **pages; + + pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, + GFP_KERNEL); + if (unlikely(!pages)) { + ret |= VM_FAULT_OOM; + goto out; + } + + for (i = 0; i < HPAGE_PMD_NR; i++) { + pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, + vma, address); + if (unlikely(!pages[i] || + mem_cgroup_newpage_charge(pages[i], mm, + GFP_KERNEL))) { + if (pages[i]) + put_page(pages[i]); + mem_cgroup_uncharge_start(); + while (--i >= 0) { + mem_cgroup_uncharge_page(pages[i]); + put_page(pages[i]); + } + mem_cgroup_uncharge_end(); + kfree(pages); + ret |= VM_FAULT_OOM; + goto out; + } + } + + for (i = 0; i < HPAGE_PMD_NR; i++) { + copy_user_highpage(pages[i], page + i, + haddr + PAGE_SHIFT*i, vma); + __SetPageUptodate(pages[i]); + cond_resched(); + } + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto out_free_pages; + VM_BUG_ON(!PageHead(page)); + + pmdp_clear_flush_notify(vma, haddr, pmd); + /* leave pmd empty until pte is filled */ + + pgtable = get_pmd_huge_pte(mm); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + entry = mk_pte(pages[i], vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + page_add_new_anon_rmap(pages[i], vma, haddr); + pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + kfree(pages); + + mm->nr_ptes++; + smp_wmb(); /* make pte visible before pmd */ + pmd_populate(mm, pmd, pgtable); + page_remove_rmap(page); + spin_unlock(&mm->page_table_lock); + + ret |= VM_FAULT_WRITE; + put_page(page); + +out: + return ret; + +out_free_pages: + spin_unlock(&mm->page_table_lock); + mem_cgroup_uncharge_start(); + for (i = 0; i < HPAGE_PMD_NR; i++) { + mem_cgroup_uncharge_page(pages[i]); + put_page(pages[i]); + } + mem_cgroup_uncharge_end(); + kfree(pages); + goto out; +} + +int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, pmd_t orig_pmd) +{ + int ret = 0; + struct page *page, *new_page; + unsigned long haddr; + + VM_BUG_ON(!vma->anon_vma); + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto out_unlock; + + page = pmd_page(orig_pmd); + VM_BUG_ON(!PageCompound(page) || !PageHead(page)); + haddr = address & HPAGE_PMD_MASK; + if (page_mapcount(page) == 1) { + pmd_t entry; + entry = pmd_mkyoung(orig_pmd); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) + update_mmu_cache(vma, address, entry); + ret |= VM_FAULT_WRITE; + goto out_unlock; + } + get_page(page); + spin_unlock(&mm->page_table_lock); + + if (transparent_hugepage_enabled(vma) && + !transparent_hugepage_debug_cow()) + new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), + vma, haddr); + else + new_page = NULL; + + if (unlikely(!new_page)) { + ret = do_huge_pmd_wp_page_fallback(mm, vma, address, + pmd, orig_pmd, page, haddr); + put_page(page); + goto out; + } + + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + put_page(new_page); + put_page(page); + ret |= VM_FAULT_OOM; + goto out; + } + + copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); + __SetPageUptodate(new_page); + + spin_lock(&mm->page_table_lock); + put_page(page); + if (unlikely(!pmd_same(*pmd, orig_pmd))) { + mem_cgroup_uncharge_page(new_page); + put_page(new_page); + } else { + pmd_t entry; + VM_BUG_ON(!PageHead(page)); + entry = mk_pmd(new_page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + entry = pmd_mkhuge(entry); + pmdp_clear_flush_notify(vma, haddr, pmd); + page_add_new_anon_rmap(new_page, vma, haddr); + set_pmd_at(mm, haddr, pmd, entry); + update_mmu_cache(vma, address, entry); + page_remove_rmap(page); + put_page(page); + ret |= VM_FAULT_WRITE; + } +out_unlock: + spin_unlock(&mm->page_table_lock); +out: + return ret; +} + +struct page *follow_trans_huge_pmd(struct mm_struct *mm, + unsigned long addr, + pmd_t *pmd, + unsigned int flags) +{ + struct page *page = NULL; + + assert_spin_locked(&mm->page_table_lock); + + if (flags & FOLL_WRITE && !pmd_write(*pmd)) + goto out; + + page = pmd_page(*pmd); + VM_BUG_ON(!PageHead(page)); + if (flags & FOLL_TOUCH) { + pmd_t _pmd; + /* + * We should set the dirty bit only for FOLL_WRITE but + * for now the dirty bit in the pmd is meaningless. + * And if the dirty bit will become meaningful and + * we'll only set it with FOLL_WRITE, an atomic + * set_bit will be required on the pmd to set the + * young bit, instead of the current set_pmd_at. + */ + _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); + set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); + } + page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; + VM_BUG_ON(!PageCompound(page)); + if (flags & FOLL_GET) + get_page(page); + +out: + return page; +} + +int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd) +{ + int ret = 0; + + spin_lock(&tlb->mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(&tlb->mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, + pmd); + } else { + struct page *page; + pgtable_t pgtable; + pgtable = get_pmd_huge_pte(tlb->mm); + page = pmd_page(*pmd); + pmd_clear(pmd); + page_remove_rmap(page); + VM_BUG_ON(page_mapcount(page) < 0); + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); + VM_BUG_ON(!PageHead(page)); + spin_unlock(&tlb->mm->page_table_lock); + tlb_remove_page(tlb, page); + pte_free(tlb->mm, pgtable); + ret = 1; + } + } else + spin_unlock(&tlb->mm->page_table_lock); + + return ret; +} + +int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ + int ret = 0; + + spin_lock(&vma->vm_mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + ret = !pmd_trans_splitting(*pmd); + spin_unlock(&vma->vm_mm->page_table_lock); + if (unlikely(!ret)) + wait_split_huge_page(vma->anon_vma, pmd); + else { + /* + * All logical pages in the range are present + * if backed by a huge page. + */ + memset(vec, 1, (end - addr) >> PAGE_SHIFT); + } + } else + spin_unlock(&vma->vm_mm->page_table_lock); + + return ret; +} + +int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, pgprot_t newprot) +{ + struct mm_struct *mm = vma->vm_mm; + int ret = 0; + + spin_lock(&mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(&mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + pmd_t entry; + + entry = pmdp_get_and_clear(mm, addr, pmd); + entry = pmd_modify(entry, newprot); + set_pmd_at(mm, addr, pmd, entry); + spin_unlock(&vma->vm_mm->page_table_lock); + flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE); + ret = 1; + } + } else + spin_unlock(&vma->vm_mm->page_table_lock); + + return ret; +} + +pmd_t *page_check_address_pmd(struct page *page, + struct mm_struct *mm, + unsigned long address, + enum page_check_address_pmd_flag flag) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, *ret = NULL; + + if (address & ~HPAGE_PMD_MASK) + goto out; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + goto out; + if (pmd_page(*pmd) != page) + goto out; + /* + * split_vma() may create temporary aliased mappings. There is + * no risk as long as all huge pmd are found and have their + * splitting bit set before __split_huge_page_refcount + * runs. Finding the same huge pmd more than once during the + * same rmap walk is not a problem. + */ + if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && + pmd_trans_splitting(*pmd)) + goto out; + if (pmd_trans_huge(*pmd)) { + VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && + !pmd_trans_splitting(*pmd)); + ret = pmd; + } +out: + return ret; +} + +static int __split_huge_page_splitting(struct page *page, + struct vm_area_struct *vma, + unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + pmd_t *pmd; + int ret = 0; + + spin_lock(&mm->page_table_lock); + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); + if (pmd) { + /* + * We can't temporarily set the pmd to null in order + * to split it, the pmd must remain marked huge at all + * times or the VM won't take the pmd_trans_huge paths + * and it won't wait on the anon_vma->root->lock to + * serialize against split_huge_page*. + */ + pmdp_splitting_flush_notify(vma, address, pmd); + ret = 1; + } + spin_unlock(&mm->page_table_lock); + + return ret; +} + +static void __split_huge_page_refcount(struct page *page) +{ + int i; + unsigned long head_index = page->index; + struct zone *zone = page_zone(page); + int zonestat; + + /* prevent PageLRU to go away from under us, and freeze lru stats */ + spin_lock_irq(&zone->lru_lock); + compound_lock(page); + + for (i = 1; i < HPAGE_PMD_NR; i++) { + struct page *page_tail = page + i; + + /* tail_page->_count cannot change */ + atomic_sub(atomic_read(&page_tail->_count), &page->_count); + BUG_ON(page_count(page) <= 0); + atomic_add(page_mapcount(page) + 1, &page_tail->_count); + BUG_ON(atomic_read(&page_tail->_count) <= 0); + + /* after clearing PageTail the gup refcount can be released */ + smp_mb(); + + /* + * retain hwpoison flag of the poisoned tail page: + * fix for the unsuitable process killed on Guest Machine(KVM) + * by the memory-failure. + */ + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON; + page_tail->flags |= (page->flags & + ((1L << PG_referenced) | + (1L << PG_swapbacked) | + (1L << PG_mlocked) | + (1L << PG_uptodate))); + page_tail->flags |= (1L << PG_dirty); + + /* + * 1) clear PageTail before overwriting first_page + * 2) clear PageTail before clearing PageHead for VM_BUG_ON + */ + smp_wmb(); + + /* + * __split_huge_page_splitting() already set the + * splitting bit in all pmd that could map this + * hugepage, that will ensure no CPU can alter the + * mapcount on the head page. The mapcount is only + * accounted in the head page and it has to be + * transferred to all tail pages in the below code. So + * for this code to be safe, the split the mapcount + * can't change. But that doesn't mean userland can't + * keep changing and reading the page contents while + * we transfer the mapcount, so the pmd splitting + * status is achieved setting a reserved bit in the + * pmd, not by clearing the present bit. + */ + BUG_ON(page_mapcount(page_tail)); + page_tail->_mapcount = page->_mapcount; + + BUG_ON(page_tail->mapping); + page_tail->mapping = page->mapping; + + page_tail->index = ++head_index; + + BUG_ON(!PageAnon(page_tail)); + BUG_ON(!PageUptodate(page_tail)); + BUG_ON(!PageDirty(page_tail)); + BUG_ON(!PageSwapBacked(page_tail)); + + mem_cgroup_split_huge_fixup(page, page_tail); + + lru_add_page_tail(zone, page, page_tail); + } + + __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); + + /* + * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics, + * so adjust those appropriately if this page is on the LRU. + */ + if (PageLRU(page)) { + zonestat = NR_LRU_BASE + page_lru(page); + __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1)); + } + + ClearPageCompound(page); + compound_unlock(page); + spin_unlock_irq(&zone->lru_lock); + + for (i = 1; i < HPAGE_PMD_NR; i++) { + struct page *page_tail = page + i; + BUG_ON(page_count(page_tail) <= 0); + /* + * Tail pages may be freed if there wasn't any mapping + * like if add_to_swap() is running on a lru page that + * had its mapping zapped. And freeing these pages + * requires taking the lru_lock so we do the put_page + * of the tail pages after the split is complete. + */ + put_page(page_tail); + } + + /* + * Only the head page (now become a regular page) is required + * to be pinned by the caller. + */ + BUG_ON(page_count(page) <= 0); +} + +static int __split_huge_page_map(struct page *page, + struct vm_area_struct *vma, + unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + pmd_t *pmd, _pmd; + int ret = 0, i; + pgtable_t pgtable; + unsigned long haddr; + + spin_lock(&mm->page_table_lock); + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); + if (pmd) { + pgtable = get_pmd_huge_pte(mm); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0, haddr = address; i < HPAGE_PMD_NR; + i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + BUG_ON(PageCompound(page+i)); + entry = mk_pte(page + i, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (!pmd_write(*pmd)) + entry = pte_wrprotect(entry); + else + BUG_ON(page_mapcount(page) != 1); + if (!pmd_young(*pmd)) + entry = pte_mkold(entry); + pte = pte_offset_map(&_pmd, haddr); + BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + + mm->nr_ptes++; + smp_wmb(); /* make pte visible before pmd */ + /* + * Up to this point the pmd is present and huge and + * userland has the whole access to the hugepage + * during the split (which happens in place). If we + * overwrite the pmd with the not-huge version + * pointing to the pte here (which of course we could + * if all CPUs were bug free), userland could trigger + * a small page size TLB miss on the small sized TLB + * while the hugepage TLB entry is still established + * in the huge TLB. Some CPU doesn't like that. See + * http://support.amd.com/us/Processor_TechDocs/41322.pdf, + * Erratum 383 on page 93. Intel should be safe but is + * also warns that it's only safe if the permission + * and cache attributes of the two entries loaded in + * the two TLB is identical (which should be the case + * here). But it is generally safer to never allow + * small and huge TLB entries for the same virtual + * address to be loaded simultaneously. So instead of + * doing "pmd_populate(); flush_tlb_range();" we first + * mark the current pmd notpresent (atomically because + * here the pmd_trans_huge and pmd_trans_splitting + * must remain set at all times on the pmd until the + * split is complete for this pmd), then we flush the + * SMP TLB and finally we write the non-huge version + * of the pmd entry with pmd_populate. + */ + set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd)); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + pmd_populate(mm, pmd, pgtable); + ret = 1; + } + spin_unlock(&mm->page_table_lock); + + return ret; +} + +/* must be called with anon_vma->root->lock hold */ +static void __split_huge_page(struct page *page, + struct anon_vma *anon_vma) +{ + int mapcount, mapcount2; + struct anon_vma_chain *avc; + + BUG_ON(!PageHead(page)); + BUG_ON(PageTail(page)); + + mapcount = 0; + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; + unsigned long addr = vma_address(page, vma); + BUG_ON(is_vma_temporary_stack(vma)); + if (addr == -EFAULT) + continue; + mapcount += __split_huge_page_splitting(page, vma, addr); + } + /* + * It is critical that new vmas are added to the tail of the + * anon_vma list. This guarantes that if copy_huge_pmd() runs + * and establishes a child pmd before + * __split_huge_page_splitting() freezes the parent pmd (so if + * we fail to prevent copy_huge_pmd() from running until the + * whole __split_huge_page() is complete), we will still see + * the newly established pmd of the child later during the + * walk, to be able to set it as pmd_trans_splitting too. + */ + if (mapcount != page_mapcount(page)) + printk(KERN_ERR "mapcount %d page_mapcount %d\n", + mapcount, page_mapcount(page)); + BUG_ON(mapcount != page_mapcount(page)); + + __split_huge_page_refcount(page); + + mapcount2 = 0; + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; + unsigned long addr = vma_address(page, vma); + BUG_ON(is_vma_temporary_stack(vma)); + if (addr == -EFAULT) + continue; + mapcount2 += __split_huge_page_map(page, vma, addr); + } + if (mapcount != mapcount2) + printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", + mapcount, mapcount2, page_mapcount(page)); + BUG_ON(mapcount != mapcount2); +} + +int split_huge_page(struct page *page) +{ + struct anon_vma *anon_vma; + int ret = 1; + + BUG_ON(!PageAnon(page)); + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) + goto out; + ret = 0; + if (!PageCompound(page)) + goto out_unlock; + + BUG_ON(!PageSwapBacked(page)); + __split_huge_page(page, anon_vma); + + BUG_ON(PageCompound(page)); +out_unlock: + page_unlock_anon_vma(anon_vma); +out: + return ret; +} + +int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice) +{ + switch (advice) { + case MADV_HUGEPAGE: + /* + * Be somewhat over-protective like KSM for now! + */ + if (*vm_flags & (VM_HUGEPAGE | + VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | + VM_MIXEDMAP | VM_SAO)) + return -EINVAL; + *vm_flags &= ~VM_NOHUGEPAGE; + *vm_flags |= VM_HUGEPAGE; + /* + * If the vma become good for khugepaged to scan, + * register it here without waiting a page fault that + * may not happen any time soon. + */ + if (unlikely(khugepaged_enter_vma_merge(vma))) + return -ENOMEM; + break; + case MADV_NOHUGEPAGE: + /* + * Be somewhat over-protective like KSM for now! + */ + if (*vm_flags & (VM_NOHUGEPAGE | + VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | + VM_MIXEDMAP | VM_SAO)) + return -EINVAL; + *vm_flags &= ~VM_HUGEPAGE; + *vm_flags |= VM_NOHUGEPAGE; + /* + * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning + * this vma even if we leave the mm registered in khugepaged if + * it got registered before VM_NOHUGEPAGE was set. + */ + break; + } + + return 0; +} + +static int __init khugepaged_slab_init(void) +{ + mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", + sizeof(struct mm_slot), + __alignof__(struct mm_slot), 0, NULL); + if (!mm_slot_cache) + return -ENOMEM; + + return 0; +} + +static void __init khugepaged_slab_free(void) +{ + kmem_cache_destroy(mm_slot_cache); + mm_slot_cache = NULL; +} + +static inline struct mm_slot *alloc_mm_slot(void) +{ + if (!mm_slot_cache) /* initialization failed */ + return NULL; + return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); +} + +static inline void free_mm_slot(struct mm_slot *mm_slot) +{ + kmem_cache_free(mm_slot_cache, mm_slot); +} + +static int __init mm_slots_hash_init(void) +{ + mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), + GFP_KERNEL); + if (!mm_slots_hash) + return -ENOMEM; + return 0; +} + +#if 0 +static void __init mm_slots_hash_free(void) +{ + kfree(mm_slots_hash); + mm_slots_hash = NULL; +} +#endif + +static struct mm_slot *get_mm_slot(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + struct hlist_head *bucket; + struct hlist_node *node; + + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) + % MM_SLOTS_HASH_HEADS]; + hlist_for_each_entry(mm_slot, node, bucket, hash) { + if (mm == mm_slot->mm) + return mm_slot; + } + return NULL; +} + +static void insert_to_mm_slots_hash(struct mm_struct *mm, + struct mm_slot *mm_slot) +{ + struct hlist_head *bucket; + + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) + % MM_SLOTS_HASH_HEADS]; + mm_slot->mm = mm; + hlist_add_head(&mm_slot->hash, bucket); +} + +static inline int khugepaged_test_exit(struct mm_struct *mm) +{ + return atomic_read(&mm->mm_users) == 0; +} + +int __khugepaged_enter(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int wakeup; + + mm_slot = alloc_mm_slot(); + if (!mm_slot) + return -ENOMEM; + + /* __khugepaged_exit() must not run from under us */ + VM_BUG_ON(khugepaged_test_exit(mm)); + if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { + free_mm_slot(mm_slot); + return 0; + } + + spin_lock(&khugepaged_mm_lock); + insert_to_mm_slots_hash(mm, mm_slot); + /* + * Insert just behind the scanning cursor, to let the area settle + * down a little. + */ + wakeup = list_empty(&khugepaged_scan.mm_head); + list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); + spin_unlock(&khugepaged_mm_lock); + + atomic_inc(&mm->mm_count); + if (wakeup) + wake_up_interruptible(&khugepaged_wait); + + return 0; +} + +int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +{ + unsigned long hstart, hend; + if (!vma->anon_vma) + /* + * Not yet faulted in so we will register later in the + * page fault if needed. + */ + return 0; + if (vma->vm_file || vma->vm_ops) + /* khugepaged not yet working on file or special mappings */ + return 0; + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (hstart < hend) + return khugepaged_enter(vma); + return 0; +} + +void __khugepaged_exit(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int free = 0; + + spin_lock(&khugepaged_mm_lock); + mm_slot = get_mm_slot(mm); + if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { + hlist_del(&mm_slot->hash); + list_del(&mm_slot->mm_node); + free = 1; + } + + if (free) { + spin_unlock(&khugepaged_mm_lock); + clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + free_mm_slot(mm_slot); + mmdrop(mm); + } else if (mm_slot) { + spin_unlock(&khugepaged_mm_lock); + /* + * This is required to serialize against + * khugepaged_test_exit() (which is guaranteed to run + * under mmap sem read mode). Stop here (after we + * return all pagetables will be destroyed) until + * khugepaged has finished working on the pagetables + * under the mmap_sem. + */ + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } else + spin_unlock(&khugepaged_mm_lock); +} + +static void release_pte_page(struct page *page) +{ + /* 0 stands for page_is_file_cache(page) == false */ + dec_zone_page_state(page, NR_ISOLATED_ANON + 0); + unlock_page(page); + putback_lru_page(page); +} + +static void release_pte_pages(pte_t *pte, pte_t *_pte) +{ + while (--_pte >= pte) { + pte_t pteval = *_pte; + if (!pte_none(pteval)) + release_pte_page(pte_page(pteval)); + } +} + +static void release_all_pte_pages(pte_t *pte) +{ + release_pte_pages(pte, pte + HPAGE_PMD_NR); +} + +static int __collapse_huge_page_isolate(struct vm_area_struct *vma, + unsigned long address, + pte_t *pte) +{ + struct page *page; + pte_t *_pte; + int referenced = 0, isolated = 0, none = 0; + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; + _pte++, address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (pte_none(pteval)) { + if (++none <= khugepaged_max_ptes_none) + continue; + else { + release_pte_pages(pte, _pte); + goto out; + } + } + if (!pte_present(pteval) || !pte_write(pteval)) { + release_pte_pages(pte, _pte); + goto out; + } + page = vm_normal_page(vma, address, pteval); + if (unlikely(!page)) { + release_pte_pages(pte, _pte); + goto out; + } + VM_BUG_ON(PageCompound(page)); + BUG_ON(!PageAnon(page)); + VM_BUG_ON(!PageSwapBacked(page)); + + /* cannot use mapcount: can't collapse if there's a gup pin */ + if (page_count(page) != 1) { + release_pte_pages(pte, _pte); + goto out; + } + /* + * We can do it before isolate_lru_page because the + * page can't be freed from under us. NOTE: PG_lock + * is needed to serialize against split_huge_page + * when invoked from the VM. + */ + if (!trylock_page(page)) { + release_pte_pages(pte, _pte); + goto out; + } + /* + * Isolate the page to avoid collapsing an hugepage + * currently in use by the VM. + */ + if (isolate_lru_page(page)) { + unlock_page(page); + release_pte_pages(pte, _pte); + goto out; + } + /* 0 stands for page_is_file_cache(page) == false */ + inc_zone_page_state(page, NR_ISOLATED_ANON + 0); + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(PageLRU(page)); + + /* If there is no mapped pte young don't collapse the page */ + if (pte_young(pteval) || PageReferenced(page) || + mmu_notifier_test_young(vma->vm_mm, address)) + referenced = 1; + } + if (unlikely(!referenced)) + release_all_pte_pages(pte); + else + isolated = 1; +out: + return isolated; +} + +static void __collapse_huge_page_copy(pte_t *pte, struct page *page, + struct vm_area_struct *vma, + unsigned long address, + spinlock_t *ptl) +{ + pte_t *_pte; + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { + pte_t pteval = *_pte; + struct page *src_page; + + if (pte_none(pteval)) { + clear_user_highpage(page, address); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); + } else { + src_page = pte_page(pteval); + copy_user_highpage(page, src_page, address, vma); + VM_BUG_ON(page_mapcount(src_page) != 1); + VM_BUG_ON(page_count(src_page) != 2); + release_pte_page(src_page); + /* + * ptl mostly unnecessary, but preempt has to + * be disabled to update the per-cpu stats + * inside page_remove_rmap(). + */ + spin_lock(ptl); + /* + * paravirt calls inside pte_clear here are + * superfluous. + */ + pte_clear(vma->vm_mm, address, _pte); + page_remove_rmap(src_page); + spin_unlock(ptl); + free_page_and_swap_cache(src_page); + } + + address += PAGE_SIZE; + page++; + } +} + +static void collapse_huge_page(struct mm_struct *mm, + unsigned long address, + struct page **hpage, + struct vm_area_struct *vma) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, _pmd; + pte_t *pte; + pgtable_t pgtable; + struct page *new_page; + spinlock_t *ptl; + int isolated; + unsigned long hstart, hend; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); +#ifndef CONFIG_NUMA + VM_BUG_ON(!*hpage); + new_page = *hpage; +#else + VM_BUG_ON(*hpage); + /* + * Allocate the page while the vma is still valid and under + * the mmap_sem read mode so there is no memory allocation + * later when we take the mmap_sem in write mode. This is more + * friendly behavior (OTOH it may actually hide bugs) to + * filesystems in userland with daemons allocating memory in + * the userland I/O paths. Allocating memory with the + * mmap_sem in read mode is good idea also to allow greater + * scalability. + */ + new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); + if (unlikely(!new_page)) { + up_read(&mm->mmap_sem); + *hpage = ERR_PTR(-ENOMEM); + return; + } +#endif + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + up_read(&mm->mmap_sem); + put_page(new_page); + return; + } + + /* after allocating the hugepage upgrade to mmap_sem write mode */ + up_read(&mm->mmap_sem); + + /* + * Prevent all access to pagetables with the exception of + * gup_fast later hanlded by the ptep_clear_flush and the VM + * handled by the anon_vma lock + PG_lock. + */ + down_write(&mm->mmap_sem); + if (unlikely(khugepaged_test_exit(mm))) + goto out; + + vma = find_vma(mm, address); + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (address < hstart || address + HPAGE_PMD_SIZE > hend) + goto out; + + if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || + (vma->vm_flags & VM_NOHUGEPAGE)) + goto out; + + /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) + goto out; + if (is_vma_temporary_stack(vma)) + goto out; + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + /* pmd can't go away or become huge under us */ + if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) + goto out; + + anon_vma_lock(vma->anon_vma); + + pte = pte_offset_map(pmd, address); + ptl = pte_lockptr(mm, pmd); + + spin_lock(&mm->page_table_lock); /* probably unnecessary */ + /* + * After this gup_fast can't run anymore. This also removes + * any huge TLB entry from the CPU so we won't allow + * huge and small TLB entries for the same virtual address + * to avoid the risk of CPU bugs in that area. + */ + _pmd = pmdp_clear_flush_notify(vma, address, pmd); + spin_unlock(&mm->page_table_lock); + + spin_lock(ptl); + isolated = __collapse_huge_page_isolate(vma, address, pte); + spin_unlock(ptl); + + if (unlikely(!isolated)) { + pte_unmap(pte); + spin_lock(&mm->page_table_lock); + BUG_ON(!pmd_none(*pmd)); + set_pmd_at(mm, address, pmd, _pmd); + spin_unlock(&mm->page_table_lock); + anon_vma_unlock(vma->anon_vma); + goto out; + } + + /* + * All pages are isolated and locked so anon_vma rmap + * can't run anymore. + */ + anon_vma_unlock(vma->anon_vma); + + __collapse_huge_page_copy(pte, new_page, vma, address, ptl); + pte_unmap(pte); + __SetPageUptodate(new_page); + pgtable = pmd_pgtable(_pmd); + VM_BUG_ON(page_count(pgtable) != 1); + VM_BUG_ON(page_mapcount(pgtable) != 0); + + _pmd = mk_pmd(new_page, vma->vm_page_prot); + _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); + _pmd = pmd_mkhuge(_pmd); + + /* + * spin_lock() below is not the equivalent of smp_wmb(), so + * this is needed to avoid the copy_huge_page writes to become + * visible after the set_pmd_at() write. + */ + smp_wmb(); + + spin_lock(&mm->page_table_lock); + BUG_ON(!pmd_none(*pmd)); + page_add_new_anon_rmap(new_page, vma, address); + set_pmd_at(mm, address, pmd, _pmd); + update_mmu_cache(vma, address, entry); + prepare_pmd_huge_pte(pgtable, mm); + mm->nr_ptes--; + spin_unlock(&mm->page_table_lock); + +#ifndef CONFIG_NUMA + *hpage = NULL; +#endif + khugepaged_pages_collapsed++; +out_up_write: + up_write(&mm->mmap_sem); + return; + +out: + mem_cgroup_uncharge_page(new_page); +#ifdef CONFIG_NUMA + put_page(new_page); +#endif + goto out_up_write; +} + +static int khugepaged_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + struct page **hpage) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte, *_pte; + int ret = 0, referenced = 0, none = 0; + struct page *page; + unsigned long _address; + spinlock_t *ptl; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) + goto out; + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; + _pte++, _address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (pte_none(pteval)) { + if (++none <= khugepaged_max_ptes_none) + continue; + else + goto out_unmap; + } + if (!pte_present(pteval) || !pte_write(pteval)) + goto out_unmap; + page = vm_normal_page(vma, _address, pteval); + if (unlikely(!page)) + goto out_unmap; + VM_BUG_ON(PageCompound(page)); + if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) + goto out_unmap; + /* cannot use mapcount: can't collapse if there's a gup pin */ + if (page_count(page) != 1) + goto out_unmap; + if (pte_young(pteval) || PageReferenced(page) || + mmu_notifier_test_young(vma->vm_mm, address)) + referenced = 1; + } + if (referenced) + ret = 1; +out_unmap: + pte_unmap_unlock(pte, ptl); + if (ret) + /* collapse_huge_page will return with the mmap_sem released */ + collapse_huge_page(mm, address, hpage, vma); +out: + return ret; +} + +static void collect_mm_slot(struct mm_slot *mm_slot) +{ + struct mm_struct *mm = mm_slot->mm; + + VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + + if (khugepaged_test_exit(mm)) { + /* free mm_slot */ + hlist_del(&mm_slot->hash); + list_del(&mm_slot->mm_node); + + /* + * Not strictly needed because the mm exited already. + * + * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + */ + + /* khugepaged_mm_lock actually not necessary for the below */ + free_mm_slot(mm_slot); + mmdrop(mm); + } +} + +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, + struct page **hpage) +{ + struct mm_slot *mm_slot; + struct mm_struct *mm; + struct vm_area_struct *vma; + int progress = 0; + + VM_BUG_ON(!pages); + VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + + if (khugepaged_scan.mm_slot) + mm_slot = khugepaged_scan.mm_slot; + else { + mm_slot = list_entry(khugepaged_scan.mm_head.next, + struct mm_slot, mm_node); + khugepaged_scan.address = 0; + khugepaged_scan.mm_slot = mm_slot; + } + spin_unlock(&khugepaged_mm_lock); + + mm = mm_slot->mm; + down_read(&mm->mmap_sem); + if (unlikely(khugepaged_test_exit(mm))) + vma = NULL; + else + vma = find_vma(mm, khugepaged_scan.address); + + progress++; + for (; vma; vma = vma->vm_next) { + unsigned long hstart, hend; + + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) { + progress++; + break; + } + + if ((!(vma->vm_flags & VM_HUGEPAGE) && + !khugepaged_always()) || + (vma->vm_flags & VM_NOHUGEPAGE)) { + skip: + progress++; + continue; + } + /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) + goto skip; + if (is_vma_temporary_stack(vma)) + goto skip; + + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (hstart >= hend) + goto skip; + if (khugepaged_scan.address > hend) + goto skip; + if (khugepaged_scan.address < hstart) + khugepaged_scan.address = hstart; + VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); + + while (khugepaged_scan.address < hend) { + int ret; + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) + goto breakouterloop; + + VM_BUG_ON(khugepaged_scan.address < hstart || + khugepaged_scan.address + HPAGE_PMD_SIZE > + hend); + ret = khugepaged_scan_pmd(mm, vma, + khugepaged_scan.address, + hpage); + /* move to next address */ + khugepaged_scan.address += HPAGE_PMD_SIZE; + progress += HPAGE_PMD_NR; + if (ret) + /* we released mmap_sem so break loop */ + goto breakouterloop_mmap_sem; + if (progress >= pages) + goto breakouterloop; + } + } +breakouterloop: + up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ +breakouterloop_mmap_sem: + + spin_lock(&khugepaged_mm_lock); + VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); + /* + * Release the current mm_slot if this mm is about to die, or + * if we scanned all vmas of this mm. + */ + if (khugepaged_test_exit(mm) || !vma) { + /* + * Make sure that if mm_users is reaching zero while + * khugepaged runs here, khugepaged_exit will find + * mm_slot not pointing to the exiting mm. + */ + if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { + khugepaged_scan.mm_slot = list_entry( + mm_slot->mm_node.next, + struct mm_slot, mm_node); + khugepaged_scan.address = 0; + } else { + khugepaged_scan.mm_slot = NULL; + khugepaged_full_scans++; + } + + collect_mm_slot(mm_slot); + } + + return progress; +} + +static int khugepaged_has_work(void) +{ + return !list_empty(&khugepaged_scan.mm_head) && + khugepaged_enabled(); +} + +static int khugepaged_wait_event(void) +{ + return !list_empty(&khugepaged_scan.mm_head) || + !khugepaged_enabled(); +} + +static void khugepaged_do_scan(struct page **hpage) +{ + unsigned int progress = 0, pass_through_head = 0; + unsigned int pages = khugepaged_pages_to_scan; + + barrier(); /* write khugepaged_pages_to_scan to local stack */ + + while (progress < pages) { + cond_resched(); + +#ifndef CONFIG_NUMA + if (!*hpage) { + *hpage = alloc_hugepage(khugepaged_defrag()); + if (unlikely(!*hpage)) + break; + } +#else + if (IS_ERR(*hpage)) + break; +#endif + + if (unlikely(kthread_should_stop() || freezing(current))) + break; + + spin_lock(&khugepaged_mm_lock); + if (!khugepaged_scan.mm_slot) + pass_through_head++; + if (khugepaged_has_work() && + pass_through_head < 2) + progress += khugepaged_scan_mm_slot(pages - progress, + hpage); + else + progress = pages; + spin_unlock(&khugepaged_mm_lock); + } +} + +static void khugepaged_alloc_sleep(void) +{ + DEFINE_WAIT(wait); + add_wait_queue(&khugepaged_wait, &wait); + schedule_timeout_interruptible( + msecs_to_jiffies( + khugepaged_alloc_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); +} + +#ifndef CONFIG_NUMA +static struct page *khugepaged_alloc_hugepage(void) +{ + struct page *hpage; + + do { + hpage = alloc_hugepage(khugepaged_defrag()); + if (!hpage) + khugepaged_alloc_sleep(); + } while (unlikely(!hpage) && + likely(khugepaged_enabled())); + return hpage; +} +#endif + +static void khugepaged_loop(void) +{ + struct page *hpage; + +#ifdef CONFIG_NUMA + hpage = NULL; +#endif + while (likely(khugepaged_enabled())) { +#ifndef CONFIG_NUMA + hpage = khugepaged_alloc_hugepage(); + if (unlikely(!hpage)) + break; +#else + if (IS_ERR(hpage)) { + khugepaged_alloc_sleep(); + hpage = NULL; + } +#endif + + khugepaged_do_scan(&hpage); +#ifndef CONFIG_NUMA + if (hpage) + put_page(hpage); +#endif + try_to_freeze(); + if (unlikely(kthread_should_stop())) + break; + if (khugepaged_has_work()) { + DEFINE_WAIT(wait); + if (!khugepaged_scan_sleep_millisecs) + continue; + add_wait_queue(&khugepaged_wait, &wait); + schedule_timeout_interruptible( + msecs_to_jiffies( + khugepaged_scan_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); + } else if (khugepaged_enabled()) + wait_event_freezable(khugepaged_wait, + khugepaged_wait_event()); + } +} + +static int khugepaged(void *none) +{ + struct mm_slot *mm_slot; + + set_freezable(); + set_user_nice(current, 19); + + /* serialize with start_khugepaged() */ + mutex_lock(&khugepaged_mutex); + + for (;;) { + mutex_unlock(&khugepaged_mutex); + VM_BUG_ON(khugepaged_thread != current); + khugepaged_loop(); + VM_BUG_ON(khugepaged_thread != current); + + mutex_lock(&khugepaged_mutex); + if (!khugepaged_enabled()) + break; + if (unlikely(kthread_should_stop())) + break; + } + + spin_lock(&khugepaged_mm_lock); + mm_slot = khugepaged_scan.mm_slot; + khugepaged_scan.mm_slot = NULL; + if (mm_slot) + collect_mm_slot(mm_slot); + spin_unlock(&khugepaged_mm_lock); + + khugepaged_thread = NULL; + mutex_unlock(&khugepaged_mutex); + + return 0; +} + +void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) +{ + struct page *page; + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_trans_huge(*pmd))) { + spin_unlock(&mm->page_table_lock); + return; + } + page = pmd_page(*pmd); + VM_BUG_ON(!page_count(page)); + get_page(page); + spin_unlock(&mm->page_table_lock); + + split_huge_page(page); + + put_page(page); + BUG_ON(pmd_trans_huge(*pmd)); +} + +static void split_huge_page_address(struct mm_struct *mm, + unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return; + /* + * Caller holds the mmap_sem write mode, so a huge pmd cannot + * materialize from under us. + */ + split_huge_page_pmd(mm, pmd); +} + +void __vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next) +{ + /* + * If the new start address isn't hpage aligned and it could + * previously contain an hugepage: check if we need to split + * an huge pmd. + */ + if (start & ~HPAGE_PMD_MASK && + (start & HPAGE_PMD_MASK) >= vma->vm_start && + (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) + split_huge_page_address(vma->vm_mm, start); + + /* + * If the new end address isn't hpage aligned and it could + * previously contain an hugepage: check if we need to split + * an huge pmd. + */ + if (end & ~HPAGE_PMD_MASK && + (end & HPAGE_PMD_MASK) >= vma->vm_start && + (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) + split_huge_page_address(vma->vm_mm, end); + + /* + * If we're also updating the vma->vm_next->vm_start, if the new + * vm_next->vm_start isn't page aligned and it could previously + * contain an hugepage: check if we need to split an huge pmd. + */ + if (adjust_next > 0) { + struct vm_area_struct *next = vma->vm_next; + unsigned long nstart = next->vm_start; + nstart += adjust_next << PAGE_SHIFT; + if (nstart & ~HPAGE_PMD_MASK && + (nstart & HPAGE_PMD_MASK) >= next->vm_start && + (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) + split_huge_page_address(next->vm_mm, nstart); + } +} diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c03273807182..bb0b7c128015 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -394,67 +394,37 @@ static int vma_has_reserves(struct vm_area_struct *vma) return 0; } -static void clear_gigantic_page(struct page *page, - unsigned long addr, unsigned long sz) +static void copy_gigantic_page(struct page *dst, struct page *src) { int i; - struct page *p = page; - - might_sleep(); - for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) { - cond_resched(); - clear_user_highpage(p, addr + i * PAGE_SIZE); - } -} -static void clear_huge_page(struct page *page, - unsigned long addr, unsigned long sz) -{ - int i; - - if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) { - clear_gigantic_page(page, addr, sz); - return; - } - - might_sleep(); - for (i = 0; i < sz/PAGE_SIZE; i++) { - cond_resched(); - clear_user_highpage(page + i, addr + i * PAGE_SIZE); - } -} - -static void copy_gigantic_page(struct page *dst, struct page *src, - unsigned long addr, struct vm_area_struct *vma) -{ - int i; - struct hstate *h = hstate_vma(vma); + struct hstate *h = page_hstate(src); struct page *dst_base = dst; struct page *src_base = src; - might_sleep(); + for (i = 0; i < pages_per_huge_page(h); ) { cond_resched(); - copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); + copy_highpage(dst, src); i++; dst = mem_map_next(dst, dst_base, i); src = mem_map_next(src, src_base, i); } } -static void copy_huge_page(struct page *dst, struct page *src, - unsigned long addr, struct vm_area_struct *vma) + +void copy_huge_page(struct page *dst, struct page *src) { int i; - struct hstate *h = hstate_vma(vma); + struct hstate *h = page_hstate(src); if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { - copy_gigantic_page(dst, src, addr, vma); + copy_gigantic_page(dst, src); return; } might_sleep(); for (i = 0; i < pages_per_huge_page(h); i++) { cond_resched(); - copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); + copy_highpage(dst + i, src + i); } } @@ -466,11 +436,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) h->free_huge_pages_node[nid]++; } +static struct page *dequeue_huge_page_node(struct hstate *h, int nid) +{ + struct page *page; + + if (list_empty(&h->hugepage_freelists[nid])) + return NULL; + page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); + list_del(&page->lru); + set_page_refcounted(page); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + return page; +} + static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve) { - int nid; struct page *page = NULL; struct mempolicy *mpol; nodemask_t *nodemask; @@ -496,19 +479,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { - nid = zone_to_nid(zone); - if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && - !list_empty(&h->hugepage_freelists[nid])) { - page = list_entry(h->hugepage_freelists[nid].next, - struct page, lru); - list_del(&page->lru); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - - if (!avoid_reserve) - decrement_hugepage_resv_vma(h, vma); - - break; + if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { + page = dequeue_huge_page_node(h, zone_to_nid(zone)); + if (page) { + if (!avoid_reserve) + decrement_hugepage_resv_vma(h, vma); + break; + } } } err: @@ -770,11 +747,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, return ret; } -static struct page *alloc_buddy_huge_page(struct hstate *h, - struct vm_area_struct *vma, unsigned long address) +static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) { struct page *page; - unsigned int nid; + unsigned int r_nid; if (h->order >= MAX_ORDER) return NULL; @@ -812,9 +788,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, } spin_unlock(&hugetlb_lock); - page = alloc_pages(htlb_alloc_mask|__GFP_COMP| - __GFP_REPEAT|__GFP_NOWARN, - huge_page_order(h)); + if (nid == NUMA_NO_NODE) + page = alloc_pages(htlb_alloc_mask|__GFP_COMP| + __GFP_REPEAT|__GFP_NOWARN, + huge_page_order(h)); + else + page = alloc_pages_exact_node(nid, + htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| + __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page && arch_prepare_hugepage(page)) { __free_pages(page, huge_page_order(h)); @@ -823,19 +804,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, spin_lock(&hugetlb_lock); if (page) { - /* - * This page is now managed by the hugetlb allocator and has - * no users -- drop the buddy allocator's reference. - */ - put_page_testzero(page); - VM_BUG_ON(page_count(page)); - nid = page_to_nid(page); + r_nid = page_to_nid(page); set_compound_page_dtor(page, free_huge_page); /* * We incremented the global counters already */ - h->nr_huge_pages_node[nid]++; - h->surplus_huge_pages_node[nid]++; + h->nr_huge_pages_node[r_nid]++; + h->surplus_huge_pages_node[r_nid]++; __count_vm_event(HTLB_BUDDY_PGALLOC); } else { h->nr_huge_pages--; @@ -848,6 +823,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, } /* + * This allocation function is useful in the context where vma is irrelevant. + * E.g. soft-offlining uses this function because it only cares physical + * address of error page. + */ +struct page *alloc_huge_page_node(struct hstate *h, int nid) +{ + struct page *page; + + spin_lock(&hugetlb_lock); + page = dequeue_huge_page_node(h, nid); + spin_unlock(&hugetlb_lock); + + if (!page) + page = alloc_buddy_huge_page(h, nid); + + return page; +} + +/* * Increase the hugetlb pool such that it can accomodate a reservation * of size 'delta'. */ @@ -871,17 +865,14 @@ static int gather_surplus_pages(struct hstate *h, int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = alloc_buddy_huge_page(h, NULL, 0); - if (!page) { + page = alloc_buddy_huge_page(h, NUMA_NO_NODE); + if (!page) /* * We were not able to allocate enough pages to * satisfy the entire reservation so we free what * we've allocated so far. */ - spin_lock(&hugetlb_lock); - needed = 0; goto free; - } list_add(&page->lru, &surplus_list); } @@ -908,31 +899,31 @@ retry: needed += allocated; h->resv_huge_pages += delta; ret = 0; -free: + + spin_unlock(&hugetlb_lock); /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) { if ((--needed) < 0) break; list_del(&page->lru); + /* + * This page is now managed by the hugetlb allocator and has + * no users -- drop the buddy allocator's reference. + */ + put_page_testzero(page); + VM_BUG_ON(page_count(page)); enqueue_huge_page(h, page); } /* Free unnecessary surplus pages to the buddy allocator */ +free: if (!list_empty(&surplus_list)) { - spin_unlock(&hugetlb_lock); list_for_each_entry_safe(page, tmp, &surplus_list, lru) { list_del(&page->lru); - /* - * The page has a reference count of zero already, so - * call free_huge_page directly instead of using - * put_page. This must be done with hugetlb_lock - * unlocked which is safe because free_huge_page takes - * hugetlb_lock before deciding how to free the page. - */ - free_huge_page(page); + put_page(page); } - spin_lock(&hugetlb_lock); } + spin_lock(&hugetlb_lock); return ret; } @@ -1052,14 +1043,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, spin_unlock(&hugetlb_lock); if (!page) { - page = alloc_buddy_huge_page(h, vma, addr); + page = alloc_buddy_huge_page(h, NUMA_NO_NODE); if (!page) { hugetlb_put_quota(inode->i_mapping, chg); return ERR_PTR(-VM_FAULT_SIGBUS); } } - set_page_refcounted(page); set_page_private(page, (unsigned long) mapping); vma_commit_reservation(h, vma, addr); @@ -1373,6 +1363,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj, return sprintf(buf, "%lu\n", nr_huge_pages); } + static ssize_t nr_hugepages_store_common(bool obey_mempolicy, struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) @@ -1385,9 +1376,14 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, err = strict_strtoul(buf, 10, &count); if (err) - return 0; + goto out; h = kobj_to_hstate(kobj, &nid); + if (h->order >= MAX_ORDER) { + err = -EINVAL; + goto out; + } + if (nid == NUMA_NO_NODE) { /* * global hstate attribute @@ -1413,6 +1409,9 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, NODEMASK_FREE(nodes_allowed); return len; +out: + NODEMASK_FREE(nodes_allowed); + return err; } static ssize_t nr_hugepages_show(struct kobject *kobj, @@ -1455,6 +1454,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, struct hstate *h = kobj_to_hstate(kobj, NULL); return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); } + static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { @@ -1462,9 +1462,12 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, unsigned long input; struct hstate *h = kobj_to_hstate(kobj, NULL); + if (h->order >= MAX_ORDER) + return -EINVAL; + err = strict_strtoul(buf, 10, &input); if (err) - return 0; + return err; spin_lock(&hugetlb_lock); h->nr_overcommit_huge_pages = input; @@ -1867,13 +1870,19 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, { struct hstate *h = &default_hstate; unsigned long tmp; + int ret; if (!write) tmp = h->max_huge_pages; + if (write && h->order >= MAX_ORDER) + return -EINVAL; + table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, buffer, length, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret) + goto out; if (write) { NODEMASK_ALLOC(nodemask_t, nodes_allowed, @@ -1888,8 +1897,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, if (nodes_allowed != &node_states[N_HIGH_MEMORY]) NODEMASK_FREE(nodes_allowed); } - - return 0; +out: + return ret; } int hugetlb_sysctl_handler(struct ctl_table *table, int write, @@ -1927,21 +1936,27 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, { struct hstate *h = &default_hstate; unsigned long tmp; + int ret; if (!write) tmp = h->nr_overcommit_huge_pages; + if (write && h->order >= MAX_ORDER) + return -EINVAL; + table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, buffer, length, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret) + goto out; if (write) { spin_lock(&hugetlb_lock); h->nr_overcommit_huge_pages = tmp; spin_unlock(&hugetlb_lock); } - - return 0; +out: + return ret; } #endif /* CONFIG_SYSCTL */ @@ -2153,6 +2168,19 @@ nomem: return -ENOMEM; } +static int is_hugetlb_entry_migration(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_migration_entry(swp)) { + return 1; + } else + return 0; +} + static int is_hugetlb_entry_hwpoisoned(pte_t pte) { swp_entry_t swp; @@ -2380,10 +2408,14 @@ retry_avoidcopy: * When the original hugepage is shared one, it does not have * anon_vma prepared. */ - if (unlikely(anon_vma_prepare(vma))) + if (unlikely(anon_vma_prepare(vma))) { + /* Caller expects lock to be held */ + spin_lock(&mm->page_table_lock); return VM_FAULT_OOM; + } - copy_huge_page(new_page, old_page, address, vma); + copy_user_huge_page(new_page, old_page, address, vma, + pages_per_huge_page(h)); __SetPageUptodate(new_page); /* @@ -2487,7 +2519,7 @@ retry: ret = -PTR_ERR(page); goto out; } - clear_huge_page(page, address, huge_page_size(h)); + clear_huge_page(page, address, pages_per_huge_page(h)); __SetPageUptodate(page); if (vma->vm_flags & VM_MAYSHARE) { @@ -2515,22 +2547,20 @@ retry: hugepage_add_new_anon_rmap(page, vma, address); } } else { + /* + * If memory error occurs between mmap() and fault, some process + * don't have hwpoisoned swap entry for errored virtual address. + * So we need to block hugepage fault by PG_hwpoison bit check. + */ + if (unlikely(PageHWPoison(page))) { + ret = VM_FAULT_HWPOISON | + VM_FAULT_SET_HINDEX(h - hstates); + goto backout_unlocked; + } page_dup_rmap(page); } /* - * Since memory error handler replaces pte into hwpoison swap entry - * at the time of error handling, a process which reserved but not have - * the mapping to the error hugepage does not have hwpoison swap entry. - * So we need to block accesses from such a process by checking - * PG_hwpoison bit here. - */ - if (unlikely(PageHWPoison(page))) { - ret = VM_FAULT_HWPOISON; - goto backout_unlocked; - } - - /* * If we are going to COW a private mapping later, we examine the * pending reservations for this page now. This will ensure that * any allocations necessary to record that reservation occur outside @@ -2587,8 +2617,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ptep = huge_pte_offset(mm, address); if (ptep) { entry = huge_ptep_get(ptep); - if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) - return VM_FAULT_HWPOISON; + if (unlikely(is_hugetlb_entry_migration(entry))) { + migration_entry_wait(mm, (pmd_t *)ptep, address); + return 0; + } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) + return VM_FAULT_HWPOISON_LARGE | + VM_FAULT_SET_HINDEX(h - hstates); } ptep = huge_pte_alloc(mm, address, huge_page_size(h)); @@ -2665,7 +2699,8 @@ out_page_table_lock: unlock_page(pagecache_page); put_page(pagecache_page); } - unlock_page(page); + if (page != pagecache_page) + unlock_page(page); out_mutex: mutex_unlock(&hugetlb_instantiation_mutex); @@ -2878,18 +2913,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) hugetlb_acct_memory(h, -(chg - freed)); } +#ifdef CONFIG_MEMORY_FAILURE + +/* Should be called in hugetlb_lock */ +static int is_hugepage_on_freelist(struct page *hpage) +{ + struct page *page; + struct page *tmp; + struct hstate *h = page_hstate(hpage); + int nid = page_to_nid(hpage); + + list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) + if (page == hpage) + return 1; + return 0; +} + /* * This function is called from memory failure code. * Assume the caller holds page lock of the head page. */ -void __isolate_hwpoisoned_huge_page(struct page *hpage) +int dequeue_hwpoisoned_huge_page(struct page *hpage) { struct hstate *h = page_hstate(hpage); int nid = page_to_nid(hpage); + int ret = -EBUSY; spin_lock(&hugetlb_lock); - list_del(&hpage->lru); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; + if (is_hugepage_on_freelist(hpage)) { + list_del(&hpage->lru); + set_page_refcounted(hpage); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + ret = 0; + } spin_unlock(&hugetlb_lock); + return ret; } +#endif diff --git a/mm/internal.h b/mm/internal.h index 6a697bb97fc5..69488205723d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -62,7 +62,7 @@ extern bool is_free_buddy_page(struct page *page); */ static inline unsigned long page_order(struct page *page) { - VM_BUG_ON(!PageBuddy(page)); + /* PageBuddy() must be checked by the caller */ return page_private(page); } @@ -134,6 +134,10 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) } } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern unsigned long vma_address(struct page *page, + struct vm_area_struct *vma); +#endif #else /* !CONFIG_MMU */ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) { @@ -243,7 +247,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, unsigned int foll_flags, - struct page **pages, struct vm_area_struct **vmas); + struct page **pages, struct vm_area_struct **vmas, + int *nonblocking); #define ZONE_RECLAIM_NOSCAN -2 #define ZONE_RECLAIM_FULL -1 diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index 177a5169bbde..ff0d9779cec8 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c @@ -75,13 +75,11 @@ static int __init kmemleak_test_init(void) * after the module is removed. */ for (i = 0; i < 10; i++) { - elem = kmalloc(sizeof(*elem), GFP_KERNEL); - pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem); + elem = kzalloc(sizeof(*elem), GFP_KERNEL); + pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem); if (!elem) return -ENOMEM; - memset(elem, 0, sizeof(*elem)); INIT_LIST_HEAD(&elem->list); - list_add_tail(&elem->list, &test_list); } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index bd9bc214091b..84225f3b7190 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -113,7 +113,9 @@ #define BYTES_PER_POINTER sizeof(void *) /* GFP bitmask for kmemleak internal allocations */ -#define GFP_KMEMLEAK_MASK (GFP_KERNEL | GFP_ATOMIC) +#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \ + __GFP_NORETRY | __GFP_NOMEMALLOC | \ + __GFP_NOWARN) /* scanning area inside a memory block */ struct kmemleak_scan_area { @@ -511,9 +513,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, struct kmemleak_object *object; struct prio_tree_node *node; - object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); + object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); if (!object) { - kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); + pr_warning("Cannot allocate a kmemleak_object structure\n"); + kmemleak_disable(); return NULL; } @@ -734,9 +737,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) return; } - area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK); + area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); if (!area) { - kmemleak_warn("Cannot allocate a scan area\n"); + pr_warning("Cannot allocate a scan area\n"); goto out; } @@ -34,6 +34,7 @@ #include <linux/swap.h> #include <linux/ksm.h> #include <linux/hash.h> +#include <linux/freezer.h> #include <asm/tlbflush.h> #include "internal.h" @@ -411,6 +412,20 @@ out: up_read(&mm->mmap_sem); } +static struct page *page_trans_compound_anon(struct page *page) +{ + if (PageTransCompound(page)) { + struct page *head = compound_trans_head(page); + /* + * head may actually be splitted and freed from under + * us but it's ok here. + */ + if (PageAnon(head)) + return head; + } + return NULL; +} + static struct page *get_mergeable_page(struct rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; @@ -430,7 +445,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) page = follow_page(vma, addr, FOLL_GET); if (IS_ERR_OR_NULL(page)) goto out; - if (PageAnon(page)) { + if (PageAnon(page) || page_trans_compound_anon(page)) { flush_anon_page(vma, page, addr); flush_dcache_page(page); } else { @@ -708,6 +723,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, if (addr == -EFAULT) goto out; + BUG_ON(PageTransCompound(page)); ptep = page_check_address(page, mm, addr, &ptl, 0); if (!ptep) goto out; @@ -783,6 +799,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, goto out; pmd = pmd_offset(pud, addr); + BUG_ON(pmd_trans_huge(*pmd)); if (!pmd_present(*pmd)) goto out; @@ -800,6 +817,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); page_remove_rmap(page); + if (!page_mapped(page)) + try_to_free_swap(page); put_page(page); pte_unmap_unlock(ptep, ptl); @@ -808,6 +827,33 @@ out: return err; } +static int page_trans_compound_anon_split(struct page *page) +{ + int ret = 0; + struct page *transhuge_head = page_trans_compound_anon(page); + if (transhuge_head) { + /* Get the reference on the head to split it. */ + if (get_page_unless_zero(transhuge_head)) { + /* + * Recheck we got the reference while the head + * was still anonymous. + */ + if (PageAnon(transhuge_head)) + ret = split_huge_page(transhuge_head); + else + /* + * Retry later if split_huge_page run + * from under us. + */ + ret = 1; + put_page(transhuge_head); + } else + /* Retry later if split_huge_page run from under us. */ + ret = 1; + } + return ret; +} + /* * try_to_merge_one_page - take two pages and merge them into one * @vma: the vma that holds the pte pointing to page @@ -828,6 +874,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, if (!(vma->vm_flags & VM_MERGEABLE)) goto out; + if (PageTransCompound(page) && page_trans_compound_anon_split(page)) + goto out; + BUG_ON(PageTransCompound(page)); if (!PageAnon(page)) goto out; @@ -1247,6 +1296,18 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) slot = ksm_scan.mm_slot; if (slot == &ksm_mm_head) { + /* + * A number of pages can hang around indefinitely on per-cpu + * pagevecs, raised page count preventing write_protect_page + * from merging them. Though it doesn't really matter much, + * it is puzzling to see some stuck in pages_volatile until + * other activity jostles them out, and they also prevented + * LTP's KSM test from succeeding deterministically; so drain + * them here (here rather than on entry to ksm_do_scan(), + * so we don't IPI too often when pages_to_scan is set low). + */ + lru_add_drain_all(); + root_unstable_tree = RB_ROOT; spin_lock(&ksm_mmlist_lock); @@ -1277,7 +1338,13 @@ next_mm: if (ksm_test_exit(mm)) break; *page = follow_page(vma, ksm_scan.address, FOLL_GET); - if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) { + if (IS_ERR_OR_NULL(*page)) { + ksm_scan.address += PAGE_SIZE; + cond_resched(); + continue; + } + if (PageAnon(*page) || + page_trans_compound_anon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); rmap_item = get_next_rmap_item(slot, @@ -1291,8 +1358,7 @@ next_mm: up_read(&mm->mmap_sem); return rmap_item; } - if (!IS_ERR_OR_NULL(*page)) - put_page(*page); + put_page(*page); ksm_scan.address += PAGE_SIZE; cond_resched(); } @@ -1352,7 +1418,7 @@ static void ksm_do_scan(unsigned int scan_npages) struct rmap_item *rmap_item; struct page *uninitialized_var(page); - while (scan_npages--) { + while (scan_npages-- && likely(!freezing(current))) { cond_resched(); rmap_item = scan_get_next_rmap_item(&page); if (!rmap_item) @@ -1370,6 +1436,7 @@ static int ksmd_should_run(void) static int ksm_scan_thread(void *nothing) { + set_freezable(); set_user_nice(current, 5); while (!kthread_should_stop()) { @@ -1378,11 +1445,13 @@ static int ksm_scan_thread(void *nothing) ksm_do_scan(ksm_thread_pages_to_scan); mutex_unlock(&ksm_thread_mutex); + try_to_freeze(); + if (ksmd_should_run()) { schedule_timeout_interruptible( msecs_to_jiffies(ksm_thread_sleep_millisecs)); } else { - wait_event_interruptible(ksm_thread_wait, + wait_event_freezable(ksm_thread_wait, ksmd_should_run() || kthread_should_stop()); } } @@ -1724,8 +1793,13 @@ static int ksm_memory_callback(struct notifier_block *self, /* * Keep it very simple for now: just lock out ksmd and * MADV_UNMERGEABLE while any memory is going offline. + * mutex_lock_nested() is necessary because lockdep was alarmed + * that here we take ksm_thread_mutex inside notifier chain + * mutex, and later take notifier chain mutex inside + * ksm_thread_mutex to unlock it. But that's safe because both + * are inside mem_hotplug_mutex. */ - mutex_lock(&ksm_thread_mutex); + mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING); break; case MEM_OFFLINE: diff --git a/mm/maccess.c b/mm/maccess.c index 4e348dbaecd7..e2b6f5634e0d 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -1,9 +1,9 @@ /* * Access kernel memory without faulting. */ -#include <linux/uaccess.h> #include <linux/module.h> #include <linux/mm.h> +#include <linux/uaccess.h> /** * probe_kernel_read(): safely attempt to read from a location diff --git a/mm/madvise.c b/mm/madvise.c index 319528b8db74..2221491ed503 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -71,6 +71,12 @@ static long madvise_behavior(struct vm_area_struct * vma, if (error) goto out; break; + case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: + error = hugepage_madvise(vma, &new_flags, behavior); + if (error) + goto out; + break; } if (new_flags == vma->vm_flags) { @@ -283,6 +289,10 @@ madvise_behavior_valid(int behavior) case MADV_MERGEABLE: case MADV_UNMERGEABLE: #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: +#endif return 1; default: diff --git a/mm/memblock.c b/mm/memblock.c index 43840b305ecb..4618fda975a0 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -11,237 +11,421 @@ */ #include <linux/kernel.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/bitops.h> +#include <linux/poison.h> +#include <linux/pfn.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> #include <linux/memblock.h> -#define MEMBLOCK_ALLOC_ANYWHERE 0 +struct memblock memblock __initdata_memblock; -struct memblock memblock; +int memblock_debug __initdata_memblock; +int memblock_can_resize __initdata_memblock; +static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock; +static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock; -static int memblock_debug; +/* inline so we don't get a warning when pr_debug is compiled out */ +static inline const char *memblock_type_name(struct memblock_type *type) +{ + if (type == &memblock.memory) + return "memory"; + else if (type == &memblock.reserved) + return "reserved"; + else + return "unknown"; +} -static int __init early_memblock(char *p) +/* + * Address comparison utilities + */ + +static phys_addr_t __init_memblock memblock_align_down(phys_addr_t addr, phys_addr_t size) { - if (p && strstr(p, "debug")) - memblock_debug = 1; + return addr & ~(size - 1); +} + +static phys_addr_t __init_memblock memblock_align_up(phys_addr_t addr, phys_addr_t size) +{ + return (addr + (size - 1)) & ~(size - 1); +} + +static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, + phys_addr_t base2, phys_addr_t size2) +{ + return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); +} + +static long __init_memblock memblock_addrs_adjacent(phys_addr_t base1, phys_addr_t size1, + phys_addr_t base2, phys_addr_t size2) +{ + if (base2 == base1 + size1) + return 1; + else if (base1 == base2 + size2) + return -1; + return 0; } -early_param("memblock", early_memblock); -static void memblock_dump(struct memblock_region *region, char *name) +static long __init_memblock memblock_regions_adjacent(struct memblock_type *type, + unsigned long r1, unsigned long r2) { - unsigned long long base, size; - int i; + phys_addr_t base1 = type->regions[r1].base; + phys_addr_t size1 = type->regions[r1].size; + phys_addr_t base2 = type->regions[r2].base; + phys_addr_t size2 = type->regions[r2].size; - pr_info(" %s.cnt = 0x%lx\n", name, region->cnt); + return memblock_addrs_adjacent(base1, size1, base2, size2); +} - for (i = 0; i < region->cnt; i++) { - base = region->region[i].base; - size = region->region[i].size; +long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) +{ + unsigned long i; - pr_info(" %s[0x%x]\t0x%016llx - 0x%016llx, 0x%llx bytes\n", - name, i, base, base + size - 1, size); + for (i = 0; i < type->cnt; i++) { + phys_addr_t rgnbase = type->regions[i].base; + phys_addr_t rgnsize = type->regions[i].size; + if (memblock_addrs_overlap(base, size, rgnbase, rgnsize)) + break; } + + return (i < type->cnt) ? i : -1; } -void memblock_dump_all(void) +/* + * Find, allocate, deallocate or reserve unreserved regions. All allocations + * are top-down. + */ + +static phys_addr_t __init_memblock memblock_find_region(phys_addr_t start, phys_addr_t end, + phys_addr_t size, phys_addr_t align) { - if (!memblock_debug) - return; + phys_addr_t base, res_base; + long j; - pr_info("MEMBLOCK configuration:\n"); - pr_info(" rmo_size = 0x%llx\n", (unsigned long long)memblock.rmo_size); - pr_info(" memory.size = 0x%llx\n", (unsigned long long)memblock.memory.size); + /* In case, huge size is requested */ + if (end < size) + return MEMBLOCK_ERROR; - memblock_dump(&memblock.memory, "memory"); - memblock_dump(&memblock.reserved, "reserved"); + base = memblock_align_down((end - size), align); + + /* Prevent allocations returning 0 as it's also used to + * indicate an allocation failure + */ + if (start == 0) + start = PAGE_SIZE; + + while (start <= base) { + j = memblock_overlaps_region(&memblock.reserved, base, size); + if (j < 0) + return base; + res_base = memblock.reserved.regions[j].base; + if (res_base < size) + break; + base = memblock_align_down(res_base - size, align); + } + + return MEMBLOCK_ERROR; } -static unsigned long memblock_addrs_overlap(u64 base1, u64 size1, u64 base2, - u64 size2) +static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size, + phys_addr_t align, phys_addr_t start, phys_addr_t end) { - return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); + long i; + + BUG_ON(0 == size); + + /* Pump up max_addr */ + if (end == MEMBLOCK_ALLOC_ACCESSIBLE) + end = memblock.current_limit; + + /* We do a top-down search, this tends to limit memory + * fragmentation by keeping early boot allocs near the + * top of memory + */ + for (i = memblock.memory.cnt - 1; i >= 0; i--) { + phys_addr_t memblockbase = memblock.memory.regions[i].base; + phys_addr_t memblocksize = memblock.memory.regions[i].size; + phys_addr_t bottom, top, found; + + if (memblocksize < size) + continue; + if ((memblockbase + memblocksize) <= start) + break; + bottom = max(memblockbase, start); + top = min(memblockbase + memblocksize, end); + if (bottom >= top) + continue; + found = memblock_find_region(bottom, top, size, align); + if (found != MEMBLOCK_ERROR) + return found; + } + return MEMBLOCK_ERROR; } -static long memblock_addrs_adjacent(u64 base1, u64 size1, u64 base2, u64 size2) +/* + * Find a free area with specified alignment in a specific range. + */ +u64 __init_memblock memblock_find_in_range(u64 start, u64 end, u64 size, u64 align) { - if (base2 == base1 + size1) - return 1; - else if (base1 == base2 + size2) - return -1; + return memblock_find_base(size, align, start, end); +} - return 0; +/* + * Free memblock.reserved.regions + */ +int __init_memblock memblock_free_reserved_regions(void) +{ + if (memblock.reserved.regions == memblock_reserved_init_regions) + return 0; + + return memblock_free(__pa(memblock.reserved.regions), + sizeof(struct memblock_region) * memblock.reserved.max); } -static long memblock_regions_adjacent(struct memblock_region *rgn, - unsigned long r1, unsigned long r2) +/* + * Reserve memblock.reserved.regions + */ +int __init_memblock memblock_reserve_reserved_regions(void) { - u64 base1 = rgn->region[r1].base; - u64 size1 = rgn->region[r1].size; - u64 base2 = rgn->region[r2].base; - u64 size2 = rgn->region[r2].size; + if (memblock.reserved.regions == memblock_reserved_init_regions) + return 0; - return memblock_addrs_adjacent(base1, size1, base2, size2); + return memblock_reserve(__pa(memblock.reserved.regions), + sizeof(struct memblock_region) * memblock.reserved.max); } -static void memblock_remove_region(struct memblock_region *rgn, unsigned long r) +static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) { unsigned long i; - for (i = r; i < rgn->cnt - 1; i++) { - rgn->region[i].base = rgn->region[i + 1].base; - rgn->region[i].size = rgn->region[i + 1].size; + for (i = r; i < type->cnt - 1; i++) { + type->regions[i].base = type->regions[i + 1].base; + type->regions[i].size = type->regions[i + 1].size; } - rgn->cnt--; + type->cnt--; } /* Assumption: base addr of region 1 < base addr of region 2 */ -static void memblock_coalesce_regions(struct memblock_region *rgn, +static void __init_memblock memblock_coalesce_regions(struct memblock_type *type, unsigned long r1, unsigned long r2) { - rgn->region[r1].size += rgn->region[r2].size; - memblock_remove_region(rgn, r2); + type->regions[r1].size += type->regions[r2].size; + memblock_remove_region(type, r2); } -void __init memblock_init(void) +/* Defined below but needed now */ +static long memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size); + +static int __init_memblock memblock_double_array(struct memblock_type *type) { - /* Create a dummy zero size MEMBLOCK which will get coalesced away later. - * This simplifies the memblock_add() code below... + struct memblock_region *new_array, *old_array; + phys_addr_t old_size, new_size, addr; + int use_slab = slab_is_available(); + + /* We don't allow resizing until we know about the reserved regions + * of memory that aren't suitable for allocation */ - memblock.memory.region[0].base = 0; - memblock.memory.region[0].size = 0; - memblock.memory.cnt = 1; + if (!memblock_can_resize) + return -1; - /* Ditto. */ - memblock.reserved.region[0].base = 0; - memblock.reserved.region[0].size = 0; - memblock.reserved.cnt = 1; -} + /* Calculate new doubled size */ + old_size = type->max * sizeof(struct memblock_region); + new_size = old_size << 1; + + /* Try to find some space for it. + * + * WARNING: We assume that either slab_is_available() and we use it or + * we use MEMBLOCK for allocations. That means that this is unsafe to use + * when bootmem is currently active (unless bootmem itself is implemented + * on top of MEMBLOCK which isn't the case yet) + * + * This should however not be an issue for now, as we currently only + * call into MEMBLOCK while it's still active, or much later when slab is + * active for memory hotplug operations + */ + if (use_slab) { + new_array = kmalloc(new_size, GFP_KERNEL); + addr = new_array == NULL ? MEMBLOCK_ERROR : __pa(new_array); + } else + addr = memblock_find_base(new_size, sizeof(phys_addr_t), 0, MEMBLOCK_ALLOC_ACCESSIBLE); + if (addr == MEMBLOCK_ERROR) { + pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", + memblock_type_name(type), type->max, type->max * 2); + return -1; + } + new_array = __va(addr); -void __init memblock_analyze(void) -{ - int i; + memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", + memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); - memblock.memory.size = 0; + /* Found space, we now need to move the array over before + * we add the reserved region since it may be our reserved + * array itself that is full. + */ + memcpy(new_array, type->regions, old_size); + memset(new_array + type->max, 0, old_size); + old_array = type->regions; + type->regions = new_array; + type->max <<= 1; + + /* If we use SLAB that's it, we are done */ + if (use_slab) + return 0; - for (i = 0; i < memblock.memory.cnt; i++) - memblock.memory.size += memblock.memory.region[i].size; + /* Add the new reserved region now. Should not fail ! */ + BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size) < 0); + + /* If the array wasn't our static init one, then free it. We only do + * that before SLAB is available as later on, we don't know whether + * to use kfree or free_bootmem_pages(). Shouldn't be a big deal + * anyways + */ + if (old_array != memblock_memory_init_regions && + old_array != memblock_reserved_init_regions) + memblock_free(__pa(old_array), old_size); + + return 0; } -static long memblock_add_region(struct memblock_region *rgn, u64 base, u64 size) +extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1, + phys_addr_t addr2, phys_addr_t size2) +{ + return 1; +} + +static long __init_memblock memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) { unsigned long coalesced = 0; long adjacent, i; - if ((rgn->cnt == 1) && (rgn->region[0].size == 0)) { - rgn->region[0].base = base; - rgn->region[0].size = size; + if ((type->cnt == 1) && (type->regions[0].size == 0)) { + type->regions[0].base = base; + type->regions[0].size = size; return 0; } /* First try and coalesce this MEMBLOCK with another. */ - for (i = 0; i < rgn->cnt; i++) { - u64 rgnbase = rgn->region[i].base; - u64 rgnsize = rgn->region[i].size; + for (i = 0; i < type->cnt; i++) { + phys_addr_t rgnbase = type->regions[i].base; + phys_addr_t rgnsize = type->regions[i].size; if ((rgnbase == base) && (rgnsize == size)) /* Already have this region, so we're done */ return 0; adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize); + /* Check if arch allows coalescing */ + if (adjacent != 0 && type == &memblock.memory && + !memblock_memory_can_coalesce(base, size, rgnbase, rgnsize)) + break; if (adjacent > 0) { - rgn->region[i].base -= size; - rgn->region[i].size += size; + type->regions[i].base -= size; + type->regions[i].size += size; coalesced++; break; } else if (adjacent < 0) { - rgn->region[i].size += size; + type->regions[i].size += size; coalesced++; break; } } - if ((i < rgn->cnt - 1) && memblock_regions_adjacent(rgn, i, i+1)) { - memblock_coalesce_regions(rgn, i, i+1); + /* If we plugged a hole, we may want to also coalesce with the + * next region + */ + if ((i < type->cnt - 1) && memblock_regions_adjacent(type, i, i+1) && + ((type != &memblock.memory || memblock_memory_can_coalesce(type->regions[i].base, + type->regions[i].size, + type->regions[i+1].base, + type->regions[i+1].size)))) { + memblock_coalesce_regions(type, i, i+1); coalesced++; } if (coalesced) return coalesced; - if (rgn->cnt >= MAX_MEMBLOCK_REGIONS) + + /* If we are out of space, we fail. It's too late to resize the array + * but then this shouldn't have happened in the first place. + */ + if (WARN_ON(type->cnt >= type->max)) return -1; /* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */ - for (i = rgn->cnt - 1; i >= 0; i--) { - if (base < rgn->region[i].base) { - rgn->region[i+1].base = rgn->region[i].base; - rgn->region[i+1].size = rgn->region[i].size; + for (i = type->cnt - 1; i >= 0; i--) { + if (base < type->regions[i].base) { + type->regions[i+1].base = type->regions[i].base; + type->regions[i+1].size = type->regions[i].size; } else { - rgn->region[i+1].base = base; - rgn->region[i+1].size = size; + type->regions[i+1].base = base; + type->regions[i+1].size = size; break; } } - if (base < rgn->region[0].base) { - rgn->region[0].base = base; - rgn->region[0].size = size; + if (base < type->regions[0].base) { + type->regions[0].base = base; + type->regions[0].size = size; + } + type->cnt++; + + /* The array is full ? Try to resize it. If that fails, we undo + * our allocation and return an error + */ + if (type->cnt == type->max && memblock_double_array(type)) { + type->cnt--; + return -1; } - rgn->cnt++; return 0; } -long memblock_add(u64 base, u64 size) +long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { - struct memblock_region *_rgn = &memblock.memory; - - /* On pSeries LPAR systems, the first MEMBLOCK is our RMO region. */ - if (base == 0) - memblock.rmo_size = size; - - return memblock_add_region(_rgn, base, size); + return memblock_add_region(&memblock.memory, base, size); } -static long __memblock_remove(struct memblock_region *rgn, u64 base, u64 size) +static long __init_memblock __memblock_remove(struct memblock_type *type, phys_addr_t base, phys_addr_t size) { - u64 rgnbegin, rgnend; - u64 end = base + size; + phys_addr_t rgnbegin, rgnend; + phys_addr_t end = base + size; int i; rgnbegin = rgnend = 0; /* supress gcc warnings */ /* Find the region where (base, size) belongs to */ - for (i=0; i < rgn->cnt; i++) { - rgnbegin = rgn->region[i].base; - rgnend = rgnbegin + rgn->region[i].size; + for (i=0; i < type->cnt; i++) { + rgnbegin = type->regions[i].base; + rgnend = rgnbegin + type->regions[i].size; if ((rgnbegin <= base) && (end <= rgnend)) break; } /* Didn't find the region */ - if (i == rgn->cnt) + if (i == type->cnt) return -1; /* Check to see if we are removing entire region */ if ((rgnbegin == base) && (rgnend == end)) { - memblock_remove_region(rgn, i); + memblock_remove_region(type, i); return 0; } /* Check to see if region is matching at the front */ if (rgnbegin == base) { - rgn->region[i].base = end; - rgn->region[i].size -= size; + type->regions[i].base = end; + type->regions[i].size -= size; return 0; } /* Check to see if the region is matching at the end */ if (rgnend == end) { - rgn->region[i].size -= size; + type->regions[i].size -= size; return 0; } @@ -249,208 +433,189 @@ static long __memblock_remove(struct memblock_region *rgn, u64 base, u64 size) * We need to split the entry - adjust the current one to the * beginging of the hole and add the region after hole. */ - rgn->region[i].size = base - rgn->region[i].base; - return memblock_add_region(rgn, end, rgnend - end); + type->regions[i].size = base - type->regions[i].base; + return memblock_add_region(type, end, rgnend - end); } -long memblock_remove(u64 base, u64 size) +long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) { return __memblock_remove(&memblock.memory, base, size); } -long __init memblock_free(u64 base, u64 size) +long __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) { return __memblock_remove(&memblock.reserved, base, size); } -long __init memblock_reserve(u64 base, u64 size) +long __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) { - struct memblock_region *_rgn = &memblock.reserved; + struct memblock_type *_rgn = &memblock.reserved; BUG_ON(0 == size); return memblock_add_region(_rgn, base, size); } -long memblock_overlaps_region(struct memblock_region *rgn, u64 base, u64 size) +phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) { - unsigned long i; + phys_addr_t found; - for (i = 0; i < rgn->cnt; i++) { - u64 rgnbase = rgn->region[i].base; - u64 rgnsize = rgn->region[i].size; - if (memblock_addrs_overlap(base, size, rgnbase, rgnsize)) - break; - } + /* We align the size to limit fragmentation. Without this, a lot of + * small allocs quickly eat up the whole reserve array on sparc + */ + size = memblock_align_up(size, align); + + found = memblock_find_base(size, align, 0, max_addr); + if (found != MEMBLOCK_ERROR && + memblock_add_region(&memblock.reserved, found, size) >= 0) + return found; - return (i < rgn->cnt) ? i : -1; + return 0; } -static u64 memblock_align_down(u64 addr, u64 size) +phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) { - return addr & ~(size - 1); + phys_addr_t alloc; + + alloc = __memblock_alloc_base(size, align, max_addr); + + if (alloc == 0) + panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n", + (unsigned long long) size, (unsigned long long) max_addr); + + return alloc; } -static u64 memblock_align_up(u64 addr, u64 size) +phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align) { - return (addr + (size - 1)) & ~(size - 1); + return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); } -static u64 __init memblock_alloc_nid_unreserved(u64 start, u64 end, - u64 size, u64 align) + +/* + * Additional node-local allocators. Search for node memory is bottom up + * and walks memblock regions within that node bottom-up as well, but allocation + * within an memblock region is top-down. XXX I plan to fix that at some stage + * + * WARNING: Only available after early_node_map[] has been populated, + * on some architectures, that is after all the calls to add_active_range() + * have been done to populate it. + */ + +phys_addr_t __weak __init memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid) { - u64 base, res_base; - long j; +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP + /* + * This code originates from sparc which really wants use to walk by addresses + * and returns the nid. This is not very convenient for early_pfn_map[] users + * as the map isn't sorted yet, and it really wants to be walked by nid. + * + * For now, I implement the inefficient method below which walks the early + * map multiple times. Eventually we may want to use an ARCH config option + * to implement a completely different method for both case. + */ + unsigned long start_pfn, end_pfn; + int i; - base = memblock_align_down((end - size), align); - while (start <= base) { - j = memblock_overlaps_region(&memblock.reserved, base, size); - if (j < 0) { - /* this area isn't reserved, take it */ - if (memblock_add_region(&memblock.reserved, base, size) < 0) - base = ~(u64)0; - return base; - } - res_base = memblock.reserved.region[j].base; - if (res_base < size) - break; - base = memblock_align_down(res_base - size, align); + for (i = 0; i < MAX_NUMNODES; i++) { + get_pfn_range_for_nid(i, &start_pfn, &end_pfn); + if (start < PFN_PHYS(start_pfn) || start >= PFN_PHYS(end_pfn)) + continue; + *nid = i; + return min(end, PFN_PHYS(end_pfn)); } +#endif + *nid = 0; - return ~(u64)0; + return end; } -static u64 __init memblock_alloc_nid_region(struct memblock_property *mp, - u64 (*nid_range)(u64, u64, int *), - u64 size, u64 align, int nid) +static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp, + phys_addr_t size, + phys_addr_t align, int nid) { - u64 start, end; + phys_addr_t start, end; start = mp->base; end = start + mp->size; start = memblock_align_up(start, align); while (start < end) { - u64 this_end; + phys_addr_t this_end; int this_nid; - this_end = nid_range(start, end, &this_nid); + this_end = memblock_nid_range(start, end, &this_nid); if (this_nid == nid) { - u64 ret = memblock_alloc_nid_unreserved(start, this_end, - size, align); - if (ret != ~(u64)0) + phys_addr_t ret = memblock_find_region(start, this_end, size, align); + if (ret != MEMBLOCK_ERROR && + memblock_add_region(&memblock.reserved, ret, size) >= 0) return ret; } start = this_end; } - return ~(u64)0; + return MEMBLOCK_ERROR; } -u64 __init memblock_alloc_nid(u64 size, u64 align, int nid, - u64 (*nid_range)(u64 start, u64 end, int *nid)) +phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { - struct memblock_region *mem = &memblock.memory; + struct memblock_type *mem = &memblock.memory; int i; BUG_ON(0 == size); + /* We align the size to limit fragmentation. Without this, a lot of + * small allocs quickly eat up the whole reserve array on sparc + */ size = memblock_align_up(size, align); + /* We do a bottom-up search for a region with the right + * nid since that's easier considering how memblock_nid_range() + * works + */ for (i = 0; i < mem->cnt; i++) { - u64 ret = memblock_alloc_nid_region(&mem->region[i], - nid_range, + phys_addr_t ret = memblock_alloc_nid_region(&mem->regions[i], size, align, nid); - if (ret != ~(u64)0) + if (ret != MEMBLOCK_ERROR) return ret; } - return memblock_alloc(size, align); -} - -u64 __init memblock_alloc(u64 size, u64 align) -{ - return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE); + return 0; } -u64 __init memblock_alloc_base(u64 size, u64 align, u64 max_addr) +phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) { - u64 alloc; - - alloc = __memblock_alloc_base(size, align, max_addr); + phys_addr_t res = memblock_alloc_nid(size, align, nid); - if (alloc == 0) - panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n", - (unsigned long long) size, (unsigned long long) max_addr); - - return alloc; + if (res) + return res; + return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE); } -u64 __init __memblock_alloc_base(u64 size, u64 align, u64 max_addr) -{ - long i, j; - u64 base = 0; - u64 res_base; - - BUG_ON(0 == size); - - size = memblock_align_up(size, align); - - /* On some platforms, make sure we allocate lowmem */ - /* Note that MEMBLOCK_REAL_LIMIT may be MEMBLOCK_ALLOC_ANYWHERE */ - if (max_addr == MEMBLOCK_ALLOC_ANYWHERE) - max_addr = MEMBLOCK_REAL_LIMIT; - - for (i = memblock.memory.cnt - 1; i >= 0; i--) { - u64 memblockbase = memblock.memory.region[i].base; - u64 memblocksize = memblock.memory.region[i].size; - - if (memblocksize < size) - continue; - if (max_addr == MEMBLOCK_ALLOC_ANYWHERE) - base = memblock_align_down(memblockbase + memblocksize - size, align); - else if (memblockbase < max_addr) { - base = min(memblockbase + memblocksize, max_addr); - base = memblock_align_down(base - size, align); - } else - continue; - while (base && memblockbase <= base) { - j = memblock_overlaps_region(&memblock.reserved, base, size); - if (j < 0) { - /* this area isn't reserved, take it */ - if (memblock_add_region(&memblock.reserved, base, size) < 0) - return 0; - return base; - } - res_base = memblock.reserved.region[j].base; - if (res_base < size) - break; - base = memblock_align_down(res_base - size, align); - } - } - return 0; -} +/* + * Remaining API functions + */ /* You must call memblock_analyze() before this. */ -u64 __init memblock_phys_mem_size(void) +phys_addr_t __init memblock_phys_mem_size(void) { - return memblock.memory.size; + return memblock.memory_size; } -u64 memblock_end_of_DRAM(void) +phys_addr_t __init_memblock memblock_end_of_DRAM(void) { int idx = memblock.memory.cnt - 1; - return (memblock.memory.region[idx].base + memblock.memory.region[idx].size); + return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size); } /* You must call memblock_analyze() after this. */ -void __init memblock_enforce_memory_limit(u64 memory_limit) +void __init memblock_enforce_memory_limit(phys_addr_t memory_limit) { unsigned long i; - u64 limit; - struct memblock_property *p; + phys_addr_t limit; + struct memblock_region *p; if (!memory_limit) return; @@ -458,24 +623,21 @@ void __init memblock_enforce_memory_limit(u64 memory_limit) /* Truncate the memblock regions to satisfy the memory limit. */ limit = memory_limit; for (i = 0; i < memblock.memory.cnt; i++) { - if (limit > memblock.memory.region[i].size) { - limit -= memblock.memory.region[i].size; + if (limit > memblock.memory.regions[i].size) { + limit -= memblock.memory.regions[i].size; continue; } - memblock.memory.region[i].size = limit; + memblock.memory.regions[i].size = limit; memblock.memory.cnt = i + 1; break; } - if (memblock.memory.region[0].size < memblock.rmo_size) - memblock.rmo_size = memblock.memory.region[0].size; - memory_limit = memblock_end_of_DRAM(); /* And truncate any reserves above the limit also. */ for (i = 0; i < memblock.reserved.cnt; i++) { - p = &memblock.reserved.region[i]; + p = &memblock.reserved.regions[i]; if (p->base > memory_limit) p->size = 0; @@ -489,53 +651,190 @@ void __init memblock_enforce_memory_limit(u64 memory_limit) } } -int __init memblock_is_reserved(u64 addr) +static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) +{ + unsigned int left = 0, right = type->cnt; + + do { + unsigned int mid = (right + left) / 2; + + if (addr < type->regions[mid].base) + right = mid; + else if (addr >= (type->regions[mid].base + + type->regions[mid].size)) + left = mid + 1; + else + return mid; + } while (left < right); + return -1; +} + +int __init memblock_is_reserved(phys_addr_t addr) +{ + return memblock_search(&memblock.reserved, addr) != -1; +} + +int __init_memblock memblock_is_memory(phys_addr_t addr) +{ + return memblock_search(&memblock.memory, addr) != -1; +} + +int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) +{ + int idx = memblock_search(&memblock.memory, base); + + if (idx == -1) + return 0; + return memblock.memory.regions[idx].base <= base && + (memblock.memory.regions[idx].base + + memblock.memory.regions[idx].size) >= (base + size); +} + +int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) +{ + return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; +} + + +void __init_memblock memblock_set_current_limit(phys_addr_t limit) +{ + memblock.current_limit = limit; +} + +static void __init_memblock memblock_dump(struct memblock_type *region, char *name) { + unsigned long long base, size; int i; - for (i = 0; i < memblock.reserved.cnt; i++) { - u64 upper = memblock.reserved.region[i].base + - memblock.reserved.region[i].size - 1; - if ((addr >= memblock.reserved.region[i].base) && (addr <= upper)) - return 1; + pr_info(" %s.cnt = 0x%lx\n", name, region->cnt); + + for (i = 0; i < region->cnt; i++) { + base = region->regions[i].base; + size = region->regions[i].size; + + pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes\n", + name, i, base, base + size - 1, size); } - return 0; } -int memblock_is_region_reserved(u64 base, u64 size) +void __init_memblock memblock_dump_all(void) { - return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; + if (!memblock_debug) + return; + + pr_info("MEMBLOCK configuration:\n"); + pr_info(" memory size = 0x%llx\n", (unsigned long long)memblock.memory_size); + + memblock_dump(&memblock.memory, "memory"); + memblock_dump(&memblock.reserved, "reserved"); } -/* - * Given a <base, len>, find which memory regions belong to this range. - * Adjust the request and return a contiguous chunk. - */ -int memblock_find(struct memblock_property *res) +void __init memblock_analyze(void) { int i; - u64 rstart, rend; - rstart = res->base; - rend = rstart + res->size - 1; + /* Check marker in the unused last array entry */ + WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base + != (phys_addr_t)RED_INACTIVE); + WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base + != (phys_addr_t)RED_INACTIVE); + + memblock.memory_size = 0; + + for (i = 0; i < memblock.memory.cnt; i++) + memblock.memory_size += memblock.memory.regions[i].size; + + /* We allow resizing from there */ + memblock_can_resize = 1; +} + +void __init memblock_init(void) +{ + static int init_done __initdata = 0; + + if (init_done) + return; + init_done = 1; + + /* Hookup the initial arrays */ + memblock.memory.regions = memblock_memory_init_regions; + memblock.memory.max = INIT_MEMBLOCK_REGIONS; + memblock.reserved.regions = memblock_reserved_init_regions; + memblock.reserved.max = INIT_MEMBLOCK_REGIONS; + + /* Write a marker in the unused last array entry */ + memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; + memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; + + /* Create a dummy zero size MEMBLOCK which will get coalesced away later. + * This simplifies the memblock_add() code below... + */ + memblock.memory.regions[0].base = 0; + memblock.memory.regions[0].size = 0; + memblock.memory.cnt = 1; + + /* Ditto. */ + memblock.reserved.regions[0].base = 0; + memblock.reserved.regions[0].size = 0; + memblock.reserved.cnt = 1; + + memblock.current_limit = MEMBLOCK_ALLOC_ANYWHERE; +} + +static int __init early_memblock(char *p) +{ + if (p && strstr(p, "debug")) + memblock_debug = 1; + return 0; +} +early_param("memblock", early_memblock); + +#if defined(CONFIG_DEBUG_FS) && !defined(ARCH_DISCARD_MEMBLOCK) + +static int memblock_debug_show(struct seq_file *m, void *private) +{ + struct memblock_type *type = m->private; + struct memblock_region *reg; + int i; + + for (i = 0; i < type->cnt; i++) { + reg = &type->regions[i]; + seq_printf(m, "%4d: ", i); + if (sizeof(phys_addr_t) == 4) + seq_printf(m, "0x%08lx..0x%08lx\n", + (unsigned long)reg->base, + (unsigned long)(reg->base + reg->size - 1)); + else + seq_printf(m, "0x%016llx..0x%016llx\n", + (unsigned long long)reg->base, + (unsigned long long)(reg->base + reg->size - 1)); - for (i = 0; i < memblock.memory.cnt; i++) { - u64 start = memblock.memory.region[i].base; - u64 end = start + memblock.memory.region[i].size - 1; - - if (start > rend) - return -1; - - if ((end >= rstart) && (start < rend)) { - /* adjust the request */ - if (rstart < start) - rstart = start; - if (rend > end) - rend = end; - res->base = rstart; - res->size = rend - rstart + 1; - return 0; - } } - return -1; + return 0; +} + +static int memblock_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, memblock_debug_show, inode->i_private); } + +static const struct file_operations memblock_debug_fops = { + .open = memblock_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init memblock_init_debugfs(void) +{ + struct dentry *root = debugfs_create_dir("memblock", NULL); + if (!root) + return -ENXIO; + debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops); + debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops); + + return 0; +} +__initcall(memblock_init_debugfs); + +#endif /* CONFIG_DEBUG_FS */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9be3cf8a5da4..da53a252b259 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -61,7 +61,14 @@ struct mem_cgroup *root_mem_cgroup __read_mostly; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ int do_swap_account __read_mostly; -static int really_do_swap_account __initdata = 1; /* for remember boot option*/ + +/* for remember boot option*/ +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED +static int really_do_swap_account __initdata = 1; +#else +static int really_do_swap_account __initdata = 0; +#endif + #else #define do_swap_account (0) #endif @@ -89,7 +96,10 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ - MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ + MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ + /* incremented at every pagein/pageout */ + MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA, + MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ MEM_CGROUP_STAT_NSTATS, }; @@ -254,6 +264,12 @@ struct mem_cgroup { * percpu counter. */ struct mem_cgroup_stat_cpu *stat; + /* + * used when a cpu is offlined or other synchronizations + * See mem_cgroup_read_stat(). + */ + struct mem_cgroup_stat_cpu nocpu_base; + spinlock_t pcp_counter_lock; }; /* Stuffs for move charges at task migration. */ @@ -269,7 +285,7 @@ enum move_type { /* "mc" and its members are protected by cgroup_mutex */ static struct move_charge_struct { - spinlock_t lock; /* for from, to, moving_task */ + spinlock_t lock; /* for from, to */ struct mem_cgroup *from; struct mem_cgroup *to; unsigned long precharge; @@ -530,14 +546,40 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) return mz; } +/* + * Implementation Note: reading percpu statistics for memcg. + * + * Both of vmstat[] and percpu_counter has threshold and do periodic + * synchronization to implement "quick" read. There are trade-off between + * reading cost and precision of value. Then, we may have a chance to implement + * a periodic synchronizion of counter in memcg's counter. + * + * But this _read() function is used for user interface now. The user accounts + * memory usage by memory cgroup and he _always_ requires exact value because + * he accounts memory. Even if we provide quick-and-fuzzy read, we always + * have to visit all online cpus and make sum. So, for now, unnecessary + * synchronization is not implemented. (just implemented for cpu hotplug) + * + * If there are kernel internal actions which can make use of some not-exact + * value, and reading all cpu value can be performance bottleneck in some + * common workload, threashold and synchonization as vmstat[] should be + * implemented. + */ static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, enum mem_cgroup_stat_index idx) { int cpu; s64 val = 0; - for_each_possible_cpu(cpu) + get_online_cpus(); + for_each_online_cpu(cpu) val += per_cpu(mem->stat->count[idx], cpu); +#ifdef CONFIG_HOTPLUG_CPU + spin_lock(&mem->pcp_counter_lock); + val += mem->nocpu_base.count[idx]; + spin_unlock(&mem->pcp_counter_lock); +#endif + put_online_cpus(); return val; } @@ -558,23 +600,24 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, } static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, - struct page_cgroup *pc, - bool charge) + bool file, int nr_pages) { - int val = (charge) ? 1 : -1; - preempt_disable(); - if (PageCgroupCache(pc)) - __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); + if (file) + __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); else - __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); + __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); - if (charge) + /* pagein of a big page is an event. So, ignore page size */ + if (nr_pages > 0) __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); - else + else { __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); - __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); + nr_pages = -nr_pages; /* for event */ + } + + __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages); preempt_enable(); } @@ -659,40 +702,83 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return mem; } -/* - * Call callback function against all cgroup under hierarchy tree. - */ -static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, - int (*func)(struct mem_cgroup *, void *)) +/* The caller has to guarantee "mem" exists before calling this */ +static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) { - int found, ret, nextid; struct cgroup_subsys_state *css; - struct mem_cgroup *mem; + int found; - if (!root->use_hierarchy) - return (*func)(root, data); - - nextid = 1; - do { - ret = 0; + if (!mem) /* ROOT cgroup has the smallest ID */ + return root_mem_cgroup; /*css_put/get against root is ignored*/ + if (!mem->use_hierarchy) { + if (css_tryget(&mem->css)) + return mem; + return NULL; + } + rcu_read_lock(); + /* + * searching a memory cgroup which has the smallest ID under given + * ROOT cgroup. (ID >= 1) + */ + css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); + if (css && css_tryget(css)) + mem = container_of(css, struct mem_cgroup, css); + else mem = NULL; + rcu_read_unlock(); + return mem; +} +static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, + struct mem_cgroup *root, + bool cond) +{ + int nextid = css_id(&iter->css) + 1; + int found; + int hierarchy_used; + struct cgroup_subsys_state *css; + + hierarchy_used = iter->use_hierarchy; + + css_put(&iter->css); + /* If no ROOT, walk all, ignore hierarchy */ + if (!cond || (root && !hierarchy_used)) + return NULL; + + if (!root) + root = root_mem_cgroup; + + do { + iter = NULL; rcu_read_lock(); - css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, - &found); + + css = css_get_next(&mem_cgroup_subsys, nextid, + &root->css, &found); if (css && css_tryget(css)) - mem = container_of(css, struct mem_cgroup, css); + iter = container_of(css, struct mem_cgroup, css); rcu_read_unlock(); - - if (mem) { - ret = (*func)(mem, data); - css_put(&mem->css); - } + /* If css is NULL, no more cgroups will be found */ nextid = found + 1; - } while (!ret && css); + } while (css && !iter); - return ret; + return iter; } +/* + * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please + * be careful that "break" loop is not allowed. We have reference count. + * Instead of that modify "cond" to be false and "continue" to exit the loop. + */ +#define for_each_mem_cgroup_tree_cond(iter, root, cond) \ + for (iter = mem_cgroup_start_loop(root);\ + iter != NULL;\ + iter = mem_cgroup_get_next(iter, root, cond)) + +#define for_each_mem_cgroup_tree(iter, root) \ + for_each_mem_cgroup_tree_cond(iter, root, true) + +#define for_each_mem_cgroup_all(iter) \ + for_each_mem_cgroup_tree_cond(iter, NULL, true) + static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) { @@ -730,12 +816,12 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) * removed from global LRU. */ mz = page_cgroup_zoneinfo(pc); - MEM_CGROUP_ZSTAT(mz, lru) -= 1; + /* huge page split is done under lru_lock. so, we have no races. */ + MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); if (mem_cgroup_is_root(pc->mem_cgroup)) return; VM_BUG_ON(list_empty(&pc->lru)); list_del_init(&pc->lru); - return; } void mem_cgroup_del_lru(struct page *page) @@ -752,13 +838,12 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) return; pc = lookup_page_cgroup(page); - /* - * Used bit is set without atomic ops but after smp_wmb(). - * For making pc->mem_cgroup visible, insert smp_rmb() here. - */ - smp_rmb(); /* unused or root page is not rotated. */ - if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) + if (!PageCgroupUsed(pc)) + return; + /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ + smp_rmb(); + if (mem_cgroup_is_root(pc->mem_cgroup)) return; mz = page_cgroup_zoneinfo(pc); list_move(&pc->lru, &mz->lists[lru]); @@ -773,16 +858,13 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) return; pc = lookup_page_cgroup(page); VM_BUG_ON(PageCgroupAcctLRU(pc)); - /* - * Used bit is set without atomic ops but after smp_wmb(). - * For making pc->mem_cgroup visible, insert smp_rmb() here. - */ - smp_rmb(); if (!PageCgroupUsed(pc)) return; - + /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ + smp_rmb(); mz = page_cgroup_zoneinfo(pc); - MEM_CGROUP_ZSTAT(mz, lru) += 1; + /* huge page split is done under lru_lock. so, we have no races. */ + MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); SetPageCgroupAcctLRU(pc); if (mem_cgroup_is_root(pc->mem_cgroup)) return; @@ -946,14 +1028,10 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) return NULL; pc = lookup_page_cgroup(page); - /* - * Used bit is set without atomic ops but after smp_wmb(). - * For making pc->mem_cgroup visible, insert smp_rmb() here. - */ - smp_rmb(); if (!PageCgroupUsed(pc)) return NULL; - + /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ + smp_rmb(); mz = page_cgroup_zoneinfo(pc); if (!mz) return NULL; @@ -1001,7 +1079,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, case 0: list_move(&page->lru, dst); mem_cgroup_del_lru(page); - nr_taken++; + nr_taken += hpage_nr_pages(page); break; case -EBUSY: /* we don't affect global LRU but rotate in our LRU */ @@ -1035,6 +1113,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) return false; } +/** + * mem_cgroup_check_margin - check if the memory cgroup allows charging + * @mem: memory cgroup to check + * @bytes: the number of bytes the caller intends to charge + * + * Returns a boolean value on whether @mem can be charged @bytes or + * whether this would exceed the limit. + */ +static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes) +{ + if (!res_counter_check_margin(&mem->res, bytes)) + return false; + if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes)) + return false; + return true; +} + static unsigned int get_swappiness(struct mem_cgroup *memcg) { struct cgroup *cgrp = memcg->css.cgroup; @@ -1051,7 +1146,52 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg) return swappiness; } -/* A routine for testing mem is not under move_account */ +static void mem_cgroup_start_move(struct mem_cgroup *mem) +{ + int cpu; + + get_online_cpus(); + spin_lock(&mem->pcp_counter_lock); + for_each_online_cpu(cpu) + per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; + mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; + spin_unlock(&mem->pcp_counter_lock); + put_online_cpus(); + + synchronize_rcu(); +} + +static void mem_cgroup_end_move(struct mem_cgroup *mem) +{ + int cpu; + + if (!mem) + return; + get_online_cpus(); + spin_lock(&mem->pcp_counter_lock); + for_each_online_cpu(cpu) + per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; + mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; + spin_unlock(&mem->pcp_counter_lock); + put_online_cpus(); +} +/* + * 2 routines for checking "mem" is under move_account() or not. + * + * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used + * for avoiding race in accounting. If true, + * pc->mem_cgroup may be overwritten. + * + * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or + * under hierarchy of moving cgroups. This is for + * waiting at hith-memory prressure caused by "move". + */ + +static bool mem_cgroup_stealed(struct mem_cgroup *mem) +{ + VM_BUG_ON(!rcu_read_lock_held()); + return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; +} static bool mem_cgroup_under_move(struct mem_cgroup *mem) { @@ -1092,13 +1232,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) return false; } -static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) -{ - int *val = data; - (*val)++; - return 0; -} - /** * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. * @memcg: The memory cgroup that went over limit @@ -1173,7 +1306,10 @@ done: static int mem_cgroup_count_children(struct mem_cgroup *mem) { int num = 0; - mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, mem) + num++; return num; } @@ -1185,8 +1321,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) u64 limit; u64 memsw; - limit = res_counter_read_u64(&memcg->res, RES_LIMIT) + - total_swap_pages; + limit = res_counter_read_u64(&memcg->res, RES_LIMIT); + limit += total_swap_pages << PAGE_SHIFT; + memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); /* * If memsw is finite and limits the amount of swap space available @@ -1322,49 +1459,39 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, return total; } -static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data) -{ - int *val = (int *)data; - int x; - /* - * Logically, we can stop scanning immediately when we find - * a memcg is already locked. But condidering unlock ops and - * creation/removal of memcg, scan-all is simple operation. - */ - x = atomic_inc_return(&mem->oom_lock); - *val = max(x, *val); - return 0; -} /* * Check OOM-Killer is already running under our hierarchy. * If someone is running, return false. */ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) { - int lock_count = 0; + int x, lock_count = 0; + struct mem_cgroup *iter; - mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); + for_each_mem_cgroup_tree(iter, mem) { + x = atomic_inc_return(&iter->oom_lock); + lock_count = max(x, lock_count); + } if (lock_count == 1) return true; return false; } -static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) +static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) { + struct mem_cgroup *iter; + /* * When a new child is created while the hierarchy is under oom, * mem_cgroup_oom_lock() may not be called. We have to use * atomic_add_unless() here. */ - atomic_add_unless(&mem->oom_lock, -1, 0); + for_each_mem_cgroup_tree(iter, mem) + atomic_add_unless(&iter->oom_lock, -1, 0); return 0; } -static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) -{ - mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb); -} static DEFINE_MUTEX(memcg_oom_mutex); static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); @@ -1462,35 +1589,73 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) /* * Currently used to update mapped file statistics, but the routine can be * generalized to update other statistics as well. + * + * Notes: Race condition + * + * We usually use page_cgroup_lock() for accessing page_cgroup member but + * it tends to be costly. But considering some conditions, we doesn't need + * to do so _always_. + * + * Considering "charge", lock_page_cgroup() is not required because all + * file-stat operations happen after a page is attached to radix-tree. There + * are no race with "charge". + * + * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup + * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even + * if there are race with "uncharge". Statistics itself is properly handled + * by flags. + * + * Considering "move", this is an only case we see a race. To make the race + * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are + * possibility of race condition. If there is, we take a lock. */ -void mem_cgroup_update_file_mapped(struct page *page, int val) + +void mem_cgroup_update_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx, int val) { struct mem_cgroup *mem; - struct page_cgroup *pc; + struct page_cgroup *pc = lookup_page_cgroup(page); + bool need_unlock = false; + unsigned long uninitialized_var(flags); - pc = lookup_page_cgroup(page); if (unlikely(!pc)) return; - lock_page_cgroup(pc); + rcu_read_lock(); mem = pc->mem_cgroup; - if (!mem || !PageCgroupUsed(pc)) - goto done; + if (unlikely(!mem || !PageCgroupUsed(pc))) + goto out; + /* pc->mem_cgroup is unstable ? */ + if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) { + /* take a lock against to access pc->mem_cgroup */ + move_lock_page_cgroup(pc, &flags); + need_unlock = true; + mem = pc->mem_cgroup; + if (!mem || !PageCgroupUsed(pc)) + goto out; + } - /* - * Preemption is already disabled. We can use __this_cpu_xxx - */ - if (val > 0) { - __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); - SetPageCgroupFileMapped(pc); - } else { - __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); - ClearPageCgroupFileMapped(pc); + switch (idx) { + case MEMCG_NR_FILE_MAPPED: + if (val > 0) + SetPageCgroupFileMapped(pc); + else if (!page_mapped(page)) + ClearPageCgroupFileMapped(pc); + idx = MEM_CGROUP_STAT_FILE_MAPPED; + break; + default: + BUG(); } -done: - unlock_page_cgroup(pc); + this_cpu_add(mem->stat->count[idx], val); + +out: + if (unlikely(need_unlock)) + move_unlock_page_cgroup(pc, &flags); + rcu_read_unlock(); + return; } +EXPORT_SYMBOL(mem_cgroup_update_page_stat); /* * size of first charge trial. "32" comes from vmscan.c's magic value. @@ -1605,15 +1770,55 @@ static void drain_all_stock_sync(void) atomic_dec(&memcg_drain_count); } -static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, +/* + * This function drains percpu counter value from DEAD cpu and + * move it to local cpu. Note that this function can be preempted. + */ +static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) +{ + int i; + + spin_lock(&mem->pcp_counter_lock); + for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { + s64 x = per_cpu(mem->stat->count[i], cpu); + + per_cpu(mem->stat->count[i], cpu) = 0; + mem->nocpu_base.count[i] += x; + } + /* need to clear ON_MOVE value, works as a kind of lock. */ + per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; + spin_unlock(&mem->pcp_counter_lock); +} + +static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu) +{ + int idx = MEM_CGROUP_ON_MOVE; + + spin_lock(&mem->pcp_counter_lock); + per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx]; + spin_unlock(&mem->pcp_counter_lock); +} + +static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { int cpu = (unsigned long)hcpu; struct memcg_stock_pcp *stock; + struct mem_cgroup *iter; - if (action != CPU_DEAD) + if ((action == CPU_ONLINE)) { + for_each_mem_cgroup_all(iter) + synchronize_mem_cgroup_on_move(iter, cpu); return NOTIFY_OK; + } + + if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) + return NOTIFY_OK; + + for_each_mem_cgroup_all(iter) + mem_cgroup_drain_pcp_counter(iter, cpu); + stock = &per_cpu(memcg_stock, cpu); drain_stock(stock); return NOTIFY_OK; @@ -1646,27 +1851,39 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, if (likely(!ret)) return CHARGE_OK; + res_counter_uncharge(&mem->res, csize); mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); flags |= MEM_CGROUP_RECLAIM_NOSWAP; } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); - - if (csize > PAGE_SIZE) /* change csize and retry */ + /* + * csize can be either a huge page (HPAGE_SIZE), a batch of + * regular pages (CHARGE_SIZE), or a single regular page + * (PAGE_SIZE). + * + * Never reclaim on behalf of optional batching, retry with a + * single page instead. + */ + if (csize == CHARGE_SIZE) return CHARGE_RETRY; if (!(gfp_mask & __GFP_WAIT)) return CHARGE_WOULDBLOCK; ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, - gfp_mask, flags); + gfp_mask, flags); + if (mem_cgroup_check_margin(mem_over_limit, csize)) + return CHARGE_RETRY; /* - * try_to_free_mem_cgroup_pages() might not give us a full - * picture of reclaim. Some pages are reclaimed and might be - * moved to swap cache or just unmapped from the cgroup. - * Check the limit again to see if the reclaim reduced the - * current usage of the cgroup before giving up + * Even though the limit is exceeded at this point, reclaim + * may have been able to free some pages. Retry the charge + * before killing the task. + * + * Only for regular pages, though: huge pages are rather + * unlikely to succeed so close to the limit, and we fall back + * to regular pages anyway in case of failure. */ - if (ret || mem_cgroup_check_under_limit(mem_over_limit)) + if (csize == PAGE_SIZE && ret) return CHARGE_RETRY; /* @@ -1691,12 +1908,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, * oom-killer can be invoked. */ static int __mem_cgroup_try_charge(struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) + gfp_t gfp_mask, + struct mem_cgroup **memcg, bool oom, + int page_size) { int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem = NULL; int ret; - int csize = CHARGE_SIZE; + int csize = max(CHARGE_SIZE, (unsigned long) page_size); /* * Unlike gloval-vm's OOM-kill, we're not in memory shortage @@ -1721,7 +1940,7 @@ again: VM_BUG_ON(css_is_removed(&mem->css)); if (mem_cgroup_is_root(mem)) goto done; - if (consume_stock(mem)) + if (page_size == PAGE_SIZE && consume_stock(mem)) goto done; css_get(&mem->css); } else { @@ -1729,23 +1948,22 @@ again: rcu_read_lock(); p = rcu_dereference(mm->owner); - VM_BUG_ON(!p); /* - * because we don't have task_lock(), "p" can exit while - * we're here. In that case, "mem" can point to root - * cgroup but never be NULL. (and task_struct itself is freed - * by RCU, cgroup itself is RCU safe.) Then, we have small - * risk here to get wrong cgroup. But such kind of mis-account - * by race always happens because we don't have cgroup_mutex(). - * It's overkill and we allow that small race, here. + * Because we don't have task_lock(), "p" can exit. + * In that case, "mem" can point to root or p can be NULL with + * race with swapoff. Then, we have small risk of mis-accouning. + * But such kind of mis-account by race always happens because + * we don't have cgroup_mutex(). It's overkill and we allo that + * small race, here. + * (*) swapoff at el will charge against mm-struct not against + * task-struct. So, mm->owner can be NULL. */ mem = mem_cgroup_from_task(p); - VM_BUG_ON(!mem); - if (mem_cgroup_is_root(mem)) { + if (!mem || mem_cgroup_is_root(mem)) { rcu_read_unlock(); goto done; } - if (consume_stock(mem)) { + if (page_size == PAGE_SIZE && consume_stock(mem)) { /* * It seems dagerous to access memcg without css_get(). * But considering how consume_stok works, it's not @@ -1786,7 +2004,7 @@ again: case CHARGE_OK: break; case CHARGE_RETRY: /* not in OOM situation but retry */ - csize = PAGE_SIZE; + csize = page_size; css_put(&mem->css); mem = NULL; goto again; @@ -1807,8 +2025,8 @@ again: } } while (ret != CHARGE_OK); - if (csize > PAGE_SIZE) - refill_stock(mem, csize - PAGE_SIZE); + if (csize > page_size) + refill_stock(mem, csize - page_size); css_put(&mem->css); done: *memcg = mem; @@ -1836,9 +2054,10 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, } } -static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) +static void mem_cgroup_cancel_charge(struct mem_cgroup *mem, + int page_size) { - __mem_cgroup_cancel_charge(mem, 1); + __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT); } /* @@ -1888,15 +2107,13 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) return mem; } -/* - * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be - * USED state. If already USED, uncharge and return. - */ - static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, - struct page_cgroup *pc, - enum charge_type ctype) + struct page_cgroup *pc, + enum charge_type ctype, + int page_size) { + int nr_pages = page_size >> PAGE_SHIFT; + /* try_charge() can return NULL to *memcg, taking care of it. */ if (!mem) return; @@ -1904,10 +2121,13 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); - mem_cgroup_cancel_charge(mem); + mem_cgroup_cancel_charge(mem, page_size); return; } - + /* + * we don't need page_cgroup_lock about tail pages, becase they are not + * accessed by any other context at this point. + */ pc->mem_cgroup = mem; /* * We access a page_cgroup asynchronously without lock_page_cgroup(). @@ -1931,8 +2151,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, break; } - mem_cgroup_charge_statistics(mem, pc, true); - + mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages); unlock_page_cgroup(pc); /* * "charge_statistics" updated event counter. Then, check it. @@ -1942,6 +2161,48 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, memcg_check_events(mem, pc->page); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ + (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) +/* + * Because tail pages are not marked as "used", set it. We're under + * zone->lru_lock, 'splitting on pmd' and compund_lock. + */ +void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) +{ + struct page_cgroup *head_pc = lookup_page_cgroup(head); + struct page_cgroup *tail_pc = lookup_page_cgroup(tail); + unsigned long flags; + + if (mem_cgroup_disabled()) + return; + /* + * We have no races with charge/uncharge but will have races with + * page state accounting. + */ + move_lock_page_cgroup(head_pc, &flags); + + tail_pc->mem_cgroup = head_pc->mem_cgroup; + smp_wmb(); /* see __commit_charge() */ + if (PageCgroupAcctLRU(head_pc)) { + enum lru_list lru; + struct mem_cgroup_per_zone *mz; + + /* + * LRU flags cannot be copied because we need to add tail + *.page to LRU by generic call and our hook will be called. + * We hold lru_lock, then, reduce counter directly. + */ + lru = page_lru(head); + mz = page_cgroup_zoneinfo(head_pc); + MEM_CGROUP_ZSTAT(mz, lru) -= 1; + } + tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; + move_unlock_page_cgroup(head_pc, &flags); +} +#endif + /** * __mem_cgroup_move_account - move account of the page * @pc: page_cgroup of the page. @@ -1960,11 +2221,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, */ static void __mem_cgroup_move_account(struct page_cgroup *pc, - struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) + struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge, + int charge_size) { + int nr_pages = charge_size >> PAGE_SHIFT; + VM_BUG_ON(from == to); VM_BUG_ON(PageLRU(pc->page)); - VM_BUG_ON(!PageCgroupLocked(pc)); + VM_BUG_ON(!page_is_cgroup_locked(pc)); VM_BUG_ON(!PageCgroupUsed(pc)); VM_BUG_ON(pc->mem_cgroup != from); @@ -1975,14 +2239,14 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); preempt_enable(); } - mem_cgroup_charge_statistics(from, pc, false); + mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); if (uncharge) /* This is not "cancel", but cancel_charge does all we need. */ - mem_cgroup_cancel_charge(from); + mem_cgroup_cancel_charge(from, charge_size); /* caller should have done css_get */ pc->mem_cgroup = to; - mem_cgroup_charge_statistics(to, pc, true); + mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); /* * We charges against "to" which may not have any tasks. Then, "to" * can be under rmdir(). But in current implementation, caller of @@ -1997,12 +2261,25 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, * __mem_cgroup_move_account() */ static int mem_cgroup_move_account(struct page_cgroup *pc, - struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) + struct mem_cgroup *from, struct mem_cgroup *to, + bool uncharge, int charge_size) { int ret = -EINVAL; + unsigned long flags; + /* + * The page is isolated from LRU. So, collapse function + * will not handle this page. But page splitting can happen. + * Do this check under compound_page_lock(). The caller should + * hold it. + */ + if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page)) + return -EBUSY; + lock_page_cgroup(pc); if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { - __mem_cgroup_move_account(pc, from, to, uncharge); + move_lock_page_cgroup(pc, &flags); + __mem_cgroup_move_account(pc, from, to, uncharge, charge_size); + move_unlock_page_cgroup(pc, &flags); ret = 0; } unlock_page_cgroup(pc); @@ -2026,6 +2303,8 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, struct cgroup *cg = child->css.cgroup; struct cgroup *pcg = cg->parent; struct mem_cgroup *parent; + int page_size = PAGE_SIZE; + unsigned long flags; int ret; /* Is ROOT ? */ @@ -2038,14 +2317,24 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, if (isolate_lru_page(page)) goto put; + if (PageTransHuge(page)) + page_size = HPAGE_SIZE; + parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, + &parent, false, page_size); if (ret || !parent) goto put_back; - ret = mem_cgroup_move_account(pc, child, parent, true); + if (page_size > PAGE_SIZE) + flags = compound_lock_irqsave(page); + + ret = mem_cgroup_move_account(pc, child, parent, true, page_size); if (ret) - mem_cgroup_cancel_charge(parent); + mem_cgroup_cancel_charge(parent, page_size); + + if (page_size > PAGE_SIZE) + compound_unlock_irqrestore(page, flags); put_back: putback_lru_page(page); put: @@ -2064,20 +2353,32 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, enum charge_type ctype) { struct mem_cgroup *mem = NULL; + int page_size = PAGE_SIZE; struct page_cgroup *pc; + bool oom = true; int ret; + if (PageTransHuge(page)) { + page_size <<= compound_order(page); + VM_BUG_ON(!PageTransHuge(page)); + /* + * Never OOM-kill a process for a huge page. The + * fault handler will fall back to regular pages. + */ + oom = false; + } + pc = lookup_page_cgroup(page); /* can happen at boot */ if (unlikely(!pc)) return 0; prefetchw(pc); - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); + ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size); if (ret || !mem) return ret; - __mem_cgroup_commit_charge(mem, pc, ctype); + __mem_cgroup_commit_charge(mem, pc, ctype, page_size); return 0; } @@ -2086,8 +2387,6 @@ int mem_cgroup_newpage_charge(struct page *page, { if (mem_cgroup_disabled()) return 0; - if (PageCompound(page)) - return 0; /* * If already mapped, we don't have to account. * If page cache, page->mapping has address_space. @@ -2193,13 +2492,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, if (!mem) goto charge_cur_mm; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); + ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE); css_put(&mem->css); return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, ptr, true); + return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE); } static void @@ -2215,7 +2514,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, cgroup_exclude_rmdir(&ptr->css); pc = lookup_page_cgroup(page); mem_cgroup_lru_del_before_commit_swapcache(page); - __mem_cgroup_commit_charge(ptr, pc, ctype); + __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE); mem_cgroup_lru_add_after_commit_swapcache(page); /* * Now swap is on-memory. This means this page may be @@ -2264,11 +2563,12 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) return; if (!mem) return; - mem_cgroup_cancel_charge(mem); + mem_cgroup_cancel_charge(mem, PAGE_SIZE); } static void -__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) +__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, + int page_size) { struct memcg_batch_info *batch = NULL; bool uncharge_memsw = true; @@ -2295,6 +2595,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) goto direct_uncharge; + if (page_size != PAGE_SIZE) + goto direct_uncharge; + /* * In typical case, batch->memcg == mem. This means we can * merge a series of uncharges to an uncharge of res_counter. @@ -2308,9 +2611,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) batch->memsw_bytes += PAGE_SIZE; return; direct_uncharge: - res_counter_uncharge(&mem->res, PAGE_SIZE); + res_counter_uncharge(&mem->res, page_size); if (uncharge_memsw) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); + res_counter_uncharge(&mem->memsw, page_size); if (unlikely(batch->memcg != mem)) memcg_oom_recover(mem); return; @@ -2322,8 +2625,10 @@ direct_uncharge: static struct mem_cgroup * __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { + int count; struct page_cgroup *pc; struct mem_cgroup *mem = NULL; + int page_size = PAGE_SIZE; if (mem_cgroup_disabled()) return NULL; @@ -2331,6 +2636,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) if (PageSwapCache(page)) return NULL; + if (PageTransHuge(page)) { + page_size <<= compound_order(page); + VM_BUG_ON(!PageTransHuge(page)); + } + + count = page_size >> PAGE_SHIFT; /* * Check if our page_cgroup is valid */ @@ -2363,7 +2674,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - mem_cgroup_charge_statistics(mem, pc, false); + mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count); ClearPageCgroupUsed(pc); /* @@ -2384,7 +2695,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) mem_cgroup_get(mem); } if (!mem_cgroup_is_root(mem)) - __do_uncharge(mem, ctype); + __do_uncharge(mem, ctype, page_size); return mem; @@ -2579,6 +2890,7 @@ int mem_cgroup_prepare_migration(struct page *page, enum charge_type ctype; int ret = 0; + VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) return 0; @@ -2628,7 +2940,7 @@ int mem_cgroup_prepare_migration(struct page *page, return 0; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE); css_put(&mem->css);/* drop extra refcnt */ if (ret || *ptr == NULL) { if (PageAnon(page)) { @@ -2655,13 +2967,13 @@ int mem_cgroup_prepare_migration(struct page *page, ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; else ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - __mem_cgroup_commit_charge(mem, pc, ctype); + __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE); return ret; } /* remove redundant charge if migration failed*/ void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, struct page *newpage) + struct page *oldpage, struct page *newpage, bool migration_ok) { struct page *used, *unused; struct page_cgroup *pc; @@ -2670,8 +2982,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, return; /* blocks rmdir() */ cgroup_exclude_rmdir(&mem->css); - /* at migration success, oldpage->mapping is NULL. */ - if (oldpage->mapping) { + if (!migration_ok) { used = oldpage; unused = newpage; } else { @@ -3038,6 +3349,7 @@ move_account: lru_add_drain_all(); drain_all_stock_sync(); ret = 0; + mem_cgroup_start_move(mem); for_each_node_state(node, N_HIGH_MEMORY) { for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { enum lru_list l; @@ -3051,6 +3363,7 @@ move_account: if (ret) break; } + mem_cgroup_end_move(mem); memcg_oom_recover(mem); /* it seems parent cgroup doesn't have enough mem */ if (ret == -ENOMEM) @@ -3137,33 +3450,25 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, return retval; } -struct mem_cgroup_idx_data { - s64 val; - enum mem_cgroup_stat_index idx; -}; -static int -mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) +static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, + enum mem_cgroup_stat_index idx) { - struct mem_cgroup_idx_data *d = data; - d->val += mem_cgroup_read_stat(mem, d->idx); - return 0; -} + struct mem_cgroup *iter; + s64 val = 0; -static void -mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, - enum mem_cgroup_stat_index idx, s64 *val) -{ - struct mem_cgroup_idx_data d; - d.idx = idx; - d.val = 0; - mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat); - *val = d.val; + /* each per cpu's value can be minus.Then, use s64 */ + for_each_mem_cgroup_tree(iter, mem) + val += mem_cgroup_read_stat(iter, idx); + + if (val < 0) /* race ? */ + val = 0; + return val; } static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) { - u64 idx_val, val; + u64 val; if (!mem_cgroup_is_root(mem)) { if (!swap) @@ -3172,16 +3477,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) return res_counter_read_u64(&mem->memsw, RES_USAGE); } - mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val); - val = idx_val; - mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val); - val += idx_val; + val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE); + val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS); - if (swap) { - mem_cgroup_get_recursive_idx_stat(mem, - MEM_CGROUP_STAT_SWAPOUT, &idx_val); - val += idx_val; - } + if (swap) + val += mem_cgroup_get_recursive_idx_stat(mem, + MEM_CGROUP_STAT_SWAPOUT); return val << PAGE_SHIFT; } @@ -3389,9 +3690,9 @@ struct { }; -static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) +static void +mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) { - struct mcs_total_stat *s = data; s64 val; /* per cpu stat */ @@ -3421,13 +3722,15 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; - return 0; } static void mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) { - mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat); + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, mem) + mem_cgroup_get_local_stat(iter, s); } static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, @@ -3604,7 +3907,7 @@ static int compare_thresholds(const void *a, const void *b) return _a->threshold - _b->threshold; } -static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) +static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem) { struct mem_cgroup_eventfd_list *ev; @@ -3615,7 +3918,10 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) static void mem_cgroup_oom_notify(struct mem_cgroup *mem) { - mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, mem) + mem_cgroup_oom_notify_cb(iter); } static int mem_cgroup_usage_register_event(struct cgroup *cgrp, @@ -3986,13 +4292,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) */ if (!node_state(node, N_NORMAL_MEMORY)) tmp = -1; - pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); + pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); if (!pn) return 1; mem->info.nodeinfo[node] = pn; - memset(pn, 0, sizeof(*pn)); - for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; for_each_lru(l) @@ -4016,23 +4320,25 @@ static struct mem_cgroup *mem_cgroup_alloc(void) /* Can be very big if MAX_NUMNODES is very big */ if (size < PAGE_SIZE) - mem = kmalloc(size, GFP_KERNEL); + mem = kzalloc(size, GFP_KERNEL); else - mem = vmalloc(size); + mem = vzalloc(size); if (!mem) return NULL; - memset(mem, 0, size); mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); - if (!mem->stat) { - if (size < PAGE_SIZE) - kfree(mem); - else - vfree(mem); - mem = NULL; - } + if (!mem->stat) + goto out_free; + spin_lock_init(&mem->pcp_counter_lock); return mem; + +out_free: + if (size < PAGE_SIZE) + kfree(mem); + else + vfree(mem); + return NULL; } /* @@ -4158,7 +4464,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) &per_cpu(memcg_stock, cpu); INIT_WORK(&stock->work, drain_local_stock); } - hotcpu_notifier(memcg_stock_cpu_callback, 0); + hotcpu_notifier(memcg_cpu_hotplug_callback, 0); } else { parent = mem_cgroup_from_cont(cont->parent); mem->use_hierarchy = parent->use_hierarchy; @@ -4268,7 +4574,8 @@ one_by_one: batch_count = PRECHARGE_COUNT_AT_ONCE; cond_resched(); } - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, + PAGE_SIZE); if (ret || !mem) /* mem_cgroup_clear_mc() will do uncharge later */ return -ENOMEM; @@ -4430,6 +4737,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; + VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) if (is_target_pte_for_mc(vma, addr, *pte, NULL)) @@ -4467,10 +4775,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) static int mem_cgroup_precharge_mc(struct mm_struct *mm) { - return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); + unsigned long precharge = mem_cgroup_count_precharge(mm); + + VM_BUG_ON(mc.moving_task); + mc.moving_task = current; + return mem_cgroup_do_precharge(precharge); } -static void mem_cgroup_clear_mc(void) +/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ +static void __mem_cgroup_clear_mc(void) { struct mem_cgroup *from = mc.from; struct mem_cgroup *to = mc.to; @@ -4505,17 +4818,28 @@ static void mem_cgroup_clear_mc(void) PAGE_SIZE * mc.moved_swap); } /* we've already done mem_cgroup_get(mc.to) */ - mc.moved_swap = 0; } + memcg_oom_recover(from); + memcg_oom_recover(to); + wake_up_all(&mc.waitq); +} + +static void mem_cgroup_clear_mc(void) +{ + struct mem_cgroup *from = mc.from; + + /* + * we must clear moving_task before waking up waiters at the end of + * task migration. + */ + mc.moving_task = NULL; + __mem_cgroup_clear_mc(); spin_lock(&mc.lock); mc.from = NULL; mc.to = NULL; - mc.moving_task = NULL; spin_unlock(&mc.lock); - memcg_oom_recover(from); - memcg_oom_recover(to); - wake_up_all(&mc.waitq); + mem_cgroup_end_move(from); } static int mem_cgroup_can_attach(struct cgroup_subsys *ss, @@ -4542,15 +4866,12 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, VM_BUG_ON(mc.precharge); VM_BUG_ON(mc.moved_charge); VM_BUG_ON(mc.moved_swap); - VM_BUG_ON(mc.moving_task); + mem_cgroup_start_move(from); spin_lock(&mc.lock); mc.from = from; mc.to = mem; - mc.precharge = 0; - mc.moved_charge = 0; - mc.moved_swap = 0; - mc.moving_task = current; spin_unlock(&mc.lock); + /* We set mc.moving_task later */ ret = mem_cgroup_precharge_mc(mm); if (ret) @@ -4579,6 +4900,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, spinlock_t *ptl; retry: + VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { pte_t ptent = *(pte++); @@ -4599,7 +4921,7 @@ retry: goto put; pc = lookup_page_cgroup(page); if (!mem_cgroup_move_account(pc, - mc.from, mc.to, false)) { + mc.from, mc.to, false, PAGE_SIZE)) { mc.precharge--; /* we uncharge from mc.from later. */ mc.moved_charge++; @@ -4644,7 +4966,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) struct vm_area_struct *vma; lru_add_drain_all(); - down_read(&mm->mmap_sem); +retry: + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { + /* + * Someone who are holding the mmap_sem might be waiting in + * waitq. So we cancel all extra charges, wake up all waiters, + * and retry. Because we cancel precharges, we might not be able + * to move enough charges, but moving charge is a best-effort + * feature anyway, so it wouldn't be a big problem. + */ + __mem_cgroup_clear_mc(); + cond_resched(); + goto retry; + } for (vma = mm->mmap; vma; vma = vma->vm_next) { int ret; struct mm_walk mem_cgroup_move_charge_walk = { @@ -4723,10 +5057,21 @@ struct cgroup_subsys mem_cgroup_subsys = { }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +static int __init enable_swap_account(char *s) +{ + /* consider enabled if no parameter or 1 is given */ + if (!(*s) || !strcmp(s, "=1")) + really_do_swap_account = 1; + else if (!strcmp(s, "=0")) + really_do_swap_account = 0; + return 1; +} +__setup("swapaccount", enable_swap_account); static int __init disable_swap_account(char *s) { - really_do_swap_account = 0; + printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n"); + enable_swap_account("=0"); return 1; } __setup("noswapaccount", disable_swap_account); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 757f6b0accfe..0207c2f6f8bd 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -7,21 +7,26 @@ * Free Software Foundation. * * High level machine check handler. Handles pages reported by the - * hardware as being corrupted usually due to a 2bit ECC memory or cache + * hardware as being corrupted usually due to a multi-bit ECC memory or cache * failure. + * + * In addition there is a "soft offline" entry point that allows stop using + * not-yet-corrupted-by-suspicious pages without killing anything. * * Handles page cache pages in various states. The tricky part - * here is that we can access any page asynchronous to other VM - * users, because memory failures could happen anytime and anywhere, - * possibly violating some of their assumptions. This is why this code - * has to be extremely careful. Generally it tries to use normal locking - * rules, as in get the standard locks, even if that means the - * error handling takes potentially a long time. - * - * The operation to map back from RMAP chains to processes has to walk - * the complete process list and has non linear complexity with the number - * mappings. In short it can be quite slow. But since memory corruptions - * are rare we hope to get away with this. + * here is that we can access any page asynchronously in respect to + * other VM users, because memory failures could happen anytime and + * anywhere. This could violate some of their assumptions. This is why + * this code has to be extremely careful. Generally it tries to use + * normal locking rules, as in get the standard locks, even if that means + * the error handling takes potentially a long time. + * + * There are several operations here with exponential complexity because + * of unsuitable VM data structures. For example the operation to map back + * from RMAP chains to processes has to walk the complete process list and + * has non linear complexity with the number. But since memory corruptions + * are rare we hope to get away with this. This avoids impacting the core + * VM. */ /* @@ -30,7 +35,6 @@ * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages * - pass bad pages to kdump next kernel */ -#define DEBUG 1 /* remove me in 2.6.34 */ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/page-flags.h> @@ -47,6 +51,7 @@ #include <linux/slab.h> #include <linux/swapops.h> #include <linux/hugetlb.h> +#include <linux/memory_hotplug.h> #include "internal.h" int sysctl_memory_failure_early_kill __read_mostly = 0; @@ -78,7 +83,7 @@ static int hwpoison_filter_dev(struct page *p) return 0; /* - * page_mapping() does not accept slab page + * page_mapping() does not accept slab pages. */ if (PageSlab(p)) return -EINVAL; @@ -198,7 +203,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, #ifdef __ARCH_SI_TRAPNO si.si_trapno = trapno; #endif - si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; + si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; /* * Don't use force here, it's convenient if the signal * can be temporarily blocked. @@ -228,8 +233,8 @@ void shake_page(struct page *p, int access) } /* - * Only all shrink_slab here (which would also - * shrink other caches) if access is not potentially fatal. + * Only call shrink_slab here (which would also shrink other caches) if + * access is not potentially fatal. */ if (access) { int nr; @@ -268,7 +273,7 @@ struct to_kill { struct list_head nd; struct task_struct *tsk; unsigned long addr; - unsigned addr_valid:1; + char addr_valid; }; /* @@ -309,7 +314,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * a SIGKILL because the error is not contained anymore. */ if (tk->addr == -EFAULT) { - pr_debug("MCE: Unable to find user space address %lx in %s\n", + pr_info("MCE: Unable to find user space address %lx in %s\n", page_to_pfn(p), tsk->comm); tk->addr_valid = 0; } @@ -577,7 +582,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) pfn, err); } else if (page_has_private(p) && !try_to_release_page(p, GFP_NOIO)) { - pr_debug("MCE %#lx: failed to release buffers\n", pfn); + pr_info("MCE %#lx: failed to release buffers\n", pfn); } else { ret = RECOVERED; } @@ -693,11 +698,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) * Issues: * - Error on hugepage is contained in hugepage unit (not in raw page unit.) * To narrow down kill region to one page, we need to break up pmd. - * - To support soft-offlining for hugepage, we need to support hugepage - * migration. */ static int me_huge_page(struct page *p, unsigned long pfn) { + int res = 0; struct page *hpage = compound_head(p); /* * We can safely recover from error on free or reserved (i.e. @@ -710,8 +714,9 @@ static int me_huge_page(struct page *p, unsigned long pfn) * so there is no race between isolation and mapping/unmapping. */ if (!(page_mapping(hpage) || PageAnon(hpage))) { - __isolate_hwpoisoned_huge_page(hpage); - return RECOVERED; + res = dequeue_hwpoisoned_huge_page(hpage); + if (!res) + return RECOVERED; } return DELAYED; } @@ -836,8 +841,6 @@ static int page_action(struct page_state *ps, struct page *p, return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; } -#define N_UNMAP_TRIES 5 - /* * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. @@ -849,9 +852,9 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, struct address_space *mapping; LIST_HEAD(tokill); int ret; - int i; int kill = 1; struct page *hpage = compound_head(p); + struct page *ppage; if (PageReserved(p) || PageSlab(p)) return SWAP_SUCCESS; @@ -893,6 +896,44 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, } /* + * ppage: poisoned page + * if p is regular page(4k page) + * ppage == real poisoned page; + * else p is hugetlb or THP, ppage == head page. + */ + ppage = hpage; + + if (PageTransHuge(hpage)) { + /* + * Verify that this isn't a hugetlbfs head page, the check for + * PageAnon is just for avoid tripping a split_huge_page + * internal debug check, as split_huge_page refuses to deal with + * anything that isn't an anon page. PageAnon can't go away fro + * under us because we hold a refcount on the hpage, without a + * refcount on the hpage. split_huge_page can't be safely called + * in the first place, having a refcount on the tail isn't + * enough * to be safe. + */ + if (!PageHuge(hpage) && PageAnon(hpage)) { + if (unlikely(split_huge_page(hpage))) { + /* + * FIXME: if splitting THP is failed, it is + * better to stop the following operation rather + * than causing panic by unmapping. System might + * survive if the page is freed later. + */ + printk(KERN_INFO + "MCE %#lx: failed to split THP\n", pfn); + + BUG_ON(!PageHWPoison(p)); + return SWAP_FAIL; + } + /* THP is split, so ppage should be the real poisoned page. */ + ppage = p; + } + } + + /* * First collect all the processes that have the page * mapped in dirty form. This has to be done before try_to_unmap, * because ttu takes the rmap data structures down. @@ -901,22 +942,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * there's nothing that can be done. */ if (kill) - collect_procs(hpage, &tokill); + collect_procs(ppage, &tokill); - /* - * try_to_unmap can fail temporarily due to races. - * Try a few times (RED-PEN better strategy?) - */ - for (i = 0; i < N_UNMAP_TRIES; i++) { - ret = try_to_unmap(hpage, ttu); - if (ret == SWAP_SUCCESS) - break; - pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); - } + if (hpage != ppage) + lock_page_nosync(ppage); + ret = try_to_unmap(ppage, ttu); if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(hpage)); + pfn, page_mapcount(ppage)); + + if (hpage != ppage) + unlock_page(ppage); /* * Now that the dirty bit has been propagated to the @@ -927,7 +964,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, + kill_procs_ao(&tokill, !!PageDirty(ppage), trapno, ret != SWAP_SUCCESS, p, pfn); return ret; @@ -936,7 +973,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, static void set_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_order(hpage); + int nr_pages = 1 << compound_trans_order(hpage); for (i = 0; i < nr_pages; i++) SetPageHWPoison(hpage + i); } @@ -944,7 +981,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage) static void clear_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_order(hpage); + int nr_pages = 1 << compound_trans_order(hpage); for (i = 0; i < nr_pages; i++) ClearPageHWPoison(hpage + i); } @@ -974,14 +1011,17 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) return 0; } - nr_pages = 1 << compound_order(hpage); + nr_pages = 1 << compound_trans_order(hpage); atomic_long_add(nr_pages, &mce_bad_pages); /* * We need/can do nothing about count=0 pages. * 1) it's a free page, and therefore in safe hand: * prep_new_page() will be the gate keeper. - * 2) it's part of a non-compound high order page. + * 2) it's a free hugepage, which is also safe: + * an affected hugepage will be dequeued from hugepage freelist, + * so there's no concern about reusing it ever after. + * 3) it's part of a non-compound high order page. * Implies some kernel user: cannot stop them from * R/W the page; let's pray that the page has been * used and will be freed some time later. @@ -993,6 +1033,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) if (is_free_buddy_page(p)) { action_result(pfn, "free buddy", DELAYED); return 0; + } else if (PageHuge(hpage)) { + /* + * Check "just unpoisoned", "filter hit", and + * "race with other subpage." + */ + lock_page_nosync(hpage); + if (!PageHWPoison(hpage) + || (hwpoison_filter(p) && TestClearPageHWPoison(p)) + || (p != hpage && TestSetPageHWPoison(hpage))) { + atomic_long_sub(nr_pages, &mce_bad_pages); + return 0; + } + set_page_hwpoison_huge_page(hpage); + res = dequeue_hwpoisoned_huge_page(hpage); + action_result(pfn, "free huge", + res ? IGNORED : DELAYED); + unlock_page(hpage); + return res; } else { action_result(pfn, "high order kernel", IGNORED); return -EBUSY; @@ -1007,19 +1065,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. */ - if (!PageLRU(p) && !PageHuge(p)) - shake_page(p, 0); - if (!PageLRU(p) && !PageHuge(p)) { - /* - * shake_page could have turned it free. - */ - if (is_free_buddy_page(p)) { - action_result(pfn, "free buddy, 2nd try", DELAYED); - return 0; + if (!PageHuge(p) && !PageTransCompound(p)) { + if (!PageLRU(p)) + shake_page(p, 0); + if (!PageLRU(p)) { + /* + * shake_page could have turned it free. + */ + if (is_free_buddy_page(p)) { + action_result(pfn, "free buddy, 2nd try", + DELAYED); + return 0; + } + action_result(pfn, "non LRU", IGNORED); + put_page(p); + return -EBUSY; } - action_result(pfn, "non LRU", IGNORED); - put_page(p); - return -EBUSY; } /* @@ -1049,7 +1110,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * For error on the tail page, we should set PG_hwpoison * on the head page to show that the hugepage is hwpoisoned */ - if (PageTail(p) && TestSetPageHWPoison(hpage)) { + if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { action_result(pfn, "hugepage already hardware poisoned", IGNORED); unlock_page(hpage); @@ -1147,16 +1208,26 @@ int unpoison_memory(unsigned long pfn) page = compound_head(p); if (!PageHWPoison(p)) { - pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); + pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); return 0; } - nr_pages = 1 << compound_order(page); + nr_pages = 1 << compound_trans_order(page); if (!get_page_unless_zero(page)) { + /* + * Since HWPoisoned hugepage should have non-zero refcount, + * race between memory failure and unpoison seems to happen. + * In such case unpoison fails and memory failure runs + * to the end. + */ + if (PageHuge(page)) { + pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); + return 0; + } if (TestClearPageHWPoison(p)) atomic_long_sub(nr_pages, &mce_bad_pages); - pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); + pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); return 0; } @@ -1168,12 +1239,12 @@ int unpoison_memory(unsigned long pfn) * the free buddy page pool. */ if (TestClearPageHWPoison(page)) { - pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); + pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); atomic_long_sub(nr_pages, &mce_bad_pages); freeit = 1; + if (PageHuge(page)) + clear_page_hwpoison_huge_page(page); } - if (PageHuge(p)) - clear_page_hwpoison_huge_page(page); unlock_page(page); put_page(page); @@ -1187,7 +1258,11 @@ EXPORT_SYMBOL(unpoison_memory); static struct page *new_page(struct page *p, unsigned long private, int **x) { int nid = page_to_nid(p); - return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); + if (PageHuge(p)) + return alloc_huge_page_node(page_hstate(compound_head(p)), + nid); + else + return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); } /* @@ -1204,25 +1279,31 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) return 1; /* - * The lock_system_sleep prevents a race with memory hotplug, - * because the isolation assumes there's only a single user. + * The lock_memory_hotplug prevents a race with memory hotplug. * This is a big hammer, a better would be nicer. */ - lock_system_sleep(); + lock_memory_hotplug(); /* * Isolate the page, so that it doesn't get reallocated if it * was free. */ set_migratetype_isolate(p); + /* + * When the target page is a free hugepage, just remove it + * from free hugepage list. + */ if (!get_page_unless_zero(compound_head(p))) { - if (is_free_buddy_page(p)) { - pr_debug("get_any_page: %#lx free buddy page\n", pfn); + if (PageHuge(p)) { + pr_info("get_any_page: %#lx free huge page\n", pfn); + ret = dequeue_hwpoisoned_huge_page(compound_head(p)); + } else if (is_free_buddy_page(p)) { + pr_info("get_any_page: %#lx free buddy page\n", pfn); /* Set hwpoison bit while page is still isolated */ SetPageHWPoison(p); ret = 0; } else { - pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", + pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n", pfn, p->flags); ret = -EIO; } @@ -1231,7 +1312,51 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) ret = 1; } unset_migratetype_isolate(p); - unlock_system_sleep(); + unlock_memory_hotplug(); + return ret; +} + +static int soft_offline_huge_page(struct page *page, int flags) +{ + int ret; + unsigned long pfn = page_to_pfn(page); + struct page *hpage = compound_head(page); + LIST_HEAD(pagelist); + + ret = get_any_page(page, pfn, flags); + if (ret < 0) + return ret; + if (ret == 0) + goto done; + + if (PageHWPoison(hpage)) { + put_page(hpage); + pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn); + return -EBUSY; + } + + /* Keep page count to indicate a given hugepage is isolated. */ + + list_add(&hpage->lru, &pagelist); + ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, + true); + if (ret) { + struct page *page1, *page2; + list_for_each_entry_safe(page1, page2, &pagelist, lru) + put_page(page1); + + pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", + pfn, ret, page->flags); + if (ret > 0) + ret = -EIO; + return ret; + } +done: + if (!PageHWPoison(hpage)) + atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); + set_page_hwpoison_huge_page(hpage); + dequeue_hwpoisoned_huge_page(hpage); + /* keep elevated page count for bad page */ return ret; } @@ -1262,6 +1387,9 @@ int soft_offline_page(struct page *page, int flags) int ret; unsigned long pfn = page_to_pfn(page); + if (PageHuge(page)) + return soft_offline_huge_page(page, flags); + ret = get_any_page(page, pfn, flags); if (ret < 0) return ret; @@ -1288,7 +1416,7 @@ int soft_offline_page(struct page *page, int flags) goto done; } if (!PageLRU(page)) { - pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", + pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", pfn, page->flags); return -EIO; } @@ -1302,7 +1430,7 @@ int soft_offline_page(struct page *page, int flags) if (PageHWPoison(page)) { unlock_page(page); put_page(page); - pr_debug("soft offline: %#lx page already poisoned\n", pfn); + pr_info("soft offline: %#lx page already poisoned\n", pfn); return -EBUSY; } @@ -1323,7 +1451,7 @@ int soft_offline_page(struct page *page, int flags) put_page(page); if (ret == 1) { ret = 0; - pr_debug("soft_offline: %#lx: invalidated\n", pfn); + pr_info("soft_offline: %#lx: invalidated\n", pfn); goto done; } @@ -1337,15 +1465,17 @@ int soft_offline_page(struct page *page, int flags) LIST_HEAD(pagelist); list_add(&page->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); + ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + 0, true); if (ret) { - pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", + putback_lru_pages(&pagelist); + pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) ret = -EIO; } } else { - pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", + pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", pfn, ret, page_count(page), page->flags); } if (ret) diff --git a/mm/memory.c b/mm/memory.c index 0e18b4d649ec..5823698c2b71 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, } } -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address) { pgtable_t new = pte_alloc_one(mm, address); + int wait_split_huge_page; if (!new) return -ENOMEM; @@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ spin_lock(&mm->page_table_lock); - if (!pmd_present(*pmd)) { /* Has another populated it ? */ + wait_split_huge_page = 0; + if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ mm->nr_ptes++; pmd_populate(mm, pmd, new); new = NULL; - } + } else if (unlikely(pmd_trans_splitting(*pmd))) + wait_split_huge_page = 1; spin_unlock(&mm->page_table_lock); if (new) pte_free(mm, new); + if (wait_split_huge_page) + wait_split_huge_page(vma->anon_vma, pmd); return 0; } @@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) smp_wmb(); /* See comment in __pte_alloc */ spin_lock(&init_mm.page_table_lock); - if (!pmd_present(*pmd)) { /* Has another populated it ? */ + if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ pmd_populate_kernel(&init_mm, pmd, new); new = NULL; - } + } else + VM_BUG_ON(pmd_trans_splitting(*pmd)); spin_unlock(&init_mm.page_table_lock); if (new) pte_free_kernel(&init_mm, new); @@ -719,9 +726,9 @@ out_set_pte: return 0; } -static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, - unsigned long addr, unsigned long end) +int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) { pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; @@ -736,7 +743,7 @@ again: dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) return -ENOMEM; - src_pte = pte_offset_map_nested(src_pmd, addr); + src_pte = pte_offset_map(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); orig_src_pte = src_pte; @@ -767,7 +774,7 @@ again: arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); - pte_unmap_nested(orig_src_pte); + pte_unmap(orig_src_pte); add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); @@ -795,6 +802,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*src_pmd)) { + int err; + VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); + err = copy_huge_pmd(dst_mm, src_mm, + dst_pmd, src_pmd, addr, vma); + if (err == -ENOMEM) + return -ENOMEM; + if (!err) + continue; + /* fall through */ + } if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, @@ -997,6 +1015,16 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) { + if (next-addr != HPAGE_PMD_SIZE) { + VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); + split_huge_page_pmd(vma->vm_mm, pmd); + } else if (zap_huge_pmd(tlb, vma, pmd)) { + (*zap_work)--; + continue; + } + /* fall through */ + } if (pmd_none_or_clear_bad(pmd)) { (*zap_work)--; continue; @@ -1262,7 +1290,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, pud = pud_offset(pgd, address); if (pud_none(*pud)) goto no_page_table; - if (pud_huge(*pud)) { + if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { BUG_ON(flags & FOLL_GET); page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); goto out; @@ -1273,11 +1301,32 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) goto no_page_table; - if (pmd_huge(*pmd)) { + if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { BUG_ON(flags & FOLL_GET); page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); goto out; } + if (pmd_trans_huge(*pmd)) { + if (flags & FOLL_SPLIT) { + split_huge_page_pmd(mm, pmd); + goto split_fallthrough; + } + spin_lock(&mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(&mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + page = follow_trans_huge_pmd(mm, address, + pmd, flags); + spin_unlock(&mm->page_table_lock); + goto out; + } + } else + spin_unlock(&mm->page_table_lock); + /* fall through */ + } +split_fallthrough: if (unlikely(pmd_bad(*pmd))) goto no_page_table; @@ -1310,6 +1359,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, */ mark_page_accessed(page); } + if (flags & FOLL_MLOCK) { + /* + * The preliminary mapping check is mainly to avoid the + * pointless overhead of lock_page on the ZERO_PAGE + * which might bounce very badly if there is contention. + * + * If the page is already locked, we don't need to + * handle it now - vmscan will handle it later if and + * when it attempts to reclaim the page. + */ + if (page->mapping && trylock_page(page)) { + lru_add_drain(); /* push cached pages to LRU */ + /* + * Because we lock page here and migration is + * blocked by the pte's page reference, we need + * only check for file-cache page truncation. + */ + if (page->mapping) + mlock_vma_page(page); + unlock_page(page); + } + } unlock: pte_unmap_unlock(ptep, ptl); out: @@ -1341,7 +1412,8 @@ no_page_table: int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int nr_pages, unsigned int gup_flags, - struct page **pages, struct vm_area_struct **vmas) + struct page **pages, struct vm_area_struct **vmas, + int *nonblocking) { int i; unsigned long vm_flags; @@ -1386,6 +1458,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pmd = pmd_offset(pud, pg); if (pmd_none(*pmd)) return i ? : -EFAULT; + VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map(pmd, pg); if (pte_none(*pte)) { pte_unmap(pte); @@ -1441,16 +1514,22 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, cond_resched(); while (!(page = follow_page(vma, start, foll_flags))) { int ret; + unsigned int fault_flags = 0; + + if (foll_flags & FOLL_WRITE) + fault_flags |= FAULT_FLAG_WRITE; + if (nonblocking) + fault_flags |= FAULT_FLAG_ALLOW_RETRY; ret = handle_mm_fault(mm, vma, start, - (foll_flags & FOLL_WRITE) ? - FAULT_FLAG_WRITE : 0); + fault_flags); if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return i ? i : -ENOMEM; if (ret & - (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) + (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE| + VM_FAULT_SIGBUS)) return i ? i : -EFAULT; BUG(); } @@ -1459,6 +1538,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, else tsk->min_flt++; + if (ret & VM_FAULT_RETRY) { + *nonblocking = 0; + return i; + } + /* * The VM_FAULT_WRITE bit tells us that * do_wp_page has broken COW when necessary, @@ -1558,7 +1642,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (force) flags |= FOLL_FORCE; - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, + NULL); } EXPORT_SYMBOL(get_user_pages); @@ -1583,22 +1668,25 @@ struct page *get_dump_page(unsigned long addr) struct page *page; if (__get_user_pages(current, current->mm, addr, 1, - FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) + FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, + NULL) < 1) return NULL; flush_cache_page(vma, addr, page_to_pfn(page)); return page; } #endif /* CONFIG_ELF_CORE */ -pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, +pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { pgd_t * pgd = pgd_offset(mm, addr); pud_t * pud = pud_alloc(mm, pgd, addr); if (pud) { pmd_t * pmd = pmd_alloc(mm, pud, addr); - if (pmd) + if (pmd) { + VM_BUG_ON(pmd_trans_huge(*pmd)); return pte_alloc_map_lock(mm, pmd, addr, ptl); + } } return NULL; } @@ -1817,6 +1905,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, pmd = pmd_alloc(mm, pud, addr); if (!pmd) return -ENOMEM; + VM_BUG_ON(pmd_trans_huge(*pmd)); do { next = pmd_addr_end(addr, end); if (remap_pte_range(mm, pmd, addr, next, @@ -2047,19 +2136,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, return same; } -/* - * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when - * servicing faults for write access. In the normal case, do always want - * pte_mkwrite. But get_user_pages can cause write faults for mappings - * that do not have writing enabled, when used by access_process_vm. - */ -static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) -{ - if (likely(vma->vm_flags & VM_WRITE)) - pte = pte_mkwrite(pte); - return pte; -} - static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) { /* @@ -2079,7 +2155,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo * zeroes. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) - memset(kaddr, 0, PAGE_SIZE); + clear_page(kaddr); kunmap_atomic(kaddr, KM_USER0); flush_dcache_page(dst); } else @@ -2107,10 +2183,11 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte) + __releases(ptl) { struct page *old_page, *new_page; pte_t entry; - int reuse = 0, ret = 0; + int ret = 0; int page_mkwrite = 0; struct page *dirty_page = NULL; @@ -2142,19 +2219,20 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); - page_cache_release(old_page); goto unlock; } page_cache_release(old_page); } - reuse = reuse_swap_page(old_page); - if (reuse) + if (reuse_swap_page(old_page)) { /* * The page is all ours. Move it to our anon_vma so * the rmap code will not search our parent or siblings. * Protected against the rmap code by the page lock. */ page_move_anon_rmap(old_page, vma, address); + unlock_page(old_page); + goto reuse; + } unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { @@ -2210,7 +2288,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); - page_cache_release(old_page); goto unlock; } @@ -2218,18 +2295,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } dirty_page = old_page; get_page(dirty_page); - reuse = 1; - } - if (reuse) { reuse: flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, address, page_table, entry,1)) update_mmu_cache(vma, address, page_table); + pte_unmap_unlock(page_table, ptl); ret |= VM_FAULT_WRITE; - goto unlock; + + if (!dirty_page) + return ret; + + /* + * Yes, Virginia, this is actually required to prevent a race + * with clear_page_dirty_for_io() from clearing the page dirty + * bit after it clear all dirty ptes, but before a racing + * do_wp_page installs a dirty pte. + * + * do_no_page is protected similarly. + */ + if (!page_mkwrite) { + wait_on_page_locked(dirty_page); + set_page_dirty_balance(dirty_page, page_mkwrite); + } + put_page(dirty_page); + if (page_mkwrite) { + struct address_space *mapping = dirty_page->mapping; + + set_page_dirty(dirty_page); + unlock_page(dirty_page); + page_cache_release(dirty_page); + if (mapping) { + /* + * Some device drivers do not set page.mapping + * but still dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + } + + /* file_update_time outside page_lock */ + if (vma->vm_file) + file_update_time(vma->vm_file); + + return ret; } /* @@ -2254,16 +2365,6 @@ gotten: } __SetPageUptodate(new_page); - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if ((vma->vm_flags & VM_LOCKED) && old_page) { - lock_page(old_page); /* for LRU manipulation */ - clear_page_mlock(old_page); - unlock_page(old_page); - } - if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; @@ -2331,42 +2432,19 @@ gotten: if (new_page) page_cache_release(new_page); - if (old_page) - page_cache_release(old_page); unlock: pte_unmap_unlock(page_table, ptl); - if (dirty_page) { + if (old_page) { /* - * Yes, Virginia, this is actually required to prevent a race - * with clear_page_dirty_for_io() from clearing the page dirty - * bit after it clear all dirty ptes, but before a racing - * do_wp_page installs a dirty pte. - * - * do_no_page is protected similarly. + * Don't let another task, with possibly unlocked vma, + * keep the mlocked page. */ - if (!page_mkwrite) { - wait_on_page_locked(dirty_page); - set_page_dirty_balance(dirty_page, page_mkwrite); - } - put_page(dirty_page); - if (page_mkwrite) { - struct address_space *mapping = dirty_page->mapping; - - set_page_dirty(dirty_page); - unlock_page(dirty_page); - page_cache_release(dirty_page); - if (mapping) { - /* - * Some device drivers do not set page.mapping - * but still dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } + if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { + lock_page(old_page); /* LRU manipulation */ + munlock_vma_page(old_page); + unlock_page(old_page); } - - /* file_update_time outside page_lock */ - if (vma->vm_file) - file_update_time(vma->vm_file); + page_cache_release(old_page); } return ret; oom_free_new: @@ -2570,6 +2648,7 @@ void unmap_mapping_range(struct address_space *mapping, details.last_index = ULONG_MAX; details.i_mmap_lock = &mapping->i_mmap_lock; + mutex_lock(&mapping->unmap_mutex); spin_lock(&mapping->i_mmap_lock); /* Protect against endless unmapping loops */ @@ -2586,6 +2665,7 @@ void unmap_mapping_range(struct address_space *mapping, if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->unmap_mutex); } EXPORT_SYMBOL(unmap_mapping_range); @@ -2626,6 +2706,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page, *swapcache = NULL; swp_entry_t entry; pte_t pte; + int locked; struct mem_cgroup *ptr = NULL; int exclusive = 0; int ret = 0; @@ -2676,8 +2757,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_release; } - lock_page(page); + locked = lock_page_or_retry(page, mm, flags); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + if (!locked) { + ret |= VM_FAULT_RETRY; + goto out_release; + } /* * Make sure try_to_free_swap or reuse_swap_page or swapoff did not @@ -2926,7 +3011,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, vmf.page = NULL; ret = vma->vm_ops->fault(vma, &vmf); - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | + VM_FAULT_RETRY))) return ret; if (unlikely(PageHWPoison(vmf.page))) { @@ -2967,12 +3053,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto out; } charged = 1; - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if (vma->vm_flags & VM_LOCKED) - clear_page_mlock(vmf.page); copy_user_highpage(page, vmf.page, address, vma); __SetPageUptodate(page); } else { @@ -3139,9 +3219,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static inline int handle_pte_fault(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - pte_t *pte, pmd_t *pmd, unsigned int flags) +int handle_pte_fault(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pte_t *pte, pmd_t *pmd, unsigned int flags) { pte_t entry; spinlock_t *ptl; @@ -3185,7 +3265,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, * with threads. */ if (flags & FAULT_FLAG_WRITE) - flush_tlb_page(vma, address); + flush_tlb_fix_spurious_fault(vma, address); } unlock: pte_unmap_unlock(pte, ptl); @@ -3220,9 +3300,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pmd = pmd_alloc(mm, pud, address); if (!pmd) return VM_FAULT_OOM; - pte = pte_alloc_map(mm, pmd, address); - if (!pte) + if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { + if (!vma->vm_ops) + return do_huge_pmd_anonymous_page(mm, vma, address, + pmd, flags); + } else { + pmd_t orig_pmd = *pmd; + barrier(); + if (pmd_trans_huge(orig_pmd)) { + if (flags & FAULT_FLAG_WRITE && + !pmd_write(orig_pmd) && + !pmd_trans_splitting(orig_pmd)) + return do_huge_pmd_wp_page(mm, vma, address, + pmd, orig_pmd); + return 0; + } + } + + /* + * Use __pte_alloc instead of pte_alloc_map, because we can't + * run pte_offset_map on the pmd, if an huge pmd could + * materialize from under us from a different thread. + */ + if (unlikely(__pte_alloc(mm, vma, pmd, address))) return VM_FAULT_OOM; + /* if an huge pmd materialized from under us just retry later */ + if (unlikely(pmd_trans_huge(*pmd))) + return 0; + /* + * A regular pmd is established and it can't morph into a huge pmd + * from under us anymore at this point because we hold the mmap_sem + * read mode and khugepaged takes it in write mode. So now it's + * safe to run pte_offset_map(). + */ + pte = pte_offset_map(pmd, address); return handle_pte_fault(mm, vma, address, pte, pmd, flags); } @@ -3288,7 +3399,12 @@ int make_pages_present(unsigned long addr, unsigned long end) vma = find_vma(current->mm, addr); if (!vma) return -ENOMEM; - write = (vma->vm_flags & VM_WRITE) != 0; + /* + * We want to touch writable mappings with a write fault in order + * to break COW, except for shared mappings because these don't COW + * and we would not want to dirty them for nothing. + */ + write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE; BUG_ON(addr >= end); BUG_ON(end > vma->vm_end); len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; @@ -3343,7 +3459,7 @@ int in_gate_area_no_task(unsigned long addr) #endif /* __HAVE_ARCH_GATE_AREA */ -static int follow_pte(struct mm_struct *mm, unsigned long address, +static int __follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, spinlock_t **ptlp) { pgd_t *pgd; @@ -3360,6 +3476,7 @@ static int follow_pte(struct mm_struct *mm, unsigned long address, goto out; pmd = pmd_offset(pud, address); + VM_BUG_ON(pmd_trans_huge(*pmd)); if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) goto out; @@ -3380,6 +3497,17 @@ out: return -EINVAL; } +static inline int follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp) +{ + int res; + + /* (void) is needed to make gcc happy */ + (void) __cond_lock(*ptlp, + !(res = __follow_pte(mm, address, ptepp, ptlp))); + return res; +} + /** * follow_pfn - look up PFN at a user virtual address * @vma: memory mapping @@ -3589,3 +3717,74 @@ void might_fault(void) } EXPORT_SYMBOL(might_fault); #endif + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) +static void clear_gigantic_page(struct page *page, + unsigned long addr, + unsigned int pages_per_huge_page) +{ + int i; + struct page *p = page; + + might_sleep(); + for (i = 0; i < pages_per_huge_page; + i++, p = mem_map_next(p, page, i)) { + cond_resched(); + clear_user_highpage(p, addr + i * PAGE_SIZE); + } +} +void clear_huge_page(struct page *page, + unsigned long addr, unsigned int pages_per_huge_page) +{ + int i; + + if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { + clear_gigantic_page(page, addr, pages_per_huge_page); + return; + } + + might_sleep(); + for (i = 0; i < pages_per_huge_page; i++) { + cond_resched(); + clear_user_highpage(page + i, addr + i * PAGE_SIZE); + } +} + +static void copy_user_gigantic_page(struct page *dst, struct page *src, + unsigned long addr, + struct vm_area_struct *vma, + unsigned int pages_per_huge_page) +{ + int i; + struct page *dst_base = dst; + struct page *src_base = src; + + for (i = 0; i < pages_per_huge_page; ) { + cond_resched(); + copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} + +void copy_user_huge_page(struct page *dst, struct page *src, + unsigned long addr, struct vm_area_struct *vma, + unsigned int pages_per_huge_page) +{ + int i; + + if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { + copy_user_gigantic_page(dst, src, addr, vma, + pages_per_huge_page); + return; + } + + might_sleep(); + for (i = 0; i < pages_per_huge_page; i++) { + cond_resched(); + copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); + } +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index dd186c1a5d53..321fc7455df7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -34,6 +34,23 @@ #include "internal.h" +DEFINE_MUTEX(mem_hotplug_mutex); + +void lock_memory_hotplug(void) +{ + mutex_lock(&mem_hotplug_mutex); + + /* for exclusive hibernation if CONFIG_HIBERNATION=y */ + lock_system_sleep(); +} + +void unlock_memory_hotplug(void) +{ + unlock_system_sleep(); + mutex_unlock(&mem_hotplug_mutex); +} + + /* add this memory to iomem resource */ static struct resource *register_memory_resource(u64 start, u64 size) { @@ -65,9 +82,10 @@ static void release_memory_resource(struct resource *res) #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE #ifndef CONFIG_SPARSEMEM_VMEMMAP -static void get_page_bootmem(unsigned long info, struct page *page, int type) +static void get_page_bootmem(unsigned long info, struct page *page, + unsigned long type) { - atomic_set(&page->_mapcount, type); + page->lru.next = (struct list_head *) type; SetPagePrivate(page); set_page_private(page, info); atomic_inc(&page->_count); @@ -77,15 +95,16 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type) * so use __ref to tell modpost not to generate a warning */ void __ref put_page_bootmem(struct page *page) { - int type; + unsigned long type; - type = atomic_read(&page->_mapcount); - BUG_ON(type >= -1); + type = (unsigned long) page->lru.next; + BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || + type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); if (atomic_dec_return(&page->_count) == 1) { ClearPagePrivate(page); set_page_private(page, 0); - reset_page_mapcount(page); + INIT_LIST_HEAD(&page->lru); __free_pages_bootmem(page, 0); } @@ -390,6 +409,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) int ret; struct memory_notify arg; + lock_memory_hotplug(); arg.start_pfn = pfn; arg.nr_pages = nr_pages; arg.status_change_nid = -1; @@ -402,6 +422,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) ret = notifier_to_errno(ret); if (ret) { memory_notify(MEM_CANCEL_ONLINE, &arg); + unlock_memory_hotplug(); return ret; } /* @@ -426,6 +447,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) printk(KERN_DEBUG "online_pages %lx at %lx failed\n", nr_pages, pfn); memory_notify(MEM_CANCEL_ONLINE, &arg); + unlock_memory_hotplug(); return ret; } @@ -450,6 +472,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) if (onlined_pages) memory_notify(MEM_ONLINE, &arg); + unlock_memory_hotplug(); return 0; } @@ -493,7 +516,7 @@ int mem_online_node(int nid) pg_data_t *pgdat; int ret; - lock_system_sleep(); + lock_memory_hotplug(); pgdat = hotadd_new_pgdat(nid, 0); if (pgdat) { ret = -ENOMEM; @@ -504,7 +527,7 @@ int mem_online_node(int nid) BUG_ON(ret); out: - unlock_system_sleep(); + unlock_memory_hotplug(); return ret; } @@ -516,7 +539,7 @@ int __ref add_memory(int nid, u64 start, u64 size) struct resource *res; int ret; - lock_system_sleep(); + lock_memory_hotplug(); res = register_memory_resource(start, size); ret = -EEXIST; @@ -563,7 +586,7 @@ error: release_memory_resource(res); out: - unlock_system_sleep(); + unlock_memory_hotplug(); return ret; } EXPORT_SYMBOL_GPL(add_memory); @@ -602,27 +625,14 @@ static struct page *next_active_pageblock(struct page *page) /* Checks if this range of memory is likely to be hot-removable. */ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) { - int type; struct page *page = pfn_to_page(start_pfn); struct page *end_page = page + nr_pages; /* Check the starting page of each pageblock within the range */ for (; page < end_page; page = next_active_pageblock(page)) { - type = get_pageblock_migratetype(page); - - /* - * A pageblock containing MOVABLE or free pages is considered - * removable - */ - if (type != MIGRATE_MOVABLE && !pageblock_free(page)) - return 0; - - /* - * A pageblock starting with a PageReserved page is not - * considered removable. - */ - if (PageReserved(page)) + if (!is_pageblock_removable_nolock(page)) return 0; + cond_resched(); } /* All pageblocks in the memory block are likely to be hot-removable */ @@ -659,7 +669,7 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) * Scanning pfn is much easier than scanning lru list. * Scan pfn from start to end and Find LRU page. */ -int scan_lru_pages(unsigned long start, unsigned long end) +static unsigned long scan_lru_pages(unsigned long start, unsigned long end) { unsigned long pfn; struct page *page; @@ -709,29 +719,31 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) page_is_file_cache(page)); } else { - /* Becasue we don't have big zone->lock. we should - check this again here. */ - if (page_count(page)) - not_managed++; #ifdef CONFIG_DEBUG_VM printk(KERN_ALERT "removing pfn %lx from LRU failed\n", pfn); dump_page(page); #endif + /* Becasue we don't have big zone->lock. we should + check this again here. */ + if (page_count(page)) { + not_managed++; + ret = -EBUSY; + break; + } } } - ret = -EBUSY; - if (not_managed) { - if (!list_empty(&source)) + if (!list_empty(&source)) { + if (not_managed) { + putback_lru_pages(&source); + goto out; + } + /* this function returns # of failed pages */ + ret = migrate_pages(&source, hotremove_migrate_alloc, 0, + true, true); + if (ret) putback_lru_pages(&source); - goto out; } - ret = 0; - if (list_empty(&source)) - goto out; - /* this function returns # of failed pages */ - ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); - out: return ret; } @@ -803,7 +815,7 @@ static int offline_pages(unsigned long start_pfn, if (!test_pages_in_a_zone(start_pfn, end_pfn)) return -EINVAL; - lock_system_sleep(); + lock_memory_hotplug(); zone = page_zone(pfn_to_page(start_pfn)); node = zone_to_nid(zone); @@ -840,7 +852,6 @@ repeat: ret = 0; if (drain) { lru_add_drain_all(); - flush_scheduled_work(); cond_resched(); drain_all_pages(); } @@ -862,7 +873,6 @@ repeat: } /* drain all zone's lru pagevec, this is asyncronous... */ lru_add_drain_all(); - flush_scheduled_work(); yield(); /* drain pcp pages , this is synchrouns. */ drain_all_pages(); @@ -894,7 +904,7 @@ repeat: writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); - unlock_system_sleep(); + unlock_memory_hotplug(); return 0; failed_removal: @@ -905,7 +915,7 @@ failed_removal: undo_isolate_page_range(start_pfn, end_pfn); out: - unlock_system_sleep(); + unlock_memory_hotplug(); return ret; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f969da5dd8a2..368fc9d23610 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -514,6 +514,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + split_huge_page_pmd(vma->vm_mm, pmd); if (pmd_none_or_clear_bad(pmd)) continue; if (check_pte_range(vma, pmd, addr, next, nodes, @@ -924,15 +925,22 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, nodemask_t nmask; LIST_HEAD(pagelist); int err = 0; + struct vm_area_struct *vma; nodes_clear(nmask); node_set(source, nmask); - check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, + vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); + if (IS_ERR(vma)) + return PTR_ERR(vma); - if (!list_empty(&pagelist)) - err = migrate_pages(&pagelist, new_node_page, dest, 0); + if (!list_empty(&pagelist)) { + err = migrate_pages(&pagelist, new_node_page, dest, + false, true); + if (err) + putback_lru_pages(&pagelist); + } return err; } @@ -1147,9 +1155,13 @@ static long do_mbind(unsigned long start, unsigned long len, err = mbind_range(mm, start, end, new); - if (!list_empty(&pagelist)) + if (!list_empty(&pagelist)) { nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma, 0); + (unsigned long)vma, + false, true); + if (nr_failed) + putback_lru_pages(&pagelist); + } if (!err && nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; @@ -1298,15 +1310,15 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, goto out; /* Find the mm_struct */ - read_lock(&tasklist_lock); + rcu_read_lock(); task = pid ? find_task_by_vpid(pid) : current; if (!task) { - read_unlock(&tasklist_lock); + rcu_read_unlock(); err = -ESRCH; goto out; } mm = get_task_mm(task); - read_unlock(&tasklist_lock); + rcu_read_unlock(); err = -EINVAL; if (!mm) @@ -1588,7 +1600,7 @@ unsigned slab_node(struct mempolicy *policy) (void)first_zones_zonelist(zonelist, highest_zoneidx, &policy->v.nodes, &zone); - return zone->node; + return zone ? zone->node : numa_node_id(); } default: @@ -1784,7 +1796,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, } /** - * alloc_page_vma - Allocate a page for a VMA. + * alloc_pages_vma - Allocate a page for a VMA. * * @gfp: * %GFP_USER user allocation. @@ -1793,6 +1805,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * %GFP_FS allocation should not call back into a file system. * %GFP_ATOMIC don't sleep. * + * @order:Order of the GFP allocation. * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual Address of the allocation. Must be inside the VMA. * @@ -1806,7 +1819,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * Should be called with the mm_sem of the vma hold. */ struct page * -alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) +alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, + unsigned long addr) { struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; @@ -1818,7 +1832,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); mpol_cond_put(pol); - page = alloc_page_interleave(gfp, 0, nid); + page = alloc_page_interleave(gfp, order, nid); put_mems_allowed(); return page; } @@ -1827,7 +1841,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) /* * slow path: ref counted shared policy */ - struct page *page = __alloc_pages_nodemask(gfp, 0, + struct page *page = __alloc_pages_nodemask(gfp, order, zl, policy_nodemask(gfp, pol)); __mpol_put(pol); put_mems_allowed(); @@ -1836,7 +1850,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) /* * fast path: default or task policy */ - page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); + page = __alloc_pages_nodemask(gfp, order, zl, + policy_nodemask(gfp, pol)); put_mems_allowed(); return page; } diff --git a/mm/migrate.c b/mm/migrate.c index 38e7cad782f4..766115253807 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -32,8 +32,11 @@ #include <linux/security.h> #include <linux/memcontrol.h> #include <linux/syscalls.h> +#include <linux/hugetlb.h> #include <linux/gfp.h> +#include <asm/tlbflush.h> + #include "internal.h" #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -95,26 +98,36 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, pte_t *ptep, pte; spinlock_t *ptl; - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) - goto out; + if (unlikely(PageHuge(new))) { + ptep = huge_pte_offset(mm, addr); + if (!ptep) + goto out; + ptl = &mm->page_table_lock; + } else { + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + goto out; - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) - goto out; + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + goto out; - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) - goto out; + pmd = pmd_offset(pud, addr); + if (pmd_trans_huge(*pmd)) + goto out; + if (!pmd_present(*pmd)) + goto out; - ptep = pte_offset_map(pmd, addr); + ptep = pte_offset_map(pmd, addr); - if (!is_swap_pte(*ptep)) { - pte_unmap(ptep); - goto out; - } + if (!is_swap_pte(*ptep)) { + pte_unmap(ptep); + goto out; + } + + ptl = pte_lockptr(mm, pmd); + } - ptl = pte_lockptr(mm, pmd); spin_lock(ptl); pte = *ptep; if (!is_swap_pte(pte)) @@ -130,10 +143,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); if (is_write_migration_entry(entry)) pte = pte_mkwrite(pte); +#ifdef CONFIG_HUGETLB_PAGE + if (PageHuge(new)) + pte = pte_mkhuge(pte); +#endif flush_cache_page(vma, addr, pte_pfn(pte)); set_pte_at(mm, addr, ptep, pte); - if (PageAnon(new)) + if (PageHuge(new)) { + if (PageAnon(new)) + hugepage_add_anon_rmap(new, vma, addr); + else + page_dup_rmap(new); + } else if (PageAnon(new)) page_add_anon_rmap(new, vma, addr); else page_add_file_rmap(new); @@ -226,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || - (struct page *)radix_tree_deref_slot(pslot) != page) { + radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -276,11 +298,59 @@ static int migrate_page_move_mapping(struct address_space *mapping, } /* + * The expected number of remaining references is the same as that + * of migrate_page_move_mapping(). + */ +int migrate_huge_page_move_mapping(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + int expected_count; + void **pslot; + + if (!mapping) { + if (page_count(page) != 1) + return -EAGAIN; + return 0; + } + + spin_lock_irq(&mapping->tree_lock); + + pslot = radix_tree_lookup_slot(&mapping->page_tree, + page_index(page)); + + expected_count = 2 + page_has_private(page); + if (page_count(page) != expected_count || + radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + if (!page_freeze_refs(page, expected_count)) { + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + get_page(newpage); + + radix_tree_replace_slot(pslot, newpage); + + page_unfreeze_refs(page, expected_count); + + __put_page(page); + + spin_unlock_irq(&mapping->tree_lock); + return 0; +} + +/* * Copy the page to its new location */ -static void migrate_page_copy(struct page *newpage, struct page *page) +void migrate_page_copy(struct page *newpage, struct page *page) { - copy_highpage(newpage, page); + if (PageHuge(page)) + copy_huge_page(newpage, page); + else + copy_highpage(newpage, page); if (PageError(page)) SetPageError(newpage); @@ -431,7 +501,6 @@ static int writeout(struct address_space *mapping, struct page *page) .nr_to_write = 1, .range_start = 0, .range_end = LLONG_MAX, - .nonblocking = 1, .for_reclaim = 1 }; int rc; @@ -547,13 +616,12 @@ static int move_to_new_page(struct page *newpage, struct page *page, * to the newly allocated page in newpage. */ static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, int offlining) + struct page *page, int force, bool offlining, bool sync) { int rc = 0; int *result = NULL; struct page *newpage = get_new_page(page, private, &result); int remap_swapcache = 1; - int rcu_locked = 0; int charge = 0; struct mem_cgroup *mem = NULL; struct anon_vma *anon_vma = NULL; @@ -565,6 +633,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, /* page was freed from under us. So we are done. */ goto move_newpage; } + if (unlikely(PageTransHuge(page))) + if (unlikely(split_huge_page(page))) + goto move_newpage; /* prepare cgroup just returns 0 or -ENOMEM */ rc = -EAGAIN; @@ -572,6 +643,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (!trylock_page(page)) { if (!force) goto move_newpage; + + /* + * It's not safe for direct compaction to call lock_page. + * For example, during page readahead pages are added locked + * to the LRU. Later, when the IO completes the pages are + * marked uptodate and unlocked. However, the queueing + * could be merging multiple pages for one bio (e.g. + * mpage_readpages). If an allocation happens for the + * second or third page, the process can end up locking + * the same page twice and deadlocking. Rather than + * trying to be clever about what pages can be locked, + * avoid the use of lock_page for direct compaction + * altogether. + */ + if (current->flags & PF_MEMALLOC) + goto move_newpage; + lock_page(page); } @@ -598,27 +686,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, BUG_ON(charge); if (PageWriteback(page)) { - if (!force) + if (!force || !sync) goto uncharge; wait_on_page_writeback(page); } /* * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, * we cannot notice that anon_vma is freed while we migrates a page. - * This rcu_read_lock() delays freeing anon_vma pointer until the end + * This get_anon_vma() delays freeing anon_vma pointer until the end * of migration. File cache pages are no problem because of page_lock() * File Caches may use write_page() or lock_page() in migration, then, * just care Anon page here. */ if (PageAnon(page)) { - rcu_read_lock(); - rcu_locked = 1; - - /* Determine how to safely use anon_vma */ - if (!page_mapped(page)) { - if (!PageSwapCache(page)) - goto rcu_unlock; - + /* + * Only page_lock_anon_vma() understands the subtleties of + * getting a hold on an anon_vma from outside one of its mms. + */ + anon_vma = page_lock_anon_vma(page); + if (anon_vma) { + /* + * Take a reference count on the anon_vma if the + * page is mapped so that it is guaranteed to + * exist when the page is remapped later + */ + get_anon_vma(anon_vma); + page_unlock_anon_vma(anon_vma); + } else if (PageSwapCache(page)) { /* * We cannot be sure that the anon_vma of an unmapped * swapcache page is safe to use because we don't @@ -633,13 +727,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, */ remap_swapcache = 0; } else { - /* - * Take a reference count on the anon_vma if the - * page is mapped so that it is guaranteed to - * exist when the page is remapped later - */ - anon_vma = page_anon_vma(page); - get_anon_vma(anon_vma); + goto uncharge; } } @@ -656,16 +744,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, * free the metadata, so the page can be freed. */ if (!page->mapping) { - if (!PageAnon(page) && page_has_private(page)) { - /* - * Go direct to try_to_free_buffers() here because - * a) that's what try_to_release_page() would do anyway - * b) we may be under rcu_read_lock() here, so we can't - * use GFP_KERNEL which is what try_to_release_page() - * needs to be effective. - */ + VM_BUG_ON(PageAnon(page)); + if (page_has_private(page)) { try_to_free_buffers(page); - goto rcu_unlock; + goto uncharge; } goto skip_unmap; } @@ -679,20 +761,18 @@ skip_unmap: if (rc && remap_swapcache) remove_migration_ptes(page, page); -rcu_unlock: /* Drop an anon_vma reference if we took one */ if (anon_vma) drop_anon_vma(anon_vma); - if (rcu_locked) - rcu_read_unlock(); uncharge: if (!charge) - mem_cgroup_end_migration(mem, page, newpage); + mem_cgroup_end_migration(mem, page, newpage, rc == 0); unlock: unlock_page(page); +move_newpage: if (rc != -EAGAIN) { /* * A page that has been migrated has all references @@ -706,8 +786,6 @@ unlock: putback_lru_page(page); } -move_newpage: - /* * Move the new page to the LRU. If migration was not successful * then this will free the page. @@ -724,6 +802,81 @@ move_newpage: } /* + * Counterpart of unmap_and_move_page() for hugepage migration. + * + * This function doesn't wait the completion of hugepage I/O + * because there is no race between I/O and migration for hugepage. + * Note that currently hugepage I/O occurs only in direct I/O + * where no lock is held and PG_writeback is irrelevant, + * and writeback status of all subpages are counted in the reference + * count of the head page (i.e. if all subpages of a 2MB hugepage are + * under direct I/O, the reference of the head page is 512 and a bit more.) + * This means that when we try to migrate hugepage whose subpages are + * doing direct I/O, some references remain after try_to_unmap() and + * hugepage migration fails without data corruption. + * + * There is also no race when direct I/O is issued on the page under migration, + * because then pte is replaced with migration swap entry and direct I/O code + * will wait in the page fault for migration to complete. + */ +static int unmap_and_move_huge_page(new_page_t get_new_page, + unsigned long private, struct page *hpage, + int force, bool offlining, bool sync) +{ + int rc = 0; + int *result = NULL; + struct page *new_hpage = get_new_page(hpage, private, &result); + struct anon_vma *anon_vma = NULL; + + if (!new_hpage) + return -ENOMEM; + + rc = -EAGAIN; + + if (!trylock_page(hpage)) { + if (!force || !sync) + goto out; + lock_page(hpage); + } + + if (PageAnon(hpage)) { + anon_vma = page_lock_anon_vma(hpage); + if (anon_vma) { + get_anon_vma(anon_vma); + page_unlock_anon_vma(anon_vma); + } + } + + try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + + if (!page_mapped(hpage)) + rc = move_to_new_page(new_hpage, hpage, 1); + + if (rc) + remove_migration_ptes(hpage, hpage); + + if (anon_vma) + drop_anon_vma(anon_vma); +out: + unlock_page(hpage); + + if (rc != -EAGAIN) { + list_del(&hpage->lru); + put_page(hpage); + } + + put_page(new_hpage); + + if (result) { + if (rc) + *result = rc; + else + *result = page_to_nid(new_hpage); + } + return rc; +} + +/* * migrate_pages * * The function takes one list of pages to migrate and a function @@ -732,13 +885,15 @@ move_newpage: * * The function returns after 10 attempts or if no pages * are movable anymore because to has become empty - * or no retryable pages exist anymore. All pages will be - * returned to the LRU or freed. + * or no retryable pages exist anymore. + * Caller should call putback_lru_pages to return pages to the LRU + * or free list only if ret != 0. * * Return: Number of pages not migrated or error code. */ int migrate_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, int offlining) + new_page_t get_new_page, unsigned long private, bool offlining, + bool sync) { int retry = 1; int nr_failed = 0; @@ -758,7 +913,8 @@ int migrate_pages(struct list_head *from, cond_resched(); rc = unmap_and_move(get_new_page, private, - page, pass > 2, offlining); + page, pass > 2, offlining, + sync); switch(rc) { case -ENOMEM: @@ -780,8 +936,50 @@ out: if (!swapwrite) current->flags &= ~PF_SWAPWRITE; - putback_lru_pages(from); + if (rc) + return rc; + + return nr_failed + retry; +} + +int migrate_huge_pages(struct list_head *from, + new_page_t get_new_page, unsigned long private, bool offlining, + bool sync) +{ + int retry = 1; + int nr_failed = 0; + int pass = 0; + struct page *page; + struct page *page2; + int rc; + + for (pass = 0; pass < 10 && retry; pass++) { + retry = 0; + + list_for_each_entry_safe(page, page2, from, lru) { + cond_resched(); + rc = unmap_and_move_huge_page(get_new_page, + private, page, pass > 2, offlining, + sync); + + switch(rc) { + case -ENOMEM: + goto out; + case -EAGAIN: + retry++; + break; + case 0: + break; + default: + /* Permanent failure */ + nr_failed++; + break; + } + } + } + rc = 0; +out: if (rc) return rc; @@ -841,10 +1039,10 @@ static int do_move_page_to_node_array(struct mm_struct *mm, err = -EFAULT; vma = find_vma(mm, pp->addr); - if (!vma || !vma_migratable(vma)) + if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) goto set_status; - page = follow_page(vma, pp->addr, FOLL_GET); + page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT); err = PTR_ERR(page); if (IS_ERR(page)) @@ -890,9 +1088,12 @@ set_status: } err = 0; - if (!list_empty(&pagelist)) + if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_page_node, - (unsigned long)pm, 0); + (unsigned long)pm, 0, true); + if (err) + putback_lru_pages(&pagelist); + } up_read(&mm->mmap_sem); return err; @@ -1005,7 +1206,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, int err = -EFAULT; vma = find_vma(mm, addr); - if (!vma) + if (!vma || addr < vma->vm_start) goto set_status; page = follow_page(vma, addr, 0); diff --git a/mm/mincore.c b/mm/mincore.c index 9ac42dc6d7b6..a4e6b9d75c76 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -154,6 +154,13 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) { + if (mincore_huge_pmd(vma, pmd, addr, next, vec)) { + vec += (next - addr) >> PAGE_SHIFT; + continue; + } + /* fall through */ + } if (pmd_none_or_clear_bad(pmd)) mincore_unmapped_range(vma, addr, next, vec); else diff --git a/mm/mlock.c b/mm/mlock.c index b70919ce4f72..c3924c7f00be 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -155,13 +155,12 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add * vma->vm_mm->mmap_sem must be held for at least read. */ static long __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + int *nonblocking) { struct mm_struct *mm = vma->vm_mm; unsigned long addr = start; - struct page *pages[16]; /* 16 gives a reasonable batch */ int nr_pages = (end - start) / PAGE_SIZE; - int ret = 0; int gup_flags; VM_BUG_ON(start & ~PAGE_MASK); @@ -170,73 +169,33 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, VM_BUG_ON(end > vma->vm_end); VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); - gup_flags = FOLL_TOUCH | FOLL_GET; - if (vma->vm_flags & VM_WRITE) + gup_flags = FOLL_TOUCH; + /* + * We want to touch writable mappings with a write fault in order + * to break COW, except for shared mappings because these don't COW + * and we would not want to dirty them for nothing. + */ + if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) gup_flags |= FOLL_WRITE; + /* + * We want mlock to succeed for regions that have any permissions + * other than PROT_NONE. + */ + if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) + gup_flags |= FOLL_FORCE; + + if (vma->vm_flags & VM_LOCKED) + gup_flags |= FOLL_MLOCK; + /* We don't try to access the guard page of a stack vma */ if (stack_guard_page(vma, start)) { addr += PAGE_SIZE; nr_pages--; } - while (nr_pages > 0) { - int i; - - cond_resched(); - - /* - * get_user_pages makes pages present if we are - * setting mlock. and this extra reference count will - * disable migration of this page. However, page may - * still be truncated out from under us. - */ - ret = __get_user_pages(current, mm, addr, - min_t(int, nr_pages, ARRAY_SIZE(pages)), - gup_flags, pages, NULL); - /* - * This can happen for, e.g., VM_NONLINEAR regions before - * a page has been allocated and mapped at a given offset, - * or for addresses that map beyond end of a file. - * We'll mlock the pages if/when they get faulted in. - */ - if (ret < 0) - break; - - lru_add_drain(); /* push cached pages to LRU */ - - for (i = 0; i < ret; i++) { - struct page *page = pages[i]; - - if (page->mapping) { - /* - * That preliminary check is mainly to avoid - * the pointless overhead of lock_page on the - * ZERO_PAGE: which might bounce very badly if - * there is contention. However, we're still - * dirtying its cacheline with get/put_page: - * we'll add another __get_user_pages flag to - * avoid it if that case turns out to matter. - */ - lock_page(page); - /* - * Because we lock page here and migration is - * blocked by the elevated reference, we need - * only check for file-cache page truncation. - */ - if (page->mapping) - mlock_vma_page(page); - unlock_page(page); - } - put_page(page); /* ref from get_user_pages() */ - } - - addr += ret * PAGE_SIZE; - nr_pages -= ret; - ret = 0; - } - - return ret; /* 0 or negative error code */ + return __get_user_pages(current, mm, addr, nr_pages, gup_flags, + NULL, NULL, nonblocking); } /* @@ -280,7 +239,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))) { - __mlock_vma_pages_range(vma, start, end); + __mlock_vma_pages_range(vma, start, end, NULL); /* Hide errors from mmap() and other callers */ return 0; @@ -372,18 +331,10 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, int ret = 0; int lock = newflags & VM_LOCKED; - if (newflags == vma->vm_flags || - (vma->vm_flags & (VM_IO | VM_PFNMAP))) + if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || + is_vm_hugetlb_page(vma) || vma == get_gate_vma(current)) goto out; /* don't set VM_LOCKED, don't count */ - if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || - is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current)) { - if (lock) - make_pages_present(start, end); - goto out; /* don't set VM_LOCKED, don't count */ - } - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); @@ -419,14 +370,10 @@ success: * set VM_LOCKED, __mlock_vma_pages_range will bring it back. */ - if (lock) { + if (lock) vma->vm_flags = newflags; - ret = __mlock_vma_pages_range(vma, start, end); - if (ret < 0) - ret = __mlock_posix_error_return(ret); - } else { + else munlock_vma_pages_range(vma, start, end); - } out: *prev = vma; @@ -439,7 +386,8 @@ static int do_mlock(unsigned long start, size_t len, int on) struct vm_area_struct * vma, * prev; int error; - len = PAGE_ALIGN(len); + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(len != PAGE_ALIGN(len)); end = start + len; if (end < start) return -EINVAL; @@ -482,6 +430,62 @@ static int do_mlock(unsigned long start, size_t len, int on) return error; } +static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) +{ + struct mm_struct *mm = current->mm; + unsigned long end, nstart, nend; + struct vm_area_struct *vma = NULL; + int locked = 0; + int ret = 0; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(len != PAGE_ALIGN(len)); + end = start + len; + + for (nstart = start; nstart < end; nstart = nend) { + /* + * We want to fault in pages for [nstart; end) address range. + * Find first corresponding VMA. + */ + if (!locked) { + locked = 1; + down_read(&mm->mmap_sem); + vma = find_vma(mm, nstart); + } else if (nstart >= vma->vm_end) + vma = vma->vm_next; + if (!vma || vma->vm_start >= end) + break; + /* + * Set [nstart; nend) to intersection of desired address + * range with the first VMA. Also, skip undesirable VMA types. + */ + nend = min(end, vma->vm_end); + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + continue; + if (nstart < vma->vm_start) + nstart = vma->vm_start; + /* + * Now fault in a range of pages. __mlock_vma_pages_range() + * double checks the vma flags, so that it won't mlock pages + * if the vma was already munlocked. + */ + ret = __mlock_vma_pages_range(vma, nstart, nend, &locked); + if (ret < 0) { + if (ignore_errors) { + ret = 0; + continue; /* continue at next VMA */ + } + ret = __mlock_posix_error_return(ret); + break; + } + nend = nstart + ret * PAGE_SIZE; + ret = 0; + } + if (locked) + up_read(&mm->mmap_sem); + return ret; /* 0 or negative error code */ +} + SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) { unsigned long locked; @@ -507,6 +511,8 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) error = do_mlock(start, len, 1); up_write(¤t->mm->mmap_sem); + if (!error) + error = do_mlock_pages(start, len, 0); return error; } @@ -571,6 +577,10 @@ SYSCALL_DEFINE1(mlockall, int, flags) capable(CAP_IPC_LOCK)) ret = do_mlockall(flags); up_write(¤t->mm->mmap_sem); + if (!ret && (flags & MCL_CURRENT)) { + /* Ignore errors */ + do_mlock_pages(0, TASK_SIZE, 1); + } out: return ret; } diff --git a/mm/mmap.c b/mm/mmap.c index 00161a48a451..2ec8eb5a9cdd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -28,6 +28,8 @@ #include <linux/rmap.h> #include <linux/mmu_notifier.h> #include <linux/perf_event.h> +#include <linux/audit.h> +#include <linux/khugepaged.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -252,7 +254,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) down_write(&mm->mmap_sem); #ifdef CONFIG_COMPAT_BRK - min_brk = mm->end_code; + /* + * CONFIG_COMPAT_BRK can still be overridden by setting + * randomize_va_space to 2, which will still cause mm->start_brk + * to be arbitrarily shifted + */ + if (mm->start_brk > PAGE_ALIGN(mm->end_data)) + min_brk = mm->start_brk; + else + min_brk = mm->end_data; #else min_brk = mm->start_brk; #endif @@ -587,6 +597,8 @@ again: remove_next = 1 + (end > next->vm_end); } } + vma_adjust_trans_huge(vma, start, end, adjust_next); + /* * When changing only vma->vm_end, we don't really need anon_vma * lock. This is a fairly rare case by itself, but the anon_vma @@ -814,6 +826,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, end, prev->vm_pgoff, NULL); if (err) return NULL; + khugepaged_enter_vma_merge(prev); return prev; } @@ -832,6 +845,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, next->vm_pgoff - pglen, NULL); if (err) return NULL; + khugepaged_enter_vma_merge(area); return area; } @@ -1108,6 +1122,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, unsigned long retval = -EBADF; if (!(flags & MAP_ANONYMOUS)) { + audit_mmap_fd(fd, flags); if (unlikely(flags & MAP_HUGETLB)) return -EINVAL; file = fget(fd); @@ -1759,6 +1774,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } } vma_unlock_anon_vma(vma); + khugepaged_enter_vma_merge(vma); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -1806,6 +1822,7 @@ static int expand_downwards(struct vm_area_struct *vma, } } vma_unlock_anon_vma(vma); + khugepaged_enter_vma_merge(vma); return error; } @@ -2460,6 +2477,7 @@ int install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long vm_flags, struct page **pages) { + int ret; struct vm_area_struct *vma; vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); @@ -2477,16 +2495,23 @@ int install_special_mapping(struct mm_struct *mm, vma->vm_ops = &special_mapping_vmops; vma->vm_private_data = pages; - if (unlikely(insert_vm_struct(mm, vma))) { - kmem_cache_free(vm_area_cachep, vma); - return -ENOMEM; - } + ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1); + if (ret) + goto out; + + ret = insert_vm_struct(mm, vma); + if (ret) + goto out; mm->total_vm += len >> PAGE_SHIFT; perf_event_mmap(vma); return 0; + +out: + kmem_cache_free(vm_area_cachep, vma); + return ret; } static DEFINE_MUTEX(mm_all_locks_mutex); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 438951d366f2..8d032de4088e 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, return young; } +int __mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + int young = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->test_young) { + young = mn->ops->test_young(mn, mm, address); + if (young) + break; + } + } + rcu_read_unlock(); + + return young; +} + void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { diff --git a/mm/mmzone.c b/mm/mmzone.c index e35bfb82c855..f5b7d1760213 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn, return 1; } #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ - -#ifdef CONFIG_SMP -/* Called when a more accurate view of NR_FREE_PAGES is needed */ -unsigned long zone_nr_free_pages(struct zone *zone) -{ - unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES); - - /* - * While kswapd is awake, it is considered the zone is under some - * memory pressure. Under pressure, there is a risk that - * per-cpu-counter-drift will allow the min watermark to be breached - * potentially causing a live-lock. While kswapd is awake and - * free pages are low, get a better estimate for free pages - */ - if (nr_free_pages < zone->percpu_drift_mark && - !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) - return zone_page_state_snapshot(zone, NR_FREE_PAGES); - - return nr_free_pages; -} -#endif /* CONFIG_SMP */ diff --git a/mm/mprotect.c b/mm/mprotect.c index 2d1bf7cf8851..5a688a2756be 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -78,7 +78,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, pte_unmap_unlock(pte - 1, ptl); } -static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, +static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable) { @@ -88,13 +88,21 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) { + if (next - addr != HPAGE_PMD_SIZE) + split_huge_page_pmd(vma->vm_mm, pmd); + else if (change_huge_pmd(vma, pmd, addr, newprot)) + continue; + /* fall through */ + } if (pmd_none_or_clear_bad(pmd)) continue; - change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable); + change_pte_range(vma->vm_mm, pmd, addr, next, newprot, + dirty_accountable); } while (pmd++, addr = next, addr != end); } -static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, +static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable) { @@ -106,7 +114,8 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable); + change_pmd_range(vma, pud, addr, next, newprot, + dirty_accountable); } while (pud++, addr = next, addr != end); } @@ -126,7 +135,8 @@ static void change_protection(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable); + change_pud_range(vma, pgd, addr, next, newprot, + dirty_accountable); } while (pgd++, addr = next, addr != end); flush_tlb_range(vma, start, end); } @@ -211,6 +221,7 @@ success: mmu_notifier_invalidate_range_end(mm, start, end); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); + perf_event_mmap(vma); return 0; fail: @@ -299,7 +310,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); if (error) goto out; - perf_event_mmap(vma); nstart = tmp; if (nstart < prev->vm_end) diff --git a/mm/mremap.c b/mm/mremap.c index cde56ee51ef7..1de98d492ddc 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -41,13 +41,15 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) return NULL; pmd = pmd_offset(pud, addr); + split_huge_page_pmd(mm, pmd); if (pmd_none_or_clear_bad(pmd)) return NULL; return pmd; } -static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) +static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr) { pgd_t *pgd; pud_t *pud; @@ -62,7 +64,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) if (!pmd) return NULL; - if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr)) + VM_BUG_ON(pmd_trans_huge(*pmd)); + if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr)) return NULL; return pmd; @@ -91,9 +94,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, */ mapping = vma->vm_file->f_mapping; spin_lock(&mapping->i_mmap_lock); - if (new_vma->vm_truncate_count && - new_vma->vm_truncate_count != vma->vm_truncate_count) - new_vma->vm_truncate_count = 0; + new_vma->vm_truncate_count = 0; } /* @@ -101,7 +102,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, * pte locks because exclusive mmap_sem prevents deadlock. */ old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); - new_pte = pte_offset_map_nested(new_pmd, new_addr); + new_pte = pte_offset_map(new_pmd, new_addr); new_ptl = pte_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); @@ -119,7 +120,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, arch_leave_lazy_mmu_mode(); if (new_ptl != old_ptl) spin_unlock(new_ptl); - pte_unmap_nested(new_pte - 1); + pte_unmap(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) spin_unlock(&mapping->i_mmap_lock); @@ -147,7 +148,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, old_pmd = get_old_pmd(vma->vm_mm, old_addr); if (!old_pmd) continue; - new_pmd = alloc_new_pmd(vma->vm_mm, new_addr); + new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); if (!new_pmd) break; next = (new_addr + PMD_SIZE) & PMD_MASK; diff --git a/mm/nommu.c b/mm/nommu.c index 88ff091eb07a..f59e1424d3db 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -10,7 +10,7 @@ * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> - * Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org> + * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org> */ #include <linux/module.h> @@ -29,6 +29,7 @@ #include <linux/personality.h> #include <linux/security.h> #include <linux/syscalls.h> +#include <linux/audit.h> #include <asm/uaccess.h> #include <asm/tlb.h> @@ -126,7 +127,8 @@ unsigned int kobjsize(const void *objp) int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int nr_pages, unsigned int foll_flags, - struct page **pages, struct vm_area_struct **vmas) + struct page **pages, struct vm_area_struct **vmas, + int *retry) { struct vm_area_struct *vma; unsigned long vm_flags; @@ -184,7 +186,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (force) flags |= FOLL_FORCE; - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, + NULL); } EXPORT_SYMBOL(get_user_pages); @@ -293,12 +296,60 @@ void *vmalloc(unsigned long size) } EXPORT_SYMBOL(vmalloc); +/* + * vzalloc - allocate virtually continguos memory with zero fill + * + * @size: allocation size + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into continguos kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vzalloc(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL); +} +EXPORT_SYMBOL(vzalloc); + +/** + * vmalloc_node - allocate memory on a specific node + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ void *vmalloc_node(unsigned long size, int node) { return vmalloc(size); } EXPORT_SYMBOL(vmalloc_node); +/** + * vzalloc_node - allocate memory on a specific node with zero fill + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vzalloc_node(unsigned long size, int node) +{ + return vzalloc(size); +} +EXPORT_SYMBOL(vzalloc_node); + #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif @@ -392,6 +443,31 @@ void __attribute__((weak)) vmalloc_sync_all(void) { } +/** + * alloc_vm_area - allocate a range of kernel address space + * @size: size of the area + * + * Returns: NULL on failure, vm_struct on success + * + * This function reserves a range of kernel address space, and + * allocates pagetables to map that range. No actual mappings + * are created. If the kernel address space is not shared + * between processes, it syncs the pagetable across all + * processes. + */ +struct vm_struct *alloc_vm_area(size_t size) +{ + BUG(); + return NULL; +} +EXPORT_SYMBOL_GPL(alloc_vm_area); + +void free_vm_area(struct vm_struct *area) +{ + BUG(); +} +EXPORT_SYMBOL_GPL(free_vm_area); + int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) { @@ -1411,6 +1487,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, struct file *file = NULL; unsigned long retval = -EBADF; + audit_mmap_fd(fd, flags); if (!(flags & MAP_ANONYMOUS)) { file = fget(fd); if (!file) @@ -1668,6 +1745,7 @@ void exit_mmap(struct mm_struct *mm) mm->mmap = vma->vm_next; delete_vma_from_mm(vma); delete_vma(mm, vma); + cond_resched(); } kleave(""); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 4029583a1024..7dcca55ede7c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -162,10 +162,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, return 0; /* - * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't - * need to be executed for something that cannot be killed. + * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN + * so the entire heuristic doesn't need to be executed for something + * that cannot be killed. */ - if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + if (atomic_read(&p->mm->oom_disable_count)) { task_unlock(p); return 0; } @@ -403,16 +404,40 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, #define K(x) ((x) << (PAGE_SHIFT-10)) static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) { + struct task_struct *q; + struct mm_struct *mm; + p = find_lock_task_mm(p); if (!p) return 1; + /* mm cannot be safely dereferenced after task_unlock(p) */ + mm = p->mm; + pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", task_pid_nr(p), p->comm, K(p->mm->total_vm), K(get_mm_counter(p->mm, MM_ANONPAGES)), K(get_mm_counter(p->mm, MM_FILEPAGES))); task_unlock(p); + /* + * Kill all processes sharing p->mm in other thread groups, if any. + * They don't get access to memory reserves or a higher scheduler + * priority, though, to avoid depletion of all memory or task + * starvation. This prevents mm->mmap_sem livelock when an oom killed + * task cannot exit because it requires the semaphore and its contended + * by another thread trying to allocate memory itself. That thread will + * now get access to memory reserves since it has a pending fatal + * signal. + */ + for_each_process(q) + if (q->mm == mm && !same_thread_group(q, p)) { + task_lock(q); /* Protect ->comm from prctl() */ + pr_err("Kill process %d (%s) sharing same memory\n", + task_pid_nr(q), q->comm); + task_unlock(q); + force_sig(SIGKILL, q); + } set_tsk_thread_flag(p, TIF_MEMDIE); force_sig(SIGKILL, p); @@ -680,7 +705,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, read_lock(&tasklist_lock); if (sysctl_oom_kill_allocating_task && !oom_unkillable_task(current, NULL, nodemask) && - (current->signal->oom_adj != OOM_DISABLE)) { + current->mm && !atomic_read(¤t->mm->oom_disable_count)) { /* * oom_kill_process() needs tasklist_lock held. If it returns * non-zero, current could not be killed so we must fallback to diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e3bccac1f025..2cb01f6ec5d0 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -404,25 +404,22 @@ unsigned long determine_dirtyable_memory(void) * - vm.dirty_background_ratio or vm.dirty_background_bytes * - vm.dirty_ratio or vm.dirty_bytes * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and - * runtime tasks. + * real-time tasks. */ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { unsigned long background; unsigned long dirty; - unsigned long available_memory = determine_dirtyable_memory(); + unsigned long uninitialized_var(available_memory); struct task_struct *tsk; + if (!vm_dirty_bytes || !dirty_background_bytes) + available_memory = determine_dirtyable_memory(); + if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); - else { - int dirty_ratio; - - dirty_ratio = vm_dirty_ratio; - if (dirty_ratio < 5) - dirty_ratio = 5; - dirty = (dirty_ratio * available_memory) / 100; - } + else + dirty = (vm_dirty_ratio * available_memory) / 100; if (dirty_background_bytes) background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); @@ -510,7 +507,7 @@ static void balance_dirty_pages(struct address_space *mapping, * catch-up. This avoids (excessively) small writeouts * when the bdi limits are ramping up. */ - if (nr_reclaimable + nr_writeback < + if (nr_reclaimable + nr_writeback <= (background_thresh + dirty_thresh) / 2) break; @@ -542,8 +539,8 @@ static void balance_dirty_pages(struct address_space *mapping, * the last resort safeguard. */ dirty_exceeded = - (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh) - || (nr_reclaimable + nr_writeback >= dirty_thresh); + (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) + || (nr_reclaimable + nr_writeback > dirty_thresh); if (!dirty_exceeded) break; @@ -569,7 +566,7 @@ static void balance_dirty_pages(struct address_space *mapping, break; /* We've done our duty */ } trace_wbc_balance_dirty_wait(&wbc, bdi); - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); io_schedule_timeout(pause); /* @@ -1109,7 +1106,7 @@ EXPORT_SYMBOL(write_one_page); int __set_page_dirty_no_writeback(struct page *page) { if (!PageDirty(page)) - SetPageDirty(page); + return !TestSetPageDirty(page); return 0; } @@ -1121,6 +1118,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) { if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_zone_page_state(page, NR_DIRTIED); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); @@ -1129,6 +1127,18 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) EXPORT_SYMBOL(account_page_dirtied); /* + * Helper function for set_page_writeback family. + * NOTE: Unlike account_page_dirtied this does not rely on being atomic + * wrt interrupts. + */ +void account_page_writeback(struct page *page) +{ + inc_zone_page_state(page, NR_WRITEBACK); + inc_zone_page_state(page, NR_WRITTEN); +} +EXPORT_SYMBOL(account_page_writeback); + +/* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. * @@ -1366,7 +1376,7 @@ int test_set_page_writeback(struct page *page) ret = TestSetPageWriteback(page); } if (!ret) - inc_zone_page_state(page, NR_WRITEBACK); + account_page_writeback(page); return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f12ad1836abe..a873e61e312e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -21,6 +21,7 @@ #include <linux/pagemap.h> #include <linux/jiffies.h> #include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/kmemcheck.h> @@ -103,19 +104,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; * only be modified with pm_mutex held, unless the suspend/hibernate code is * guaranteed not to run in parallel with that modification). */ -void set_gfp_allowed_mask(gfp_t mask) + +static gfp_t saved_gfp_mask; + +void pm_restore_gfp_mask(void) { WARN_ON(!mutex_is_locked(&pm_mutex)); - gfp_allowed_mask = mask; + if (saved_gfp_mask) { + gfp_allowed_mask = saved_gfp_mask; + saved_gfp_mask = 0; + } } -gfp_t clear_gfp_allowed_mask(gfp_t mask) +void pm_restrict_gfp_mask(void) { - gfp_t ret = gfp_allowed_mask; - WARN_ON(!mutex_is_locked(&pm_mutex)); - gfp_allowed_mask &= ~mask; - return ret; + WARN_ON(saved_gfp_mask); + saved_gfp_mask = gfp_allowed_mask; + gfp_allowed_mask &= ~GFP_IOFS; } #endif /* CONFIG_PM_SLEEP */ @@ -351,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order) } } +/* update __split_huge_page_refcount if you change this function */ static int destroy_compound_page(struct page *page, unsigned long order) { int i; @@ -420,18 +427,10 @@ static inline void rmv_page_order(struct page *page) * * Assumption: *_mem_map is contiguous at least up to MAX_ORDER */ -static inline struct page * -__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) -{ - unsigned long buddy_idx = page_idx ^ (1 << order); - - return page + (buddy_idx - page_idx); -} - static inline unsigned long -__find_combined_index(unsigned long page_idx, unsigned int order) +__find_buddy_index(unsigned long page_idx, unsigned int order) { - return (page_idx & ~(1 << order)); + return page_idx ^ (1 << order); } /* @@ -442,8 +441,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order) * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. * - * For recording whether a page is in the buddy system, we use PG_buddy. - * Setting, clearing, and testing PG_buddy is serialized by zone->lock. + * For recording whether a page is in the buddy system, we set ->_mapcount -2. + * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. * * For recording page's order, we use page_private(page). */ @@ -476,7 +475,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous - * free pages of length of (1 << order) and marked with PG_buddy. Page's + * free pages of length of (1 << order) and marked with _mapcount -2. Page's * order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were @@ -493,6 +492,7 @@ static inline void __free_one_page(struct page *page, { unsigned long page_idx; unsigned long combined_idx; + unsigned long uninitialized_var(buddy_idx); struct page *buddy; if (unlikely(PageCompound(page))) @@ -507,7 +507,8 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON(bad_range(zone, page)); while (order < MAX_ORDER-1) { - buddy = __page_find_buddy(page, page_idx, order); + buddy_idx = __find_buddy_index(page_idx, order); + buddy = page + (buddy_idx - page_idx); if (!page_is_buddy(page, buddy, order)) break; @@ -515,7 +516,7 @@ static inline void __free_one_page(struct page *page, list_del(&buddy->lru); zone->free_area[order].nr_free--; rmv_page_order(buddy); - combined_idx = __find_combined_index(page_idx, order); + combined_idx = buddy_idx & page_idx; page = page + (combined_idx - page_idx); page_idx = combined_idx; order++; @@ -530,11 +531,12 @@ static inline void __free_one_page(struct page *page, * so it's less likely to be used soon and more likely to be merged * as a higher order page */ - if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) { + if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { struct page *higher_page, *higher_buddy; - combined_idx = __find_combined_index(page_idx, order); - higher_page = page + combined_idx - page_idx; - higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); + combined_idx = buddy_idx & page_idx; + higher_page = page + (combined_idx - page_idx); + buddy_idx = __find_buddy_index(combined_idx, order + 1); + higher_buddy = page + (buddy_idx - combined_idx); if (page_is_buddy(higher_page, higher_buddy, order + 1)) { list_add_tail(&page->lru, &zone->free_area[order].free_list[migratetype]); @@ -645,13 +647,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order) trace_mm_page_free_direct(page, order); kmemcheck_free_shadow(page, order); - for (i = 0; i < (1 << order); i++) { - struct page *pg = page + i; - - if (PageAnon(pg)) - pg->mapping = NULL; - bad += free_pages_check(pg); - } + if (PageAnon(page)) + page->mapping = NULL; + for (i = 0; i < (1 << order); i++) + bad += free_pages_check(page + i); if (bad) return false; @@ -1089,8 +1088,10 @@ static void drain_pages(unsigned int cpu) pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; - free_pcppages_bulk(zone, pcp->count, pcp); - pcp->count = 0; + if (pcp->count) { + free_pcppages_bulk(zone, pcp->count, pcp); + pcp->count = 0; + } local_irq_restore(flags); } } @@ -1454,24 +1455,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) #endif /* CONFIG_FAIL_PAGE_ALLOC */ /* - * Return 1 if free pages are above 'mark'. This takes into account the order + * Return true if free pages are above 'mark'. This takes into account the order * of the allocation. */ -int zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags) +static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags, long free_pages) { /* free_pages my go negative - that's OK */ long min = mark; - long free_pages = zone_nr_free_pages(z) - (1 << order) + 1; int o; + free_pages -= (1 << order) + 1; if (alloc_flags & ALLOC_HIGH) min -= min / 2; if (alloc_flags & ALLOC_HARDER) min -= min / 4; if (free_pages <= min + z->lowmem_reserve[classzone_idx]) - return 0; + return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ free_pages -= z->free_area[o].nr_free << o; @@ -1480,9 +1481,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, min >>= 1; if (free_pages <= min) - return 0; + return false; } - return 1; + return true; +} + +bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags) +{ + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + zone_page_state(z, NR_FREE_PAGES)); +} + +bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags) +{ + long free_pages = zone_page_state(z, NR_FREE_PAGES); + + if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) + free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); + + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + free_pages); } #ifdef CONFIG_NUMA @@ -1787,15 +1807,18 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) + int migratetype, unsigned long *did_some_progress, + bool sync_migration) { struct page *page; if (!order || compaction_deferred(preferred_zone)) return NULL; + current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, - nodemask); + nodemask, sync_migration); + current->flags &= ~PF_MEMALLOC; if (*did_some_progress != COMPACT_SKIPPED) { /* Page migration frees to the PCP lists but we want merging */ @@ -1831,7 +1854,8 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) + int migratetype, unsigned long *did_some_progress, + bool sync_migration) { return NULL; } @@ -1846,23 +1870,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, { struct page *page = NULL; struct reclaim_state reclaim_state; - struct task_struct *p = current; bool drained = false; cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - p->flags |= PF_MEMALLOC; + current->flags |= PF_MEMALLOC; lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; - p->reclaim_state = &reclaim_state; + current->reclaim_state = &reclaim_state; *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); - p->reclaim_state = NULL; + current->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); - p->flags &= ~PF_MEMALLOC; + current->flags &= ~PF_MEMALLOC; cond_resched(); @@ -1906,7 +1929,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, preferred_zone, migratetype); if (!page && gfp_mask & __GFP_NOFAIL) - congestion_wait(BLK_RW_ASYNC, HZ/50); + wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); } while (!page && (gfp_mask & __GFP_NOFAIL)); return page; @@ -1914,24 +1937,24 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, static inline void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, - enum zone_type high_zoneidx) + enum zone_type high_zoneidx, + enum zone_type classzone_idx) { struct zoneref *z; struct zone *zone; for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) - wakeup_kswapd(zone, order); + wakeup_kswapd(zone, order, classzone_idx); } static inline int gfp_to_alloc_flags(gfp_t gfp_mask) { - struct task_struct *p = current; int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; const gfp_t wait = gfp_mask & __GFP_WAIT; /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ - BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH); + BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); /* * The caller may dip into page reserves a bit more if the caller @@ -1939,21 +1962,26 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). */ - alloc_flags |= (gfp_mask & __GFP_HIGH); + alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); if (!wait) { - alloc_flags |= ALLOC_HARDER; + /* + * Not worth trying to allocate harder for + * __GFP_NOMEMALLOC even if it can't schedule. + */ + if (!(gfp_mask & __GFP_NOMEMALLOC)) + alloc_flags |= ALLOC_HARDER; /* * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */ alloc_flags &= ~ALLOC_CPUSET; - } else if (unlikely(rt_task(p)) && !in_interrupt()) + } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { if (!in_interrupt() && - ((p->flags & PF_MEMALLOC) || + ((current->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))) alloc_flags |= ALLOC_NO_WATERMARKS; } @@ -1972,7 +2000,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int alloc_flags; unsigned long pages_reclaimed = 0; unsigned long did_some_progress; - struct task_struct *p = current; + bool sync_migration = false; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -1997,7 +2025,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - wake_all_kswapd(order, zonelist, high_zoneidx); + if (!(gfp_mask & __GFP_NO_KSWAPD)) + wake_all_kswapd(order, zonelist, high_zoneidx, + zone_idx(preferred_zone)); /* * OK, we're below the kswapd watermark and have kicked background @@ -2006,6 +2036,14 @@ restart: */ alloc_flags = gfp_to_alloc_flags(gfp_mask); + /* + * Find the true preferred zone if the allocation is unconstrained by + * cpusets. + */ + if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) + first_zones_zonelist(zonelist, high_zoneidx, NULL, + &preferred_zone); + /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, @@ -2028,21 +2066,26 @@ rebalance: goto nopage; /* Avoid recursion of direct reclaim */ - if (p->flags & PF_MEMALLOC) + if (current->flags & PF_MEMALLOC) goto nopage; /* Avoid allocations with no watermarks from looping endlessly */ if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) goto nopage; - /* Try direct compaction */ + /* + * Try direct compaction. The first pass is asynchronous. Subsequent + * attempts after direct reclaim are synchronous + */ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress); + migratetype, &did_some_progress, + sync_migration); if (page) goto got_pg; + sync_migration = true; /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, @@ -2094,15 +2137,29 @@ rebalance: pages_reclaimed += did_some_progress; if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { /* Wait for some write requests to complete then retry */ - congestion_wait(BLK_RW_ASYNC, HZ/50); + wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); goto rebalance; + } else { + /* + * High-order allocations do not necessarily loop after + * direct reclaim and reclaim/compaction depends on compaction + * being called after reclaim so call directly if necessary + */ + page = __alloc_pages_direct_compact(gfp_mask, order, + zonelist, high_zoneidx, + nodemask, + alloc_flags, preferred_zone, + migratetype, &did_some_progress, + sync_migration); + if (page) + goto got_pg; } nopage: if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { printk(KERN_WARNING "%s: page allocation failure." " order:%d, mode:0x%x\n", - p->comm, order, gfp_mask); + current->comm, order, gfp_mask); dump_stack(); show_mem(); } @@ -2145,7 +2202,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, get_mems_allowed(); /* The preferred zone is used for statistics later */ - first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); + first_zones_zonelist(zonelist, high_zoneidx, + nodemask ? : &cpuset_current_mems_allowed, + &preferred_zone); if (!preferred_zone) { put_mems_allowed(); return NULL; @@ -2436,7 +2495,7 @@ void show_free_areas(void) " all_unreclaimable? %s" "\n", zone->name, - K(zone_nr_free_pages(zone)), + K(zone_page_state(zone, NR_FREE_PAGES)), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), @@ -2579,9 +2638,16 @@ static int __parse_numa_zonelist_order(char *s) static __init int setup_numa_zonelist_order(char *s) { - if (s) - return __parse_numa_zonelist_order(s); - return 0; + int ret; + + if (!s) + return 0; + + ret = __parse_numa_zonelist_order(s); + if (ret == 0) + strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); + + return ret; } early_param("numa_zonelist_order", setup_numa_zonelist_order); @@ -3007,14 +3073,6 @@ static __init_refok int __build_all_zonelists(void *data) build_zonelist_cache(pgdat); } -#ifdef CONFIG_MEMORY_HOTPLUG - /* Setup real pagesets for the new zone */ - if (data) { - struct zone *zone = data; - setup_zone_pageset(zone); - } -#endif - /* * Initialize the boot_pagesets that are going to be used * for bootstrapping processors. The real pagesets for @@ -3063,7 +3121,11 @@ void build_all_zonelists(void *data) } else { /* we have to stop all cpus to guarantee there is no user of zonelist */ - stop_machine(__build_all_zonelists, data, NULL); +#ifdef CONFIG_MEMORY_HOTPLUG + if (data) + setup_zone_pageset((struct zone *)data); +#endif + stop_machine(__build_all_zonelists, NULL, NULL); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); @@ -3636,6 +3698,41 @@ void __init free_bootmem_with_active_regions(int nid, } } +#ifdef CONFIG_HAVE_MEMBLOCK +u64 __init find_memory_core_early(int nid, u64 size, u64 align, + u64 goal, u64 limit) +{ + int i; + + /* Need to go over early_node_map to find out good range for node */ + for_each_active_range_index_in_nid(i, nid) { + u64 addr; + u64 ei_start, ei_last; + u64 final_start, final_end; + + ei_last = early_node_map[i].end_pfn; + ei_last <<= PAGE_SHIFT; + ei_start = early_node_map[i].start_pfn; + ei_start <<= PAGE_SHIFT; + + final_start = max(ei_start, goal); + final_end = min(ei_last, limit); + + if (final_start >= final_end) + continue; + + addr = memblock_find_in_range(final_start, final_end, size, align); + + if (addr == MEMBLOCK_ERROR) + continue; + + return addr; + } + + return MEMBLOCK_ERROR; +} +#endif + int __init add_from_early_node_map(struct range *range, int az, int nr_range, int nid) { @@ -3655,46 +3752,26 @@ int __init add_from_early_node_map(struct range *range, int az, void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, u64 goal, u64 limit) { - int i; void *ptr; + u64 addr; - if (limit > get_max_mapped()) - limit = get_max_mapped(); + if (limit > memblock.current_limit) + limit = memblock.current_limit; - /* need to go over early_node_map to find out good range for node */ - for_each_active_range_index_in_nid(i, nid) { - u64 addr; - u64 ei_start, ei_last; + addr = find_memory_core_early(nid, size, align, goal, limit); - ei_last = early_node_map[i].end_pfn; - ei_last <<= PAGE_SHIFT; - ei_start = early_node_map[i].start_pfn; - ei_start <<= PAGE_SHIFT; - addr = find_early_area(ei_start, ei_last, - goal, limit, size, align); - - if (addr == -1ULL) - continue; - -#if 0 - printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n", - nid, - ei_start, ei_last, goal, limit, size, - align, addr); -#endif - - ptr = phys_to_virt(addr); - memset(ptr, 0, size); - reserve_early_without_check(addr, addr + size, "BOOTMEM"); - /* - * The min_count is set to 0 so that bootmem allocated blocks - * are never reported as leaks. - */ - kmemleak_alloc(ptr, size, 0, 0); - return ptr; - } + if (addr == MEMBLOCK_ERROR) + return NULL; - return NULL; + ptr = phys_to_virt(addr); + memset(ptr, 0, size); + memblock_x86_reserve_range(addr, addr + size, "BOOTMEM"); + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. + */ + kmemleak_alloc(ptr, size, 0, 0); + return ptr; } #endif @@ -3997,7 +4074,7 @@ static void __init setup_usemap(struct pglist_data *pgdat, zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); } #else -static void inline setup_usemap(struct pglist_data *pgdat, +static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, unsigned long zonesize) {} #endif /* CONFIG_SPARSEMEM */ @@ -5281,12 +5358,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, * page allocater never alloc memory from ISOLATE block. */ +static int +__count_immobile_pages(struct zone *zone, struct page *page, int count) +{ + unsigned long pfn, iter, found; + /* + * For avoiding noise data, lru_add_drain_all() should be called + * If ZONE_MOVABLE, the zone never contains immobile pages + */ + if (zone_idx(zone) == ZONE_MOVABLE) + return true; + + if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE) + return true; + + pfn = page_to_pfn(page); + for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { + unsigned long check = pfn + iter; + + if (!pfn_valid_within(check)) { + iter++; + continue; + } + page = pfn_to_page(check); + if (!page_count(page)) { + if (PageBuddy(page)) + iter += (1 << page_order(page)) - 1; + continue; + } + if (!PageLRU(page)) + found++; + /* + * If there are RECLAIMABLE pages, we need to check it. + * But now, memory offline itself doesn't call shrink_slab() + * and it still to be fixed. + */ + /* + * If the page is not RAM, page_count()should be 0. + * we don't need more check. This is an _used_ not-movable page. + * + * The problematic thing here is PG_reserved pages. PG_reserved + * is set to both of a memory hole page and a _used_ kernel + * page at boot. + */ + if (found > count) + return false; + } + return true; +} + +bool is_pageblock_removable_nolock(struct page *page) +{ + struct zone *zone = page_zone(page); + return __count_immobile_pages(zone, page, 0); +} + int set_migratetype_isolate(struct page *page) { struct zone *zone; - struct page *curr_page; - unsigned long flags, pfn, iter; - unsigned long immobile = 0; + unsigned long flags, pfn; struct memory_isolate_notify arg; int notifier_ret; int ret = -EBUSY; @@ -5296,11 +5426,6 @@ int set_migratetype_isolate(struct page *page) zone_idx = zone_idx(zone); spin_lock_irqsave(&zone->lock, flags); - if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE || - zone_idx == ZONE_MOVABLE) { - ret = 0; - goto out; - } pfn = page_to_pfn(page); arg.start_pfn = pfn; @@ -5320,23 +5445,20 @@ int set_migratetype_isolate(struct page *page) */ notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); notifier_ret = notifier_to_errno(notifier_ret); - if (notifier_ret || !arg.pages_found) + if (notifier_ret) goto out; - - for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) { - if (!pfn_valid_within(pfn)) - continue; - - curr_page = pfn_to_page(iter); - if (!page_count(curr_page) || PageLRU(curr_page)) - continue; - - immobile++; - } - - if (arg.pages_found == immobile) + /* + * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. + * We just check MOVABLE pages. + */ + if (__count_immobile_pages(zone, page, arg.pages_found)) ret = 0; + /* + * immobile means "not-on-lru" paes. If immobile is larger than + * removable-by-driver pages reported by notifier, we'll fail. + */ + out: if (!ret) { set_pageblock_migratetype(page, MIGRATE_ISOLATE); @@ -5455,7 +5577,6 @@ static struct trace_print_flags pageflag_names[] = { {1UL << PG_swapcache, "swapcache" }, {1UL << PG_mappedtodisk, "mappedtodisk" }, {1UL << PG_reclaim, "reclaim" }, - {1UL << PG_buddy, "buddy" }, {1UL << PG_swapbacked, "swapbacked" }, {1UL << PG_unevictable, "unevictable" }, #ifdef CONFIG_MMU @@ -5503,7 +5624,7 @@ void dump_page(struct page *page) { printk(KERN_ALERT "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", - page, page_count(page), page_mapcount(page), + page, atomic_read(&page->_count), page_mapcount(page), page->mapping, page->index); dump_page_flags(page->flags); } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 5e0ffd967452..4ae42bb40892 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -86,7 +86,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) * all pages in [start_pfn...end_pfn) must be in the same zone. * zone->lock must be held before call this. * - * Returns 0 if all pages in the range is isolated. + * Returns 1 if all pages in the range is isolated. */ static int __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) @@ -119,7 +119,6 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) struct zone *zone; int ret; - pfn = start_pfn; /* * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page * is not aligned to pageblock_nr_pages. diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 8b1a2ce21ee5..7cfa6ae02303 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -34,6 +34,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + split_huge_page_pmd(walk->mm, pmd); if (pmd_none_or_clear_bad(pmd)) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); @@ -139,7 +140,6 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd_t *pgd; unsigned long next; int err = 0; - struct vm_area_struct *vma; if (addr >= end) return err; @@ -149,15 +149,17 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd = pgd_offset(walk->mm, addr); do { + struct vm_area_struct *uninitialized_var(vma); + next = pgd_addr_end(addr, end); +#ifdef CONFIG_HUGETLB_PAGE /* * handle hugetlb vma individually because pagetable walk for * the hugetlb page is dependent on the architecture and * we can't handled it in the same manner as non-huge pages. */ vma = find_vma(walk->mm, addr); -#ifdef CONFIG_HUGETLB_PAGE if (vma && is_vm_hugetlb_page(vma)) { if (vma->vm_end < next) next = vma->vm_end; diff --git a/mm/percpu-km.c b/mm/percpu-km.c index df680855540a..89633fefc6a2 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -27,7 +27,7 @@ * chunk size is not aligned. percpu-km code will whine about it. */ -#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK +#if defined(CONFIG_SMP) && defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) #error "contiguous percpu allocation is incompatible with paged first chunk" #endif @@ -35,7 +35,11 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) { - /* noop */ + unsigned int cpu; + + for_each_possible_cpu(cpu) + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); + return 0; } diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 7d9c1d0ebd3f..ea534960a04b 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -421,7 +421,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void) return NULL; vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, - pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL); + pcpu_nr_groups, pcpu_atom_size); if (!vms) { pcpu_free_chunk(chunk); return NULL; diff --git a/mm/percpu.c b/mm/percpu.c index c76ef3891e0d..3f930018aa60 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -31,7 +31,7 @@ * as small as 4 bytes. The allocator organizes chunks into lists * according to free size and tries to allocate from the fullest one. * Each chunk keeps the maximum contiguous area size hint which is - * guaranteed to be eqaul to or larger than the maximum contiguous + * guaranteed to be equal to or larger than the maximum contiguous * area in the chunk. This helps the allocator not to iterate the * chunk maps unnecessarily. * @@ -76,6 +76,7 @@ #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ +#ifdef CONFIG_SMP /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ #ifndef __addr_to_pcpu_ptr #define __addr_to_pcpu_ptr(addr) \ @@ -89,6 +90,11 @@ (unsigned long)pcpu_base_addr - \ (unsigned long)__per_cpu_start) #endif +#else /* CONFIG_SMP */ +/* on UP, it's always identity mapped */ +#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr) +#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr) +#endif /* CONFIG_SMP */ struct pcpu_chunk { struct list_head list; /* linked to pcpu_slot lists */ @@ -252,7 +258,7 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, /* * (Un)populated page region iterators. Iterate over (un)populated - * page regions betwen @start and @end in @chunk. @rs and @re should + * page regions between @start and @end in @chunk. @rs and @re should * be integer variables and will be set to start and end page index of * the current region. */ @@ -287,12 +293,8 @@ static void *pcpu_mem_alloc(size_t size) if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); - else { - void *ptr = vmalloc(size); - if (ptr) - memset(ptr, 0, size); - return ptr; - } + else + return vzalloc(size); } /** @@ -820,8 +822,8 @@ fail_unlock_mutex: * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * - * Allocate percpu area of @size bytes aligned at @align. Might - * sleep. Might trigger writeouts. + * Allocate zero-filled percpu area of @size bytes aligned at @align. + * Might sleep. Might trigger writeouts. * * CONTEXT: * Does GFP_KERNEL allocation. @@ -840,9 +842,10 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * - * Allocate percpu area of @size bytes aligned at @align from reserved - * percpu area if arch has set it up; otherwise, allocation is served - * from the same dynamic area. Might sleep. Might trigger writeouts. + * Allocate zero-filled percpu area of @size bytes aligned at @align + * from reserved percpu area if arch has set it up; otherwise, + * allocation is served from the same dynamic area. Might sleep. + * Might trigger writeouts. * * CONTEXT: * Does GFP_KERNEL allocation. @@ -949,6 +952,7 @@ EXPORT_SYMBOL_GPL(free_percpu); */ bool is_kernel_percpu_address(unsigned long addr) { +#ifdef CONFIG_SMP const size_t static_size = __per_cpu_end - __per_cpu_start; void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); unsigned int cpu; @@ -959,6 +963,8 @@ bool is_kernel_percpu_address(unsigned long addr) if ((void *)addr >= start && (void *)addr < start + static_size) return true; } +#endif + /* on UP, can't distinguish from other static vars, always false */ return false; } @@ -1067,161 +1073,6 @@ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) } /** - * pcpu_build_alloc_info - build alloc_info considering distances between CPUs - * @reserved_size: the size of reserved percpu area in bytes - * @dyn_size: minimum free size for dynamic allocation in bytes - * @atom_size: allocation atom size - * @cpu_distance_fn: callback to determine distance between cpus, optional - * - * This function determines grouping of units, their mappings to cpus - * and other parameters considering needed percpu size, allocation - * atom size and distances between CPUs. - * - * Groups are always mutliples of atom size and CPUs which are of - * LOCAL_DISTANCE both ways are grouped together and share space for - * units in the same group. The returned configuration is guaranteed - * to have CPUs on different nodes on different groups and >=75% usage - * of allocated virtual address space. - * - * RETURNS: - * On success, pointer to the new allocation_info is returned. On - * failure, ERR_PTR value is returned. - */ -static struct pcpu_alloc_info * __init pcpu_build_alloc_info( - size_t reserved_size, size_t dyn_size, - size_t atom_size, - pcpu_fc_cpu_distance_fn_t cpu_distance_fn) -{ - static int group_map[NR_CPUS] __initdata; - static int group_cnt[NR_CPUS] __initdata; - const size_t static_size = __per_cpu_end - __per_cpu_start; - int nr_groups = 1, nr_units = 0; - size_t size_sum, min_unit_size, alloc_size; - int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ - int last_allocs, group, unit; - unsigned int cpu, tcpu; - struct pcpu_alloc_info *ai; - unsigned int *cpu_map; - - /* this function may be called multiple times */ - memset(group_map, 0, sizeof(group_map)); - memset(group_cnt, 0, sizeof(group_cnt)); - - /* calculate size_sum and ensure dyn_size is enough for early alloc */ - size_sum = PFN_ALIGN(static_size + reserved_size + - max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE)); - dyn_size = size_sum - static_size - reserved_size; - - /* - * Determine min_unit_size, alloc_size and max_upa such that - * alloc_size is multiple of atom_size and is the smallest - * which can accomodate 4k aligned segments which are equal to - * or larger than min_unit_size. - */ - min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); - - alloc_size = roundup(min_unit_size, atom_size); - upa = alloc_size / min_unit_size; - while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) - upa--; - max_upa = upa; - - /* group cpus according to their proximity */ - for_each_possible_cpu(cpu) { - group = 0; - next_group: - for_each_possible_cpu(tcpu) { - if (cpu == tcpu) - break; - if (group_map[tcpu] == group && cpu_distance_fn && - (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || - cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { - group++; - nr_groups = max(nr_groups, group + 1); - goto next_group; - } - } - group_map[cpu] = group; - group_cnt[group]++; - } - - /* - * Expand unit size until address space usage goes over 75% - * and then as much as possible without using more address - * space. - */ - last_allocs = INT_MAX; - for (upa = max_upa; upa; upa--) { - int allocs = 0, wasted = 0; - - if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) - continue; - - for (group = 0; group < nr_groups; group++) { - int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); - allocs += this_allocs; - wasted += this_allocs * upa - group_cnt[group]; - } - - /* - * Don't accept if wastage is over 1/3. The - * greater-than comparison ensures upa==1 always - * passes the following check. - */ - if (wasted > num_possible_cpus() / 3) - continue; - - /* and then don't consume more memory */ - if (allocs > last_allocs) - break; - last_allocs = allocs; - best_upa = upa; - } - upa = best_upa; - - /* allocate and fill alloc_info */ - for (group = 0; group < nr_groups; group++) - nr_units += roundup(group_cnt[group], upa); - - ai = pcpu_alloc_alloc_info(nr_groups, nr_units); - if (!ai) - return ERR_PTR(-ENOMEM); - cpu_map = ai->groups[0].cpu_map; - - for (group = 0; group < nr_groups; group++) { - ai->groups[group].cpu_map = cpu_map; - cpu_map += roundup(group_cnt[group], upa); - } - - ai->static_size = static_size; - ai->reserved_size = reserved_size; - ai->dyn_size = dyn_size; - ai->unit_size = alloc_size / upa; - ai->atom_size = atom_size; - ai->alloc_size = alloc_size; - - for (group = 0, unit = 0; group_cnt[group]; group++) { - struct pcpu_group_info *gi = &ai->groups[group]; - - /* - * Initialize base_offset as if all groups are located - * back-to-back. The caller should update this to - * reflect actual allocation. - */ - gi->base_offset = unit * ai->unit_size; - - for_each_possible_cpu(cpu) - if (group_map[cpu] == group) - gi->cpu_map[gi->nr_units++] = cpu; - gi->nr_units = roundup(gi->nr_units, upa); - unit += gi->nr_units; - } - BUG_ON(unit != nr_units); - - return ai; -} - -/** * pcpu_dump_alloc_info - print out information about pcpu_alloc_info * @lvl: loglevel * @ai: allocation info to dump @@ -1363,7 +1214,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* sanity checks */ PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); +#ifdef CONFIG_SMP PCPU_SETUP_BUG_ON(!ai->static_size); +#endif PCPU_SETUP_BUG_ON(!base_addr); PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); @@ -1411,7 +1264,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* we're done parsing the input, undefine BUG macro and dump config */ #undef PCPU_SETUP_BUG_ON - pcpu_dump_alloc_info(KERN_INFO, ai); + pcpu_dump_alloc_info(KERN_DEBUG, ai); pcpu_nr_groups = ai->nr_groups; pcpu_group_offsets = group_offsets; @@ -1488,6 +1341,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, return 0; } +#ifdef CONFIG_SMP + const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { [PCPU_FC_AUTO] = "auto", [PCPU_FC_EMBED] = "embed", @@ -1515,8 +1370,180 @@ static int __init percpu_alloc_setup(char *str) } early_param("percpu_alloc", percpu_alloc_setup); +/* + * pcpu_embed_first_chunk() is used by the generic percpu setup. + * Build it if needed by the arch config or the generic setup is going + * to be used. + */ #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) +#define BUILD_EMBED_FIRST_CHUNK +#endif + +/* build pcpu_page_first_chunk() iff needed by the arch config */ +#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) +#define BUILD_PAGE_FIRST_CHUNK +#endif + +/* pcpu_build_alloc_info() is used by both embed and page first chunk */ +#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK) +/** + * pcpu_build_alloc_info - build alloc_info considering distances between CPUs + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_size: minimum free size for dynamic allocation in bytes + * @atom_size: allocation atom size + * @cpu_distance_fn: callback to determine distance between cpus, optional + * + * This function determines grouping of units, their mappings to cpus + * and other parameters considering needed percpu size, allocation + * atom size and distances between CPUs. + * + * Groups are always mutliples of atom size and CPUs which are of + * LOCAL_DISTANCE both ways are grouped together and share space for + * units in the same group. The returned configuration is guaranteed + * to have CPUs on different nodes on different groups and >=75% usage + * of allocated virtual address space. + * + * RETURNS: + * On success, pointer to the new allocation_info is returned. On + * failure, ERR_PTR value is returned. + */ +static struct pcpu_alloc_info * __init pcpu_build_alloc_info( + size_t reserved_size, size_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +{ + static int group_map[NR_CPUS] __initdata; + static int group_cnt[NR_CPUS] __initdata; + const size_t static_size = __per_cpu_end - __per_cpu_start; + int nr_groups = 1, nr_units = 0; + size_t size_sum, min_unit_size, alloc_size; + int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ + int last_allocs, group, unit; + unsigned int cpu, tcpu; + struct pcpu_alloc_info *ai; + unsigned int *cpu_map; + + /* this function may be called multiple times */ + memset(group_map, 0, sizeof(group_map)); + memset(group_cnt, 0, sizeof(group_cnt)); + + /* calculate size_sum and ensure dyn_size is enough for early alloc */ + size_sum = PFN_ALIGN(static_size + reserved_size + + max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE)); + dyn_size = size_sum - static_size - reserved_size; + + /* + * Determine min_unit_size, alloc_size and max_upa such that + * alloc_size is multiple of atom_size and is the smallest + * which can accomodate 4k aligned segments which are equal to + * or larger than min_unit_size. + */ + min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + + alloc_size = roundup(min_unit_size, atom_size); + upa = alloc_size / min_unit_size; + while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + upa--; + max_upa = upa; + + /* group cpus according to their proximity */ + for_each_possible_cpu(cpu) { + group = 0; + next_group: + for_each_possible_cpu(tcpu) { + if (cpu == tcpu) + break; + if (group_map[tcpu] == group && cpu_distance_fn && + (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || + cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { + group++; + nr_groups = max(nr_groups, group + 1); + goto next_group; + } + } + group_map[cpu] = group; + group_cnt[group]++; + } + + /* + * Expand unit size until address space usage goes over 75% + * and then as much as possible without using more address + * space. + */ + last_allocs = INT_MAX; + for (upa = max_upa; upa; upa--) { + int allocs = 0, wasted = 0; + + if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + continue; + + for (group = 0; group < nr_groups; group++) { + int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); + allocs += this_allocs; + wasted += this_allocs * upa - group_cnt[group]; + } + + /* + * Don't accept if wastage is over 1/3. The + * greater-than comparison ensures upa==1 always + * passes the following check. + */ + if (wasted > num_possible_cpus() / 3) + continue; + + /* and then don't consume more memory */ + if (allocs > last_allocs) + break; + last_allocs = allocs; + best_upa = upa; + } + upa = best_upa; + + /* allocate and fill alloc_info */ + for (group = 0; group < nr_groups; group++) + nr_units += roundup(group_cnt[group], upa); + + ai = pcpu_alloc_alloc_info(nr_groups, nr_units); + if (!ai) + return ERR_PTR(-ENOMEM); + cpu_map = ai->groups[0].cpu_map; + + for (group = 0; group < nr_groups; group++) { + ai->groups[group].cpu_map = cpu_map; + cpu_map += roundup(group_cnt[group], upa); + } + + ai->static_size = static_size; + ai->reserved_size = reserved_size; + ai->dyn_size = dyn_size; + ai->unit_size = alloc_size / upa; + ai->atom_size = atom_size; + ai->alloc_size = alloc_size; + + for (group = 0, unit = 0; group_cnt[group]; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + + /* + * Initialize base_offset as if all groups are located + * back-to-back. The caller should update this to + * reflect actual allocation. + */ + gi->base_offset = unit * ai->unit_size; + + for_each_possible_cpu(cpu) + if (group_map[cpu] == group) + gi->cpu_map[gi->nr_units++] = cpu; + gi->nr_units = roundup(gi->nr_units, upa); + unit += gi->nr_units; + } + BUG_ON(unit != nr_units); + + return ai; +} +#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */ + +#if defined(BUILD_EMBED_FIRST_CHUNK) /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem * @reserved_size: the size of reserved percpu area in bytes @@ -1645,10 +1672,9 @@ out_free: free_bootmem(__pa(areas), areas_size); return rc; } -#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || - !CONFIG_HAVE_SETUP_PER_CPU_AREA */ +#endif /* BUILD_EMBED_FIRST_CHUNK */ -#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK +#ifdef BUILD_PAGE_FIRST_CHUNK /** * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @reserved_size: the size of reserved percpu area in bytes @@ -1756,10 +1782,11 @@ out_free_ar: pcpu_free_alloc_info(ai); return rc; } -#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ +#endif /* BUILD_PAGE_FIRST_CHUNK */ +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA /* - * Generic percpu area setup. + * Generic SMP percpu area setup. * * The embedding helper is used because its behavior closely resembles * the original non-dynamic generic percpu area setup. This is @@ -1770,7 +1797,6 @@ out_free_ar: * on the physical linear memory mapping which uses large page * mappings on applicable archs. */ -#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); @@ -1799,13 +1825,48 @@ void __init setup_per_cpu_areas(void) PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); if (rc < 0) - panic("Failed to initialized percpu areas."); + panic("Failed to initialize percpu areas."); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; } -#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ + +#else /* CONFIG_SMP */ + +/* + * UP percpu area setup. + * + * UP always uses km-based percpu allocator with identity mapping. + * Static percpu variables are indistinguishable from the usual static + * variables and don't require any special preparation. + */ +void __init setup_per_cpu_areas(void) +{ + const size_t unit_size = + roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE, + PERCPU_DYNAMIC_RESERVE)); + struct pcpu_alloc_info *ai; + void *fc; + + ai = pcpu_alloc_alloc_info(1, 1); + fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + if (!ai || !fc) + panic("Failed to allocate memory for percpu areas."); + + ai->dyn_size = unit_size; + ai->unit_size = unit_size; + ai->atom_size = unit_size; + ai->alloc_size = unit_size; + ai->groups[0].nr_units = 1; + ai->groups[0].cpu_map[0] = 0; + + if (pcpu_setup_first_chunk(ai, fc) < 0) + panic("Failed to initialize percpu areas."); +} + +#endif /* CONFIG_SMP */ /* * First and reserved chunks are initialized with temporary allocation diff --git a/mm/percpu_up.c b/mm/percpu_up.c deleted file mode 100644 index db884fae5721..000000000000 --- a/mm/percpu_up.c +++ /dev/null @@ -1,30 +0,0 @@ -/* - * mm/percpu_up.c - dummy percpu memory allocator implementation for UP - */ - -#include <linux/module.h> -#include <linux/percpu.h> -#include <linux/slab.h> - -void __percpu *__alloc_percpu(size_t size, size_t align) -{ - /* - * Can't easily make larger alignment work with kmalloc. WARN - * on it. Larger alignment should only be used for module - * percpu sections on SMP for which this path isn't used. - */ - WARN_ON_ONCE(align > SMP_CACHE_BYTES); - return (void __percpu __force *)kzalloc(size, GFP_KERNEL); -} -EXPORT_SYMBOL_GPL(__alloc_percpu); - -void free_percpu(void __percpu *p) -{ - kfree(this_cpu_ptr(p)); -} -EXPORT_SYMBOL_GPL(free_percpu); - -phys_addr_t per_cpu_ptr_to_phys(void *addr) -{ - return __pa(addr); -} diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c new file mode 100644 index 000000000000..eb663fb533e0 --- /dev/null +++ b/mm/pgtable-generic.c @@ -0,0 +1,121 @@ +/* + * mm/pgtable-generic.c + * + * Generic pgtable methods declared in asm-generic/pgtable.h + * + * Copyright (C) 2010 Linus Torvalds + */ + +#include <linux/pagemap.h> +#include <asm/tlb.h> +#include <asm-generic/pgtable.h> + +#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +/* + * Only sets the access flags (dirty, accessed, and + * writable). Furthermore, we know it always gets set to a "more + * permissive" setting, which allows most architectures to optimize + * this. We return whether the PTE actually changed, which in turn + * instructs the caller to do things like update__mmu_cache. This + * used to be done in the caller, but sparc needs minor faults to + * force that call on sun4c so we changed this macro slightly + */ +int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + int changed = !pte_same(*ptep, entry); + if (changed) { + set_pte_at(vma->vm_mm, address, ptep, entry); + flush_tlb_page(vma, address); + } + return changed; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + int changed = !pmd_same(*pmdp, entry); + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + if (changed) { + set_pmd_at(vma->vm_mm, address, pmdp, entry); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + } + return changed; +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ + BUG(); + return 0; +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +} +#endif + +#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + int young; + young = ptep_test_and_clear_young(vma, address, ptep); + if (young) + flush_tlb_page(vma, address); + return young; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int young; +#ifndef CONFIG_TRANSPARENT_HUGEPAGE + BUG(); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + young = pmdp_test_and_clear_young(vma, address, pmdp); + if (young) + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return young; +} +#endif + +#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH +pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep) +{ + pte_t pte; + pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); + flush_tlb_page(vma, address); + return pte; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd; + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return pmd; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif + +#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd = pmd_mksplitting(*pmdp); + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + set_pmd_at(vma->vm_mm, address, pmdp, pmd); + /* tlb flush only to serialize against gup-fast */ + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif diff --git a/mm/rmap.c b/mm/rmap.c index 92e6757f196e..f21f4a1d6a1c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -80,7 +80,7 @@ static inline struct anon_vma_chain *anon_vma_chain_alloc(void) return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); } -void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) +static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) { kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); } @@ -94,7 +94,7 @@ void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) * anonymous pages mapped into it with that anon_vma. * * The common case will be that we already have one, but if - * if not we either need to find an adjacent mapping that we + * not we either need to find an adjacent mapping that we * can re-use the anon_vma from (very common when the only * reason for splitting a vma has been mprotect()), or we * allocate a new one. @@ -177,6 +177,10 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, list_add(&avc->same_vma, &vma->anon_vma_chain); anon_vma_lock(anon_vma); + /* + * It's critical to add new vmas to the tail of the anon_vma, + * see comment in huge_memory.c:__split_huge_page(). + */ list_add_tail(&avc->same_anon_vma, &anon_vma->head); anon_vma_unlock(anon_vma); } @@ -314,7 +318,7 @@ void __init anon_vma_init(void) * Getting a lock on a stable anon_vma from a page off the LRU is * tricky: page_lock_anon_vma rely on RCU to guard against the races. */ -struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *__page_lock_anon_vma(struct page *page) { struct anon_vma *anon_vma, *root_anon_vma; unsigned long anon_mapping; @@ -348,6 +352,8 @@ out: } void page_unlock_anon_vma(struct anon_vma *anon_vma) + __releases(&anon_vma->root->lock) + __releases(RCU) { anon_vma_unlock(anon_vma); rcu_read_unlock(); @@ -358,7 +364,7 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma) * Returns virtual address or -EFAULT if page's index/offset is not * within the range mapped the @vma. */ -static inline unsigned long +inline unsigned long vma_address(struct page *page, struct vm_area_struct *vma) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -407,7 +413,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) * * On success returns with pte mapped and locked. */ -pte_t *page_check_address(struct page *page, struct mm_struct *mm, +pte_t *__page_check_address(struct page *page, struct mm_struct *mm, unsigned long address, spinlock_t **ptlp, int sync) { pgd_t *pgd; @@ -433,6 +439,8 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return NULL; + if (pmd_trans_huge(*pmd)) + return NULL; pte = pte_offset_map(pmd, address); /* Make a quick check before getting the lock */ @@ -487,35 +495,17 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; - pte_t *pte; - spinlock_t *ptl; int referenced = 0; - pte = page_check_address(page, mm, address, &ptl, 0); - if (!pte) - goto out; - /* * Don't want to elevate referenced for mlocked page that gets this far, * in order that it progresses to try_to_unmap and is moved to the * unevictable list. */ if (vma->vm_flags & VM_LOCKED) { - *mapcount = 1; /* break early from loop */ + *mapcount = 0; /* break early from loop */ *vm_flags |= VM_LOCKED; - goto out_unmap; - } - - if (ptep_clear_flush_young_notify(vma, address, pte)) { - /* - * Don't treat a reference through a sequentially read - * mapping as such. If the page has been used in - * another mapping, we will catch it; if this other - * mapping is already gone, the unmap path will have - * set PG_referenced or activated the page. - */ - if (likely(!VM_SequentialReadHint(vma))) - referenced++; + goto out; } /* Pretend the page is referenced if the task has the @@ -524,9 +514,39 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, rwsem_is_locked(&mm->mmap_sem)) referenced++; -out_unmap: + if (unlikely(PageTransHuge(page))) { + pmd_t *pmd; + + spin_lock(&mm->page_table_lock); + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_FLAG); + if (pmd && !pmd_trans_splitting(*pmd) && + pmdp_clear_flush_young_notify(vma, address, pmd)) + referenced++; + spin_unlock(&mm->page_table_lock); + } else { + pte_t *pte; + spinlock_t *ptl; + + pte = page_check_address(page, mm, address, &ptl, 0); + if (!pte) + goto out; + + if (ptep_clear_flush_young_notify(vma, address, pte)) { + /* + * Don't treat a reference through a sequentially read + * mapping as such. If the page has been used in + * another mapping, we will catch it; if this other + * mapping is already gone, the unmap path will have + * set PG_referenced or activated the page. + */ + if (likely(!VM_SequentialReadHint(vma))) + referenced++; + } + pte_unmap_unlock(pte, ptl); + } + (*mapcount)--; - pte_unmap_unlock(pte, ptl); if (referenced) *vm_flags |= vma->vm_flags; @@ -745,7 +765,7 @@ int page_mkclean(struct page *page) if (mapping) { ret = page_mkclean_file(mapping, page); if (page_test_dirty(page)) { - page_clear_dirty(page); + page_clear_dirty(page, 1); ret = 1; } } @@ -780,10 +800,10 @@ void page_move_anon_rmap(struct page *page, } /** - * __page_set_anon_rmap - setup new anonymous rmap - * @page: the page to add the mapping to - * @vma: the vm area in which the mapping is added - * @address: the user virtual address mapped + * __page_set_anon_rmap - set up new anonymous rmap + * @page: Page to add to rmap + * @vma: VM area to add page to. + * @address: User virtual address of the mapping * @exclusive: the page is exclusively owned by the current process */ static void __page_set_anon_rmap(struct page *page, @@ -793,25 +813,16 @@ static void __page_set_anon_rmap(struct page *page, BUG_ON(!anon_vma); + if (PageAnon(page)) + return; + /* * If the page isn't exclusively mapped into this vma, * we must use the _oldest_ possible anon_vma for the * page mapping! */ - if (!exclusive) { - if (PageAnon(page)) - return; + if (!exclusive) anon_vma = anon_vma->root; - } else { - /* - * In this case, swapped-out-but-not-discarded swap-cache - * is remapped. So, no need to update page->mapping here. - * We convice anon_vma poitned by page->mapping is not obsolete - * because vma->anon_vma is necessary to be a family of it. - */ - if (PageAnon(page)) - return; - } anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; page->mapping = (struct address_space *) anon_vma; @@ -871,8 +882,13 @@ void do_page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, int exclusive) { int first = atomic_inc_and_test(&page->_mapcount); - if (first) - __inc_zone_page_state(page, NR_ANON_PAGES); + if (first) { + if (!PageTransHuge(page)) + __inc_zone_page_state(page, NR_ANON_PAGES); + else + __inc_zone_page_state(page, + NR_ANON_TRANSPARENT_HUGEPAGES); + } if (unlikely(PageKsm(page))) return; @@ -900,7 +916,10 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); SetPageSwapBacked(page); atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ - __inc_zone_page_state(page, NR_ANON_PAGES); + if (!PageTransHuge(page)) + __inc_zone_page_state(page, NR_ANON_PAGES); + else + __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __page_set_anon_rmap(page, vma, address, 1); if (page_evictable(page, vma)) lru_cache_add_lru(page, LRU_ACTIVE_ANON); @@ -918,7 +937,7 @@ void page_add_file_rmap(struct page *page) { if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_update_file_mapped(page, 1); + mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); } } @@ -942,7 +961,7 @@ void page_remove_rmap(struct page *page) * containing the swap entry, but page not yet written to swap. */ if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) { - page_clear_dirty(page); + page_clear_dirty(page, 1); set_page_dirty(page); } /* @@ -953,10 +972,14 @@ void page_remove_rmap(struct page *page) return; if (PageAnon(page)) { mem_cgroup_uncharge_page(page); - __dec_zone_page_state(page, NR_ANON_PAGES); + if (!PageTransHuge(page)) + __dec_zone_page_state(page, NR_ANON_PAGES); + else + __dec_zone_page_state(page, + NR_ANON_TRANSPARENT_HUGEPAGES); } else { __dec_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_update_file_mapped(page, -1); + mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); } /* * It would be tidy to reset the PageAnon mapping here, @@ -1209,7 +1232,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, return ret; } -static bool is_vma_temporary_stack(struct vm_area_struct *vma) +bool is_vma_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); @@ -1407,6 +1430,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) int ret; BUG_ON(!PageLocked(page)); + VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); if (unlikely(PageKsm(page))) ret = try_to_unmap_ksm(page, flags); diff --git a/mm/shmem.c b/mm/shmem.c index 080b09a57a8f..5ee67c990602 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1586,6 +1586,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode inode = new_inode(sb); if (inode) { + inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); inode->i_blocks = 0; inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; @@ -1903,7 +1904,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr dir->i_size += BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; inc_nlink(inode); - atomic_inc(&inode->i_count); /* New dentry reference */ + ihold(inode); /* New dentry reference */ dget(dentry); /* Extra pinning count for the created dentry */ d_instantiate(dentry, inode); out: @@ -2146,7 +2147,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, if (*len < 3) return 255; - if (hlist_unhashed(&inode->i_hash)) { + if (inode_unhashed(inode)) { /* Unfortunately insert_inode_hash is not idempotent, * so as we hash inodes here rather than at creation * time, we need a lock to ensure we only try @@ -2154,7 +2155,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, */ static DEFINE_SPINLOCK(lock); spin_lock(&lock); - if (hlist_unhashed(&inode->i_hash)) + if (inode_unhashed(inode)) __insert_inode_hash(inode, inode->i_ino + inode->i_generation); spin_unlock(&lock); @@ -2414,13 +2415,20 @@ static struct inode *shmem_alloc_inode(struct super_block *sb) return &p->vfs_inode; } +static void shmem_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); +} + static void shmem_destroy_inode(struct inode *inode) { if ((inode->i_mode & S_IFMT) == S_IFREG) { /* only struct inode is valid if it's an inline symlink */ mpol_free_shared_policy(&SHMEM_I(inode)->policy); } - kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); + call_rcu(&inode->i_rcu, shmem_i_callback); } static void init_once(void *foo) @@ -2537,16 +2545,16 @@ static const struct vm_operations_struct shmem_vm_ops = { }; -static int shmem_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *shmem_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt); + return mount_nodev(fs_type, flags, data, shmem_fill_super); } static struct file_system_type tmpfs_fs_type = { .owner = THIS_MODULE, .name = "tmpfs", - .get_sb = shmem_get_sb, + .mount = shmem_mount, .kill_sb = kill_litter_super, }; @@ -2642,7 +2650,7 @@ out: static struct file_system_type tmpfs_fs_type = { .name = "tmpfs", - .get_sb = ramfs_get_sb, + .mount = ramfs_mount, .kill_sb = kill_litter_super, }; diff --git a/mm/slab.c b/mm/slab.c index fcae9815d3b3..37961d1f584f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -284,7 +284,7 @@ struct kmem_list3 { * Need this for bootstrapping a per node allocator. */ #define NUM_INIT_LISTS (3 * MAX_NUMNODES) -struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; +static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; #define CACHE_CACHE 0 #define SIZE_AC MAX_NUMNODES #define SIZE_L3 (2 * MAX_NUMNODES) @@ -829,12 +829,12 @@ static void init_reap_node(int cpu) static void next_reap_node(void) { - int node = __get_cpu_var(slab_reap_node); + int node = __this_cpu_read(slab_reap_node); node = next_node(node, node_online_map); if (unlikely(node >= MAX_NUMNODES)) node = first_node(node_online_map); - __get_cpu_var(slab_reap_node) = node; + __this_cpu_write(slab_reap_node, node); } #else @@ -901,7 +901,7 @@ static int transfer_objects(struct array_cache *to, struct array_cache *from, unsigned int max) { /* Figure out how many entries to transfer */ - int nr = min(min(from->avail, max), to->limit - to->avail); + int nr = min3(from->avail, max, to->limit - to->avail); if (!nr) return 0; @@ -1012,7 +1012,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, */ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) { - int node = __get_cpu_var(slab_reap_node); + int node = __this_cpu_read(slab_reap_node); if (l3->alien) { struct array_cache *ac = l3->alien[node]; @@ -1293,7 +1293,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, * anything expensive but will only modify reap_work * and reschedule the timer. */ - cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu)); + cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu)); /* Now the cache_reaper is guaranteed to be not running. */ per_cpu(slab_reap_work, cpu).work.func = NULL; break; @@ -2781,7 +2781,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, /* * Map pages beginning at addr to the given cache and slab. This is required * for the slab allocator to be able to lookup the cache and slab of a - * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging. + * virtual address for kfree, ksize, and slab debugging. */ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, void *addr) @@ -3653,42 +3653,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) EXPORT_SYMBOL(kmem_cache_alloc); #ifdef CONFIG_TRACING -void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) +void * +kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags) { - return __cache_alloc(cachep, flags, __builtin_return_address(0)); -} -EXPORT_SYMBOL(kmem_cache_alloc_notrace); -#endif + void *ret; -/** - * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. - * @cachep: the cache we're checking against - * @ptr: pointer to validate - * - * This verifies that the untrusted pointer looks sane; - * it is _not_ a guarantee that the pointer is actually - * part of the slab cache in question, but it at least - * validates that the pointer can be dereferenced and - * looks half-way sane. - * - * Currently only used for dentry validation. - */ -int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) -{ - unsigned long size = cachep->buffer_size; - struct page *page; + ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); - if (unlikely(!kern_ptr_validate(ptr, size))) - goto out; - page = virt_to_page(ptr); - if (unlikely(!PageSlab(page))) - goto out; - if (unlikely(page_get_cache(page) != cachep)) - goto out; - return 1; -out: - return 0; + trace_kmalloc(_RET_IP_, ret, + size, slab_buffer_size(cachep), flags); + return ret; } +EXPORT_SYMBOL(kmem_cache_alloc_trace); +#endif #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) @@ -3705,31 +3682,32 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) EXPORT_SYMBOL(kmem_cache_alloc_node); #ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, - gfp_t flags, - int nodeid) +void *kmem_cache_alloc_node_trace(size_t size, + struct kmem_cache *cachep, + gfp_t flags, + int nodeid) { - return __cache_alloc_node(cachep, flags, nodeid, + void *ret; + + ret = __cache_alloc_node(cachep, flags, nodeid, __builtin_return_address(0)); + trace_kmalloc_node(_RET_IP_, ret, + size, slab_buffer_size(cachep), + flags, nodeid); + return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); +EXPORT_SYMBOL(kmem_cache_alloc_node_trace); #endif static __always_inline void * __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) { struct kmem_cache *cachep; - void *ret; cachep = kmem_find_general_cachep(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - ret = kmem_cache_alloc_node_notrace(cachep, flags, node); - - trace_kmalloc_node((unsigned long) caller, ret, - size, cachep->buffer_size, flags, node); - - return ret; + return kmem_cache_alloc_node_trace(size, cachep, flags, node); } #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) @@ -4075,7 +4053,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) * necessary. Note that the l3 listlock also protects the array_cache * if drain_array() is used on the shared array. */ -void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, +static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, struct array_cache *ac, int force, int node) { int tofree; @@ -4339,7 +4317,7 @@ static const struct seq_operations slabinfo_op = { * @count: data length * @ppos: unused */ -ssize_t slabinfo_write(struct file *file, const char __user * buffer, +static ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; diff --git a/mm/slob.c b/mm/slob.c index d582171c8101..3588eaaef726 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -500,7 +500,9 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) } else { unsigned int order = get_order(size); - ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node); + if (likely(order)) + gfp |= __GFP_COMP; + ret = slob_new_pages(gfp, order, node); if (ret) { struct page *page; page = virt_to_page(ret); @@ -676,11 +678,6 @@ int kmem_cache_shrink(struct kmem_cache *d) } EXPORT_SYMBOL(kmem_cache_shrink); -int kmem_ptr_validate(struct kmem_cache *a, const void *b) -{ - return 0; -} - static unsigned int slob_ready __read_mostly; int slab_is_available(void) diff --git a/mm/slub.c b/mm/slub.c index 13fffe1f0f3d..e15aa7f193c9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -28,6 +28,8 @@ #include <linux/math64.h> #include <linux/fault-inject.h> +#include <trace/events/kmem.h> + /* * Lock order: * 1. slab_lock(page) @@ -168,7 +170,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s) /* Internal SLUB flags */ #define __OBJECT_POISON 0x80000000UL /* Poison object */ -#define __SYSFS_ADD_DEFERRED 0x40000000UL /* Not yet visible via sysfs */ static int kmem_size = sizeof(struct kmem_cache); @@ -178,7 +179,7 @@ static struct notifier_block slab_notifier; static enum { DOWN, /* No slab functionality available */ - PARTIAL, /* kmem_cache_open() works but kmalloc does not */ + PARTIAL, /* Kmem_cache_node works */ UP, /* Everything works but does not show up in sysfs */ SYSFS /* Sysfs up */ } slab_state = DOWN; @@ -199,7 +200,7 @@ struct track { enum track_item { TRACK_ALLOC, TRACK_FREE }; -#ifdef CONFIG_SLUB_DEBUG +#ifdef CONFIG_SYSFS static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); static void sysfs_slab_remove(struct kmem_cache *); @@ -210,6 +211,7 @@ static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } static inline void sysfs_slab_remove(struct kmem_cache *s) { + kfree(s->name); kfree(s); } @@ -233,11 +235,7 @@ int slab_is_available(void) static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) { -#ifdef CONFIG_NUMA return s->node[node]; -#else - return &s->local_node; -#endif } /* Verify that a pointer has an address that is valid within a slab page */ @@ -494,7 +492,7 @@ static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) dump_stack(); } -static void init_object(struct kmem_cache *s, void *object, int active) +static void init_object(struct kmem_cache *s, void *object, u8 val) { u8 *p = object; @@ -504,9 +502,7 @@ static void init_object(struct kmem_cache *s, void *object, int active) } if (s->flags & SLAB_RED_ZONE) - memset(p + s->objsize, - active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE, - s->inuse - s->objsize); + memset(p + s->objsize, val, s->inuse - s->objsize); } static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) @@ -641,17 +637,14 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) } static int check_object(struct kmem_cache *s, struct page *page, - void *object, int active) + void *object, u8 val) { u8 *p = object; u8 *endobject = object + s->objsize; if (s->flags & SLAB_RED_ZONE) { - unsigned int red = - active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE; - if (!check_bytes_and_report(s, page, object, "Redzone", - endobject, red, s->inuse - s->objsize)) + endobject, val, s->inuse - s->objsize)) return 0; } else { if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { @@ -661,7 +654,7 @@ static int check_object(struct kmem_cache *s, struct page *page, } if (s->flags & SLAB_POISON) { - if (!active && (s->flags & __OBJECT_POISON) && + if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && (!check_bytes_and_report(s, page, p, "Poison", p, POISON_FREE, s->objsize - 1) || !check_bytes_and_report(s, page, p, "Poison", @@ -673,7 +666,7 @@ static int check_object(struct kmem_cache *s, struct page *page, check_pad_bytes(s, page, p); } - if (!s->offset && active) + if (!s->offset && val == SLUB_RED_ACTIVE) /* * Object and freepointer overlap. Cannot check * freepointer while object is allocated. @@ -792,6 +785,39 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, } /* + * Hooks for other subsystems that check memory allocations. In a typical + * production configuration these hooks all should produce no code at all. + */ +static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) +{ + flags &= gfp_allowed_mask; + lockdep_trace_alloc(flags); + might_sleep_if(flags & __GFP_WAIT); + + return should_failslab(s->objsize, flags, s->flags); +} + +static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) +{ + flags &= gfp_allowed_mask; + kmemcheck_slab_alloc(s, flags, object, s->objsize); + kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); +} + +static inline void slab_free_hook(struct kmem_cache *s, void *x) +{ + kmemleak_free_recursive(x, s->flags); +} + +static inline void slab_free_hook_irq(struct kmem_cache *s, void *object) +{ + kmemcheck_slab_free(s, object, s->objsize); + debug_check_no_locks_freed(object, s->objsize); + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(object, s->objsize); +} + +/* * Tracking of fully allocated slabs for debugging purposes. */ static void add_full(struct kmem_cache_node *n, struct page *page) @@ -838,7 +864,7 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) * dilemma by deferring the increment of the count during * bootstrap (see early_kmem_cache_node_alloc). */ - if (!NUMA_BUILD || n) { + if (n) { atomic_long_inc(&n->nr_slabs); atomic_long_add(objects, &n->total_objects); } @@ -858,11 +884,11 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) return; - init_object(s, object, 0); + init_object(s, object, SLUB_RED_INACTIVE); init_tracking(s, object); } -static int alloc_debug_processing(struct kmem_cache *s, struct page *page, +static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { if (!check_slab(s, page)) @@ -878,14 +904,14 @@ static int alloc_debug_processing(struct kmem_cache *s, struct page *page, goto bad; } - if (!check_object(s, page, object, 0)) + if (!check_object(s, page, object, SLUB_RED_INACTIVE)) goto bad; /* Success perform special debug activities for allocs */ if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_ALLOC, addr); trace(s, page, object, 1); - init_object(s, object, 1); + init_object(s, object, SLUB_RED_ACTIVE); return 1; bad: @@ -902,8 +928,8 @@ bad: return 0; } -static int free_debug_processing(struct kmem_cache *s, struct page *page, - void *object, unsigned long addr) +static noinline int free_debug_processing(struct kmem_cache *s, + struct page *page, void *object, unsigned long addr) { if (!check_slab(s, page)) goto fail; @@ -918,7 +944,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, goto fail; } - if (!check_object(s, page, object, 1)) + if (!check_object(s, page, object, SLUB_RED_ACTIVE)) return 0; if (unlikely(s != page->slab)) { @@ -942,7 +968,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); trace(s, page, object, 0); - init_object(s, object, 0); + init_object(s, object, SLUB_RED_INACTIVE); return 1; fail: @@ -1046,7 +1072,7 @@ static inline int free_debug_processing(struct kmem_cache *s, static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } static inline int check_object(struct kmem_cache *s, struct page *page, - void *object, int active) { return 1; } + void *object, u8 val) { return 1; } static inline void add_full(struct kmem_cache_node *n, struct page *page) {} static inline unsigned long kmem_cache_flags(unsigned long objsize, unsigned long flags, const char *name, @@ -1066,7 +1092,19 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) {} static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} -#endif + +static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) + { return 0; } + +static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, + void *object) {} + +static inline void slab_free_hook(struct kmem_cache *s, void *x) {} + +static inline void slab_free_hook_irq(struct kmem_cache *s, + void *object) {} + +#endif /* CONFIG_SLUB_DEBUG */ /* * Slab allocation and freeing @@ -1194,7 +1232,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) slab_pad_check(s, page); for_each_object(p, s, page_address(page), page->objects) - check_object(s, page, p, 0); + check_object(s, page, p, SLUB_RED_INACTIVE); } kmemcheck_free_shadow(page, compound_order(page)); @@ -1274,13 +1312,19 @@ static void add_partial(struct kmem_cache_node *n, spin_unlock(&n->list_lock); } +static inline void __remove_partial(struct kmem_cache_node *n, + struct page *page) +{ + list_del(&page->lru); + n->nr_partial--; +} + static void remove_partial(struct kmem_cache *s, struct page *page) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); spin_lock(&n->list_lock); - list_del(&page->lru); - n->nr_partial--; + __remove_partial(n, page); spin_unlock(&n->list_lock); } @@ -1293,8 +1337,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n, struct page *page) { if (slab_trylock(page)) { - list_del(&page->lru); - n->nr_partial--; + __remove_partial(n, page); __SetPageSlubFrozen(page); return 1; } @@ -1405,6 +1448,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) * On exit the slab lock will have been dropped. */ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) + __releases(bitlock) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); @@ -1447,6 +1491,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) * Remove the cpu slab */ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) + __releases(bitlock) { struct page *page = c->page; int tail = 1; @@ -1647,6 +1692,7 @@ new_slab: goto load_freelist; } + gfpflags &= gfp_allowed_mask; if (gfpflags & __GFP_WAIT) local_irq_enable(); @@ -1674,7 +1720,7 @@ debug: c->page->inuse++; c->page->freelist = get_freepointer(s, object); - c->node = -1; + c->node = NUMA_NO_NODE; goto unlock_out; } @@ -1695,12 +1741,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, struct kmem_cache_cpu *c; unsigned long flags; - gfpflags &= gfp_allowed_mask; - - lockdep_trace_alloc(gfpflags); - might_sleep_if(gfpflags & __GFP_WAIT); - - if (should_failslab(s->objsize, gfpflags, s->flags)) + if (slab_pre_alloc_hook(s, gfpflags)) return NULL; local_irq_save(flags); @@ -1719,8 +1760,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, if (unlikely(gfpflags & __GFP_ZERO) && object) memset(object, 0, s->objsize); - kmemcheck_slab_alloc(s, gfpflags, object, s->objsize); - kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags); + slab_post_alloc_hook(s, gfpflags, object); return object; } @@ -1736,11 +1776,21 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) EXPORT_SYMBOL(kmem_cache_alloc); #ifdef CONFIG_TRACING -void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) +void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { - return slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); + return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_notrace); +EXPORT_SYMBOL(kmem_cache_alloc_trace); + +void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) +{ + void *ret = kmalloc_order(size, flags, order); + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags); + return ret; +} +EXPORT_SYMBOL(kmalloc_order_trace); #endif #ifdef CONFIG_NUMA @@ -1754,16 +1804,20 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); -#endif #ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, +void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - int node) + int node, size_t size) { - return slab_alloc(s, gfpflags, node, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); + + trace_kmalloc_node(_RET_IP_, ret, + size, s->size, gfpflags, node); + return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); +EXPORT_SYMBOL(kmem_cache_alloc_node_trace); +#endif #endif /* @@ -1850,14 +1904,14 @@ static __always_inline void slab_free(struct kmem_cache *s, struct kmem_cache_cpu *c; unsigned long flags; - kmemleak_free_recursive(x, s->flags); + slab_free_hook(s, x); + local_irq_save(flags); c = __this_cpu_ptr(s->cpu_slab); - kmemcheck_slab_free(s, object, s->objsize); - debug_check_no_locks_freed(object, s->objsize); - if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(object, s->objsize); - if (likely(page == c->page && c->node >= 0)) { + + slab_free_hook_irq(s, x); + + if (likely(page == c->page && c->node != NUMA_NO_NODE)) { set_freepointer(s, object, c->freelist); c->freelist = object; stat(s, FREE_FASTPATH); @@ -1879,17 +1933,6 @@ void kmem_cache_free(struct kmem_cache *s, void *x) } EXPORT_SYMBOL(kmem_cache_free); -/* Figure out on which slab page the object resides */ -static struct page *get_object_page(const void *x) -{ - struct page *page = virt_to_head_page(x); - - if (!PageSlab(page)) - return NULL; - - return page; -} - /* * Object placement in a slab is made very easy because we always start at * offset 0. If we tune the size of the object to the alignment then we can @@ -2062,26 +2105,18 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) #endif } -static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]); - -static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) +static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) { - if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches) - /* - * Boot time creation of the kmalloc array. Use static per cpu data - * since the per cpu allocator is not available yet. - */ - s->cpu_slab = kmalloc_percpu + (s - kmalloc_caches); - else - s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); + BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < + SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); - if (!s->cpu_slab) - return 0; + s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); - return 1; + return s->cpu_slab != NULL; } -#ifdef CONFIG_NUMA +static struct kmem_cache *kmem_cache_node; + /* * No kmalloc_node yet so do it by hand. We know that this is the first * slab on the node for this slabcache. There are no concurrent accesses @@ -2091,15 +2126,15 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) * when allocating for the kmalloc_node_cache. This is used for bootstrapping * memory on a fresh node that has no slab structures yet. */ -static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node) +static void early_kmem_cache_node_alloc(int node) { struct page *page; struct kmem_cache_node *n; unsigned long flags; - BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); + BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); - page = new_slab(kmalloc_caches, gfpflags, node); + page = new_slab(kmem_cache_node, GFP_NOWAIT, node); BUG_ON(!page); if (page_to_nid(page) != node) { @@ -2111,15 +2146,15 @@ static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node) n = page->freelist; BUG_ON(!n); - page->freelist = get_freepointer(kmalloc_caches, n); + page->freelist = get_freepointer(kmem_cache_node, n); page->inuse++; - kmalloc_caches->node[node] = n; + kmem_cache_node->node[node] = n; #ifdef CONFIG_SLUB_DEBUG - init_object(kmalloc_caches, n, 1); - init_tracking(kmalloc_caches, n); + init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); + init_tracking(kmem_cache_node, n); #endif - init_kmem_cache_node(n, kmalloc_caches); - inc_slabs_node(kmalloc_caches, node, page->objects); + init_kmem_cache_node(n, kmem_cache_node); + inc_slabs_node(kmem_cache_node, node, page->objects); /* * lockdep requires consistent irq usage for each lock @@ -2137,13 +2172,15 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = s->node[node]; + if (n) - kmem_cache_free(kmalloc_caches, n); + kmem_cache_free(kmem_cache_node, n); + s->node[node] = NULL; } } -static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) +static int init_kmem_cache_nodes(struct kmem_cache *s) { int node; @@ -2151,11 +2188,11 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) struct kmem_cache_node *n; if (slab_state == DOWN) { - early_kmem_cache_node_alloc(gfpflags, node); + early_kmem_cache_node_alloc(node); continue; } - n = kmem_cache_alloc_node(kmalloc_caches, - gfpflags, node); + n = kmem_cache_alloc_node(kmem_cache_node, + GFP_KERNEL, node); if (!n) { free_kmem_cache_nodes(s); @@ -2167,17 +2204,6 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) } return 1; } -#else -static void free_kmem_cache_nodes(struct kmem_cache *s) -{ -} - -static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) -{ - init_kmem_cache_node(&s->local_node, s); - return 1; -} -#endif static void set_min_partial(struct kmem_cache *s, unsigned long min) { @@ -2312,7 +2338,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) } -static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, +static int kmem_cache_open(struct kmem_cache *s, const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) @@ -2348,10 +2374,10 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; #endif - if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) + if (!init_kmem_cache_nodes(s)) goto error; - if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) + if (alloc_kmem_cache_cpus(s)) return 1; free_kmem_cache_nodes(s); @@ -2365,35 +2391,6 @@ error: } /* - * Check if a given pointer is valid - */ -int kmem_ptr_validate(struct kmem_cache *s, const void *object) -{ - struct page *page; - - if (!kern_ptr_validate(object, s->size)) - return 0; - - page = get_object_page(object); - - if (!page || s != page->slab) - /* No slab or wrong slab */ - return 0; - - if (!check_valid_pointer(s, page, object)) - return 0; - - /* - * We could also check if the object is on the slabs freelist. - * But this would be too expensive and it seems that the main - * purpose of kmem_ptr_valid() is to check if the object belongs - * to a certain slab. - */ - return 1; -} -EXPORT_SYMBOL(kmem_ptr_validate); - -/* * Determine the size of a slab object */ unsigned int kmem_cache_size(struct kmem_cache *s) @@ -2414,9 +2411,8 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); void *p; - long *map = kzalloc(BITS_TO_LONGS(page->objects) * sizeof(long), - GFP_ATOMIC); - + unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) * + sizeof(long), GFP_ATOMIC); if (!map) return; slab_err(s, page, "%s", text); @@ -2448,9 +2444,8 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { - list_del(&page->lru); + __remove_partial(n, page); discard_slab(s, page); - n->nr_partial--; } else { list_slab_objects(s, page, "Objects remaining on kmem_cache_close()"); @@ -2507,9 +2502,15 @@ EXPORT_SYMBOL(kmem_cache_destroy); * Kmalloc subsystem *******************************************************************/ -struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned; +struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT]; EXPORT_SYMBOL(kmalloc_caches); +static struct kmem_cache *kmem_cache; + +#ifdef CONFIG_ZONE_DMA +static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT]; +#endif + static int __init setup_slub_min_order(char *str) { get_option(&str, &slub_min_order); @@ -2546,116 +2547,29 @@ static int __init setup_slub_nomerge(char *str) __setup("slub_nomerge", setup_slub_nomerge); -static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, - const char *name, int size, gfp_t gfp_flags) +static struct kmem_cache *__init create_kmalloc_cache(const char *name, + int size, unsigned int flags) { - unsigned int flags = 0; + struct kmem_cache *s; - if (gfp_flags & SLUB_DMA) - flags = SLAB_CACHE_DMA; + s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); /* * This function is called with IRQs disabled during early-boot on * single CPU so there's no need to take slub_lock here. */ - if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, + if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, flags, NULL)) goto panic; list_add(&s->list, &slab_caches); - - if (sysfs_slab_add(s)) - goto panic; return s; panic: panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); + return NULL; } -#ifdef CONFIG_ZONE_DMA -static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT]; - -static void sysfs_add_func(struct work_struct *w) -{ - struct kmem_cache *s; - - down_write(&slub_lock); - list_for_each_entry(s, &slab_caches, list) { - if (s->flags & __SYSFS_ADD_DEFERRED) { - s->flags &= ~__SYSFS_ADD_DEFERRED; - sysfs_slab_add(s); - } - } - up_write(&slub_lock); -} - -static DECLARE_WORK(sysfs_add_work, sysfs_add_func); - -static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) -{ - struct kmem_cache *s; - char *text; - size_t realsize; - unsigned long slabflags; - int i; - - s = kmalloc_caches_dma[index]; - if (s) - return s; - - /* Dynamically create dma cache */ - if (flags & __GFP_WAIT) - down_write(&slub_lock); - else { - if (!down_write_trylock(&slub_lock)) - goto out; - } - - if (kmalloc_caches_dma[index]) - goto unlock_out; - - realsize = kmalloc_caches[index].objsize; - text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", - (unsigned int)realsize); - - s = NULL; - for (i = 0; i < KMALLOC_CACHES; i++) - if (!kmalloc_caches[i].size) - break; - - BUG_ON(i >= KMALLOC_CACHES); - s = kmalloc_caches + i; - - /* - * Must defer sysfs creation to a workqueue because we don't know - * what context we are called from. Before sysfs comes up, we don't - * need to do anything because our sysfs initcall will start by - * adding all existing slabs to sysfs. - */ - slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK; - if (slab_state >= SYSFS) - slabflags |= __SYSFS_ADD_DEFERRED; - - if (!text || !kmem_cache_open(s, flags, text, - realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { - s->size = 0; - kfree(text); - goto unlock_out; - } - - list_add(&s->list, &slab_caches); - kmalloc_caches_dma[index] = s; - - if (slab_state >= SYSFS) - schedule_work(&sysfs_add_work); - -unlock_out: - up_write(&slub_lock); -out: - return kmalloc_caches_dma[index]; -} -#endif - /* * Conversion table for small slabs sizes / 8 to the index in the * kmalloc array. This is necessary for slabs < 192 since we have non power @@ -2708,10 +2622,10 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) #ifdef CONFIG_ZONE_DMA if (unlikely((flags & SLUB_DMA))) - return dma_kmalloc_cache(index, flags); + return kmalloc_dma_caches[index]; #endif - return &kmalloc_caches[index]; + return kmalloc_caches[index]; } void *__kmalloc(size_t size, gfp_t flags) @@ -2735,6 +2649,7 @@ void *__kmalloc(size_t size, gfp_t flags) } EXPORT_SYMBOL(__kmalloc); +#ifdef CONFIG_NUMA static void *kmalloc_large_node(size_t size, gfp_t flags, int node) { struct page *page; @@ -2749,7 +2664,6 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) return ptr; } -#ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node) { struct kmem_cache *s; @@ -2889,8 +2803,7 @@ int kmem_cache_shrink(struct kmem_cache *s) * may have freed the last object and be * waiting to release the slab. */ - list_del(&page->lru); - n->nr_partial--; + __remove_partial(n, page); slab_unlock(page); discard_slab(s, page); } else { @@ -2914,7 +2827,7 @@ int kmem_cache_shrink(struct kmem_cache *s) } EXPORT_SYMBOL(kmem_cache_shrink); -#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) +#if defined(CONFIG_MEMORY_HOTPLUG) static int slab_mem_going_offline_callback(void *arg) { struct kmem_cache *s; @@ -2956,7 +2869,7 @@ static void slab_mem_offline_callback(void *arg) BUG_ON(slabs_node(s, offline_node)); s->node[offline_node] = NULL; - kmem_cache_free(kmalloc_caches, n); + kmem_cache_free(kmem_cache_node, n); } } up_read(&slub_lock); @@ -2989,7 +2902,7 @@ static int slab_mem_going_online_callback(void *arg) * since memory is not yet available from the node that * is brought up. */ - n = kmem_cache_alloc(kmalloc_caches, GFP_KERNEL); + n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL); if (!n) { ret = -ENOMEM; goto out; @@ -3035,46 +2948,92 @@ static int slab_memory_callback(struct notifier_block *self, * Basic setup of slabs *******************************************************************/ +/* + * Used for early kmem_cache structures that were allocated using + * the page allocator + */ + +static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) +{ + int node; + + list_add(&s->list, &slab_caches); + s->refcount = -1; + + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); + struct page *p; + + if (n) { + list_for_each_entry(p, &n->partial, lru) + p->slab = s; + +#ifdef CONFIG_SLAB_DEBUG + list_for_each_entry(p, &n->full, lru) + p->slab = s; +#endif + } + } +} + void __init kmem_cache_init(void) { int i; int caches = 0; + struct kmem_cache *temp_kmem_cache; + int order; + struct kmem_cache *temp_kmem_cache_node; + unsigned long kmalloc_size; + + kmem_size = offsetof(struct kmem_cache, node) + + nr_node_ids * sizeof(struct kmem_cache_node *); + + /* Allocate two kmem_caches from the page allocator */ + kmalloc_size = ALIGN(kmem_size, cache_line_size()); + order = get_order(2 * kmalloc_size); + kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order); -#ifdef CONFIG_NUMA /* * Must first have the slab cache available for the allocations of the * struct kmem_cache_node's. There is special bootstrap code in * kmem_cache_open for slab_state == DOWN. */ - create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", - sizeof(struct kmem_cache_node), GFP_NOWAIT); - kmalloc_caches[0].refcount = -1; - caches++; + kmem_cache_node = (void *)kmem_cache + kmalloc_size; + + kmem_cache_open(kmem_cache_node, "kmem_cache_node", + sizeof(struct kmem_cache_node), + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); -#endif /* Able to allocate the per node structures */ slab_state = PARTIAL; - /* Caches that are not of the two-to-the-power-of size */ - if (KMALLOC_MIN_SIZE <= 32) { - create_kmalloc_cache(&kmalloc_caches[1], - "kmalloc-96", 96, GFP_NOWAIT); - caches++; - } - if (KMALLOC_MIN_SIZE <= 64) { - create_kmalloc_cache(&kmalloc_caches[2], - "kmalloc-192", 192, GFP_NOWAIT); - caches++; - } + temp_kmem_cache = kmem_cache; + kmem_cache_open(kmem_cache, "kmem_cache", kmem_size, + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); + memcpy(kmem_cache, temp_kmem_cache, kmem_size); - for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { - create_kmalloc_cache(&kmalloc_caches[i], - "kmalloc", 1 << i, GFP_NOWAIT); - caches++; - } + /* + * Allocate kmem_cache_node properly from the kmem_cache slab. + * kmem_cache_node is separately allocated so no need to + * update any list pointers. + */ + temp_kmem_cache_node = kmem_cache_node; + kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); + memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size); + + kmem_cache_bootstrap_fixup(kmem_cache_node); + + caches++; + kmem_cache_bootstrap_fixup(kmem_cache); + caches++; + /* Free temporary boot structure */ + free_pages((unsigned long)temp_kmem_cache, order); + + /* Now we can use the kmem_cache to allocate kmalloc slabs */ /* * Patch up the size_index table if we have strange large alignment @@ -3114,26 +3073,60 @@ void __init kmem_cache_init(void) size_index[size_index_elem(i)] = 8; } + /* Caches that are not of the two-to-the-power-of size */ + if (KMALLOC_MIN_SIZE <= 32) { + kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0); + caches++; + } + + if (KMALLOC_MIN_SIZE <= 64) { + kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0); + caches++; + } + + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { + kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0); + caches++; + } + slab_state = UP; /* Provide the correct kmalloc names now that the caches are up */ + if (KMALLOC_MIN_SIZE <= 32) { + kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT); + BUG_ON(!kmalloc_caches[1]->name); + } + + if (KMALLOC_MIN_SIZE <= 64) { + kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT); + BUG_ON(!kmalloc_caches[2]->name); + } + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); BUG_ON(!s); - kmalloc_caches[i].name = s; + kmalloc_caches[i]->name = s; } #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); #endif -#ifdef CONFIG_NUMA - kmem_size = offsetof(struct kmem_cache, node) + - nr_node_ids * sizeof(struct kmem_cache_node *); -#else - kmem_size = sizeof(struct kmem_cache); -#endif +#ifdef CONFIG_ZONE_DMA + for (i = 0; i < SLUB_PAGE_SHIFT; i++) { + struct kmem_cache *s = kmalloc_caches[i]; + + if (s && s->size) { + char *name = kasprintf(GFP_NOWAIT, + "dma-kmalloc-%d", s->objsize); + + BUG_ON(!name); + kmalloc_dma_caches[i] = create_kmalloc_cache(name, + s->objsize, SLAB_CACHE_DMA); + } + } +#endif printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," " CPUs=%d, Nodes=%d\n", @@ -3211,6 +3204,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; + char *n; if (WARN_ON(!name)) return NULL; @@ -3234,24 +3228,30 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, return s; } + n = kstrdup(name, GFP_KERNEL); + if (!n) + goto err; + s = kmalloc(kmem_size, GFP_KERNEL); if (s) { - if (kmem_cache_open(s, GFP_KERNEL, name, + if (kmem_cache_open(s, n, size, align, flags, ctor)) { list_add(&s->list, &slab_caches); if (sysfs_slab_add(s)) { list_del(&s->list); + kfree(n); kfree(s); goto err; } up_write(&slub_lock); return s; } + kfree(n); kfree(s); } +err: up_write(&slub_lock); -err: if (flags & SLAB_PANIC) panic("Cannot create slabcache %s\n", name); else @@ -3318,6 +3318,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) return ret; } +#ifdef CONFIG_NUMA void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, int node, unsigned long caller) { @@ -3346,8 +3347,9 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, return ret; } +#endif -#ifdef CONFIG_SLUB_DEBUG +#ifdef CONFIG_SYSFS static int count_inuse(struct page *page) { return page->inuse; @@ -3357,7 +3359,9 @@ static int count_total(struct page *page) { return page->objects; } +#endif +#ifdef CONFIG_SLUB_DEBUG static int validate_slab(struct kmem_cache *s, struct page *page, unsigned long *map) { @@ -3373,13 +3377,13 @@ static int validate_slab(struct kmem_cache *s, struct page *page, for_each_free_object(p, s, page->freelist) { set_bit(slab_index(p, s, addr), map); - if (!check_object(s, page, p, 0)) + if (!check_object(s, page, p, SLUB_RED_INACTIVE)) return 0; } for_each_object(p, s, addr, page->objects) if (!test_bit(slab_index(p, s, addr), map)) - if (!check_object(s, page, p, 1)) + if (!check_object(s, page, p, SLUB_RED_ACTIVE)) return 0; return 1; } @@ -3448,65 +3452,6 @@ static long validate_slab_cache(struct kmem_cache *s) kfree(map); return count; } - -#ifdef SLUB_RESILIENCY_TEST -static void resiliency_test(void) -{ - u8 *p; - - printk(KERN_ERR "SLUB resiliency testing\n"); - printk(KERN_ERR "-----------------------\n"); - printk(KERN_ERR "A. Corruption after allocation\n"); - - p = kzalloc(16, GFP_KERNEL); - p[16] = 0x12; - printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" - " 0x12->0x%p\n\n", p + 16); - - validate_slab_cache(kmalloc_caches + 4); - - /* Hmmm... The next two are dangerous */ - p = kzalloc(32, GFP_KERNEL); - p[32 + sizeof(void *)] = 0x34; - printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" - " 0x34 -> -0x%p\n", p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); - - validate_slab_cache(kmalloc_caches + 5); - p = kzalloc(64, GFP_KERNEL); - p += 64 + (get_cycles() & 0xff) * sizeof(void *); - *p = 0x56; - printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", - p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); - validate_slab_cache(kmalloc_caches + 6); - - printk(KERN_ERR "\nB. Corruption after free\n"); - p = kzalloc(128, GFP_KERNEL); - kfree(p); - *p = 0x78; - printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches + 7); - - p = kzalloc(256, GFP_KERNEL); - kfree(p); - p[50] = 0x9a; - printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", - p); - validate_slab_cache(kmalloc_caches + 8); - - p = kzalloc(512, GFP_KERNEL); - kfree(p); - p[512] = 0xab; - printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches + 9); -} -#else -static void resiliency_test(void) {}; -#endif - /* * Generate lists of code addresses where slabcache objects are allocated * and freed. @@ -3635,7 +3580,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, static void process_slab(struct loc_track *t, struct kmem_cache *s, struct page *page, enum track_item alloc, - long *map) + unsigned long *map) { void *addr = page_address(page); void *p; @@ -3691,7 +3636,7 @@ static int list_locations(struct kmem_cache *s, char *buf, len += sprintf(buf + len, "%7ld ", l->count); if (l->addr) - len += sprint_symbol(buf + len, (unsigned long)l->addr); + len += sprintf(buf + len, "%pS", (void *)l->addr); else len += sprintf(buf + len, "<not-available>"); @@ -3735,7 +3680,71 @@ static int list_locations(struct kmem_cache *s, char *buf, len += sprintf(buf, "No data\n"); return len; } +#endif + +#ifdef SLUB_RESILIENCY_TEST +static void resiliency_test(void) +{ + u8 *p; + + BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10); + printk(KERN_ERR "SLUB resiliency testing\n"); + printk(KERN_ERR "-----------------------\n"); + printk(KERN_ERR "A. Corruption after allocation\n"); + + p = kzalloc(16, GFP_KERNEL); + p[16] = 0x12; + printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" + " 0x12->0x%p\n\n", p + 16); + + validate_slab_cache(kmalloc_caches[4]); + + /* Hmmm... The next two are dangerous */ + p = kzalloc(32, GFP_KERNEL); + p[32 + sizeof(void *)] = 0x34; + printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" + " 0x34 -> -0x%p\n", p); + printk(KERN_ERR + "If allocated object is overwritten then not detectable\n\n"); + + validate_slab_cache(kmalloc_caches[5]); + p = kzalloc(64, GFP_KERNEL); + p += 64 + (get_cycles() & 0xff) * sizeof(void *); + *p = 0x56; + printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", + p); + printk(KERN_ERR + "If allocated object is overwritten then not detectable\n\n"); + validate_slab_cache(kmalloc_caches[6]); + + printk(KERN_ERR "\nB. Corruption after free\n"); + p = kzalloc(128, GFP_KERNEL); + kfree(p); + *p = 0x78; + printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); + validate_slab_cache(kmalloc_caches[7]); + + p = kzalloc(256, GFP_KERNEL); + kfree(p); + p[50] = 0x9a; + printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", + p); + validate_slab_cache(kmalloc_caches[8]); + + p = kzalloc(512, GFP_KERNEL); + kfree(p); + p[512] = 0xab; + printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); + validate_slab_cache(kmalloc_caches[9]); +} +#else +#ifdef CONFIG_SYSFS +static void resiliency_test(void) {}; +#endif +#endif + +#ifdef CONFIG_SYSFS enum slab_stat_type { SL_ALL, /* All slabs */ SL_PARTIAL, /* Only partially allocated slabs */ @@ -3788,6 +3797,8 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } } + lock_memory_hotplug(); +#ifdef CONFIG_SLUB_DEBUG if (flags & SO_ALL) { for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); @@ -3804,7 +3815,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s, nodes[node] += x; } - } else if (flags & SO_PARTIAL) { + } else +#endif + if (flags & SO_PARTIAL) { for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); @@ -3825,10 +3838,12 @@ static ssize_t show_slab_objects(struct kmem_cache *s, x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); #endif + unlock_memory_hotplug(); kfree(nodes); return x + sprintf(buf + x, "\n"); } +#ifdef CONFIG_SLUB_DEBUG static int any_slab_objects(struct kmem_cache *s) { int node; @@ -3844,6 +3859,7 @@ static int any_slab_objects(struct kmem_cache *s) } return 0; } +#endif #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) #define to_slab(n) container_of(n, struct kmem_cache, kobj); @@ -3930,12 +3946,9 @@ SLAB_ATTR(min_partial); static ssize_t ctor_show(struct kmem_cache *s, char *buf) { - if (s->ctor) { - int n = sprint_symbol(buf, (unsigned long)s->ctor); - - return n + sprintf(buf + n, "\n"); - } - return 0; + if (!s->ctor) + return 0; + return sprintf(buf, "%pS\n", s->ctor); } SLAB_ATTR_RO(ctor); @@ -3945,12 +3958,6 @@ static ssize_t aliases_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(aliases); -static ssize_t slabs_show(struct kmem_cache *s, char *buf) -{ - return show_slab_objects(s, buf, SO_ALL); -} -SLAB_ATTR_RO(slabs); - static ssize_t partial_show(struct kmem_cache *s, char *buf) { return show_slab_objects(s, buf, SO_PARTIAL); @@ -3975,93 +3982,83 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(objects_partial); -static ssize_t total_objects_show(struct kmem_cache *s, char *buf) -{ - return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); -} -SLAB_ATTR_RO(total_objects); - -static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) +static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); + return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); } -static ssize_t sanity_checks_store(struct kmem_cache *s, +static ssize_t reclaim_account_store(struct kmem_cache *s, const char *buf, size_t length) { - s->flags &= ~SLAB_DEBUG_FREE; + s->flags &= ~SLAB_RECLAIM_ACCOUNT; if (buf[0] == '1') - s->flags |= SLAB_DEBUG_FREE; + s->flags |= SLAB_RECLAIM_ACCOUNT; return length; } -SLAB_ATTR(sanity_checks); +SLAB_ATTR(reclaim_account); -static ssize_t trace_show(struct kmem_cache *s, char *buf) +static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); + return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); } +SLAB_ATTR_RO(hwcache_align); -static ssize_t trace_store(struct kmem_cache *s, const char *buf, - size_t length) +#ifdef CONFIG_ZONE_DMA +static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) { - s->flags &= ~SLAB_TRACE; - if (buf[0] == '1') - s->flags |= SLAB_TRACE; - return length; + return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); } -SLAB_ATTR(trace); +SLAB_ATTR_RO(cache_dma); +#endif -#ifdef CONFIG_FAILSLAB -static ssize_t failslab_show(struct kmem_cache *s, char *buf) +static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); + return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); } +SLAB_ATTR_RO(destroy_by_rcu); -static ssize_t failslab_store(struct kmem_cache *s, const char *buf, - size_t length) +#ifdef CONFIG_SLUB_DEBUG +static ssize_t slabs_show(struct kmem_cache *s, char *buf) { - s->flags &= ~SLAB_FAILSLAB; - if (buf[0] == '1') - s->flags |= SLAB_FAILSLAB; - return length; + return show_slab_objects(s, buf, SO_ALL); } -SLAB_ATTR(failslab); -#endif +SLAB_ATTR_RO(slabs); -static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) +static ssize_t total_objects_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); + return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); } +SLAB_ATTR_RO(total_objects); -static ssize_t reclaim_account_store(struct kmem_cache *s, - const char *buf, size_t length) +static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) { - s->flags &= ~SLAB_RECLAIM_ACCOUNT; - if (buf[0] == '1') - s->flags |= SLAB_RECLAIM_ACCOUNT; - return length; + return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); } -SLAB_ATTR(reclaim_account); -static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) +static ssize_t sanity_checks_store(struct kmem_cache *s, + const char *buf, size_t length) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); + s->flags &= ~SLAB_DEBUG_FREE; + if (buf[0] == '1') + s->flags |= SLAB_DEBUG_FREE; + return length; } -SLAB_ATTR_RO(hwcache_align); +SLAB_ATTR(sanity_checks); -#ifdef CONFIG_ZONE_DMA -static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) +static ssize_t trace_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); + return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); } -SLAB_ATTR_RO(cache_dma); -#endif -static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) +static ssize_t trace_store(struct kmem_cache *s, const char *buf, + size_t length) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); + s->flags &= ~SLAB_TRACE; + if (buf[0] == '1') + s->flags |= SLAB_TRACE; + return length; } -SLAB_ATTR_RO(destroy_by_rcu); +SLAB_ATTR(trace); static ssize_t red_zone_show(struct kmem_cache *s, char *buf) { @@ -4139,6 +4136,40 @@ static ssize_t validate_store(struct kmem_cache *s, } SLAB_ATTR(validate); +static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) +{ + if (!(s->flags & SLAB_STORE_USER)) + return -ENOSYS; + return list_locations(s, buf, TRACK_ALLOC); +} +SLAB_ATTR_RO(alloc_calls); + +static ssize_t free_calls_show(struct kmem_cache *s, char *buf) +{ + if (!(s->flags & SLAB_STORE_USER)) + return -ENOSYS; + return list_locations(s, buf, TRACK_FREE); +} +SLAB_ATTR_RO(free_calls); +#endif /* CONFIG_SLUB_DEBUG */ + +#ifdef CONFIG_FAILSLAB +static ssize_t failslab_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); +} + +static ssize_t failslab_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + s->flags &= ~SLAB_FAILSLAB; + if (buf[0] == '1') + s->flags |= SLAB_FAILSLAB; + return length; +} +SLAB_ATTR(failslab); +#endif + static ssize_t shrink_show(struct kmem_cache *s, char *buf) { return 0; @@ -4158,22 +4189,6 @@ static ssize_t shrink_store(struct kmem_cache *s, } SLAB_ATTR(shrink); -static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) -{ - if (!(s->flags & SLAB_STORE_USER)) - return -ENOSYS; - return list_locations(s, buf, TRACK_ALLOC); -} -SLAB_ATTR_RO(alloc_calls); - -static ssize_t free_calls_show(struct kmem_cache *s, char *buf) -{ - if (!(s->flags & SLAB_STORE_USER)) - return -ENOSYS; - return list_locations(s, buf, TRACK_FREE); -} -SLAB_ATTR_RO(free_calls); - #ifdef CONFIG_NUMA static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) { @@ -4279,25 +4294,27 @@ static struct attribute *slab_attrs[] = { &min_partial_attr.attr, &objects_attr.attr, &objects_partial_attr.attr, - &total_objects_attr.attr, - &slabs_attr.attr, &partial_attr.attr, &cpu_slabs_attr.attr, &ctor_attr.attr, &aliases_attr.attr, &align_attr.attr, - &sanity_checks_attr.attr, - &trace_attr.attr, &hwcache_align_attr.attr, &reclaim_account_attr.attr, &destroy_by_rcu_attr.attr, + &shrink_attr.attr, +#ifdef CONFIG_SLUB_DEBUG + &total_objects_attr.attr, + &slabs_attr.attr, + &sanity_checks_attr.attr, + &trace_attr.attr, &red_zone_attr.attr, &poison_attr.attr, &store_user_attr.attr, &validate_attr.attr, - &shrink_attr.attr, &alloc_calls_attr.attr, &free_calls_attr.attr, +#endif #ifdef CONFIG_ZONE_DMA &cache_dma_attr.attr, #endif @@ -4377,6 +4394,7 @@ static void kmem_cache_release(struct kobject *kobj) { struct kmem_cache *s = to_slab(kobj); + kfree(s->name); kfree(s); } @@ -4579,7 +4597,7 @@ static int __init slab_sysfs_init(void) } __initcall(slab_sysfs_init); -#endif +#endif /* CONFIG_SYSFS */ /* * The /proc/slabinfo ABI diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index aa33fd67fa41..64b984091edb 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -9,7 +9,7 @@ * * However, virtual mappings need a page table and TLBs. Many Linux * architectures already map their physical space using 1-1 mappings - * via TLBs. For those arches the virtual memmory map is essentially + * via TLBs. For those arches the virtual memory map is essentially * for free if we use the same page size as the 1-1 mappings. In that * case the overhead consists of a few additional pages that are * allocated to create a view of memory for vmemmap. @@ -220,18 +220,7 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, if (vmemmap_buf_start) { /* need to free left buf */ -#ifdef CONFIG_NO_BOOTMEM - free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end)); - if (vmemmap_buf_start < vmemmap_buf) { - char name[15]; - - snprintf(name, sizeof(name), "MEMMAP %d", nodeid); - reserve_early_without_check(__pa(vmemmap_buf_start), - __pa(vmemmap_buf), name); - } -#else free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); -#endif vmemmap_buf = NULL; vmemmap_buf_end = NULL; } diff --git a/mm/sparse.c b/mm/sparse.c index 95ac219af379..93250207c5cf 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -671,10 +671,10 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) static void free_map_bootmem(struct page *page, unsigned long nr_pages) { unsigned long maps_section_nr, removing_section_nr, i; - int magic; + unsigned long magic; for (i = 0; i < nr_pages; i++, page++) { - magic = atomic_read(&page->_mapcount); + magic = (unsigned long) page->lru.next; BUG_ON(magic == NODE_INFO); diff --git a/mm/swap.c b/mm/swap.c index 3ce7bc373a52..c02f93611a84 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -56,17 +56,97 @@ static void __page_cache_release(struct page *page) del_page_from_lru(zone, page); spin_unlock_irqrestore(&zone->lru_lock, flags); } +} + +static void __put_single_page(struct page *page) +{ + __page_cache_release(page); free_hot_cold_page(page, 0); } -static void put_compound_page(struct page *page) +static void __put_compound_page(struct page *page) { - page = compound_head(page); - if (put_page_testzero(page)) { - compound_page_dtor *dtor; + compound_page_dtor *dtor; - dtor = get_compound_page_dtor(page); - (*dtor)(page); + __page_cache_release(page); + dtor = get_compound_page_dtor(page); + (*dtor)(page); +} + +static void put_compound_page(struct page *page) +{ + if (unlikely(PageTail(page))) { + /* __split_huge_page_refcount can run under us */ + struct page *page_head = page->first_page; + smp_rmb(); + /* + * If PageTail is still set after smp_rmb() we can be sure + * that the page->first_page we read wasn't a dangling pointer. + * See __split_huge_page_refcount() smp_wmb(). + */ + if (likely(PageTail(page) && get_page_unless_zero(page_head))) { + unsigned long flags; + /* + * Verify that our page_head wasn't converted + * to a a regular page before we got a + * reference on it. + */ + if (unlikely(!PageHead(page_head))) { + /* PageHead is cleared after PageTail */ + smp_rmb(); + VM_BUG_ON(PageTail(page)); + goto out_put_head; + } + /* + * Only run compound_lock on a valid PageHead, + * after having it pinned with + * get_page_unless_zero() above. + */ + smp_mb(); + /* page_head wasn't a dangling pointer */ + flags = compound_lock_irqsave(page_head); + if (unlikely(!PageTail(page))) { + /* __split_huge_page_refcount run before us */ + compound_unlock_irqrestore(page_head, flags); + VM_BUG_ON(PageHead(page_head)); + out_put_head: + if (put_page_testzero(page_head)) + __put_single_page(page_head); + out_put_single: + if (put_page_testzero(page)) + __put_single_page(page); + return; + } + VM_BUG_ON(page_head != page->first_page); + /* + * We can release the refcount taken by + * get_page_unless_zero now that + * split_huge_page_refcount is blocked on the + * compound_lock. + */ + if (put_page_testzero(page_head)) + VM_BUG_ON(1); + /* __split_huge_page_refcount will wait now */ + VM_BUG_ON(atomic_read(&page->_count) <= 0); + atomic_dec(&page->_count); + VM_BUG_ON(atomic_read(&page_head->_count) <= 0); + compound_unlock_irqrestore(page_head, flags); + if (put_page_testzero(page_head)) { + if (PageHead(page_head)) + __put_compound_page(page_head); + else + __put_single_page(page_head); + } + } else { + /* page_head is a dangling pointer */ + VM_BUG_ON(PageTail(page)); + goto out_put_single; + } + } else if (put_page_testzero(page)) { + if (PageHead(page)) + __put_compound_page(page); + else + __put_single_page(page); } } @@ -75,7 +155,7 @@ void put_page(struct page *page) if (unlikely(PageCompound(page))) put_compound_page(page); else if (put_page_testzero(page)) - __page_cache_release(page); + __put_single_page(page); } EXPORT_SYMBOL(put_page); @@ -378,6 +458,7 @@ void release_pages(struct page **pages, int nr, int cold) pagevec_free(&pages_to_free); } +EXPORT_SYMBOL(release_pages); /* * The pages which we're about to release may be in the deferred lru-addition @@ -398,6 +479,43 @@ void __pagevec_release(struct pagevec *pvec) EXPORT_SYMBOL(__pagevec_release); +/* used by __split_huge_page_refcount() */ +void lru_add_page_tail(struct zone* zone, + struct page *page, struct page *page_tail) +{ + int active; + enum lru_list lru; + const int file = 0; + struct list_head *head; + + VM_BUG_ON(!PageHead(page)); + VM_BUG_ON(PageCompound(page_tail)); + VM_BUG_ON(PageLRU(page_tail)); + VM_BUG_ON(!spin_is_locked(&zone->lru_lock)); + + SetPageLRU(page_tail); + + if (page_evictable(page_tail, NULL)) { + if (PageActive(page)) { + SetPageActive(page_tail); + active = 1; + lru = LRU_ACTIVE_ANON; + } else { + active = 0; + lru = LRU_INACTIVE_ANON; + } + update_page_reclaim_stat(zone, page_tail, file, active); + if (likely(PageLRU(page))) + head = page->lru.prev; + else + head = &zone->lru[lru].list; + __add_page_to_lru_list(zone, page_tail, lru, head); + } else { + SetPageUnevictable(page_tail); + add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE); + } +} + /* * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. diff --git a/mm/swap_state.c b/mm/swap_state.c index e10f5833167f..5c8cfabbc9bc 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -157,6 +157,12 @@ int add_to_swap(struct page *page) if (!entry.val) return 0; + if (unlikely(PageTransHuge(page))) + if (unlikely(split_huge_page(page))) { + swapcache_free(entry, NULL); + return 0; + } + /* * Radix-tree node allocations from PF_MEMALLOC contexts could * completely exhaust the page allocator. __GFP_NOMEMALLOC diff --git a/mm/swapfile.c b/mm/swapfile.c index 7c703ff2f36f..0341c5700e34 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -30,6 +30,7 @@ #include <linux/capability.h> #include <linux/syscalls.h> #include <linux/memcontrol.h> +#include <linux/poll.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -58,6 +59,10 @@ static struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); +static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); +/* Activity counter to indicate that a swapon or swapoff has occurred */ +static atomic_t proc_poll_event = ATOMIC_INIT(0); + static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ @@ -139,7 +144,7 @@ static int discard_swap(struct swap_info_struct *si) nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); if (nr_blocks) { err = blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT); + nr_blocks, GFP_KERNEL, 0); if (err) return err; cond_resched(); @@ -150,7 +155,7 @@ static int discard_swap(struct swap_info_struct *si) nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); err = blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT); + nr_blocks, GFP_KERNEL, 0); if (err) break; @@ -189,7 +194,7 @@ static void discard_swap_cluster(struct swap_info_struct *si, start_block <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9; if (blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT)) + nr_blocks, GFP_NOIO, 0)) break; } @@ -959,6 +964,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (unlikely(pmd_trans_huge(*pmd))) + continue; if (pmd_none_or_clear_bad(pmd)) continue; ret = unuse_pte_range(vma, pmd, addr, next, entry, page); @@ -1672,7 +1679,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (S_ISBLK(inode->i_mode)) { struct block_device *bdev = I_BDEV(inode); set_blocksize(bdev, p->old_block_size); - bd_release(bdev); + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } else { mutex_lock(&inode->i_mutex); inode->i_flags &= ~S_SWAPFILE; @@ -1680,6 +1687,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) } filp_close(swap_file, NULL); err = 0; + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); out_dput: filp_close(victim, NULL); @@ -1688,6 +1697,25 @@ out: } #ifdef CONFIG_PROC_FS +struct proc_swaps { + struct seq_file seq; + int event; +}; + +static unsigned swaps_poll(struct file *file, poll_table *wait) +{ + struct proc_swaps *s = file->private_data; + + poll_wait(file, &proc_poll_wait, wait); + + if (s->event != atomic_read(&proc_poll_event)) { + s->event = atomic_read(&proc_poll_event); + return POLLIN | POLLRDNORM | POLLERR | POLLPRI; + } + + return POLLIN | POLLRDNORM; +} + /* iterator */ static void *swap_start(struct seq_file *swap, loff_t *pos) { @@ -1771,7 +1799,24 @@ static const struct seq_operations swaps_op = { static int swaps_open(struct inode *inode, struct file *file) { - return seq_open(file, &swaps_op); + struct proc_swaps *s; + int ret; + + s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL); + if (!s) + return -ENOMEM; + + file->private_data = s; + + ret = seq_open(file, &swaps_op); + if (ret) { + kfree(s); + return ret; + } + + s->seq.private = s; + s->event = atomic_read(&proc_poll_event); + return ret; } static const struct file_operations proc_swaps_operations = { @@ -1779,6 +1824,7 @@ static const struct file_operations proc_swaps_operations = { .read = seq_read, .llseek = seq_lseek, .release = seq_release, + .poll = swaps_poll, }; static int __init procswaps_init(void) @@ -1894,8 +1940,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -EINVAL; if (S_ISBLK(inode->i_mode)) { - bdev = I_BDEV(inode); - error = bd_claim(bdev, sys_swapon); + bdev = bdgrab(I_BDEV(inode)); + error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, + sys_swapon); if (error < 0) { bdev = NULL; error = -EINVAL; @@ -2084,12 +2131,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) swap_info[prev]->next = type; spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); + error = 0; goto out; bad_swap: if (bdev) { set_blocksize(bdev, p->old_block_size); - bd_release(bdev); + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } destroy_swap_extents(p); swap_cgroup_swapoff(type); diff --git a/mm/truncate.c b/mm/truncate.c index ba887bff48c5..49feb46e77b8 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -390,6 +390,10 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); + + if (mapping->a_ops->freepage) + mapping->a_ops->freepage(page); + page_cache_release(page); /* pagecache ref */ return 1; failed: @@ -545,13 +549,12 @@ EXPORT_SYMBOL(truncate_pagecache); * @inode: inode * @newsize: new file size * - * truncate_setsize updastes i_size update and performs pagecache - * truncation (if necessary) for a file size updates. It will be - * typically be called from the filesystem's setattr function when - * ATTR_SIZE is passed in. + * truncate_setsize updates i_size and performs pagecache truncation (if + * necessary) to @newsize. It will be typically be called from the filesystem's + * setattr function when ATTR_SIZE is passed in. * - * Must be called with inode_mutex held and after all filesystem - * specific block truncation has been performed. + * Must be called with inode_mutex held and before all filesystem specific + * block truncation has been performed. */ void truncate_setsize(struct inode *inode, loff_t newsize) { diff --git a/mm/util.c b/mm/util.c index 4735ea481816..f126975ef23e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -186,27 +186,6 @@ void kzfree(const void *p) } EXPORT_SYMBOL(kzfree); -int kern_ptr_validate(const void *ptr, unsigned long size) -{ - unsigned long addr = (unsigned long)ptr; - unsigned long min_addr = PAGE_OFFSET; - unsigned long align_mask = sizeof(void *) - 1; - - if (unlikely(addr < min_addr)) - goto out; - if (unlikely(addr > (unsigned long)high_memory - size)) - goto out; - if (unlikely(addr & align_mask)) - goto out; - if (unlikely(!kern_addr_valid(addr))) - goto out; - if (unlikely(!kern_addr_valid(addr + size - 1))) - goto out; - return 1; -out: - return 0; -} - /* * strndup_user - duplicate an existing string from user space * @s: The string to duplicate @@ -245,6 +224,19 @@ void arch_pick_mmap_layout(struct mm_struct *mm) } #endif +/* + * Like get_user_pages_fast() except its IRQ-safe in that it won't fall + * back to the regular GUP. + * If the architecture not support this fucntion, simply return with no + * page pinned + */ +int __attribute__((weak)) __get_user_pages_fast(unsigned long start, + int nr_pages, int write, struct page **pages) +{ + return 0; +} +EXPORT_SYMBOL_GPL(__get_user_pages_fast); + /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6b8889da69a6..f9b166732e70 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -31,8 +31,6 @@ #include <asm/tlbflush.h> #include <asm/shmparam.h> -bool vmap_lazy_unmap __read_mostly = true; - /*** Page table manipulation functions ***/ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) @@ -293,13 +291,13 @@ static void __insert_vmap_area(struct vmap_area *va) struct rb_node *tmp; while (*p) { - struct vmap_area *tmp; + struct vmap_area *tmp_va; parent = *p; - tmp = rb_entry(parent, struct vmap_area, rb_node); - if (va->va_start < tmp->va_end) + tmp_va = rb_entry(parent, struct vmap_area, rb_node); + if (va->va_start < tmp_va->va_end) p = &(*p)->rb_left; - else if (va->va_end > tmp->va_start) + else if (va->va_end > tmp_va->va_start) p = &(*p)->rb_right; else BUG(); @@ -503,9 +501,6 @@ static unsigned long lazy_max_pages(void) { unsigned int log; - if (!vmap_lazy_unmap) - return 0; - log = fls(num_online_cpus()); return log * (32UL * 1024 * 1024 / PAGE_SIZE); @@ -517,6 +512,15 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); static void purge_fragmented_blocks_allcpus(void); /* + * called before a call to iounmap() if the caller wants vm_area_struct's + * immediately freed. + */ +void set_iounmap_nonlazy(void) +{ + atomic_set(&vmap_lazy_nr, lazy_max_pages()+1); +} + +/* * Purges all lazily-freed vmap areas. * * If sync is 0 then don't purge if there is already a purge in progress. @@ -557,7 +561,6 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, if (va->va_end > *end) *end = va->va_end; nr += (va->va_end - va->va_start) >> PAGE_SHIFT; - unmap_vmap_area(va); list_add_tail(&va->purge_list, &valist); va->flags |= VM_LAZY_FREEING; va->flags &= ~VM_LAZY_FREE; @@ -602,10 +605,11 @@ static void purge_vmap_area_lazy(void) } /* - * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been - * called for the correct range previously. + * Free a vmap area, caller ensuring that the area has been unmapped + * and flush_cache_vunmap had been called for the correct range + * previously. */ -static void free_unmap_vmap_area_noflush(struct vmap_area *va) +static void free_vmap_area_noflush(struct vmap_area *va) { va->flags |= VM_LAZY_FREE; atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); @@ -614,6 +618,16 @@ static void free_unmap_vmap_area_noflush(struct vmap_area *va) } /* + * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been + * called for the correct range previously. + */ +static void free_unmap_vmap_area_noflush(struct vmap_area *va) +{ + unmap_vmap_area(va); + free_vmap_area_noflush(va); +} + +/* * Free and unmap a vmap area */ static void free_unmap_vmap_area(struct vmap_area *va) @@ -734,7 +748,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, VMALLOC_START, VMALLOC_END, node, gfp_mask); - if (unlikely(IS_ERR(va))) { + if (IS_ERR(va)) { kfree(vb); return ERR_CAST(va); } @@ -789,7 +803,7 @@ static void free_vmap_block(struct vmap_block *vb) spin_unlock(&vmap_block_tree_lock); BUG_ON(tmp != vb); - free_unmap_vmap_area_noflush(vb->va); + free_vmap_area_noflush(vb->va); call_rcu(&vb->rcu_head, rcu_free_vb); } @@ -927,6 +941,8 @@ static void vb_free(const void *addr, unsigned long size) rcu_read_unlock(); BUG_ON(!vb); + vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); + spin_lock(&vb->lock); BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); @@ -979,7 +995,6 @@ void vm_unmap_aliases(void) s = vb->va->va_start + (i << PAGE_SHIFT); e = vb->va->va_start + (j << PAGE_SHIFT); - vunmap_page_range(s, e); flush = 1; if (s < start) @@ -1160,6 +1175,7 @@ void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) { vunmap_page_range(addr, addr + size); } +EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush); /** * unmap_kernel_range - unmap kernel VM area and flush cache and TLB @@ -1300,13 +1316,6 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, -1, GFP_KERNEL, caller); } -struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, - int node, gfp_t gfp_mask) -{ - return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, - node, gfp_mask, __builtin_return_address(0)); -} - static struct vm_struct *find_vm_area(const void *addr) { struct vmap_area *va; @@ -1522,25 +1531,12 @@ fail: return NULL; } -void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) -{ - void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1, - __builtin_return_address(0)); - - /* - * A ref_count = 3 is needed because the vm_struct and vmap_area - * structures allocated in the __get_vm_area_node() function contain - * references to the virtual address of the vmalloc'ed block. - */ - kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask); - - return addr; -} - /** - * __vmalloc_node - allocate virtually contiguous memory + * __vmalloc_node_range - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment + * @start: vm area range start + * @end: vm area range end * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages * @node: node to use for allocation or -1 @@ -1550,9 +1546,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. */ -static void *__vmalloc_node(unsigned long size, unsigned long align, - gfp_t gfp_mask, pgprot_t prot, - int node, void *caller) +void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, int node, void *caller) { struct vm_struct *area; void *addr; @@ -1562,8 +1558,8 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, if (!size || (size >> PAGE_SHIFT) > totalram_pages) return NULL; - area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, - VMALLOC_END, node, gfp_mask, caller); + area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node, + gfp_mask, caller); if (!area) return NULL; @@ -1580,6 +1576,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, return addr; } +/** + * __vmalloc_node - allocate virtually contiguous memory + * @size: allocation size + * @align: desired alignment + * @gfp_mask: flags for the page level allocator + * @prot: protection mask for the allocated pages + * @node: node to use for allocation or -1 + * @caller: caller's return address + * + * Allocate enough pages to cover @size from the page level + * allocator with @gfp_mask flags. Map them into contiguous + * kernel virtual space, using a pagetable protection of @prot. + */ +static void *__vmalloc_node(unsigned long size, unsigned long align, + gfp_t gfp_mask, pgprot_t prot, + int node, void *caller) +{ + return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, + gfp_mask, prot, node, caller); +} + void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) { return __vmalloc_node(size, 1, gfp_mask, prot, -1, @@ -1587,6 +1604,13 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) } EXPORT_SYMBOL(__vmalloc); +static inline void *__vmalloc_node_flags(unsigned long size, + int node, gfp_t flags) +{ + return __vmalloc_node(size, 1, flags, PAGE_KERNEL, + node, __builtin_return_address(0)); +} + /** * vmalloc - allocate virtually contiguous memory * @size: allocation size @@ -1598,12 +1622,28 @@ EXPORT_SYMBOL(__vmalloc); */ void *vmalloc(unsigned long size) { - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, - -1, __builtin_return_address(0)); + return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM); } EXPORT_SYMBOL(vmalloc); /** + * vzalloc - allocate virtually contiguous memory with zero fill + * @size: allocation size + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vzalloc(unsigned long size) +{ + return __vmalloc_node_flags(size, -1, + GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); +} +EXPORT_SYMBOL(vzalloc); + +/** * vmalloc_user - allocate zeroed virtually contiguous memory for userspace * @size: allocation size * @@ -1644,6 +1684,25 @@ void *vmalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vmalloc_node); +/** + * vzalloc_node - allocate memory on a specific node with zero fill + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc_node() instead. + */ +void *vzalloc_node(unsigned long size, int node) +{ + return __vmalloc_node_flags(size, node, + GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); +} +EXPORT_SYMBOL(vzalloc_node); + #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif @@ -2056,6 +2115,7 @@ void free_vm_area(struct vm_struct *area) } EXPORT_SYMBOL_GPL(free_vm_area); +#ifdef CONFIG_SMP static struct vmap_area *node_to_va(struct rb_node *n) { return n ? rb_entry(n, struct vmap_area, rb_node) : NULL; @@ -2145,17 +2205,16 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, * @sizes: array containing size of each area * @nr_vms: the number of areas to allocate * @align: alignment, all entries in @offsets and @sizes must be aligned to this - * @gfp_mask: allocation mask * * Returns: kmalloc'd vm_struct pointer array pointing to allocated * vm_structs on success, %NULL on failure * * Percpu allocator wants to use congruent vm areas so that it can * maintain the offsets among percpu areas. This function allocates - * congruent vmalloc areas for it. These areas tend to be scattered - * pretty far, distance between two areas easily going up to - * gigabytes. To avoid interacting with regular vmallocs, these areas - * are allocated from top. + * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to + * be scattered pretty far, distance between two areas easily going up + * to gigabytes. To avoid interacting with regular vmallocs, these + * areas are allocated from top. * * Despite its complicated look, this allocator is rather simple. It * does everything top-down and scans areas from the end looking for @@ -2166,7 +2225,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, */ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, - size_t align, gfp_t gfp_mask) + size_t align) { const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); @@ -2176,8 +2235,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, unsigned long base, start, end, last_end; bool purged = false; - gfp_mask &= GFP_RECLAIM_MASK; - /* verify parameters and allocate data structures */ BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); for (last_area = 0, area = 0; area < nr_vms; area++) { @@ -2210,14 +2267,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, return NULL; } - vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); - vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); + vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); + vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); if (!vas || !vms) goto err_free; for (area = 0; area < nr_vms; area++) { - vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); - vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); + vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); + vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); if (!vas[area] || !vms[area]) goto err_free; } @@ -2336,9 +2393,11 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) free_vm_area(vms[i]); kfree(vms); } +#endif /* CONFIG_SMP */ #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) + __acquires(&vmlist_lock) { loff_t n = *pos; struct vm_struct *v; @@ -2365,6 +2424,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) } static void s_stop(struct seq_file *m, void *p) + __releases(&vmlist_lock) { read_unlock(&vmlist_lock); } @@ -2395,13 +2455,8 @@ static int s_show(struct seq_file *m, void *p) seq_printf(m, "0x%p-0x%p %7ld", v->addr, v->addr + v->size, v->size); - if (v->caller) { - char buff[KSYM_SYMBOL_LEN]; - - seq_putc(m, ' '); - sprint_symbol(buff, (unsigned long)v->caller); - seq_puts(m, buff); - } + if (v->caller) + seq_printf(m, " %pS", v->caller); if (v->nr_pages) seq_printf(m, " pages=%d", v->nr_pages); diff --git a/mm/vmscan.c b/mm/vmscan.c index c5dfabf25f11..17497d0cd8b9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -32,6 +32,7 @@ #include <linux/topology.h> #include <linux/cpu.h> #include <linux/cpuset.h> +#include <linux/compaction.h> #include <linux/notifier.h> #include <linux/rwsem.h> #include <linux/delay.h> @@ -51,6 +52,24 @@ #define CREATE_TRACE_POINTS #include <trace/events/vmscan.h> +/* + * reclaim_mode determines how the inactive list is shrunk + * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages + * RECLAIM_MODE_ASYNC: Do not block + * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback + * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference + * page from the LRU and reclaim all pages within a + * naturally aligned range + * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of + * order-0 pages and then compact the zone + */ +typedef unsigned __bitwise__ reclaim_mode_t; +#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) +#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) +#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) +#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) +#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) + struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; @@ -79,10 +98,10 @@ struct scan_control { int order; /* - * Intend to reclaim enough contenious memory rather than to reclaim - * enough amount memory. I.e, it's the mode for high order allocation. + * Intend to reclaim enough continuous memory rather than reclaim + * enough amount of memory. i.e, mode for high order allocation. */ - bool lumpy_reclaim_mode; + reclaim_mode_t reclaim_mode; /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; @@ -265,6 +284,39 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, return ret; } +static void set_reclaim_mode(int priority, struct scan_control *sc, + bool sync) +{ + reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; + + /* + * Initially assume we are entering either lumpy reclaim or + * reclaim/compaction.Depending on the order, we will either set the + * sync mode or just reclaim order-0 pages later. + */ + if (COMPACTION_BUILD) + sc->reclaim_mode = RECLAIM_MODE_COMPACTION; + else + sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; + + /* + * Avoid using lumpy reclaim or reclaim/compaction if possible by + * restricting when its set to either costly allocations or when + * under memory pressure + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + sc->reclaim_mode |= syncmode; + else if (sc->order && priority < DEF_PRIORITY - 2) + sc->reclaim_mode |= syncmode; + else + sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; +} + +static void reset_reclaim_mode(struct scan_control *sc) +{ + sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; +} + static inline int is_page_cache_freeable(struct page *page) { /* @@ -275,7 +327,8 @@ static inline int is_page_cache_freeable(struct page *page) return page_count(page) - page_has_private(page) == 2; } -static int may_write_to_queue(struct backing_dev_info *bdi) +static int may_write_to_queue(struct backing_dev_info *bdi, + struct scan_control *sc) { if (current->flags & PF_SWAPWRITE) return 1; @@ -283,6 +336,10 @@ static int may_write_to_queue(struct backing_dev_info *bdi) return 1; if (bdi == current->backing_dev_info) return 1; + + /* lumpy reclaim for hugepage often need a lot of write */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + return 1; return 0; } @@ -307,12 +364,6 @@ static void handle_write_error(struct address_space *mapping, unlock_page(page); } -/* Request for sync pageout. */ -enum pageout_io { - PAGEOUT_IO_ASYNC, - PAGEOUT_IO_SYNC, -}; - /* possible outcome of pageout() */ typedef enum { /* failed to write page out, page is locked */ @@ -330,7 +381,7 @@ typedef enum { * Calls ->writepage(). */ static pageout_t pageout(struct page *page, struct address_space *mapping, - enum pageout_io sync_writeback) + struct scan_control *sc) { /* * If the page is dirty, only perform writeback if that write @@ -366,7 +417,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_queue(mapping->backing_dev_info)) + if (!may_write_to_queue(mapping->backing_dev_info, sc)) return PAGE_KEEP; if (clear_page_dirty_for_io(page)) { @@ -376,7 +427,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, .nr_to_write = SWAP_CLUSTER_MAX, .range_start = 0, .range_end = LLONG_MAX, - .nonblocking = 1, .for_reclaim = 1, }; @@ -394,7 +444,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * direct reclaiming a large contiguous area and the * first attempt to free a range of pages fails. */ - if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC) + if (PageWriteback(page) && + (sc->reclaim_mode & RECLAIM_MODE_SYNC)) wait_on_page_writeback(page); if (!PageWriteback(page)) { @@ -402,7 +453,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, ClearPageReclaim(page); } trace_mm_vmscan_writepage(page, - trace_reclaim_flags(page, sync_writeback)); + trace_reclaim_flags(page, sc->reclaim_mode)); inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -459,9 +510,16 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) spin_unlock_irq(&mapping->tree_lock); swapcache_free(swap, page); } else { + void (*freepage)(struct page *); + + freepage = mapping->a_ops->freepage; + __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); + + if (freepage != NULL) + freepage(page); } return 1; @@ -580,7 +638,7 @@ static enum page_references page_check_references(struct page *page, referenced_page = TestClearPageReferenced(page); /* Lumpy reclaim - ignore references */ - if (sc->lumpy_reclaim_mode) + if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) return PAGEREF_RECLAIM; /* @@ -616,7 +674,7 @@ static enum page_references page_check_references(struct page *page, } /* Reclaim if clean, defer dirty pages to writeback */ - if (referenced_page) + if (referenced_page && !PageSwapBacked(page)) return PAGEREF_RECLAIM_CLEAN; return PAGEREF_RECLAIM; @@ -644,12 +702,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages) * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, - struct scan_control *sc, - enum pageout_io sync_writeback) + struct zone *zone, + struct scan_control *sc) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); int pgactivate = 0; + unsigned long nr_dirty = 0; + unsigned long nr_congested = 0; unsigned long nr_reclaimed = 0; cond_resched(); @@ -669,6 +729,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep; VM_BUG_ON(PageActive(page)); + VM_BUG_ON(page_zone(page) != zone); sc->nr_scanned++; @@ -694,10 +755,13 @@ static unsigned long shrink_page_list(struct list_head *page_list, * for any page for which writeback has already * started. */ - if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) + if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && + may_enter_fs) wait_on_page_writeback(page); - else - goto keep_locked; + else { + unlock_page(page); + goto keep_lumpy; + } } references = page_check_references(page, sc); @@ -743,6 +807,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, } if (PageDirty(page)) { + nr_dirty++; + if (references == PAGEREF_RECLAIM_CLEAN) goto keep_locked; if (!may_enter_fs) @@ -751,14 +817,18 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; /* Page is dirty, try to write it out here */ - switch (pageout(page, mapping, sync_writeback)) { + switch (pageout(page, mapping, sc)) { case PAGE_KEEP: + nr_congested++; goto keep_locked; case PAGE_ACTIVATE: goto activate_locked; case PAGE_SUCCESS: - if (PageWriteback(page) || PageDirty(page)) + if (PageWriteback(page)) + goto keep_lumpy; + if (PageDirty(page)) goto keep; + /* * A synchronous write - probably a ramdisk. Go * ahead and try to reclaim the page. @@ -841,6 +911,7 @@ cull_mlocked: try_to_free_swap(page); unlock_page(page); putback_lru_page(page); + reset_reclaim_mode(sc); continue; activate_locked: @@ -853,10 +924,21 @@ activate_locked: keep_locked: unlock_page(page); keep: + reset_reclaim_mode(sc); +keep_lumpy: list_add(&page->lru, &ret_pages); VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); } + /* + * Tag a zone as congested if all the dirty pages encountered were + * backed by a congested BDI. In this case, reclaimers should just + * back off and wait for congestion to clear because further reclaim + * will encounter the same problem + */ + if (nr_dirty == nr_congested && nr_dirty != 0) + zone_set_flag(zone, ZONE_CONGESTED); + free_page_list(&free_pages); list_splice(&ret_pages, page_list); @@ -962,7 +1044,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, case 0: list_move(&page->lru, dst); mem_cgroup_del_lru(page); - nr_taken++; + nr_taken += hpage_nr_pages(page); break; case -EBUSY: @@ -1006,7 +1088,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, /* Check that we have not crossed a zone boundary. */ if (unlikely(page_zone_id(cursor_page) != zone_id)) - continue; + break; /* * If we don't have enough swap space, reclaiming of @@ -1014,23 +1096,28 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * pointless. */ if (nr_swap_pages <= 0 && PageAnon(cursor_page) && - !PageSwapCache(cursor_page)) - continue; + !PageSwapCache(cursor_page)) + break; if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); mem_cgroup_del_lru(cursor_page); - nr_taken++; + nr_taken += hpage_nr_pages(page); nr_lumpy_taken++; if (PageDirty(cursor_page)) nr_lumpy_dirty++; scan++; } else { - if (mode == ISOLATE_BOTH && - page_count(cursor_page)) - nr_lumpy_failed++; + /* the page is freed already. */ + if (!page_count(cursor_page)) + continue; + break; } } + + /* If we break out of the loop above, lumpy reclaim failed */ + if (pfn < end_pfn) + nr_lumpy_failed++; } *scanned = scan; @@ -1070,14 +1157,15 @@ static unsigned long clear_active_flags(struct list_head *page_list, struct page *page; list_for_each_entry(page, page_list, lru) { + int numpages = hpage_nr_pages(page); lru = page_lru_base_type(page); if (PageActive(page)) { lru += LRU_ACTIVE; ClearPageActive(page); - nr_active++; + nr_active += numpages; } if (count) - count[lru]++; + count[lru] += numpages; } return nr_active; @@ -1187,7 +1275,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, add_page_to_lru_list(zone, page, lru); if (is_active_lru(lru)) { int file = is_file_lru(lru); - reclaim_stat->recent_rotated[file]++; + int numpages = hpage_nr_pages(page); + reclaim_stat->recent_rotated[file] += numpages; } if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); @@ -1253,7 +1342,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, return false; /* Only stall on lumpy reclaim */ - if (!sc->lumpy_reclaim_mode) + if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) return false; /* If we have relaimed everything on the isolated list, no stall */ @@ -1286,7 +1375,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, unsigned long nr_scanned; unsigned long nr_reclaimed = 0; unsigned long nr_taken; - unsigned long nr_active; unsigned long nr_anon; unsigned long nr_file; @@ -1298,15 +1386,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, return SWAP_CLUSTER_MAX; } - + set_reclaim_mode(priority, sc, false); lru_add_drain(); spin_lock_irq(&zone->lru_lock); if (scanning_global_lru(sc)) { nr_taken = isolate_pages_global(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode ? - ISOLATE_BOTH : ISOLATE_INACTIVE, + sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? + ISOLATE_BOTH : ISOLATE_INACTIVE, zone, 0, file); zone->pages_scanned += nr_scanned; if (current_is_kswapd()) @@ -1318,8 +1406,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, } else { nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode ? - ISOLATE_BOTH : ISOLATE_INACTIVE, + sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? + ISOLATE_BOTH : ISOLATE_INACTIVE, zone, sc->mem_cgroup, 0, file); /* @@ -1337,20 +1425,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, spin_unlock_irq(&zone->lru_lock); - nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); + nr_reclaimed = shrink_page_list(&page_list, zone, sc); /* Check if we should syncronously wait for writeback */ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { - congestion_wait(BLK_RW_ASYNC, HZ/10); - - /* - * The attempt at page out may have made some - * of the pages active, mark them inactive again. - */ - nr_active = clear_active_flags(&page_list, NULL); - count_vm_events(PGDEACTIVATE, nr_active); - - nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC); + set_reclaim_mode(priority, sc, true); + nr_reclaimed += shrink_page_list(&page_list, zone, sc); } local_irq_disable(); @@ -1359,6 +1439,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); + + trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, + zone_idx(zone), + nr_scanned, nr_reclaimed, + priority, + trace_shrink_flags(file, sc->reclaim_mode)); return nr_reclaimed; } @@ -1398,7 +1484,7 @@ static void move_active_pages_to_lru(struct zone *zone, list_move(&page->lru, &zone->lru[lru].list); mem_cgroup_add_lru_list(page, lru); - pgmoved++; + pgmoved += hpage_nr_pages(page); if (!pagevec_add(&pvec, page) || list_empty(list)) { spin_unlock_irq(&zone->lru_lock); @@ -1466,7 +1552,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { - nr_rotated++; + nr_rotated += hpage_nr_pages(page); /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So @@ -1506,6 +1592,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, spin_unlock_irq(&zone->lru_lock); } +#ifdef CONFIG_SWAP static int inactive_anon_is_low_global(struct zone *zone) { unsigned long active, inactive; @@ -1531,12 +1618,26 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) { int low; + /* + * If we don't have swap space, anonymous page deactivation + * is pointless. + */ + if (!total_swap_pages) + return 0; + if (scanning_global_lru(sc)) low = inactive_anon_is_low_global(zone); else low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); return low; } +#else +static inline int inactive_anon_is_low(struct zone *zone, + struct scan_control *sc) +{ + return 0; +} +#endif static int inactive_file_is_low_global(struct zone *zone) { @@ -1721,19 +1822,55 @@ out: } } -static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc) +/* + * Reclaim/compaction depends on a number of pages being freed. To avoid + * disruption to the system, a small number of order-0 pages continue to be + * rotated and reclaimed in the normal fashion. However, by the time we get + * back to the allocator and call try_to_compact_zone(), we ensure that + * there are enough free pages for it to be likely successful + */ +static inline bool should_continue_reclaim(struct zone *zone, + unsigned long nr_reclaimed, + unsigned long nr_scanned, + struct scan_control *sc) { + unsigned long pages_for_compaction; + unsigned long inactive_lru_pages; + + /* If not in reclaim/compaction mode, stop */ + if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) + return false; + /* - * If we need a large contiguous chunk of memory, or have - * trouble getting a small set of contiguous pages, we - * will reclaim both active and inactive pages. + * If we failed to reclaim and have scanned the full list, stop. + * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far + * faster but obviously would be less likely to succeed + * allocation. If this is desirable, use GFP_REPEAT to decide + * if both reclaimed and scanned should be checked or just + * reclaimed */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - sc->lumpy_reclaim_mode = 1; - else if (sc->order && priority < DEF_PRIORITY - 2) - sc->lumpy_reclaim_mode = 1; - else - sc->lumpy_reclaim_mode = 0; + if (!nr_reclaimed && !nr_scanned) + return false; + + /* + * If we have not reclaimed enough pages for compaction and the + * inactive lists are large enough, continue reclaiming + */ + pages_for_compaction = (2UL << sc->order); + inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + if (sc->nr_reclaimed < pages_for_compaction && + inactive_lru_pages > pages_for_compaction) + return true; + + /* If compaction would go ahead or the allocation would succeed, stop */ + switch (compaction_suitable(zone, sc->order)) { + case COMPACT_PARTIAL: + case COMPACT_CONTINUE: + return false; + default: + return true; + } } /* @@ -1745,13 +1882,14 @@ static void shrink_zone(int priority, struct zone *zone, unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; enum lru_list l; - unsigned long nr_reclaimed = sc->nr_reclaimed; + unsigned long nr_reclaimed, nr_scanned; unsigned long nr_to_reclaim = sc->nr_to_reclaim; +restart: + nr_reclaimed = 0; + nr_scanned = sc->nr_scanned; get_scan_count(zone, sc, nr, priority); - set_lumpy_reclaim_mode(priority, sc); - while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { for_each_evictable_lru(l) { @@ -1775,16 +1913,20 @@ static void shrink_zone(int priority, struct zone *zone, if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) break; } - - sc->nr_reclaimed = nr_reclaimed; + sc->nr_reclaimed += nr_reclaimed; /* * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0) + if (inactive_anon_is_low(zone, sc)) shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); + /* reclaim/compaction might need reclaim to continue */ + if (should_continue_reclaim(zone, nr_reclaimed, + sc->nr_scanned - nr_scanned, sc)) + goto restart; + throttle_vm_writeout(sc->gfp_mask); } @@ -1937,21 +2079,17 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, /* Take a nap, wait for some writeback to complete */ if (!sc->hibernation_mode && sc->nr_scanned && - priority < DEF_PRIORITY - 2) - congestion_wait(BLK_RW_ASYNC, HZ/10); + priority < DEF_PRIORITY - 2) { + struct zone *preferred_zone; + + first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), + &cpuset_current_mems_allowed, + &preferred_zone); + wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); + } } out: - /* - * Now that we've scanned all the zones at this priority level, note - * that level within the zone so that the next thread which performs - * scanning of this zone will immediately start out at this priority - * level. This affects only the decision whether or not to bring - * mapped pages onto the inactive list. - */ - if (priority < 0) - priority = 0; - delayacct_freepages_end(); put_mems_allowed(); @@ -2063,38 +2201,87 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, } #endif +/* + * pgdat_balanced is used when checking if a node is balanced for high-order + * allocations. Only zones that meet watermarks and are in a zone allowed + * by the callers classzone_idx are added to balanced_pages. The total of + * balanced pages must be at least 25% of the zones allowed by classzone_idx + * for the node to be considered balanced. Forcing all zones to be balanced + * for high orders can cause excessive reclaim when there are imbalanced zones. + * The choice of 25% is due to + * o a 16M DMA zone that is balanced will not balance a zone on any + * reasonable sized machine + * o On all other machines, the top zone must be at least a reasonable + * precentage of the middle zones. For example, on 32-bit x86, highmem + * would need to be at least 256M for it to be balance a whole node. + * Similarly, on x86-64 the Normal zone would need to be at least 1G + * to balance a node on its own. These seemed like reasonable ratios. + */ +static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, + int classzone_idx) +{ + unsigned long present_pages = 0; + int i; + + for (i = 0; i <= classzone_idx; i++) + present_pages += pgdat->node_zones[i].present_pages; + + return balanced_pages > (present_pages >> 2); +} + /* is kswapd sleeping prematurely? */ -static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) +static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, + int classzone_idx) { int i; + unsigned long balanced = 0; + bool all_zones_ok = true; /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ if (remaining) - return 1; + return true; - /* If after HZ/10, a zone is below the high mark, it's premature */ + /* Check the watermark levels */ for (i = 0; i < pgdat->nr_zones; i++) { struct zone *zone = pgdat->node_zones + i; if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable) + /* + * balance_pgdat() skips over all_unreclaimable after + * DEF_PRIORITY. Effectively, it considers them balanced so + * they must be considered balanced here as well if kswapd + * is to sleep + */ + if (zone->all_unreclaimable) { + balanced += zone->present_pages; continue; + } - if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), - 0, 0)) - return 1; + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), + classzone_idx, 0)) + all_zones_ok = false; + else + balanced += zone->present_pages; } - return 0; + /* + * For high-order requests, the balanced zones must contain at least + * 25% of the nodes pages for kswapd to sleep. For order-0, all zones + * must be balanced + */ + if (order) + return pgdat_balanced(pgdat, balanced, classzone_idx); + else + return !all_zones_ok; } /* * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). * - * Returns the number of pages which were actually freed. + * Returns the final order kswapd was reclaiming at * * There is special handling here for zones which are full of pinned pages. * This can happen if the pages are all mlocked, or if they are all used by @@ -2111,11 +2298,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) * interoperates with the page allocator fallback scheme to ensure that aging * of pages is balanced across the zones. */ -static unsigned long balance_pgdat(pg_data_t *pgdat, int order) +static unsigned long balance_pgdat(pg_data_t *pgdat, int order, + int *classzone_idx) { int all_zones_ok; + unsigned long balanced; int priority; int i; + int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long total_scanned; struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc = { @@ -2138,7 +2328,6 @@ loop_again: count_vm_event(PAGEOUTRUN); for (priority = DEF_PRIORITY; priority >= 0; priority--) { - int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; int has_under_min_watermark_zone = 0; @@ -2147,6 +2336,7 @@ loop_again: disable_swap_token(); all_zones_ok = 1; + balanced = 0; /* * Scan in the highmem->dma direction for the highest @@ -2169,9 +2359,10 @@ loop_again: shrink_active_list(SWAP_CLUSTER_MAX, zone, &sc, priority, 0); - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) { end_zone = i; + *classzone_idx = i; break; } } @@ -2194,6 +2385,7 @@ loop_again: * cause too much scanning of the lower zones. */ for (i = 0; i <= end_zone; i++) { + int compaction; struct zone *zone = pgdat->node_zones + i; int nr_slab; @@ -2215,7 +2407,7 @@ loop_again: * We put equal pressure on every zone, unless one * zone has way too many pages free already. */ - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, 8*high_wmark_pages(zone), end_zone, 0)) shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; @@ -2223,9 +2415,26 @@ loop_again: lru_pages); sc.nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; + + compaction = 0; + if (order && + zone_watermark_ok(zone, 0, + high_wmark_pages(zone), + end_zone, 0) && + !zone_watermark_ok(zone, order, + high_wmark_pages(zone), + end_zone, 0)) { + compact_zone_order(zone, + order, + sc.gfp_mask, false, + COMPACT_MODE_KSWAPD); + compaction = 1; + } + if (zone->all_unreclaimable) continue; - if (nr_slab == 0 && !zone_reclaimable(zone)) + if (!compaction && nr_slab == 0 && + !zone_reclaimable(zone)) zone->all_unreclaimable = 1; /* * If we've done a decent amount of scanning and @@ -2236,7 +2445,7 @@ loop_again: total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), end_zone, 0)) { all_zones_ok = 0; /* @@ -2244,13 +2453,24 @@ loop_again: * means that we have a GFP_ATOMIC allocation * failure risk. Hurry up! */ - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, min_wmark_pages(zone), end_zone, 0)) has_under_min_watermark_zone = 1; + } else { + /* + * If a zone reaches its high watermark, + * consider it to be no longer congested. It's + * possible there are dirty pages backed by + * congested BDIs but as pressure is relieved, + * spectulatively avoid congestion waits + */ + zone_clear_flag(zone, ZONE_CONGESTED); + if (i <= *classzone_idx) + balanced += zone->present_pages; } } - if (all_zones_ok) + if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) break; /* kswapd: all done */ /* * OK, kswapd is getting into trouble. Take a nap, then take @@ -2273,7 +2493,13 @@ loop_again: break; } out: - if (!all_zones_ok) { + + /* + * order-0: All zones must meet high watermark for a balanced node + * high-order: Balanced zones must make up at least 25% of the node + * for the node to be balanced + */ + if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { cond_resched(); try_to_freeze(); @@ -2298,7 +2524,88 @@ out: goto loop_again; } - return sc.nr_reclaimed; + /* + * If kswapd was reclaiming at a higher order, it has the option of + * sleeping without all zones being balanced. Before it does, it must + * ensure that the watermarks for order-0 on *all* zones are met and + * that the congestion flags are cleared. The congestion flag must + * be cleared as kswapd is the only mechanism that clears the flag + * and it is potentially going to sleep here. + */ + if (order) { + for (i = 0; i <= end_zone; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + if (zone->all_unreclaimable && priority != DEF_PRIORITY) + continue; + + /* Confirm the zone is balanced for order-0 */ + if (!zone_watermark_ok(zone, 0, + high_wmark_pages(zone), 0, 0)) { + order = sc.order = 0; + goto loop_again; + } + + /* If balanced, clear the congested flag */ + zone_clear_flag(zone, ZONE_CONGESTED); + } + } + + /* + * Return the order we were reclaiming at so sleeping_prematurely() + * makes a decision on the order we were last reclaiming at. However, + * if another caller entered the allocator slow path while kswapd + * was awake, order will remain at the higher level + */ + *classzone_idx = end_zone; + return order; +} + +static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) +{ + long remaining = 0; + DEFINE_WAIT(wait); + + if (freezing(current) || kthread_should_stop()) + return; + + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + + /* Try to sleep for a short interval */ + if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { + remaining = schedule_timeout(HZ/10); + finish_wait(&pgdat->kswapd_wait, &wait); + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + } + + /* + * After a short sleep, check if it was a premature sleep. If not, then + * go fully to sleep until explicitly woken up. + */ + if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { + trace_mm_vmscan_kswapd_sleep(pgdat->node_id); + + /* + * vmstat counters are not perfectly accurate and the estimated + * value for counters such as NR_FREE_PAGES can deviate from the + * true value by nr_online_cpus * threshold. To avoid the zone + * watermarks being breached while under pressure, we reduce the + * per-cpu vmstat threshold while kswapd is awake and restore + * them before going back to sleep. + */ + set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); + schedule(); + set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); + } else { + if (remaining) + count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); + else + count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); + } + finish_wait(&pgdat->kswapd_wait, &wait); } /* @@ -2317,9 +2624,10 @@ out: static int kswapd(void *p) { unsigned long order; + int classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; - DEFINE_WAIT(wait); + struct reclaim_state reclaim_state = { .reclaimed_slab = 0, }; @@ -2347,49 +2655,30 @@ static int kswapd(void *p) set_freezable(); order = 0; + classzone_idx = MAX_NR_ZONES - 1; for ( ; ; ) { unsigned long new_order; + int new_classzone_idx; int ret; - prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); new_order = pgdat->kswapd_max_order; + new_classzone_idx = pgdat->classzone_idx; pgdat->kswapd_max_order = 0; - if (order < new_order) { + pgdat->classzone_idx = MAX_NR_ZONES - 1; + if (order < new_order || classzone_idx > new_classzone_idx) { /* * Don't sleep if someone wants a larger 'order' - * allocation + * allocation or has tigher zone constraints */ order = new_order; + classzone_idx = new_classzone_idx; } else { - if (!freezing(current) && !kthread_should_stop()) { - long remaining = 0; - - /* Try to sleep for a short interval */ - if (!sleeping_prematurely(pgdat, order, remaining)) { - remaining = schedule_timeout(HZ/10); - finish_wait(&pgdat->kswapd_wait, &wait); - prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); - } - - /* - * After a short sleep, check if it was a - * premature sleep. If not, then go fully - * to sleep until explicitly woken up - */ - if (!sleeping_prematurely(pgdat, order, remaining)) { - trace_mm_vmscan_kswapd_sleep(pgdat->node_id); - schedule(); - } else { - if (remaining) - count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); - else - count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); - } - } - + kswapd_try_to_sleep(pgdat, order, classzone_idx); order = pgdat->kswapd_max_order; + classzone_idx = pgdat->classzone_idx; + pgdat->kswapd_max_order = 0; + pgdat->classzone_idx = MAX_NR_ZONES - 1; } - finish_wait(&pgdat->kswapd_wait, &wait); ret = try_to_freeze(); if (kthread_should_stop()) @@ -2401,7 +2690,7 @@ static int kswapd(void *p) */ if (!ret) { trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - balance_pgdat(pgdat, order); + order = balance_pgdat(pgdat, order, &classzone_idx); } } return 0; @@ -2410,23 +2699,26 @@ static int kswapd(void *p) /* * A zone is low on free memory, so wake its kswapd task to service it. */ -void wakeup_kswapd(struct zone *zone, int order) +void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) { pg_data_t *pgdat; if (!populated_zone(zone)) return; - pgdat = zone->zone_pgdat; - if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) - return; - if (pgdat->kswapd_max_order < order) - pgdat->kswapd_max_order = order; - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) return; + pgdat = zone->zone_pgdat; + if (pgdat->kswapd_max_order < order) { + pgdat->kswapd_max_order = order; + pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); + } if (!waitqueue_active(&pgdat->kswapd_wait)) return; + if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) + return; + + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); wake_up_interruptible(&pgdat->kswapd_wait); } @@ -2987,6 +3279,7 @@ int scan_unevictable_handler(struct ctl_table *table, int write, return 0; } +#ifdef CONFIG_NUMA /* * per node 'scan_unevictable_pages' attribute. On demand re-scan of * a specified node's per zone unevictable lists for evictable pages. @@ -3033,4 +3326,4 @@ void scan_unevictable_unregister_node(struct node *node) { sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); } - +#endif diff --git a/mm/vmstat.c b/mm/vmstat.c index 355a9e669aaa..0c3b5048773e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -17,6 +17,8 @@ #include <linux/vmstat.h> #include <linux/sched.h> #include <linux/math64.h> +#include <linux/writeback.h> +#include <linux/compaction.h> #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; @@ -81,7 +83,31 @@ EXPORT_SYMBOL(vm_stat); #ifdef CONFIG_SMP -static int calculate_threshold(struct zone *zone) +int calculate_pressure_threshold(struct zone *zone) +{ + int threshold; + int watermark_distance; + + /* + * As vmstats are not up to date, there is drift between the estimated + * and real values. For high thresholds and a high number of CPUs, it + * is possible for the min watermark to be breached while the estimated + * value looks fine. The pressure threshold is a reduced value such + * that even the maximum amount of drift will not accidentally breach + * the min watermark + */ + watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone); + threshold = max(1, (int)(watermark_distance / num_online_cpus())); + + /* + * Maximum threshold is 125 + */ + threshold = min(125, threshold); + + return threshold; +} + +int calculate_normal_threshold(struct zone *zone) { int threshold; int mem; /* memory in 128 MB units */ @@ -140,7 +166,7 @@ static void refresh_zone_stat_thresholds(void) for_each_populated_zone(zone) { unsigned long max_drift, tolerate_drift; - threshold = calculate_threshold(zone); + threshold = calculate_normal_threshold(zone); for_each_online_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold @@ -159,42 +185,50 @@ static void refresh_zone_stat_thresholds(void) } } +void set_pgdat_percpu_threshold(pg_data_t *pgdat, + int (*calculate_pressure)(struct zone *)) +{ + struct zone *zone; + int cpu; + int threshold; + int i; + + for (i = 0; i < pgdat->nr_zones; i++) { + zone = &pgdat->node_zones[i]; + if (!zone->percpu_drift_mark) + continue; + + threshold = (*calculate_pressure)(zone); + for_each_possible_cpu(cpu) + per_cpu_ptr(zone->pageset, cpu)->stat_threshold + = threshold; + } +} + /* * For use when we know that interrupts are disabled. */ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, int delta) { - struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); - - s8 *p = pcp->vm_stat_diff + item; + struct per_cpu_pageset __percpu *pcp = zone->pageset; + s8 __percpu *p = pcp->vm_stat_diff + item; long x; + long t; + + x = delta + __this_cpu_read(*p); - x = delta + *p; + t = __this_cpu_read(pcp->stat_threshold); - if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { + if (unlikely(x > t || x < -t)) { zone_page_state_add(x, zone, item); x = 0; } - *p = x; + __this_cpu_write(*p, x); } EXPORT_SYMBOL(__mod_zone_page_state); /* - * For an unknown interrupt state - */ -void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, - int delta) -{ - unsigned long flags; - - local_irq_save(flags); - __mod_zone_page_state(zone, item, delta); - local_irq_restore(flags); -} -EXPORT_SYMBOL(mod_zone_page_state); - -/* * Optimized increment and decrement functions. * * These are only for a single page and therefore can take a struct page * @@ -219,16 +253,17 @@ EXPORT_SYMBOL(mod_zone_page_state); */ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); - s8 *p = pcp->vm_stat_diff + item; + struct per_cpu_pageset __percpu *pcp = zone->pageset; + s8 __percpu *p = pcp->vm_stat_diff + item; + s8 v, t; - (*p)++; + v = __this_cpu_inc_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v > t)) { + s8 overstep = t >> 1; - if (unlikely(*p > pcp->stat_threshold)) { - int overstep = pcp->stat_threshold / 2; - - zone_page_state_add(*p + overstep, zone, item); - *p = -overstep; + zone_page_state_add(v + overstep, zone, item); + __this_cpu_write(*p, -overstep); } } @@ -240,16 +275,17 @@ EXPORT_SYMBOL(__inc_zone_page_state); void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); - s8 *p = pcp->vm_stat_diff + item; - - (*p)--; + struct per_cpu_pageset __percpu *pcp = zone->pageset; + s8 __percpu *p = pcp->vm_stat_diff + item; + s8 v, t; - if (unlikely(*p < - pcp->stat_threshold)) { - int overstep = pcp->stat_threshold / 2; + v = __this_cpu_dec_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v < - t)) { + s8 overstep = t >> 1; - zone_page_state_add(*p - overstep, zone, item); - *p = overstep; + zone_page_state_add(v - overstep, zone, item); + __this_cpu_write(*p, overstep); } } @@ -259,6 +295,92 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item) } EXPORT_SYMBOL(__dec_zone_page_state); +#ifdef CONFIG_CMPXCHG_LOCAL +/* + * If we have cmpxchg_local support then we do not need to incur the overhead + * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. + * + * mod_state() modifies the zone counter state through atomic per cpu + * operations. + * + * Overstep mode specifies how overstep should handled: + * 0 No overstepping + * 1 Overstepping half of threshold + * -1 Overstepping minus half of threshold +*/ +static inline void mod_state(struct zone *zone, + enum zone_stat_item item, int delta, int overstep_mode) +{ + struct per_cpu_pageset __percpu *pcp = zone->pageset; + s8 __percpu *p = pcp->vm_stat_diff + item; + long o, n, t, z; + + do { + z = 0; /* overflow to zone counters */ + + /* + * The fetching of the stat_threshold is racy. We may apply + * a counter threshold to the wrong the cpu if we get + * rescheduled while executing here. However, the following + * will apply the threshold again and therefore bring the + * counter under the threshold. + */ + t = this_cpu_read(pcp->stat_threshold); + + o = this_cpu_read(*p); + n = delta + o; + + if (n > t || n < -t) { + int os = overstep_mode * (t >> 1) ; + + /* Overflow must be added to zone counters */ + z = n + os; + n = -os; + } + } while (this_cpu_cmpxchg(*p, o, n) != o); + + if (z) + zone_page_state_add(z, zone, item); +} + +void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + int delta) +{ + mod_state(zone, item, delta, 0); +} +EXPORT_SYMBOL(mod_zone_page_state); + +void inc_zone_state(struct zone *zone, enum zone_stat_item item) +{ + mod_state(zone, item, 1, 1); +} + +void inc_zone_page_state(struct page *page, enum zone_stat_item item) +{ + mod_state(page_zone(page), item, 1, 1); +} +EXPORT_SYMBOL(inc_zone_page_state); + +void dec_zone_page_state(struct page *page, enum zone_stat_item item) +{ + mod_state(page_zone(page), item, -1, -1); +} +EXPORT_SYMBOL(dec_zone_page_state); +#else +/* + * Use interrupt disable to serialize counter updates + */ +void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + int delta) +{ + unsigned long flags; + + local_irq_save(flags); + __mod_zone_page_state(zone, item, delta); + local_irq_restore(flags); +} +EXPORT_SYMBOL(mod_zone_page_state); + void inc_zone_state(struct zone *zone, enum zone_stat_item item) { unsigned long flags; @@ -289,6 +411,7 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) local_irq_restore(flags); } EXPORT_SYMBOL(dec_zone_page_state); +#endif /* * Update the zone counters for one cpu. @@ -394,6 +517,7 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z) #endif #ifdef CONFIG_COMPACTION + struct contig_page_info { unsigned long free_pages; unsigned long free_blocks_total; @@ -745,6 +869,9 @@ static const char * const vmstat_text[] = { "nr_isolated_anon", "nr_isolated_file", "nr_shmem", + "nr_dirtied", + "nr_written", + #ifdef CONFIG_NUMA "numa_hit", "numa_miss", @@ -753,6 +880,9 @@ static const char * const vmstat_text[] = { "numa_local", "numa_other", #endif + "nr_anon_transparent_hugepages", + "nr_dirty_threshold", + "nr_dirty_background_threshold", #ifdef CONFIG_VM_EVENT_COUNTERS "pgpgin", @@ -826,7 +956,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n scanned %lu" "\n spanned %lu" "\n present %lu", - zone_nr_free_pages(zone), + zone_page_state(zone, NR_FREE_PAGES), min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), @@ -904,36 +1034,44 @@ static const struct file_operations proc_zoneinfo_file_operations = { .release = seq_release, }; +enum writeback_stat_item { + NR_DIRTY_THRESHOLD, + NR_DIRTY_BG_THRESHOLD, + NR_VM_WRITEBACK_STAT_ITEMS, +}; + static void *vmstat_start(struct seq_file *m, loff_t *pos) { unsigned long *v; -#ifdef CONFIG_VM_EVENT_COUNTERS - unsigned long *e; -#endif - int i; + int i, stat_items_size; if (*pos >= ARRAY_SIZE(vmstat_text)) return NULL; + stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + + NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); #ifdef CONFIG_VM_EVENT_COUNTERS - v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) - + sizeof(struct vm_event_state), GFP_KERNEL); -#else - v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long), - GFP_KERNEL); + stat_items_size += sizeof(struct vm_event_state); #endif + + v = kmalloc(stat_items_size, GFP_KERNEL); m->private = v; if (!v) return ERR_PTR(-ENOMEM); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) v[i] = global_page_state(i); + v += NR_VM_ZONE_STAT_ITEMS; + + global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, + v + NR_DIRTY_THRESHOLD); + v += NR_VM_WRITEBACK_STAT_ITEMS; + #ifdef CONFIG_VM_EVENT_COUNTERS - e = v + NR_VM_ZONE_STAT_ITEMS; - all_vm_events(e); - e[PGPGIN] /= 2; /* sectors -> kbytes */ - e[PGPGOUT] /= 2; + all_vm_events(v); + v[PGPGIN] /= 2; /* sectors -> kbytes */ + v[PGPGOUT] /= 2; #endif - return v + *pos; + return (unsigned long *)m->private + *pos; } static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) @@ -1017,7 +1155,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: - cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu)); + cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); per_cpu(vmstat_work, cpu).work.func = NULL; break; case CPU_DOWN_FAILED: |