diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 993 |
1 files changed, 479 insertions, 514 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f8f3bfc435ee..2b3bf6767d54 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -63,6 +63,7 @@ #include <linux/sched/rt.h> #include <linux/page_owner.h> #include <linux/kthread.h> +#include <linux/memcontrol.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -90,6 +91,11 @@ EXPORT_PER_CPU_SYMBOL(_numa_mem_); int _node_numa_mem_[MAX_NUMNODES]; #endif +#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY +volatile u64 latent_entropy __latent_entropy; +EXPORT_SYMBOL(latent_entropy); +#endif + /* * Array of node states. */ @@ -286,15 +292,9 @@ static inline void reset_deferred_meminit(pg_data_t *pgdat) /* Returns true if the struct page for the pfn is uninitialised */ static inline bool __meminit early_page_uninitialised(unsigned long pfn) { - if (pfn >= NODE_DATA(early_pfn_to_nid(pfn))->first_deferred_pfn) - return true; - - return false; -} + int nid = early_pfn_to_nid(pfn); -static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) -{ - if (pfn >= NODE_DATA(nid)->first_deferred_pfn) + if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) return true; return false; @@ -339,11 +339,6 @@ static inline bool early_page_uninitialised(unsigned long pfn) return false; } -static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) -{ - return false; -} - static inline bool update_defer_init(pg_data_t *pgdat, unsigned long pfn, unsigned long zone_end, unsigned long *nr_initialised) @@ -617,6 +612,9 @@ static bool need_debug_guardpage(void) if (!debug_pagealloc_enabled()) return false; + if (!debug_guardpage_minorder()) + return false; + return true; } @@ -625,6 +623,9 @@ static void init_debug_guardpage(void) if (!debug_pagealloc_enabled()) return; + if (!debug_guardpage_minorder()) + return; + _debug_guardpage_enabled = true; } @@ -645,23 +646,31 @@ static int __init debug_guardpage_minorder_setup(char *buf) pr_info("Setting debug_guardpage_minorder to %lu\n", res); return 0; } -__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); +early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); -static inline void set_page_guard(struct zone *zone, struct page *page, +static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { struct page_ext *page_ext; if (!debug_guardpage_enabled()) - return; + return false; + + if (order >= debug_guardpage_minorder()) + return false; page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + return false; + __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); INIT_LIST_HEAD(&page->lru); set_page_private(page, order); /* Guard pages are not available for any usage */ __mod_zone_freepage_state(zone, -(1 << order), migratetype); + + return true; } static inline void clear_page_guard(struct zone *zone, struct page *page, @@ -673,6 +682,9 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, return; page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + return; + __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); set_page_private(page, 0); @@ -680,9 +692,9 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, __mod_zone_freepage_state(zone, (1 << order), migratetype); } #else -struct page_ext_operations debug_guardpage_ops = { NULL, }; -static inline void set_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) {} +struct page_ext_operations debug_guardpage_ops; +static inline bool set_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) {} #endif @@ -998,6 +1010,8 @@ static __always_inline bool free_pages_prepare(struct page *page, VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); + if (compound) + ClearPageDoubleMap(page); for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_pages_check(page, page + i); @@ -1008,8 +1022,10 @@ static __always_inline bool free_pages_prepare(struct page *page, (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; } } - if (PageAnonHead(page)) + if (PageMappingFlags(page)) page->mapping = NULL; + if (memcg_kmem_enabled() && PageKmemcg(page)) + memcg_kmem_uncharge(page, order); if (check_free) bad += free_pages_check(page); if (bad) @@ -1076,9 +1092,9 @@ static void free_pcppages_bulk(struct zone *zone, int count, spin_lock(&zone->lock); isolated_pageblocks = has_isolate_pageblock(zone); - nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); if (nr_scanned) - __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); while (count) { struct page *page; @@ -1133,9 +1149,9 @@ static void free_one_page(struct zone *zone, { unsigned long nr_scanned; spin_lock(&zone->lock); - nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); if (nr_scanned) - __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { @@ -1267,7 +1283,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn) spin_lock(&early_pfn_lock); nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); if (nid < 0) - nid = 0; + nid = first_online_node; spin_unlock(&early_pfn_lock); return nid; @@ -1393,15 +1409,18 @@ static void __init deferred_free_range(struct page *page, return; /* Free a large naturally-aligned chunk if possible */ - if (nr_pages == MAX_ORDER_NR_PAGES && - (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) { + if (nr_pages == pageblock_nr_pages && + (pfn & (pageblock_nr_pages - 1)) == 0) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); - __free_pages_boot_core(page, MAX_ORDER-1); + __free_pages_boot_core(page, pageblock_order); return; } - for (i = 0; i < nr_pages; i++, page++) + for (i = 0; i < nr_pages; i++, page++, pfn++) { + if ((pfn & (pageblock_nr_pages - 1)) == 0) + set_pageblock_migratetype(page, MIGRATE_MOVABLE); __free_pages_boot_core(page, 0); + } } /* Completion tracking for deferred_init_memmap() threads */ @@ -1469,9 +1488,9 @@ static int __init deferred_init_memmap(void *data) /* * Ensure pfn_valid is checked every - * MAX_ORDER_NR_PAGES for memory holes + * pageblock_nr_pages for memory holes */ - if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { + if ((pfn & (pageblock_nr_pages - 1)) == 0) { if (!pfn_valid(pfn)) { page = NULL; goto free_range; @@ -1484,7 +1503,7 @@ static int __init deferred_init_memmap(void *data) } /* Minimise pfn page lookups and scheduler checks */ - if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) { + if (page && (pfn & (pageblock_nr_pages - 1)) != 0) { page++; } else { nr_pages += nr_to_free; @@ -1520,6 +1539,9 @@ free_range: free_base_page = NULL; free_base_pfn = nr_to_free = 0; } + /* Free the last block of pages to allocator */ + nr_pages += nr_to_free; + deferred_free_range(free_base_page, free_base_pfn, nr_to_free); first_init_pfn = max(end_pfn, first_init_pfn); } @@ -1616,18 +1638,15 @@ static inline void expand(struct zone *zone, struct page *page, size >>= 1; VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); - if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && - debug_guardpage_enabled() && - high < debug_guardpage_minorder()) { - /* - * Mark as guard pages (or page), that will allow to - * merge back to allocator when buddy will be freed. - * Corresponding page table entries will not be touched, - * pages will stay not present in virtual address space - */ - set_page_guard(zone, &page[size], high, migratetype); + /* + * Mark as guard pages (or page), that will allow to + * merge back to allocator when buddy will be freed. + * Corresponding page table entries will not be touched, + * pages will stay not present in virtual address space + */ + if (set_page_guard(zone, &page[size], high, migratetype)) continue; - } + list_add(&page[size].lru, &area->free_list[migratetype]); area->nr_free++; set_page_order(&page[size], high); @@ -1716,6 +1735,19 @@ static bool check_new_pages(struct page *page, unsigned int order) return false; } +inline void post_alloc_hook(struct page *page, unsigned int order, + gfp_t gfp_flags) +{ + set_page_private(page, 0); + set_page_refcounted(page); + + arch_alloc_page(page, order); + kernel_map_pages(page, 1 << order, 1); + kernel_poison_pages(page, 1 << order, 1); + kasan_alloc_pages(page, order); + set_page_owner(page, order, gfp_flags); +} + static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, unsigned int alloc_flags) { @@ -1728,13 +1760,7 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags poisoned &= page_is_poisoned(p); } - set_page_private(page, 0); - set_page_refcounted(page); - - arch_alloc_page(page, order); - kernel_map_pages(page, 1 << order, 1); - kernel_poison_pages(page, 1 << order, 1); - kasan_alloc_pages(page, order); + post_alloc_hook(page, order, gfp_flags); if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO)) for (i = 0; i < (1 << order); i++) @@ -1743,8 +1769,6 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); - set_page_owner(page, order, gfp_flags); - /* * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to * allocate the page. The expectation is that the caller is taking @@ -2453,7 +2477,6 @@ void free_hot_cold_page_list(struct list_head *list, bool cold) void split_page(struct page *page, unsigned int order) { int i; - gfp_t gfp_mask; VM_BUG_ON_PAGE(PageCompound(page), page); VM_BUG_ON_PAGE(!page_count(page), page); @@ -2467,12 +2490,9 @@ void split_page(struct page *page, unsigned int order) split_page(virt_to_page(page[0].shadow), order); #endif - gfp_mask = get_page_owner_gfp(page); - set_page_owner(page, 0, gfp_mask); - for (i = 1; i < (1 << order); i++) { + for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); - set_page_owner(page + i, 0, gfp_mask); - } + split_page_owner(page, order); } EXPORT_SYMBOL_GPL(split_page); @@ -2488,9 +2508,14 @@ int __isolate_free_page(struct page *page, unsigned int order) mt = get_pageblock_migratetype(page); if (!is_migrate_isolate(mt)) { - /* Obey watermarks as if the page was being allocated */ - watermark = low_wmark_pages(zone) + (1 << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + /* + * Obey watermarks as if the page was being allocated. We can + * emulate a high-order watermark check with a raised order-0 + * watermark, because we already know our high-order page + * exists. + */ + watermark = min_wmark_pages(zone) + (1UL << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) return 0; __mod_zone_freepage_state(zone, -(1UL << order), mt); @@ -2501,9 +2526,10 @@ int __isolate_free_page(struct page *page, unsigned int order) zone->free_area[order].nr_free--; rmv_page_order(page); - set_page_owner(page, order, __GFP_MOVABLE); - - /* Set the pageblock if the isolated page is at least a pageblock */ + /* + * Set the pageblock if the isolated page is at least half of a + * pageblock + */ if (order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { @@ -2519,33 +2545,6 @@ int __isolate_free_page(struct page *page, unsigned int order) } /* - * Similar to split_page except the page is already free. As this is only - * being used for migration, the migratetype of the block also changes. - * As this is called with interrupts disabled, the caller is responsible - * for calling arch_alloc_page() and kernel_map_page() after interrupts - * are enabled. - * - * Note: this is probably too low level an operation for use in drivers. - * Please consult with lkml before using this in your driver. - */ -int split_free_page(struct page *page) -{ - unsigned int order; - int nr_pages; - - order = page_order(page); - - nr_pages = __isolate_free_page(page, order); - if (!nr_pages) - return 0; - - /* Split into individual pages */ - set_page_refcounted(page); - split_page(page, order); - return nr_pages; -} - -/* * Update NUMA hit/miss statistics * * Must be called with interrupts disabled. @@ -2609,11 +2608,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, page = list_last_entry(list, struct page, lru); else page = list_first_entry(list, struct page, lru); - } while (page && check_new_pcp(page)); - __dec_zone_state(zone, NR_ALLOC_BATCH); - list_del(&page->lru); - pcp->count--; + list_del(&page->lru); + pcp->count--; + + } while (check_new_pcp(page)); } else { /* * We most definitely don't want callers attempting to @@ -2635,16 +2634,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, spin_unlock(&zone->lock); if (!page) goto failed; - __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); __mod_zone_freepage_state(zone, -(1 << order), get_pcppage_migratetype(page)); } - if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && - !test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) - set_bit(ZONE_FAIR_DEPLETED, &zone->flags); - - __count_zone_vm_events(PGALLOC, zone, 1 << order); + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); local_irq_restore(flags); @@ -2854,40 +2848,18 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order, } #ifdef CONFIG_NUMA -static bool zone_local(struct zone *local_zone, struct zone *zone) -{ - return local_zone->node == zone->node; -} - static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < RECLAIM_DISTANCE; } #else /* CONFIG_NUMA */ -static bool zone_local(struct zone *local_zone, struct zone *zone) -{ - return true; -} - static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return true; } #endif /* CONFIG_NUMA */ -static void reset_alloc_batches(struct zone *preferred_zone) -{ - struct zone *zone = preferred_zone->zone_pgdat->node_zones; - - do { - mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - low_wmark_pages(zone) - - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - clear_bit(ZONE_FAIR_DEPLETED, &zone->flags); - } while (zone++ != preferred_zone); -} - /* * get_page_from_freelist goes through the zonelist trying to allocate * a page. @@ -2898,10 +2870,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, { struct zoneref *z = ac->preferred_zoneref; struct zone *zone; - bool fair_skipped = false; - bool apply_fair = (alloc_flags & ALLOC_FAIR); + struct pglist_data *last_pgdat_dirty_limit = NULL; -zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. @@ -2916,50 +2886,33 @@ zonelist_scan: !__cpuset_zone_allowed(zone, gfp_mask)) continue; /* - * Distribute pages in proportion to the individual - * zone size to ensure fair page aging. The zone a - * page was allocated in should have no effect on the - * time the page has in memory before being reclaimed. - */ - if (apply_fair) { - if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { - fair_skipped = true; - continue; - } - if (!zone_local(ac->preferred_zoneref->zone, zone)) { - if (fair_skipped) - goto reset_fair; - apply_fair = false; - } - } - /* * When allocating a page cache page for writing, we - * want to get it from a zone that is within its dirty - * limit, such that no single zone holds more than its + * want to get it from a node that is within its dirty + * limit, such that no single node holds more than its * proportional share of globally allowed dirty pages. - * The dirty limits take into account the zone's + * The dirty limits take into account the node's * lowmem reserves and high watermark so that kswapd * should be able to balance it without having to * write pages from its LRU list. * - * This may look like it could increase pressure on - * lower zones by failing allocations in higher zones - * before they are full. But the pages that do spill - * over are limited as the lower zones are protected - * by this very same mechanism. It should not become - * a practical burden to them. - * * XXX: For now, allow allocations to potentially - * exceed the per-zone dirty limit in the slowpath + * exceed the per-node dirty limit in the slowpath * (spread_dirty_pages unset) before going into reclaim, * which is important when on a NUMA setup the allowed - * zones are together not big enough to reach the + * nodes are together not big enough to reach the * global limit. The proper fix for these situations - * will require awareness of zones in the + * will require awareness of nodes in the * dirty-throttling and the flusher threads. */ - if (ac->spread_dirty_pages && !zone_dirty_ok(zone)) - continue; + if (ac->spread_dirty_pages) { + if (last_pgdat_dirty_limit == zone->zone_pgdat) + continue; + + if (!node_dirty_ok(zone->zone_pgdat)) { + last_pgdat_dirty_limit = zone->zone_pgdat; + continue; + } + } mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_fast(zone, order, mark, @@ -2971,16 +2924,16 @@ zonelist_scan: if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone; - if (zone_reclaim_mode == 0 || + if (node_reclaim_mode == 0 || !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) continue; - ret = zone_reclaim(zone, gfp_mask, order); + ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); switch (ret) { - case ZONE_RECLAIM_NOSCAN: + case NODE_RECLAIM_NOSCAN: /* did not scan */ continue; - case ZONE_RECLAIM_FULL: + case NODE_RECLAIM_FULL: /* scanned but unreclaimable */ continue; default: @@ -3010,22 +2963,6 @@ try_this_zone: } } - /* - * The first pass makes sure allocations are spread fairly within the - * local node. However, the local node might have free pages left - * after the fairness batches are exhausted, and remote zones haven't - * even been considered yet. Try once more without fairness, and - * include remote zones now, before entering the slowpath and waking - * kswapd: prefer spilling to a remote zone over swapping locally. - */ - if (fair_skipped) { -reset_fair: - apply_fair = false; - fair_skipped = false; - reset_alloc_batches(ac->preferred_zoneref->zone); - goto zonelist_scan; - } - return NULL; } @@ -3047,9 +2984,11 @@ static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); -void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) +void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) { unsigned int filter = SHOW_MEM_FILTER_NODES; + struct va_format vaf; + va_list args; if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || debug_guardpage_minorder() > 0) @@ -3067,22 +3006,16 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; - if (fmt) { - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; + pr_warn("%s: ", current->comm); - pr_warn("%pV", &vaf); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_cont("%pV", &vaf); + va_end(args); - va_end(args); - } + pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask); - pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n", - current->comm, order, gfp_mask, &gfp_mask); dump_stack(); if (!should_suppress_show_mem()) show_mem(filter); @@ -3095,6 +3028,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, struct oom_control oc = { .zonelist = ac->zonelist, .nodemask = ac->nodemask, + .memcg = NULL, .gfp_mask = gfp_mask, .order = order, }; @@ -3169,7 +3103,6 @@ out: return page; } - /* * Maximum number of compaction retries wit a progress before OOM * killer is consider as the only way to move forward. @@ -3181,17 +3114,16 @@ out: static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, enum compact_result *compact_result) + enum compact_priority prio, enum compact_result *compact_result) { struct page *page; - int contended_compaction; if (!order) return NULL; current->flags |= PF_MEMALLOC; *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, - mode, &contended_compaction); + prio); current->flags &= ~PF_MEMALLOC; if (*compact_result <= COMPACT_INACTIVE) @@ -3203,8 +3135,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, */ count_vm_event(COMPACTSTALL); - page = get_page_from_freelist(gfp_mask, order, - alloc_flags & ~ALLOC_NO_WATERMARKS, ac); + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); if (page) { struct zone *zone = page_zone(page); @@ -3221,24 +3152,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, */ count_vm_event(COMPACTFAIL); - /* - * In all zones where compaction was attempted (and not - * deferred or skipped), lock contention has been detected. - * For THP allocation we do not want to disrupt the others - * so we fallback to base pages instead. - */ - if (contended_compaction == COMPACT_CONTENDED_LOCK) - *compact_result = COMPACT_CONTENDED; - - /* - * If compaction was aborted due to need_resched(), we do not - * want to further increase allocation latency, unless it is - * khugepaged trying to collapse. - */ - if (contended_compaction == COMPACT_CONTENDED_SCHED - && !(current->flags & PF_KTHREAD)) - *compact_result = COMPACT_CONTENDED; - cond_resched(); return NULL; @@ -3246,26 +3159,26 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, static inline bool should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, - enum compact_result compact_result, enum migrate_mode *migrate_mode, - int compaction_retries) + enum compact_result compact_result, + enum compact_priority *compact_priority, + int *compaction_retries) { int max_retries = MAX_COMPACT_RETRIES; + int min_priority; if (!order) return false; + if (compaction_made_progress(compact_result)) + (*compaction_retries)++; + /* * compaction considers all the zone as desperately out of memory * so it doesn't really make much sense to retry except when the - * failure could be caused by weak migration mode. + * failure could be caused by insufficient priority */ - if (compaction_failed(compact_result)) { - if (*migrate_mode == MIGRATE_ASYNC) { - *migrate_mode = MIGRATE_SYNC_LIGHT; - return true; - } - return false; - } + if (compaction_failed(compact_result)) + goto check_priority; /* * make sure the compaction wasn't deferred or didn't bail out early @@ -3286,16 +3199,28 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, */ if (order > PAGE_ALLOC_COSTLY_ORDER) max_retries /= 4; - if (compaction_retries <= max_retries) + if (*compaction_retries <= max_retries) return true; + /* + * Make sure there are attempts at the highest priority if we exhausted + * all retries or failed at the lower priorities. + */ +check_priority: + min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? + MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; + if (*compact_priority > min_priority) { + (*compact_priority)--; + *compaction_retries = 0; + return true; + } return false; } #else static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, enum compact_result *compact_result) + enum compact_priority prio, enum compact_result *compact_result) { *compact_result = COMPACT_SKIPPED; return NULL; @@ -3304,8 +3229,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, static inline bool should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, enum compact_result compact_result, - enum migrate_mode *migrate_mode, - int compaction_retries) + enum compact_priority *compact_priority, + int *compaction_retries) { struct zone *zone; struct zoneref *z; @@ -3372,8 +3297,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, return NULL; retry: - page = get_page_from_freelist(gfp_mask, order, - alloc_flags & ~ALLOC_NO_WATERMARKS, ac); + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); /* * If an allocation failed after direct reclaim, it could be because @@ -3394,10 +3318,14 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) { struct zoneref *z; struct zone *zone; + pg_data_t *last_pgdat = NULL; for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, - ac->high_zoneidx, ac->nodemask) - wakeup_kswapd(zone, order, ac_classzone_idx(ac)); + ac->high_zoneidx, ac->nodemask) { + if (last_pgdat != zone->zone_pgdat) + wakeup_kswapd(zone, order, ac->high_zoneidx); + last_pgdat = zone->zone_pgdat; + } } static inline unsigned int @@ -3431,16 +3359,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask) } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; - if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { - if (gfp_mask & __GFP_MEMALLOC) - alloc_flags |= ALLOC_NO_WATERMARKS; - else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) - alloc_flags |= ALLOC_NO_WATERMARKS; - else if (!in_interrupt() && - ((current->flags & PF_MEMALLOC) || - unlikely(test_thread_flag(TIF_MEMDIE)))) - alloc_flags |= ALLOC_NO_WATERMARKS; - } #ifdef CONFIG_CMA if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; @@ -3450,12 +3368,19 @@ gfp_to_alloc_flags(gfp_t gfp_mask) bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) { - return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); -} + if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) + return false; -static inline bool is_thp_gfp_mask(gfp_t gfp_mask) -{ - return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE; + if (gfp_mask & __GFP_MEMALLOC) + return true; + if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) + return true; + if (!in_interrupt() && + ((current->flags & PF_MEMALLOC) || + unlikely(test_thread_flag(TIF_MEMDIE)))) + return true; + + return false; } /* @@ -3478,23 +3403,33 @@ static inline bool is_thp_gfp_mask(gfp_t gfp_mask) static inline bool should_reclaim_retry(gfp_t gfp_mask, unsigned order, struct alloc_context *ac, int alloc_flags, - bool did_some_progress, int no_progress_loops) + bool did_some_progress, int *no_progress_loops) { struct zone *zone; struct zoneref *z; /* + * Costly allocations might have made a progress but this doesn't mean + * their order will become available due to high fragmentation so + * always increment the no progress counter for them + */ + if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) + *no_progress_loops = 0; + else + (*no_progress_loops)++; + + /* * Make sure we converge to OOM if we cannot make any progress * several times in the row. */ - if (no_progress_loops > MAX_RECLAIM_RETRIES) + if (*no_progress_loops > MAX_RECLAIM_RETRIES) return false; /* - * Keep reclaiming pages while there is a chance this will lead somewhere. - * If none of the target zones can satisfy our allocation request even - * if all reclaimable pages are considered then we are screwed and have - * to go OOM. + * Keep reclaiming pages while there is a chance this will lead + * somewhere. If none of the target zones can satisfy our allocation + * request even if all reclaimable pages are considered then we are + * screwed and have to go OOM. */ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { @@ -3502,7 +3437,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, unsigned long reclaimable; available = reclaimable = zone_reclaimable_pages(zone); - available -= DIV_ROUND_UP(no_progress_loops * available, + available -= DIV_ROUND_UP((*no_progress_loops) * available, MAX_RECLAIM_RETRIES); available += zone_page_state_snapshot(zone, NR_FREE_PAGES); @@ -3519,14 +3454,12 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * prevent from pre mature OOM */ if (!did_some_progress) { - unsigned long writeback; - unsigned long dirty; + unsigned long write_pending; - writeback = zone_page_state_snapshot(zone, - NR_WRITEBACK); - dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY); + write_pending = zone_page_state_snapshot(zone, + NR_ZONE_WRITE_PENDING); - if (2*(writeback + dirty) > reclaimable) { + if (2 * write_pending > reclaimable) { congestion_wait(BLK_RW_ASYNC, HZ/10); return true; } @@ -3561,10 +3494,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct page *page = NULL; unsigned int alloc_flags; unsigned long did_some_progress; - enum migrate_mode migration_mode = MIGRATE_ASYNC; + enum compact_priority compact_priority = DEF_COMPACT_PRIORITY; enum compact_result compact_result; int compaction_retries = 0; int no_progress_loops = 0; + unsigned long alloc_start = jiffies; + unsigned int stall_timeout = 10 * HZ; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -3585,37 +3520,88 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) gfp_mask &= ~__GFP_ATOMIC; -retry: + /* + * The fast path uses conservative alloc_flags to succeed only until + * kswapd needs to be woken up, and to avoid the cost of setting up + * alloc_flags precisely. So we do that now. + */ + alloc_flags = gfp_to_alloc_flags(gfp_mask); + if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); /* - * OK, we're below the kswapd watermark and have kicked background - * reclaim. Now things get more complex, so set up alloc_flags according - * to how we want to proceed. + * The adjusted alloc_flags might result in immediate success, so try + * that first */ - alloc_flags = gfp_to_alloc_flags(gfp_mask); - - /* This is the last chance, in general, before the goto nopage. */ - page = get_page_from_freelist(gfp_mask, order, - alloc_flags & ~ALLOC_NO_WATERMARKS, ac); + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); if (page) goto got_pg; - /* Allocate without watermarks if the context allows */ - if (alloc_flags & ALLOC_NO_WATERMARKS) { + /* + * For costly allocations, try direct compaction first, as it's likely + * that we have enough base pages and don't need to reclaim. Don't try + * that for allocations that are allowed to ignore watermarks, as the + * ALLOC_NO_WATERMARKS attempt didn't yet happen. + */ + if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER && + !gfp_pfmemalloc_allowed(gfp_mask)) { + page = __alloc_pages_direct_compact(gfp_mask, order, + alloc_flags, ac, + INIT_COMPACT_PRIORITY, + &compact_result); + if (page) + goto got_pg; + /* - * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds - * the allocation is high priority and these type of - * allocations are system rather than user orientated + * Checks for costly allocations with __GFP_NORETRY, which + * includes THP page fault allocations */ + if (gfp_mask & __GFP_NORETRY) { + /* + * If compaction is deferred for high-order allocations, + * it is because sync compaction recently failed. If + * this is the case and the caller requested a THP + * allocation, we do not want to heavily disrupt the + * system, so we fail the allocation instead of entering + * direct reclaim. + */ + if (compact_result == COMPACT_DEFERRED) + goto nopage; + + /* + * Looks like reclaim/compaction is worth trying, but + * sync compaction could be very expensive, so keep + * using async compaction. + */ + compact_priority = INIT_COMPACT_PRIORITY; + } + } + +retry: + /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ + if (gfp_mask & __GFP_KSWAPD_RECLAIM) + wake_all_kswapds(order, ac); + + if (gfp_pfmemalloc_allowed(gfp_mask)) + alloc_flags = ALLOC_NO_WATERMARKS; + + /* + * Reset the zonelist iterators if memory policies can be ignored. + * These allocations are high priority and system rather than user + * orientated. + */ + if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) { ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); - page = get_page_from_freelist(gfp_mask, order, - ALLOC_NO_WATERMARKS, ac); - if (page) - goto got_pg; + ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, + ac->high_zoneidx, ac->nodemask); } + /* Attempt with potentially adjusted zonelist and alloc_flags */ + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); + if (page) + goto got_pg; + /* Caller is not willing to reclaim, we can't balance anything */ if (!can_direct_reclaim) { /* @@ -3645,38 +3631,6 @@ retry: if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) goto nopage; - /* - * Try direct compaction. The first pass is asynchronous. Subsequent - * attempts after direct reclaim are synchronous - */ - page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, - migration_mode, - &compact_result); - if (page) - goto got_pg; - - /* Checks for THP-specific high-order allocations */ - if (is_thp_gfp_mask(gfp_mask)) { - /* - * If compaction is deferred for high-order allocations, it is - * because sync compaction recently failed. If this is the case - * and the caller requested a THP allocation, we do not want - * to heavily disrupt the system, so we fail the allocation - * instead of entering direct reclaim. - */ - if (compact_result == COMPACT_DEFERRED) - goto nopage; - - /* - * Compaction is contended so rather back off than cause - * excessive stalls. - */ - if(compact_result == COMPACT_CONTENDED) - goto nopage; - } - - if (order && compaction_made_progress(compact_result)) - compaction_retries++; /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, @@ -3684,29 +3638,33 @@ retry: if (page) goto got_pg; + /* Try direct compaction and then allocating */ + page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, + compact_priority, &compact_result); + if (page) + goto got_pg; + /* Do not loop if specifically requested */ if (gfp_mask & __GFP_NORETRY) - goto noretry; + goto nopage; /* * Do not retry costly high order allocations unless they are * __GFP_REPEAT */ if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) - goto noretry; + goto nopage; - /* - * Costly allocations might have made a progress but this doesn't mean - * their order will become available due to high fragmentation so - * always increment the no progress counter for them - */ - if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) - no_progress_loops = 0; - else - no_progress_loops++; + /* Make sure we know about allocations which stall for too long */ + if (time_after(jiffies, alloc_start + stall_timeout)) { + warn_alloc(gfp_mask, + "page alloction stalls for %ums, order:%u\n", + jiffies_to_msecs(jiffies-alloc_start), order); + stall_timeout += 10 * HZ; + } if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, - did_some_progress > 0, no_progress_loops)) + did_some_progress > 0, &no_progress_loops)) goto retry; /* @@ -3717,8 +3675,8 @@ retry: */ if (did_some_progress > 0 && should_compact_retry(ac, order, alloc_flags, - compact_result, &migration_mode, - compaction_retries)) + compact_result, &compact_priority, + &compaction_retries)) goto retry; /* Reclaim has failed us, start killing things */ @@ -3732,27 +3690,9 @@ retry: goto retry; } -noretry: - /* - * High-order allocations do not necessarily loop after direct reclaim - * and reclaim/compaction depends on compaction being called after - * reclaim so call directly if necessary. - * It can become very expensive to allocate transparent hugepages at - * fault, so use asynchronous memory compaction for THP unless it is - * khugepaged trying to collapse. All other requests should tolerate - * at least light sync migration. - */ - if (is_thp_gfp_mask(gfp_mask) && !(current->flags & PF_KTHREAD)) - migration_mode = MIGRATE_ASYNC; - else - migration_mode = MIGRATE_SYNC_LIGHT; - page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, - ac, migration_mode, - &compact_result); - if (page) - goto got_pg; nopage: - warn_alloc_failed(gfp_mask, order, NULL); + warn_alloc(gfp_mask, + "page allocation failure: order:%u", order); got_pg: return page; } @@ -3766,7 +3706,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, { struct page *page; unsigned int cpuset_mems_cookie; - unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; + unsigned int alloc_flags = ALLOC_WMARK_LOW; gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { .high_zoneidx = gfp_zone(gfp_mask), @@ -3808,7 +3748,11 @@ retry_cpuset: /* Dirty zone balancing only done in the fast path */ ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); - /* The preferred zone is used for statistics later */ + /* + * The preferred zone is used for statistics but crucially it is + * also used as the starting point for the zonelist iterator. It + * may get reset for allocations that ignore memory policies. + */ ac.preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, ac.nodemask); if (!ac.preferred_zoneref) { @@ -3849,6 +3793,12 @@ no_zone: } out: + if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && + unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) { + __free_pages(page, order); + page = NULL; + } + if (kmemcheck_enabled && page) kmemcheck_pagealloc_alloc(page, order, gfp_mask); @@ -4004,56 +3954,6 @@ void __free_page_frag(void *addr) } EXPORT_SYMBOL(__free_page_frag); -/* - * alloc_kmem_pages charges newly allocated pages to the kmem resource counter - * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is - * equivalent to alloc_pages. - * - * It should be used when the caller would like to use kmalloc, but since the - * allocation is large, it has to fall back to the page allocator. - */ -struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) -{ - struct page *page; - - page = alloc_pages(gfp_mask, order); - if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { - __free_pages(page, order); - page = NULL; - } - return page; -} - -struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) -{ - struct page *page; - - page = alloc_pages_node(nid, gfp_mask, order); - if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { - __free_pages(page, order); - page = NULL; - } - return page; -} - -/* - * __free_kmem_pages and free_kmem_pages will free pages allocated with - * alloc_kmem_pages. - */ -void __free_kmem_pages(struct page *page, unsigned int order) -{ - memcg_kmem_uncharge(page, order); - __free_pages(page, order); -} - -void free_kmem_pages(unsigned long addr, unsigned int order) -{ - if (addr != 0) { - VM_BUG_ON(!virt_addr_valid((void *)addr)); - __free_kmem_pages(virt_to_page((void *)addr), order); - } -} - static void *make_alloc_exact(unsigned long addr, unsigned int order, size_t size) { @@ -4199,7 +4099,7 @@ long si_mem_available(void) int lru; for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) - pages[lru] = global_page_state(NR_LRU_BASE + lru); + pages[lru] = global_node_page_state(NR_LRU_BASE + lru); for_each_zone(zone) wmark_low += zone->watermark[WMARK_LOW]; @@ -4235,7 +4135,7 @@ EXPORT_SYMBOL_GPL(si_mem_available); void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; - val->sharedram = global_page_state(NR_SHMEM); + val->sharedram = global_node_page_state(NR_SHMEM); val->freeram = global_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); val->totalhigh = totalhigh_pages; @@ -4257,8 +4157,8 @@ void si_meminfo_node(struct sysinfo *val, int nid) for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) managed_pages += pgdat->node_zones[zone_type].managed_pages; val->totalram = managed_pages; - val->sharedram = node_page_state(nid, NR_SHMEM); - val->freeram = node_page_state(nid, NR_FREE_PAGES); + val->sharedram = node_page_state(pgdat, NR_SHMEM); + val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; @@ -4341,6 +4241,7 @@ void show_free_areas(unsigned int filter) unsigned long free_pcp = 0; int cpu; struct zone *zone; + pg_data_t *pgdat; for_each_populated_zone(zone) { if (skip_free_areas_node(filter, zone_to_nid(zone))) @@ -4356,26 +4257,73 @@ void show_free_areas(unsigned int filter) " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" " free:%lu free_pcp:%lu free_cma:%lu\n", - global_page_state(NR_ACTIVE_ANON), - global_page_state(NR_INACTIVE_ANON), - global_page_state(NR_ISOLATED_ANON), - global_page_state(NR_ACTIVE_FILE), - global_page_state(NR_INACTIVE_FILE), - global_page_state(NR_ISOLATED_FILE), - global_page_state(NR_UNEVICTABLE), - global_page_state(NR_FILE_DIRTY), - global_page_state(NR_WRITEBACK), - global_page_state(NR_UNSTABLE_NFS), + global_node_page_state(NR_ACTIVE_ANON), + global_node_page_state(NR_INACTIVE_ANON), + global_node_page_state(NR_ISOLATED_ANON), + global_node_page_state(NR_ACTIVE_FILE), + global_node_page_state(NR_INACTIVE_FILE), + global_node_page_state(NR_ISOLATED_FILE), + global_node_page_state(NR_UNEVICTABLE), + global_node_page_state(NR_FILE_DIRTY), + global_node_page_state(NR_WRITEBACK), + global_node_page_state(NR_UNSTABLE_NFS), global_page_state(NR_SLAB_RECLAIMABLE), global_page_state(NR_SLAB_UNRECLAIMABLE), - global_page_state(NR_FILE_MAPPED), - global_page_state(NR_SHMEM), + global_node_page_state(NR_FILE_MAPPED), + global_node_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE), global_page_state(NR_FREE_PAGES), free_pcp, global_page_state(NR_FREE_CMA_PAGES)); + for_each_online_pgdat(pgdat) { + printk("Node %d" + " active_anon:%lukB" + " inactive_anon:%lukB" + " active_file:%lukB" + " inactive_file:%lukB" + " unevictable:%lukB" + " isolated(anon):%lukB" + " isolated(file):%lukB" + " mapped:%lukB" + " dirty:%lukB" + " writeback:%lukB" + " shmem:%lukB" +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + " shmem_thp: %lukB" + " shmem_pmdmapped: %lukB" + " anon_thp: %lukB" +#endif + " writeback_tmp:%lukB" + " unstable:%lukB" + " pages_scanned:%lu" + " all_unreclaimable? %s" + "\n", + pgdat->node_id, + K(node_page_state(pgdat, NR_ACTIVE_ANON)), + K(node_page_state(pgdat, NR_INACTIVE_ANON)), + K(node_page_state(pgdat, NR_ACTIVE_FILE)), + K(node_page_state(pgdat, NR_INACTIVE_FILE)), + K(node_page_state(pgdat, NR_UNEVICTABLE)), + K(node_page_state(pgdat, NR_ISOLATED_ANON)), + K(node_page_state(pgdat, NR_ISOLATED_FILE)), + K(node_page_state(pgdat, NR_FILE_MAPPED)), + K(node_page_state(pgdat, NR_FILE_DIRTY)), + K(node_page_state(pgdat, NR_WRITEBACK)), +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), + K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) + * HPAGE_PMD_NR), + K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), +#endif + K(node_page_state(pgdat, NR_SHMEM)), + K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), + K(node_page_state(pgdat, NR_UNSTABLE_NFS)), + node_page_state(pgdat, NR_PAGES_SCANNED), + !pgdat_reclaimable(pgdat) ? "yes" : "no"); + } + for_each_populated_zone(zone) { int i; @@ -4397,61 +4345,41 @@ void show_free_areas(unsigned int filter) " active_file:%lukB" " inactive_file:%lukB" " unevictable:%lukB" - " isolated(anon):%lukB" - " isolated(file):%lukB" + " writepending:%lukB" " present:%lukB" " managed:%lukB" " mlocked:%lukB" - " dirty:%lukB" - " writeback:%lukB" - " mapped:%lukB" - " shmem:%lukB" " slab_reclaimable:%lukB" " slab_unreclaimable:%lukB" " kernel_stack:%lukB" " pagetables:%lukB" - " unstable:%lukB" " bounce:%lukB" " free_pcp:%lukB" " local_pcp:%ukB" " free_cma:%lukB" - " writeback_tmp:%lukB" - " pages_scanned:%lu" - " all_unreclaimable? %s" "\n", zone->name, K(zone_page_state(zone, NR_FREE_PAGES)), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), - K(zone_page_state(zone, NR_ACTIVE_ANON)), - K(zone_page_state(zone, NR_INACTIVE_ANON)), - K(zone_page_state(zone, NR_ACTIVE_FILE)), - K(zone_page_state(zone, NR_INACTIVE_FILE)), - K(zone_page_state(zone, NR_UNEVICTABLE)), - K(zone_page_state(zone, NR_ISOLATED_ANON)), - K(zone_page_state(zone, NR_ISOLATED_FILE)), + K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), + K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), + K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), + K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), + K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), + K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), K(zone->present_pages), K(zone->managed_pages), K(zone_page_state(zone, NR_MLOCK)), - K(zone_page_state(zone, NR_FILE_DIRTY)), - K(zone_page_state(zone, NR_WRITEBACK)), - K(zone_page_state(zone, NR_FILE_MAPPED)), - K(zone_page_state(zone, NR_SHMEM)), K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), - zone_page_state(zone, NR_KERNEL_STACK) * - THREAD_SIZE / 1024, + zone_page_state(zone, NR_KERNEL_STACK_KB), K(zone_page_state(zone, NR_PAGETABLE)), - K(zone_page_state(zone, NR_UNSTABLE_NFS)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), K(this_cpu_read(zone->pageset->pcp.count)), - K(zone_page_state(zone, NR_FREE_CMA_PAGES)), - K(zone_page_state(zone, NR_WRITEBACK_TEMP)), - K(zone_page_state(zone, NR_PAGES_SCANNED)), - (!zone_reclaimable(zone) ? "yes" : "no") - ); + K(zone_page_state(zone, NR_FREE_CMA_PAGES))); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) printk(" %ld", zone->lowmem_reserve[i]); @@ -4493,7 +4421,7 @@ void show_free_areas(unsigned int filter) hugetlb_show_meminfo(); - printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); + printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); show_swap_cache_info(); } @@ -4518,7 +4446,7 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, do { zone_type--; zone = pgdat->node_zones + zone_type; - if (populated_zone(zone)) { + if (managed_zone(zone)) { zoneref_set_zone(zone, &zonelist->_zonerefs[nr_zones++]); check_highest_zone(zone_type); @@ -4713,7 +4641,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) int j; struct zonelist *zonelist; - zonelist = &pgdat->node_zonelists[0]; + zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) ; j = build_zonelists_node(NODE_DATA(node), zonelist, j); @@ -4729,7 +4657,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) int j; struct zonelist *zonelist; - zonelist = &pgdat->node_zonelists[1]; + zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK]; j = build_zonelists_node(pgdat, zonelist, 0); zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; @@ -4750,13 +4678,13 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) struct zone *z; struct zonelist *zonelist; - zonelist = &pgdat->node_zonelists[0]; + zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; pos = 0; for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { for (j = 0; j < nr_nodes; j++) { node = node_order[j]; z = &NODE_DATA(node)->node_zones[zone_type]; - if (populated_zone(z)) { + if (managed_zone(z)) { zoneref_set_zone(z, &zonelist->_zonerefs[pos++]); check_highest_zone(zone_type); @@ -4868,6 +4796,8 @@ int local_memory_node(int node) } #endif +static void setup_min_unmapped_ratio(void); +static void setup_min_slab_ratio(void); #else /* CONFIG_NUMA */ static void set_zonelist_order(void) @@ -4883,7 +4813,7 @@ static void build_zonelists(pg_data_t *pgdat) local_node = pgdat->node_id; - zonelist = &pgdat->node_zonelists[0]; + zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; j = build_zonelists_node(pgdat, zonelist, 0); /* @@ -5156,15 +5086,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* - * If not mirrored_kernelcore and ZONE_MOVABLE exists, range - * from zone_movable_pfn[nid] to end of each node should be - * ZONE_MOVABLE not ZONE_NORMAL. skip it. - */ - if (!mirrored_kernelcore && zone_movable_pfn[nid]) - if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid]) - continue; - - /* * Check given memblock attribute by firmware which can affect * kernel memory layout. If zone==ZONE_MOVABLE but memory is * mirrored, it's an overlapped memmap init. skip it. @@ -5372,13 +5293,18 @@ static void __meminit setup_zone_pageset(struct zone *zone) */ void __init setup_per_cpu_pageset(void) { + struct pglist_data *pgdat; struct zone *zone; for_each_populated_zone(zone) setup_zone_pageset(zone); + + for_each_online_pgdat(pgdat) + pgdat->per_cpu_nodestats = + alloc_percpu(struct per_cpu_nodestat); } -static noinline __init_refok +static noinline __ref int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { int i; @@ -5602,6 +5528,12 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid, *zone_end_pfn = min(node_end_pfn, arch_zone_highest_possible_pfn[movable_zone]); + /* Adjust for ZONE_MOVABLE starting within this range */ + } else if (!mirrored_kernelcore && + *zone_start_pfn < zone_movable_pfn[nid] && + *zone_end_pfn > zone_movable_pfn[nid]) { + *zone_end_pfn = zone_movable_pfn[nid]; + /* Check if this whole range is within ZONE_MOVABLE */ } else if (*zone_start_pfn >= zone_movable_pfn[nid]) *zone_start_pfn = *zone_end_pfn; @@ -5705,28 +5637,23 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages * and vice versa. */ - if (zone_movable_pfn[nid]) { - if (mirrored_kernelcore) { - unsigned long start_pfn, end_pfn; - struct memblock_region *r; - - for_each_memblock(memory, r) { - start_pfn = clamp(memblock_region_memory_base_pfn(r), - zone_start_pfn, zone_end_pfn); - end_pfn = clamp(memblock_region_memory_end_pfn(r), - zone_start_pfn, zone_end_pfn); - - if (zone_type == ZONE_MOVABLE && - memblock_is_mirror(r)) - nr_absent += end_pfn - start_pfn; - - if (zone_type == ZONE_NORMAL && - !memblock_is_mirror(r)) - nr_absent += end_pfn - start_pfn; - } - } else { - if (zone_type == ZONE_NORMAL) - nr_absent += node_end_pfn - zone_movable_pfn[nid]; + if (mirrored_kernelcore && zone_movable_pfn[nid]) { + unsigned long start_pfn, end_pfn; + struct memblock_region *r; + + for_each_memblock(memory, r) { + start_pfn = clamp(memblock_region_memory_base_pfn(r), + zone_start_pfn, zone_end_pfn); + end_pfn = clamp(memblock_region_memory_end_pfn(r), + zone_start_pfn, zone_end_pfn); + + if (zone_type == ZONE_MOVABLE && + memblock_is_mirror(r)) + nr_absent += end_pfn - start_pfn; + + if (zone_type == ZONE_NORMAL && + !memblock_is_mirror(r)) + nr_absent += end_pfn - start_pfn; } } @@ -5933,6 +5860,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) init_waitqueue_head(&pgdat->kcompactd_wait); #endif pgdat_page_ext_init(pgdat); + spin_lock_init(&pgdat->lru_lock); + lruvec_init(node_lruvec(pgdat)); for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; @@ -5982,21 +5911,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; #ifdef CONFIG_NUMA zone->node = nid; - zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) - / 100; - zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; #endif zone->name = zone_names[j]; + zone->zone_pgdat = pgdat; spin_lock_init(&zone->lock); - spin_lock_init(&zone->lru_lock); zone_seqlock_init(zone); - zone->zone_pgdat = pgdat; zone_pcp_init(zone); - /* For bootup, initialized properly in watermark setup */ - mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); - - lruvec_init(&zone->lruvec); if (!size) continue; @@ -6008,7 +5929,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) } } -static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) +static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { unsigned long __maybe_unused start = 0; unsigned long __maybe_unused offset = 0; @@ -6062,11 +5983,12 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, unsigned long end_pfn = 0; /* pg_data_t should be reset to zero when it's allocated */ - WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); + WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); reset_deferred_meminit(pgdat); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; + pgdat->per_cpu_nodestats = NULL; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, @@ -6448,15 +6370,18 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) sizeof(arch_zone_lowest_possible_pfn)); memset(arch_zone_highest_possible_pfn, 0, sizeof(arch_zone_highest_possible_pfn)); - arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); - arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; - for (i = 1; i < MAX_NR_ZONES; i++) { + + start_pfn = find_min_pfn_with_active_regions(); + + for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; - arch_zone_lowest_possible_pfn[i] = - arch_zone_highest_possible_pfn[i-1]; - arch_zone_highest_possible_pfn[i] = - max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); + + end_pfn = max(max_zone_pfn[i], start_pfn); + arch_zone_lowest_possible_pfn[i] = start_pfn; + arch_zone_highest_possible_pfn[i] = end_pfn; + + start_pfn = end_pfn; } arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; @@ -6720,6 +6645,9 @@ static void calculate_totalreserve_pages(void) enum zone_type i, j; for_each_online_pgdat(pgdat) { + + pgdat->totalreserve_pages = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; long max = 0; @@ -6736,7 +6664,7 @@ static void calculate_totalreserve_pages(void) if (max > zone->managed_pages) max = zone->managed_pages; - zone->totalreserve_pages = max; + pgdat->totalreserve_pages += max; reserve_pages += max; } @@ -6837,10 +6765,6 @@ static void __setup_per_zone_wmarks(void) zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; - __mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - low_wmark_pages(zone) - - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - spin_unlock_irqrestore(&zone->lock, flags); } @@ -6907,6 +6831,12 @@ int __meminit init_per_zone_wmark_min(void) setup_per_zone_wmarks(); refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); + +#ifdef CONFIG_NUMA + setup_min_unmapped_ratio(); + setup_min_slab_ratio(); +#endif + return 0; } core_initcall(init_per_zone_wmark_min) @@ -6948,35 +6878,58 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, } #ifdef CONFIG_NUMA +static void setup_min_unmapped_ratio(void) +{ + pg_data_t *pgdat; + struct zone *zone; + + for_each_online_pgdat(pgdat) + pgdat->min_unmapped_pages = 0; + + for_each_zone(zone) + zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * + sysctl_min_unmapped_ratio) / 100; +} + + int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { - struct zone *zone; int rc; rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; - for_each_zone(zone) - zone->min_unmapped_pages = (zone->managed_pages * - sysctl_min_unmapped_ratio) / 100; + setup_min_unmapped_ratio(); + return 0; } +static void setup_min_slab_ratio(void) +{ + pg_data_t *pgdat; + struct zone *zone; + + for_each_online_pgdat(pgdat) + pgdat->min_slab_pages = 0; + + for_each_zone(zone) + zone->zone_pgdat->min_slab_pages += (zone->managed_pages * + sysctl_min_slab_ratio) / 100; +} + int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { - struct zone *zone; int rc; rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; - for_each_zone(zone) - zone->min_slab_pages = (zone->managed_pages * - sysctl_min_slab_ratio) / 100; + setup_min_slab_ratio(); + return 0; } #endif @@ -7054,6 +7007,17 @@ static int __init set_hashdist(char *str) __setup("hashdist=", set_hashdist); #endif +#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES +/* + * Returns the number of pages that arch has reserved but + * is not known to alloc_large_system_hash(). + */ +static unsigned long __init arch_reserved_kernel_pages(void) +{ + return 0; +} +#endif + /* * allocate a large system hash table from bootmem * - it is assumed that the hash table must contain an exact power-of-2 @@ -7078,6 +7042,7 @@ void *__init alloc_large_system_hash(const char *tablename, if (!numentries) { /* round applicable memory size up to nearest megabyte */ numentries = nr_kernel_pages; + numentries -= arch_reserved_kernel_pages(); /* It isn't necessary when PAGE_SIZE >= 1MB */ if (PAGE_SHIFT < 20) |