aboutsummaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c1007
1 files changed, 682 insertions, 325 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e2ef1c17942f..3eb01dedfb50 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -16,11 +16,11 @@
#include <linux/stddef.h>
#include <linux/mm.h>
+#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
#include <linux/jiffies.h>
-#include <linux/bootmem.h>
#include <linux/memblock.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
@@ -66,6 +66,7 @@
#include <linux/ftrace.h>
#include <linux/lockdep.h>
#include <linux/nmi.h>
+#include <linux/psi.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -96,8 +97,12 @@ int _node_numa_mem_[MAX_NUMNODES];
#endif
/* work_structs for global per-cpu drains */
+struct pcpu_drain {
+ struct zone *zone;
+ struct work_struct work;
+};
DEFINE_MUTEX(pcpu_drain_mutex);
-DEFINE_PER_CPU(struct work_struct, pcpu_drain);
+DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
volatile unsigned long latent_entropy __latent_entropy;
@@ -121,10 +126,8 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
};
EXPORT_SYMBOL(node_states);
-/* Protect totalram_pages and zone->managed_pages */
-static DEFINE_SPINLOCK(managed_page_count_lock);
-
-unsigned long totalram_pages __read_mostly;
+atomic_long_t _totalram_pages __read_mostly;
+EXPORT_SYMBOL(_totalram_pages);
unsigned long totalreserve_pages __read_mostly;
unsigned long totalcma_pages __read_mostly;
@@ -237,7 +240,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
#endif
};
-char * const migratetype_names[MIGRATE_TYPES] = {
+const char * const migratetype_names[MIGRATE_TYPES] = {
"Unmovable",
"Movable",
"Reclaimable",
@@ -263,20 +266,21 @@ compound_page_dtor * const compound_page_dtors[] = {
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
+int watermark_boost_factor __read_mostly = 15000;
int watermark_scale_factor = 10;
-static unsigned long nr_kernel_pages __meminitdata;
-static unsigned long nr_all_pages __meminitdata;
-static unsigned long dma_reserve __meminitdata;
+static unsigned long nr_kernel_pages __initdata;
+static unsigned long nr_all_pages __initdata;
+static unsigned long dma_reserve __initdata;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata;
-static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata;
+static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
+static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
static unsigned long required_kernelcore __initdata;
static unsigned long required_kernelcore_percent __initdata;
static unsigned long required_movablecore __initdata;
static unsigned long required_movablecore_percent __initdata;
-static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata;
+static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
static bool mirrored_kernelcore __meminitdata;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
@@ -285,8 +289,8 @@ EXPORT_SYMBOL(movable_zone);
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
#if MAX_NUMNODES > 1
-int nr_node_ids __read_mostly = MAX_NUMNODES;
-int nr_online_nodes __read_mostly = 1;
+unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
+unsigned int nr_online_nodes __read_mostly = 1;
EXPORT_SYMBOL(nr_node_ids);
EXPORT_SYMBOL(nr_online_nodes);
#endif
@@ -294,6 +298,32 @@ EXPORT_SYMBOL(nr_online_nodes);
int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+/*
+ * During boot we initialize deferred pages on-demand, as needed, but once
+ * page_alloc_init_late() has finished, the deferred pages are all initialized,
+ * and we can permanently disable that path.
+ */
+static DEFINE_STATIC_KEY_TRUE(deferred_pages);
+
+/*
+ * Calling kasan_free_pages() only after deferred memory initialization
+ * has completed. Poisoning pages during deferred memory init will greatly
+ * lengthen the process and cause problem in large memory systems as the
+ * deferred pages initialization is done with interrupt disabled.
+ *
+ * Assuming that there will be no reference to those newly initialized
+ * pages before they are ever allocated, this should have no effect on
+ * KASAN memory tracking as the poison will be properly inserted at page
+ * allocation time. The only corner case is when pages are allocated by
+ * on-demand allocation and then freed again before the deferred pages
+ * initialization is done, but this is not likely to happen.
+ */
+static inline void kasan_free_nondeferred_pages(struct page *page, int order)
+{
+ if (!static_branch_unlikely(&deferred_pages))
+ kasan_free_pages(page, order);
+}
+
/* Returns true if the struct page for the pfn is uninitialised */
static inline bool __meminit early_page_uninitialised(unsigned long pfn)
{
@@ -306,36 +336,50 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn)
}
/*
- * Returns false when the remaining initialisation should be deferred until
+ * Returns true when the remaining initialisation should be deferred until
* later in the boot cycle when it can be parallelised.
*/
-static inline bool update_defer_init(pg_data_t *pgdat,
- unsigned long pfn, unsigned long zone_end,
- unsigned long *nr_initialised)
+static bool __meminit
+defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
{
+ static unsigned long prev_end_pfn, nr_initialised;
+
+ /*
+ * prev_end_pfn static that contains the end of previous zone
+ * No need to protect because called very early in boot before smp_init.
+ */
+ if (prev_end_pfn != end_pfn) {
+ prev_end_pfn = end_pfn;
+ nr_initialised = 0;
+ }
+
/* Always populate low zones for address-constrained allocations */
- if (zone_end < pgdat_end_pfn(pgdat))
- return true;
- (*nr_initialised)++;
- if ((*nr_initialised > pgdat->static_init_pgcnt) &&
- (pfn & (PAGES_PER_SECTION - 1)) == 0) {
- pgdat->first_deferred_pfn = pfn;
+ if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
return false;
- }
- return true;
+ /*
+ * We start only with one section of pages, more pages are added as
+ * needed until the rest of deferred pages are initialized.
+ */
+ nr_initialised++;
+ if ((nr_initialised > PAGES_PER_SECTION) &&
+ (pfn & (PAGES_PER_SECTION - 1)) == 0) {
+ NODE_DATA(nid)->first_deferred_pfn = pfn;
+ return true;
+ }
+ return false;
}
#else
+#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o)
+
static inline bool early_page_uninitialised(unsigned long pfn)
{
return false;
}
-static inline bool update_defer_init(pg_data_t *pgdat,
- unsigned long pfn, unsigned long zone_end,
- unsigned long *nr_initialised)
+static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
{
- return true;
+ return false;
}
#endif
@@ -419,6 +463,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
unsigned long old_word, word;
BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+ BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
bitmap = get_pageblock_bitmap(page, pfn);
bitidx = pfn_to_bitidx(page, pfn);
@@ -744,6 +789,57 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
return 0;
}
+#ifdef CONFIG_COMPACTION
+static inline struct capture_control *task_capc(struct zone *zone)
+{
+ struct capture_control *capc = current->capture_control;
+
+ return capc &&
+ !(current->flags & PF_KTHREAD) &&
+ !capc->page &&
+ capc->cc->zone == zone &&
+ capc->cc->direct_compaction ? capc : NULL;
+}
+
+static inline bool
+compaction_capture(struct capture_control *capc, struct page *page,
+ int order, int migratetype)
+{
+ if (!capc || order != capc->cc->order)
+ return false;
+
+ /* Do not accidentally pollute CMA or isolated regions*/
+ if (is_migrate_cma(migratetype) ||
+ is_migrate_isolate(migratetype))
+ return false;
+
+ /*
+ * Do not let lower order allocations polluate a movable pageblock.
+ * This might let an unmovable request use a reclaimable pageblock
+ * and vice-versa but no more than normal fallback logic which can
+ * have trouble finding a high-order free page.
+ */
+ if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
+ return false;
+
+ capc->page = page;
+ return true;
+}
+
+#else
+static inline struct capture_control *task_capc(struct zone *zone)
+{
+ return NULL;
+}
+
+static inline bool
+compaction_capture(struct capture_control *capc, struct page *page,
+ int order, int migratetype)
+{
+ return false;
+}
+#endif /* CONFIG_COMPACTION */
+
/*
* Freeing function for a buddy system allocator.
*
@@ -777,6 +873,7 @@ static inline void __free_one_page(struct page *page,
unsigned long uninitialized_var(buddy_pfn);
struct page *buddy;
unsigned int max_order;
+ struct capture_control *capc = task_capc(zone);
max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
@@ -792,6 +889,11 @@ static inline void __free_one_page(struct page *page,
continue_merging:
while (order < max_order - 1) {
+ if (compaction_capture(capc, page, order, migratetype)) {
+ __mod_zone_freepage_state(zone, -(1 << order),
+ migratetype);
+ return;
+ }
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
@@ -1011,7 +1113,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
if (PageMappingFlags(page))
page->mapping = NULL;
if (memcg_kmem_enabled() && PageKmemcg(page))
- memcg_kmem_uncharge(page, order);
+ __memcg_kmem_uncharge(page, order);
if (check_free)
bad += free_pages_check(page);
if (bad)
@@ -1030,7 +1132,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
arch_free_page(page, order);
kernel_poison_pages(page, 1 << order, 0);
kernel_map_pages(page, 1 << order, 0);
- kasan_free_pages(page, order);
+ kasan_free_nondeferred_pages(page, order);
return true;
}
@@ -1176,6 +1278,7 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn,
init_page_count(page);
page_mapcount_reset(page);
page_cpupid_reset_last(page);
+ page_kasan_tag_reset(page);
INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
@@ -1231,7 +1334,12 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
/* Avoid false-positive PageTail() */
INIT_LIST_HEAD(&page->lru);
- SetPageReserved(page);
+ /*
+ * no need for atomic set_bit because the struct
+ * page is not visible yet so nobody should
+ * access it yet.
+ */
+ __SetPageReserved(page);
}
}
}
@@ -1252,7 +1360,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
local_irq_restore(flags);
}
-static void __init __free_pages_boot_core(struct page *page, unsigned int order)
+void __free_pages_core(struct page *page, unsigned int order)
{
unsigned int nr_pages = 1 << order;
struct page *p = page;
@@ -1267,7 +1375,7 @@ static void __init __free_pages_boot_core(struct page *page, unsigned int order)
__ClearPageReserved(p);
set_page_count(p, 0);
- page_zone(page)->managed_pages += nr_pages;
+ atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
set_page_refcounted(page);
__free_pages(page, order);
}
@@ -1326,12 +1434,12 @@ meminit_pfn_in_nid(unsigned long pfn, int node,
#endif
-void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
+void __init memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order)
{
if (early_page_uninitialised(pfn))
return;
- return __free_pages_boot_core(page, order);
+ __free_pages_core(page, order);
}
/*
@@ -1421,14 +1529,14 @@ static void __init deferred_free_range(unsigned long pfn,
if (nr_pages == pageblock_nr_pages &&
(pfn & (pageblock_nr_pages - 1)) == 0) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_boot_core(page, pageblock_order);
+ __free_pages_core(page, pageblock_order);
return;
}
for (i = 0; i < nr_pages; i++, page++, pfn++) {
if ((pfn & (pageblock_nr_pages - 1)) == 0)
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_boot_core(page, 0);
+ __free_pages_core(page, 0);
}
}
@@ -1594,13 +1702,6 @@ static int __init deferred_init_memmap(void *data)
}
/*
- * During boot we initialize deferred pages on-demand, as needed, but once
- * page_alloc_init_late() has finished, the deferred pages are all initialized,
- * and we can permanently disable that path.
- */
-static DEFINE_STATIC_KEY_TRUE(deferred_pages);
-
-/*
* If this zone has deferred pages, try to grow it by initializing enough
* deferred pages to satisfy the allocation specified by order, rounded up to
* the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
@@ -1901,8 +2002,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1);
- kernel_poison_pages(page, 1 << order, 1);
kasan_alloc_pages(page, order);
+ kernel_poison_pages(page, 1 << order, 1);
set_page_owner(page, order, gfp_flags);
}
@@ -1969,8 +2070,8 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
*/
static int fallbacks[MIGRATE_TYPES][4] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
- [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
+ [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
#ifdef CONFIG_CMA
[MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
#endif
@@ -2015,10 +2116,6 @@ static int move_freepages(struct zone *zone,
pfn_valid(page_to_pfn(end_page)) &&
page_zone(start_page) != page_zone(end_page));
#endif
-
- if (num_movable)
- *num_movable = 0;
-
for (page = start_page; page <= end_page;) {
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
@@ -2058,6 +2155,9 @@ int move_freepages_block(struct zone *zone, struct page *page,
unsigned long start_pfn, end_pfn;
struct page *start_page, *end_page;
+ if (num_movable)
+ *num_movable = 0;
+
start_pfn = page_to_pfn(page);
start_pfn = start_pfn & ~(pageblock_nr_pages-1);
start_page = pfn_to_page(start_pfn);
@@ -2118,6 +2218,33 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
return false;
}
+static inline void boost_watermark(struct zone *zone)
+{
+ unsigned long max_boost;
+
+ if (!watermark_boost_factor)
+ return;
+
+ max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
+ watermark_boost_factor, 10000);
+
+ /*
+ * high watermark may be uninitialised if fragmentation occurs
+ * very early in boot so do not boost. We do not fall
+ * through and boost by pageblock_nr_pages as failing
+ * allocations that early means that reclaim is not going
+ * to help and it may even be impossible to reclaim the
+ * boosted watermark resulting in a hang.
+ */
+ if (!max_boost)
+ return;
+
+ max_boost = max(pageblock_nr_pages, max_boost);
+
+ zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
+ max_boost);
+}
+
/*
* This function implements actual steal behaviour. If order is large enough,
* we can steal whole pageblock. If not, we first move freepages in this
@@ -2127,7 +2254,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
* itself, so pages freed in the future will be put on the correct free list.
*/
static void steal_suitable_fallback(struct zone *zone, struct page *page,
- int start_type, bool whole_block)
+ unsigned int alloc_flags, int start_type, bool whole_block)
{
unsigned int current_order = page_order(page);
struct free_area *area;
@@ -2149,6 +2276,15 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
goto single_page;
}
+ /*
+ * Boost watermarks to increase reclaim pressure to reduce the
+ * likelihood of future fallbacks. Wake kswapd now as the node
+ * may be balanced overall and kswapd will not wake naturally.
+ */
+ boost_watermark(zone);
+ if (alloc_flags & ALLOC_KSWAPD)
+ set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
+
/* We are not allowed to try stealing from the whole block */
if (!whole_block)
goto single_page;
@@ -2247,7 +2383,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
* Limit the number reserved to 1 pageblock or roughly 1% of a zone.
* Check is race-prone but harmless.
*/
- max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
+ max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
if (zone->nr_reserved_highatomic >= max_managed)
return;
@@ -2364,20 +2500,30 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
* condition simpler.
*/
static __always_inline bool
-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
+__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+ unsigned int alloc_flags)
{
struct free_area *area;
int current_order;
+ int min_order = order;
struct page *page;
int fallback_mt;
bool can_steal;
/*
+ * Do not steal pages from freelists belonging to other pageblocks
+ * i.e. orders < pageblock_order. If there are no local zones free,
+ * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
+ */
+ if (alloc_flags & ALLOC_NOFRAGMENT)
+ min_order = pageblock_order;
+
+ /*
* Find the largest available free page in the other list. This roughly
* approximates finding the pageblock with the most free pages, which
* would be too costly to do exactly.
*/
- for (current_order = MAX_ORDER - 1; current_order >= order;
+ for (current_order = MAX_ORDER - 1; current_order >= min_order;
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
@@ -2422,7 +2568,8 @@ do_steal:
page = list_first_entry(&area->free_list[fallback_mt],
struct page, lru);
- steal_suitable_fallback(zone, page, start_migratetype, can_steal);
+ steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
+ can_steal);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, fallback_mt);
@@ -2436,7 +2583,8 @@ do_steal:
* Call me with the zone->lock already held.
*/
static __always_inline struct page *
-__rmqueue(struct zone *zone, unsigned int order, int migratetype)
+__rmqueue(struct zone *zone, unsigned int order, int migratetype,
+ unsigned int alloc_flags)
{
struct page *page;
@@ -2446,7 +2594,8 @@ retry:
if (migratetype == MIGRATE_MOVABLE)
page = __rmqueue_cma_fallback(zone, order);
- if (!page && __rmqueue_fallback(zone, order, migratetype))
+ if (!page && __rmqueue_fallback(zone, order, migratetype,
+ alloc_flags))
goto retry;
}
@@ -2461,13 +2610,14 @@ retry:
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
- int migratetype)
+ int migratetype, unsigned int alloc_flags)
{
int i, alloced = 0;
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
- struct page *page = __rmqueue(zone, order, migratetype);
+ struct page *page = __rmqueue(zone, order, migratetype,
+ alloc_flags);
if (unlikely(page == NULL))
break;
@@ -2581,6 +2731,10 @@ void drain_local_pages(struct zone *zone)
static void drain_local_pages_wq(struct work_struct *work)
{
+ struct pcpu_drain *drain;
+
+ drain = container_of(work, struct pcpu_drain, work);
+
/*
* drain_all_pages doesn't use proper cpu hotplug protection so
* we can race with cpu offline when the WQ can move this from
@@ -2589,7 +2743,7 @@ static void drain_local_pages_wq(struct work_struct *work)
* a different one.
*/
preempt_disable();
- drain_local_pages(NULL);
+ drain_local_pages(drain->zone);
preempt_enable();
}
@@ -2660,12 +2814,14 @@ void drain_all_pages(struct zone *zone)
}
for_each_cpu(cpu, &cpus_with_pcps) {
- struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
- INIT_WORK(work, drain_local_pages_wq);
- queue_work_on(cpu, mm_percpu_wq, work);
+ struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
+
+ drain->zone = zone;
+ INIT_WORK(&drain->work, drain_local_pages_wq);
+ queue_work_on(cpu, mm_percpu_wq, &drain->work);
}
for_each_cpu(cpu, &cpus_with_pcps)
- flush_work(per_cpu_ptr(&pcpu_drain, cpu));
+ flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
mutex_unlock(&pcpu_drain_mutex);
}
@@ -2863,7 +3019,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
* watermark, because we already know our high-order page
* exists.
*/
- watermark = min_wmark_pages(zone) + (1UL << order);
+ watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
return 0;
@@ -2923,6 +3079,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
/* Remove page from the per-cpu list, caller must protect the list */
static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
+ unsigned int alloc_flags,
struct per_cpu_pages *pcp,
struct list_head *list)
{
@@ -2932,7 +3089,7 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
if (list_empty(list)) {
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, list,
- migratetype);
+ migratetype, alloc_flags);
if (unlikely(list_empty(list)))
return NULL;
}
@@ -2948,7 +3105,8 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
- gfp_t gfp_flags, int migratetype)
+ gfp_t gfp_flags, int migratetype,
+ unsigned int alloc_flags)
{
struct per_cpu_pages *pcp;
struct list_head *list;
@@ -2958,7 +3116,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
local_irq_save(flags);
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list = &pcp->lists[migratetype];
- page = __rmqueue_pcplist(zone, migratetype, pcp, list);
+ page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone);
@@ -2981,7 +3139,7 @@ struct page *rmqueue(struct zone *preferred_zone,
if (likely(order == 0)) {
page = rmqueue_pcplist(preferred_zone, zone, order,
- gfp_flags, migratetype);
+ gfp_flags, migratetype, alloc_flags);
goto out;
}
@@ -3000,7 +3158,7 @@ struct page *rmqueue(struct zone *preferred_zone,
trace_mm_page_alloc_zone_locked(page, order, migratetype);
}
if (!page)
- page = __rmqueue(zone, order, migratetype);
+ page = __rmqueue(zone, order, migratetype, alloc_flags);
} while (page && check_new_pages(page, order));
spin_unlock(&zone->lock);
if (!page)
@@ -3013,6 +3171,12 @@ struct page *rmqueue(struct zone *preferred_zone,
local_irq_restore(flags);
out:
+ /* Separate test+clear to avoid unnecessary atomics */
+ if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
+ clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
+ wakeup_kswapd(zone, 0, 0, zone_idx(zone));
+ }
+
VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
return page;
@@ -3042,7 +3206,7 @@ static int __init setup_fail_page_alloc(char *str)
}
__setup("fail_page_alloc=", setup_fail_page_alloc);
-static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
if (order < fail_page_alloc.min_order)
return false;
@@ -3066,24 +3230,14 @@ static int __init fail_page_alloc_debugfs(void)
dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
&fail_page_alloc.attr);
- if (IS_ERR(dir))
- return PTR_ERR(dir);
-
- if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
- &fail_page_alloc.ignore_gfp_reclaim))
- goto fail;
- if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
- &fail_page_alloc.ignore_gfp_highmem))
- goto fail;
- if (!debugfs_create_u32("min-order", mode, dir,
- &fail_page_alloc.min_order))
- goto fail;
- return 0;
-fail:
- debugfs_remove_recursive(dir);
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &fail_page_alloc.ignore_gfp_reclaim);
+ debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+ &fail_page_alloc.ignore_gfp_highmem);
+ debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
- return -ENOMEM;
+ return 0;
}
late_initcall(fail_page_alloc_debugfs);
@@ -3092,13 +3246,19 @@ late_initcall(fail_page_alloc_debugfs);
#else /* CONFIG_FAIL_PAGE_ALLOC */
-static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
return false;
}
#endif /* CONFIG_FAIL_PAGE_ALLOC */
+static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+ return __should_fail_alloc_page(gfp_mask, order);
+}
+ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
+
/*
* Return true if free base pages are above 'mark'. For high-order checks it
* will return true of the order-0 watermark is reached and there is at least
@@ -3243,6 +3403,40 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
#endif /* CONFIG_NUMA */
/*
+ * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
+ * fragmentation is subtle. If the preferred zone was HIGHMEM then
+ * premature use of a lower zone may cause lowmem pressure problems that
+ * are worse than fragmentation. If the next zone is ZONE_DMA then it is
+ * probably too small. It only makes sense to spread allocations to avoid
+ * fragmentation between the Normal and DMA32 zones.
+ */
+static inline unsigned int
+alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
+{
+ unsigned int alloc_flags = 0;
+
+ if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+ alloc_flags |= ALLOC_KSWAPD;
+
+#ifdef CONFIG_ZONE_DMA32
+ if (zone_idx(zone) != ZONE_NORMAL)
+ goto out;
+
+ /*
+ * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
+ * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
+ * on UMA that if Normal is populated then so is DMA32.
+ */
+ BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
+ if (nr_online_nodes > 1 && !populated_zone(--zone))
+ goto out;
+
+out:
+#endif /* CONFIG_ZONE_DMA32 */
+ return alloc_flags;
+}
+
+/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
*/
@@ -3250,14 +3444,18 @@ static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
{
- struct zoneref *z = ac->preferred_zoneref;
+ struct zoneref *z;
struct zone *zone;
struct pglist_data *last_pgdat_dirty_limit = NULL;
+ bool no_fallback;
+retry:
/*
* Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
*/
+ no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
+ z = ac->preferred_zoneref;
for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
struct page *page;
@@ -3296,7 +3494,23 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
}
}
- mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
+ if (no_fallback && nr_online_nodes > 1 &&
+ zone != ac->preferred_zoneref->zone) {
+ int local_nid;
+
+ /*
+ * If moving to a remote node, retry but allow
+ * fragmenting fallbacks. Locality is more important
+ * than fragmentation avoidance.
+ */
+ local_nid = zone_to_nid(ac->preferred_zoneref->zone);
+ if (zone_to_nid(zone) != local_nid) {
+ alloc_flags &= ~ALLOC_NOFRAGMENT;
+ goto retry;
+ }
+ }
+
+ mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
if (!zone_watermark_fast(zone, order, mark,
ac_classzone_idx(ac), alloc_flags)) {
int ret;
@@ -3363,21 +3577,16 @@ try_this_zone:
}
}
- return NULL;
-}
-
-/*
- * Large machines with many possible nodes should not always dump per-node
- * meminfo in irq context.
- */
-static inline bool should_suppress_show_mem(void)
-{
- bool ret = false;
+ /*
+ * It's possible on a UMA machine to get through all zones that are
+ * fragmented. If avoiding fragmentation, reset and try again.
+ */
+ if (no_fallback) {
+ alloc_flags &= ~ALLOC_NOFRAGMENT;
+ goto retry;
+ }
-#if NODES_SHIFT > 8
- ret = in_interrupt();
-#endif
- return ret;
+ return NULL;
}
static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
@@ -3385,7 +3594,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
unsigned int filter = SHOW_MEM_FILTER_NODES;
static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
- if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs))
+ if (!__ratelimit(&show_mem_rs))
return;
/*
@@ -3416,13 +3625,13 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n",
+ pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
current->comm, &vaf, gfp_mask, &gfp_mask,
nodemask_pr_args(nodemask));
va_end(args);
cpuset_print_current_mems_allowed();
-
+ pr_cont("\n");
dump_stack();
warn_alloc_show_mem(gfp_mask, nodemask);
}
@@ -3548,19 +3757,26 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio, enum compact_result *compact_result)
{
- struct page *page;
+ struct page *page = NULL;
+ unsigned long pflags;
unsigned int noreclaim_flag;
if (!order)
return NULL;
+ psi_memstall_enter(&pflags);
noreclaim_flag = memalloc_noreclaim_save();
+
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
- prio);
+ prio, &page);
+
memalloc_noreclaim_restore(noreclaim_flag);
+ psi_memstall_leave(&pflags);
- if (*compact_result <= COMPACT_INACTIVE)
+ if (*compact_result <= COMPACT_INACTIVE) {
+ WARN_ON_ONCE(page);
return NULL;
+ }
/*
* At least in one zone compaction wasn't deferred or skipped, so let's
@@ -3568,7 +3784,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
*/
count_vm_event(COMPACTSTALL);
- page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+ /* Prep a captured page if available */
+ if (page)
+ prep_new_page(page, order, gfp_mask, alloc_flags);
+
+ /* Try get a page from the freelist if available */
+ if (!page)
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page) {
struct zone *zone = page_zone(page);
@@ -3756,11 +3978,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
struct reclaim_state reclaim_state;
int progress;
unsigned int noreclaim_flag;
+ unsigned long pflags;
cond_resched();
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
+ psi_memstall_enter(&pflags);
fs_reclaim_acquire(gfp_mask);
noreclaim_flag = memalloc_noreclaim_save();
reclaim_state.reclaimed_slab = 0;
@@ -3772,6 +3996,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
current->reclaim_state = NULL;
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(gfp_mask);
+ psi_memstall_leave(&pflags);
cond_resched();
@@ -3856,6 +4081,9 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
+ if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+ alloc_flags |= ALLOC_KSWAPD;
+
#ifdef CONFIG_CMA
if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
@@ -3922,6 +4150,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
{
struct zone *zone;
struct zoneref *z;
+ bool ret = false;
/*
* Costly allocations might have made a progress but this doesn't mean
@@ -3985,25 +4214,24 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
}
}
- /*
- * Memory allocation/reclaim might be called from a WQ
- * context and the current implementation of the WQ
- * concurrency control doesn't recognize that
- * a particular WQ is congested if the worker thread is
- * looping without ever sleeping. Therefore we have to
- * do a short sleep here rather than calling
- * cond_resched().
- */
- if (current->flags & PF_WQ_WORKER)
- schedule_timeout_uninterruptible(1);
- else
- cond_resched();
-
- return true;
+ ret = true;
+ goto out;
}
}
- return false;
+out:
+ /*
+ * Memory allocation/reclaim might be called from a WQ context and the
+ * current implementation of the WQ concurrency control doesn't
+ * recognize that a particular WQ is congested if the worker thread is
+ * looping without ever sleeping. Therefore we have to do a short sleep
+ * here rather than calling cond_resched().
+ */
+ if (current->flags & PF_WQ_WORKER)
+ schedule_timeout_uninterruptible(1);
+ else
+ cond_resched();
+ return ret;
}
static inline bool
@@ -4056,17 +4284,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
int reserve_flags;
/*
- * In the slowpath, we sanity check order to avoid ever trying to
- * reclaim >= MAX_ORDER areas which will never succeed. Callers may
- * be using allocators in order of preference for an area that is
- * too large.
- */
- if (order >= MAX_ORDER) {
- WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
- return NULL;
- }
-
- /*
* We also sanity check to catch abuse of atomic reserves being used by
* callers that are not in atomic context.
*/
@@ -4098,7 +4315,7 @@ retry_cpuset:
if (!ac->preferred_zoneref->zone)
goto nopage;
- if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+ if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
/*
@@ -4156,7 +4373,7 @@ retry_cpuset:
retry:
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
- if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+ if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
@@ -4359,6 +4576,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };
+ /*
+ * There are several places where we assume that the order value is sane
+ * so bail out early if the request is out of bound.
+ */
+ if (unlikely(order >= MAX_ORDER)) {
+ WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
+ return NULL;
+ }
+
gfp_mask &= gfp_allowed_mask;
alloc_mask = gfp_mask;
if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
@@ -4366,6 +4592,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
finalise_ac(gfp_mask, &ac);
+ /*
+ * Forbid the first pass from falling back to types that fragment
+ * memory until all local zones are considered.
+ */
+ alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
+
/* First allocation attempt */
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
if (likely(page))
@@ -4391,7 +4623,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
out:
if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
- unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
+ unlikely(__memcg_kmem_charge(page, gfp_mask, order) != 0)) {
__free_pages(page, order);
page = NULL;
}
@@ -4424,16 +4656,19 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
}
EXPORT_SYMBOL(get_zeroed_page);
-void __free_pages(struct page *page, unsigned int order)
+static inline void free_the_page(struct page *page, unsigned int order)
{
- if (put_page_testzero(page)) {
- if (order == 0)
- free_unref_page(page);
- else
- __free_pages_ok(page, order);
- }
+ if (order == 0) /* Via pcp? */
+ free_unref_page(page);
+ else
+ __free_pages_ok(page, order);
}
+void __free_pages(struct page *page, unsigned int order)
+{
+ if (put_page_testzero(page))
+ free_the_page(page, order);
+}
EXPORT_SYMBOL(__free_pages);
void free_pages(unsigned long addr, unsigned int order)
@@ -4482,14 +4717,8 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
{
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
- if (page_ref_sub_and_test(page, count)) {
- unsigned int order = compound_order(page);
-
- if (order == 0)
- free_unref_page(page);
- else
- __free_pages_ok(page, order);
- }
+ if (page_ref_sub_and_test(page, count))
+ free_the_page(page, compound_order(page));
}
EXPORT_SYMBOL(__page_frag_cache_drain);
@@ -4513,11 +4742,11 @@ refill:
/* Even if we own the page, we do not use atomic_set().
* This would break get_page_unless_zero() users.
*/
- page_ref_add(page, size - 1);
+ page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
/* reset page count bias and offset to start of new frag */
nc->pfmemalloc = page_is_pfmemalloc(page);
- nc->pagecnt_bias = size;
+ nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
nc->offset = size;
}
@@ -4533,10 +4762,10 @@ refill:
size = nc->size;
#endif
/* OK, page count is 0, we can safely set it */
- set_page_count(page, size);
+ set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
/* reset page count bias and offset to start of new frag */
- nc->pagecnt_bias = size;
+ nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
offset = size - fragsz;
}
@@ -4555,7 +4784,7 @@ void page_frag_free(void *addr)
struct page *page = virt_to_head_page(addr);
if (unlikely(put_page_testzero(page)))
- __free_pages_ok(page, compound_order(page));
+ free_the_page(page, compound_order(page));
}
EXPORT_SYMBOL(page_frag_free);
@@ -4587,6 +4816,8 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
* This function is also limited by MAX_ORDER.
*
* Memory allocated by this function must be released by free_pages_exact().
+ *
+ * Return: pointer to the allocated area or %NULL in case of error.
*/
void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
{
@@ -4607,6 +4838,8 @@ EXPORT_SYMBOL(alloc_pages_exact);
*
* Like alloc_pages_exact(), but try to allocate on node nid first before falling
* back.
+ *
+ * Return: pointer to the allocated area or %NULL in case of error.
*/
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
{
@@ -4640,11 +4873,13 @@ EXPORT_SYMBOL(free_pages_exact);
* nr_free_zone_pages - count number of pages beyond high watermark
* @offset: The zone index of the highest zone
*
- * nr_free_zone_pages() counts the number of counts pages which are beyond the
+ * nr_free_zone_pages() counts the number of pages which are beyond the
* high watermark within all zones at or below a given zone index. For each
* zone, the number of pages is calculated as:
*
* nr_free_zone_pages = managed_pages - high_pages
+ *
+ * Return: number of pages beyond high watermark.
*/
static unsigned long nr_free_zone_pages(int offset)
{
@@ -4657,7 +4892,7 @@ static unsigned long nr_free_zone_pages(int offset)
struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
for_each_zone_zonelist(zone, z, zonelist, offset) {
- unsigned long size = zone->managed_pages;
+ unsigned long size = zone_managed_pages(zone);
unsigned long high = high_wmark_pages(zone);
if (size > high)
sum += size - high;
@@ -4671,6 +4906,9 @@ static unsigned long nr_free_zone_pages(int offset)
*
* nr_free_buffer_pages() counts the number of pages which are beyond the high
* watermark within ZONE_DMA and ZONE_NORMAL.
+ *
+ * Return: number of pages beyond high watermark within ZONE_DMA and
+ * ZONE_NORMAL.
*/
unsigned long nr_free_buffer_pages(void)
{
@@ -4683,6 +4921,8 @@ EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
*
* nr_free_pagecache_pages() counts the number of pages which are beyond the
* high watermark within all zones.
+ *
+ * Return: number of pages beyond high watermark within all zones.
*/
unsigned long nr_free_pagecache_pages(void)
{
@@ -4701,6 +4941,7 @@ long si_mem_available(void)
unsigned long pagecache;
unsigned long wmark_low = 0;
unsigned long pages[NR_LRU_LISTS];
+ unsigned long reclaimable;
struct zone *zone;
int lru;
@@ -4708,7 +4949,7 @@ long si_mem_available(void)
pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
for_each_zone(zone)
- wmark_low += zone->watermark[WMARK_LOW];
+ wmark_low += low_wmark_pages(zone);
/*
* Estimate the amount of memory available for userspace allocations,
@@ -4726,19 +4967,13 @@ long si_mem_available(void)
available += pagecache;
/*
- * Part of the reclaimable slab consists of items that are in use,
- * and cannot be freed. Cap this estimate at the low watermark.
- */
- available += global_node_page_state(NR_SLAB_RECLAIMABLE) -
- min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2,
- wmark_low);
-
- /*
- * Part of the kernel memory, which can be released under memory
- * pressure.
+ * Part of the reclaimable slab and other kernel memory consists of
+ * items that are in use, and cannot be freed. Cap this estimate at the
+ * low watermark.
*/
- available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >>
- PAGE_SHIFT;
+ reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) +
+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
+ available += reclaimable - min(reclaimable / 2, wmark_low);
if (available < 0)
available = 0;
@@ -4748,11 +4983,11 @@ EXPORT_SYMBOL_GPL(si_mem_available);
void si_meminfo(struct sysinfo *val)
{
- val->totalram = totalram_pages;
+ val->totalram = totalram_pages();
val->sharedram = global_node_page_state(NR_SHMEM);
val->freeram = global_zone_page_state(NR_FREE_PAGES);
val->bufferram = nr_blockdev_pages();
- val->totalhigh = totalhigh_pages;
+ val->totalhigh = totalhigh_pages();
val->freehigh = nr_free_highpages();
val->mem_unit = PAGE_SIZE;
}
@@ -4769,7 +5004,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
pg_data_t *pgdat = NODE_DATA(nid);
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
- managed_pages += pgdat->node_zones[zone_type].managed_pages;
+ managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
val->totalram = managed_pages;
val->sharedram = node_page_state(pgdat, NR_SHMEM);
val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
@@ -4778,7 +5013,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
struct zone *zone = &pgdat->node_zones[zone_type];
if (is_highmem(zone)) {
- managed_highpages += zone->managed_pages;
+ managed_highpages += zone_managed_pages(zone);
free_highpages += zone_page_state(zone, NR_FREE_PAGES);
}
}
@@ -4985,7 +5220,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
K(zone->present_pages),
- K(zone->managed_pages),
+ K(zone_managed_pages(zone)),
K(zone_page_state(zone, NR_MLOCK)),
zone_page_state(zone, NR_KERNEL_STACK_KB),
K(zone_page_state(zone, NR_PAGETABLE)),
@@ -5134,7 +5369,8 @@ static int node_load[MAX_NUMNODES];
* from each node to each node in the system), and should also prefer nodes
* with no CPUs, since presumably they'll have very little allocation pressure
* on them otherwise.
- * It returns -1 if no node is found.
+ *
+ * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
*/
static int find_next_best_node(int node, nodemask_t *used_node_mask)
{
@@ -5440,7 +5676,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
else
page_group_by_mobility_disabled = 0;
- pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",
+ pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
nr_online_nodes,
page_group_by_mobility_disabled ? "off" : "on",
vm_total_pages);
@@ -5449,76 +5685,151 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
#endif
}
+/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
+static bool __meminit
+overlap_memmap_init(unsigned long zone, unsigned long *pfn)
+{
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ static struct memblock_region *r;
+
+ if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
+ if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
+ for_each_memblock(memory, r) {
+ if (*pfn < memblock_region_memory_end_pfn(r))
+ break;
+ }
+ }
+ if (*pfn >= memblock_region_memory_base_pfn(r) &&
+ memblock_is_mirror(r)) {
+ *pfn = memblock_region_memory_end_pfn(r);
+ return true;
+ }
+ }
+#endif
+ return false;
+}
+
/*
* Initially all pages are reserved - free ones are freed
- * up by free_all_bootmem() once the early boot process is
+ * up by memblock_free_all() once the early boot process is
* done. Non-atomic initialization, single-pass.
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context,
struct vmem_altmap *altmap)
{
- unsigned long end_pfn = start_pfn + size;
- pg_data_t *pgdat = NODE_DATA(nid);
- unsigned long pfn;
- unsigned long nr_initialised = 0;
+ unsigned long pfn, end_pfn = start_pfn + size;
struct page *page;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
- struct memblock_region *r = NULL, *tmp;
-#endif
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
+#ifdef CONFIG_ZONE_DEVICE
/*
* Honor reservation requested by the driver for this ZONE_DEVICE
- * memory
+ * memory. We limit the total number of pages to initialize to just
+ * those that might contain the memory mapping. We will defer the
+ * ZONE_DEVICE page initialization until after we have released
+ * the hotplug lock.
*/
- if (altmap && start_pfn == altmap->base_pfn)
- start_pfn += altmap->reserve;
+ if (zone == ZONE_DEVICE) {
+ if (!altmap)
+ return;
+
+ if (start_pfn == altmap->base_pfn)
+ start_pfn += altmap->reserve;
+ end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+ }
+#endif
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
* There can be holes in boot-time mem_map[]s handed to this
* function. They do not exist on hotplugged memory.
*/
- if (context != MEMMAP_EARLY)
- goto not_early;
-
- if (!early_pfn_valid(pfn))
- continue;
- if (!early_pfn_in_nid(pfn, nid))
- continue;
- if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
- break;
-
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
- /*
- * Check given memblock attribute by firmware which can affect
- * kernel memory layout. If zone==ZONE_MOVABLE but memory is
- * mirrored, it's an overlapped memmap init. skip it.
- */
- if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
- if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
- for_each_memblock(memory, tmp)
- if (pfn < memblock_region_memory_end_pfn(tmp))
- break;
- r = tmp;
- }
- if (pfn >= memblock_region_memory_base_pfn(r) &&
- memblock_is_mirror(r)) {
- /* already initialized as NORMAL */
- pfn = memblock_region_memory_end_pfn(r);
+ if (context == MEMMAP_EARLY) {
+ if (!early_pfn_valid(pfn))
continue;
- }
+ if (!early_pfn_in_nid(pfn, nid))
+ continue;
+ if (overlap_memmap_init(zone, &pfn))
+ continue;
+ if (defer_init(nid, pfn, end_pfn))
+ break;
}
-#endif
-not_early:
page = pfn_to_page(pfn);
__init_single_page(page, pfn, zone, nid);
if (context == MEMMAP_HOTPLUG)
- SetPageReserved(page);
+ __SetPageReserved(page);
+
+ /*
+ * Mark the block movable so that blocks are reserved for
+ * movable at startup. This will force kernel allocations
+ * to reserve their blocks rather than leaking throughout
+ * the address space during boot when many long-lived
+ * kernel allocations are made.
+ *
+ * bitmap is created for zone's valid pfn range. but memmap
+ * can be created for invalid pages (for alignment)
+ * check here not to call set_pageblock_migratetype() against
+ * pfn out of zone.
+ */
+ if (!(pfn & (pageblock_nr_pages - 1))) {
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ cond_resched();
+ }
+ }
+}
+
+#ifdef CONFIG_ZONE_DEVICE
+void __ref memmap_init_zone_device(struct zone *zone,
+ unsigned long start_pfn,
+ unsigned long size,
+ struct dev_pagemap *pgmap)
+{
+ unsigned long pfn, end_pfn = start_pfn + size;
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ unsigned long zone_idx = zone_idx(zone);
+ unsigned long start = jiffies;
+ int nid = pgdat->node_id;
+
+ if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone)))
+ return;
+
+ /*
+ * The call to memmap_init_zone should have already taken care
+ * of the pages reserved for the memmap, so we can just jump to
+ * the end of that region and start processing the device pages.
+ */
+ if (pgmap->altmap_valid) {
+ struct vmem_altmap *altmap = &pgmap->altmap;
+
+ start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+ size = end_pfn - start_pfn;
+ }
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ struct page *page = pfn_to_page(pfn);
+
+ __init_single_page(page, pfn, zone_idx, nid);
+
+ /*
+ * Mark page reserved as it will need to wait for onlining
+ * phase for it to be fully associated with a zone.
+ *
+ * We can use the non-atomic __set_bit operation for setting
+ * the flag as we are still initializing the pages.
+ */
+ __SetPageReserved(page);
+
+ /*
+ * ZONE_DEVICE pages union ->lru with a ->pgmap back
+ * pointer and hmm_data. It is a bug if a ZONE_DEVICE
+ * page is ever freed or placed on a driver-private list.
+ */
+ page->pgmap = pgmap;
+ page->hmm_data = 0;
/*
* Mark the block movable so that blocks are reserved for
@@ -5540,8 +5851,12 @@ not_early:
cond_resched();
}
}
+
+ pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev),
+ size, jiffies_to_msecs(jiffies - start));
}
+#endif
static void __meminit zone_init_free_lists(struct zone *zone)
{
unsigned int order, t;
@@ -5551,10 +5866,11 @@ static void __meminit zone_init_free_lists(struct zone *zone)
}
}
-#ifndef __HAVE_ARCH_MEMMAP_INIT
-#define memmap_init(size, nid, zone, start_pfn) \
- memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY, NULL)
-#endif
+void __meminit __weak memmap_init(unsigned long size, int nid,
+ unsigned long zone, unsigned long start_pfn)
+{
+ memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL);
+}
static int zone_batchsize(struct zone *zone)
{
@@ -5565,7 +5881,7 @@ static int zone_batchsize(struct zone *zone)
* The per-cpu-pages pools are set to around 1000th of the
* size of the zone.
*/
- batch = zone->managed_pages / 1024;
+ batch = zone_managed_pages(zone) / 1024;
/* But no more than a meg. */
if (batch * PAGE_SIZE > 1024 * 1024)
batch = (1024 * 1024) / PAGE_SIZE;
@@ -5646,7 +5962,6 @@ static void pageset_init(struct per_cpu_pageset *p)
memset(p, 0, sizeof(*p));
pcp = &p->pcp;
- pcp->count = 0;
for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
INIT_LIST_HEAD(&pcp->lists[migratetype]);
}
@@ -5676,7 +5991,7 @@ static void pageset_set_high_and_batch(struct zone *zone,
{
if (percpu_pagelist_fraction)
pageset_set_high(pcp,
- (zone->managed_pages /
+ (zone_managed_pages(zone) /
percpu_pagelist_fraction));
else
pageset_set_batch(pcp, zone_batchsize(zone));
@@ -5735,8 +6050,10 @@ void __meminit init_currently_empty_zone(struct zone *zone,
unsigned long size)
{
struct pglist_data *pgdat = zone->zone_pgdat;
+ int zone_idx = zone_idx(zone) + 1;
- pgdat->nr_zones = zone_idx(zone) + 1;
+ if (zone_idx > pgdat->nr_zones)
+ pgdat->nr_zones = zone_idx;
zone->zone_start_pfn = zone_start_pfn;
@@ -5766,7 +6083,7 @@ int __meminit __early_pfn_to_nid(unsigned long pfn,
return state->last_nid;
nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
- if (nid != -1) {
+ if (nid != NUMA_NO_NODE) {
state->last_start = start_pfn;
state->last_end = end_pfn;
state->last_nid = nid;
@@ -5828,7 +6145,7 @@ void __init sparse_memory_present_with_active_regions(int nid)
* with no available memory, a warning is printed and the start and end
* PFNs will be 0.
*/
-void __meminit get_pfn_range_for_nid(unsigned int nid,
+void __init get_pfn_range_for_nid(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn)
{
unsigned long this_start_pfn, this_end_pfn;
@@ -5877,7 +6194,7 @@ static void __init find_usable_zone_for_movable(void)
* highest usable zone for ZONE_MOVABLE. This preserves the assumption that
* zones within a node are in order of monotonic increases memory addresses
*/
-static void __meminit adjust_zone_range_for_zone_movable(int nid,
+static void __init adjust_zone_range_for_zone_movable(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
@@ -5908,7 +6225,7 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
* Return the number of pages a zone spans in a node, including holes
* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
*/
-static unsigned long __meminit zone_spanned_pages_in_node(int nid,
+static unsigned long __init zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
@@ -5943,7 +6260,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
* then all holes in the requested range will be accounted for.
*/
-unsigned long __meminit __absent_pages_in_range(int nid,
+unsigned long __init __absent_pages_in_range(int nid,
unsigned long range_start_pfn,
unsigned long range_end_pfn)
{
@@ -5964,7 +6281,7 @@ unsigned long __meminit __absent_pages_in_range(int nid,
* @start_pfn: The start PFN to start searching for holes
* @end_pfn: The end PFN to stop searching for holes
*
- * It returns the number of pages frames in memory holes within a range.
+ * Return: the number of pages frames in memory holes within a range.
*/
unsigned long __init absent_pages_in_range(unsigned long start_pfn,
unsigned long end_pfn)
@@ -5973,7 +6290,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
}
/* Return the number of page frames in holes in a zone on a node */
-static unsigned long __meminit zone_absent_pages_in_node(int nid,
+static unsigned long __init zone_absent_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
@@ -6025,7 +6342,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
}
#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
+static inline unsigned long __init zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
@@ -6044,7 +6361,7 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
return zones_size[zone_type];
}
-static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
+static inline unsigned long __init zone_absent_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
@@ -6058,7 +6375,7 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
unsigned long *zones_size,
@@ -6126,10 +6443,14 @@ static void __ref setup_usemap(struct pglist_data *pgdat,
{
unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
zone->pageblock_flags = NULL;
- if (usemapsize)
+ if (usemapsize) {
zone->pageblock_flags =
- memblock_virt_alloc_node_nopanic(usemapsize,
+ memblock_alloc_node_nopanic(usemapsize,
pgdat->node_id);
+ if (!zone->pageblock_flags)
+ panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
+ usemapsize, zone->name, pgdat->node_id);
+ }
}
#else
static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
@@ -6231,7 +6552,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
unsigned long remaining_pages)
{
- zone->managed_pages = remaining_pages;
+ atomic_long_set(&zone->managed_pages, remaining_pages);
zone_set_nid(zone, nid);
zone->name = zone_names[idx];
zone->zone_pgdat = NODE_DATA(nid);
@@ -6358,7 +6679,10 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
end = pgdat_end_pfn(pgdat);
end = ALIGN(end, MAX_ORDER_NR_PAGES);
size = (end - start) * sizeof(struct page);
- map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
+ map = memblock_alloc_node_nopanic(size, pgdat->node_id);
+ if (!map)
+ panic("Failed to allocate %ld bytes for node %d memory map\n",
+ size, pgdat->node_id);
pgdat->node_mem_map = map + offset;
}
pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
@@ -6384,12 +6708,6 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
{
- /*
- * We start only with one section of pages, more pages are added as
- * needed until the rest of deferred pages are initialized.
- */
- pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
- pgdat->node_spanned_pages);
pgdat->first_deferred_pfn = ULONG_MAX;
}
#else
@@ -6427,48 +6745,67 @@ void __init free_area_init_node(int nid, unsigned long *zones_size,
free_area_init_core(pgdat);
}
-#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP)
+#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
+/*
+ * Zero all valid struct pages in range [spfn, epfn), return number of struct
+ * pages zeroed
+ */
+static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
+{
+ unsigned long pfn;
+ u64 pgcnt = 0;
+
+ for (pfn = spfn; pfn < epfn; pfn++) {
+ if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
+ pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
+ + pageblock_nr_pages - 1;
+ continue;
+ }
+ mm_zero_struct_page(pfn_to_page(pfn));
+ pgcnt++;
+ }
+
+ return pgcnt;
+}
+
/*
* Only struct pages that are backed by physical memory are zeroed and
* initialized by going through __init_single_page(). But, there are some
* struct pages which are reserved in memblock allocator and their fields
* may be accessed (for example page_to_pfn() on some configuration accesses
* flags). We must explicitly zero those struct pages.
+ *
+ * This function also addresses a similar issue where struct pages are left
+ * uninitialized because the physical address range is not covered by
+ * memblock.memory or memblock.reserved. That could happen when memblock
+ * layout is manually configured via memmap=.
*/
void __init zero_resv_unavail(void)
{
phys_addr_t start, end;
- unsigned long pfn;
u64 i, pgcnt;
+ phys_addr_t next = 0;
/*
- * Loop through ranges that are reserved, but do not have reported
- * physical memory backing.
+ * Loop through unavailable ranges not covered by memblock.memory.
*/
pgcnt = 0;
- for_each_resv_unavail_range(i, &start, &end) {
- for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) {
- if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
- pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
- + pageblock_nr_pages - 1;
- continue;
- }
- mm_zero_struct_page(pfn_to_page(pfn));
- pgcnt++;
- }
+ for_each_mem_range(i, &memblock.memory, NULL,
+ NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
+ if (next < start)
+ pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
+ next = end;
}
+ pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn);
/*
* Struct pages that do not have backing memory. This could be because
* firmware is using some of this memory, or for some other reasons.
- * Once memblock is changed so such behaviour is not allowed: i.e.
- * list of "reserved" memory must be a subset of list of "memory", then
- * this code can be removed.
*/
if (pgcnt)
- pr_info("Reserved but unavailable: %lld pages", pgcnt);
+ pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
}
-#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */
+#endif /* !CONFIG_FLAT_NODE_MEM_MAP */
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -6501,14 +6838,14 @@ void __init setup_nr_node_ids(void)
* model has fine enough granularity to avoid incorrect mapping for the
* populated node map.
*
- * Returns the determined alignment in pfn's. 0 if there is no alignment
+ * Return: the determined alignment in pfn's. 0 if there is no alignment
* requirement (single node).
*/
unsigned long __init node_map_pfn_alignment(void)
{
unsigned long accl_mask = 0, last_end = 0;
unsigned long start, end, mask;
- int last_nid = -1;
+ int last_nid = NUMA_NO_NODE;
int i, nid;
for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
@@ -6556,7 +6893,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
/**
* find_min_pfn_with_active_regions - Find the minimum PFN registered
*
- * It returns the minimum PFN based on information provided via
+ * Return: the minimum PFN based on information provided via
* memblock_set_node().
*/
unsigned long __init find_min_pfn_with_active_regions(void)
@@ -6803,15 +7140,12 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
{
enum zone_type zone_type;
- if (N_MEMORY == N_NORMAL_MEMORY)
- return;
-
for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
if (populated_zone(zone)) {
- node_set_state(nid, N_HIGH_MEMORY);
- if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
- zone_type <= ZONE_NORMAL)
+ if (IS_ENABLED(CONFIG_HIGHMEM))
+ node_set_state(nid, N_HIGH_MEMORY);
+ if (zone_type <= ZONE_NORMAL)
node_set_state(nid, N_NORMAL_MEMORY);
break;
}
@@ -6967,18 +7301,16 @@ early_param("movablecore", cmdline_parse_movablecore);
void adjust_managed_page_count(struct page *page, long count)
{
- spin_lock(&managed_page_count_lock);
- page_zone(page)->managed_pages += count;
- totalram_pages += count;
+ atomic_long_add(count, &page_zone(page)->managed_pages);
+ totalram_pages_add(count);
#ifdef CONFIG_HIGHMEM
if (PageHighMem(page))
- totalhigh_pages += count;
+ totalhigh_pages_add(count);
#endif
- spin_unlock(&managed_page_count_lock);
}
EXPORT_SYMBOL(adjust_managed_page_count);
-unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
+unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
{
void *pos;
unsigned long pages = 0;
@@ -7009,15 +7341,14 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
return pages;
}
-EXPORT_SYMBOL(free_reserved_area);
#ifdef CONFIG_HIGHMEM
void free_highmem_page(struct page *page)
{
__free_reserved_page(page);
- totalram_pages++;
- page_zone(page)->managed_pages++;
- totalhigh_pages++;
+ totalram_pages_inc();
+ atomic_long_inc(&page_zone(page)->managed_pages);
+ totalhigh_pages_inc();
}
#endif
@@ -7066,10 +7397,10 @@ void __init mem_init_print_info(const char *str)
physpages << (PAGE_SHIFT - 10),
codesize >> 10, datasize >> 10, rosize >> 10,
(init_data_size + init_code_size) >> 10, bss_size >> 10,
- (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
+ (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
totalcma_pages << (PAGE_SHIFT - 10),
#ifdef CONFIG_HIGHMEM
- totalhigh_pages << (PAGE_SHIFT - 10),
+ totalhigh_pages() << (PAGE_SHIFT - 10),
#endif
str ? ", " : "", str ? str : "");
}
@@ -7149,6 +7480,7 @@ static void calculate_totalreserve_pages(void)
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
long max = 0;
+ unsigned long managed_pages = zone_managed_pages(zone);
/* Find valid and maximum lowmem_reserve in the zone */
for (j = i; j < MAX_NR_ZONES; j++) {
@@ -7159,8 +7491,8 @@ static void calculate_totalreserve_pages(void)
/* we treat the high watermark as reserved pages. */
max += high_wmark_pages(zone);
- if (max > zone->managed_pages)
- max = zone->managed_pages;
+ if (max > managed_pages)
+ max = managed_pages;
pgdat->totalreserve_pages += max;
@@ -7184,7 +7516,7 @@ static void setup_per_zone_lowmem_reserve(void)
for_each_online_pgdat(pgdat) {
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
- unsigned long managed_pages = zone->managed_pages;
+ unsigned long managed_pages = zone_managed_pages(zone);
zone->lowmem_reserve[j] = 0;
@@ -7202,7 +7534,7 @@ static void setup_per_zone_lowmem_reserve(void)
lower_zone->lowmem_reserve[j] =
managed_pages / sysctl_lowmem_reserve_ratio[idx];
}
- managed_pages += lower_zone->managed_pages;
+ managed_pages += zone_managed_pages(lower_zone);
}
}
}
@@ -7221,14 +7553,14 @@ static void __setup_per_zone_wmarks(void)
/* Calculate total number of !ZONE_HIGHMEM pages */
for_each_zone(zone) {
if (!is_highmem(zone))
- lowmem_pages += zone->managed_pages;
+ lowmem_pages += zone_managed_pages(zone);
}
for_each_zone(zone) {
u64 tmp;
spin_lock_irqsave(&zone->lock, flags);
- tmp = (u64)pages_min * zone->managed_pages;
+ tmp = (u64)pages_min * zone_managed_pages(zone);
do_div(tmp, lowmem_pages);
if (is_highmem(zone)) {
/*
@@ -7237,20 +7569,20 @@ static void __setup_per_zone_wmarks(void)
* value here.
*
* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
- * deltas control asynch page reclaim, and so should
+ * deltas control async page reclaim, and so should
* not be capped for highmem.
*/
unsigned long min_pages;
- min_pages = zone->managed_pages / 1024;
+ min_pages = zone_managed_pages(zone) / 1024;
min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
- zone->watermark[WMARK_MIN] = min_pages;
+ zone->_watermark[WMARK_MIN] = min_pages;
} else {
/*
* If it's a lowmem zone, reserve a number of pages
* proportionate to the zone's size.
*/
- zone->watermark[WMARK_MIN] = tmp;
+ zone->_watermark[WMARK_MIN] = tmp;
}
/*
@@ -7259,11 +7591,12 @@ static void __setup_per_zone_wmarks(void)
* ensure a minimum size on small systems.
*/
tmp = max_t(u64, tmp >> 2,
- mult_frac(zone->managed_pages,
+ mult_frac(zone_managed_pages(zone),
watermark_scale_factor, 10000));
- zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
- zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
+ zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
+ zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
+ zone->watermark_boost = 0;
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -7364,6 +7697,18 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
+int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
+ return 0;
+}
+
int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
@@ -7389,8 +7734,8 @@ static void setup_min_unmapped_ratio(void)
pgdat->min_unmapped_pages = 0;
for_each_zone(zone)
- zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
- sysctl_min_unmapped_ratio) / 100;
+ zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
+ sysctl_min_unmapped_ratio) / 100;
}
@@ -7417,8 +7762,8 @@ static void setup_min_slab_ratio(void)
pgdat->min_slab_pages = 0;
for_each_zone(zone)
- zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
- sysctl_min_slab_ratio) / 100;
+ zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
+ sysctl_min_slab_ratio) / 100;
}
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
@@ -7614,9 +7959,11 @@ void *__init alloc_large_system_hash(const char *tablename,
size = bucketsize << log2qty;
if (flags & HASH_EARLY) {
if (flags & HASH_ZERO)
- table = memblock_virt_alloc_nopanic(size, 0);
+ table = memblock_alloc_nopanic(size,
+ SMP_CACHE_BYTES);
else
- table = memblock_virt_alloc_raw(size, 0);
+ table = memblock_alloc_raw(size,
+ SMP_CACHE_BYTES);
} else if (hashdist) {
table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
} else {
@@ -7656,8 +8003,7 @@ void *__init alloc_large_system_hash(const char *tablename,
* race condition. So you can't expect this function should be exact.
*/
bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
- int migratetype,
- bool skip_hwpoisoned_pages)
+ int migratetype, int flags)
{
unsigned long pfn, iter, found;
@@ -7691,16 +8037,27 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
goto unmovable;
/*
+ * If the zone is movable and we have ruled out all reserved
+ * pages then it should be reasonably safe to assume the rest
+ * is movable.
+ */
+ if (zone_idx(zone) == ZONE_MOVABLE)
+ continue;
+
+ /*
* Hugepages are not in LRU lists, but they're movable.
- * We need not scan over tail pages bacause we don't
+ * We need not scan over tail pages because we don't
* handle each tail page individually in migration.
*/
if (PageHuge(page)) {
+ struct page *head = compound_head(page);
+ unsigned int skip_pages;
- if (!hugepage_migration_supported(page_hstate(page)))
+ if (!hugepage_migration_supported(page_hstate(head)))
goto unmovable;
- iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
+ skip_pages = (1 << compound_order(head)) - (page - head);
+ iter += skip_pages - 1;
continue;
}
@@ -7720,7 +8077,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* The HWPoisoned page may be not in buddy system, and
* page_count() is not 0.
*/
- if (skip_hwpoisoned_pages && PageHWPoison(page))
+ if ((flags & SKIP_HWPOISON) && PageHWPoison(page))
continue;
if (__PageMovable(page))
@@ -7747,6 +8104,8 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
return false;
unmovable:
WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
+ if (flags & REPORT_FAILURE)
+ dump_page(pfn_to_page(pfn+iter), "unmovable page");
return true;
}
@@ -7826,7 +8185,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
* pageblocks in the range. Once isolated, the pageblocks should not
* be modified by others.
*
- * Returns zero on success or negative error code. On success all
+ * Return: zero on success or negative error code. On success all
* pages which PFN is in [start, end) are allocated for the caller and
* need to be freed with free_contig_range().
*/
@@ -7873,8 +8232,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
*/
ret = start_isolate_page_range(pfn_max_align_down(start),
- pfn_max_align_up(end), migratetype,
- false);
+ pfn_max_align_up(end), migratetype, 0);
if (ret)
return ret;
@@ -7911,7 +8269,6 @@ int alloc_contig_range(unsigned long start, unsigned long end,
*/
lru_add_drain_all();
- drain_all_pages(cc.zone);
order = 0;
outer_start = start;