aboutsummaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c886
1 files changed, 553 insertions, 333 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6de9440e3ae2..9f9623d690d6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -55,10 +55,10 @@
#include <linux/kmemleak.h>
#include <linux/compaction.h>
#include <trace/events/kmem.h>
+#include <trace/events/oom.h>
#include <linux/prefetch.h>
#include <linux/mm_inline.h>
#include <linux/migrate.h>
-#include <linux/page_ext.h>
#include <linux/hugetlb.h>
#include <linux/sched/rt.h>
#include <linux/page_owner.h>
@@ -91,6 +91,10 @@ EXPORT_PER_CPU_SYMBOL(_numa_mem_);
int _node_numa_mem_[MAX_NUMNODES];
#endif
+/* work_structs for global per-cpu drains */
+DEFINE_MUTEX(pcpu_drain_mutex);
+DEFINE_PER_CPU(struct work_struct, pcpu_drain);
+
#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
volatile unsigned long latent_entropy __latent_entropy;
EXPORT_SYMBOL(latent_entropy);
@@ -714,7 +718,7 @@ static inline void rmv_page_order(struct page *page)
/*
* This function checks whether a page is free && is the buddy
* we can do coalesce a page and its buddy if
- * (a) the buddy is not in a hole &&
+ * (a) the buddy is not in a hole (check before calling!) &&
* (b) the buddy is in the buddy system &&
* (c) a page and its buddy have the same order &&
* (d) a page and its buddy are in the same zone.
@@ -729,9 +733,6 @@ static inline void rmv_page_order(struct page *page)
static inline int page_is_buddy(struct page *page, struct page *buddy,
unsigned int order)
{
- if (!pfn_valid_within(page_to_pfn(buddy)))
- return 0;
-
if (page_is_guard(buddy) && page_order(buddy) == order) {
if (page_zone_id(page) != page_zone_id(buddy))
return 0;
@@ -787,9 +788,8 @@ static inline void __free_one_page(struct page *page,
struct zone *zone, unsigned int order,
int migratetype)
{
- unsigned long page_idx;
- unsigned long combined_idx;
- unsigned long uninitialized_var(buddy_idx);
+ unsigned long combined_pfn;
+ unsigned long uninitialized_var(buddy_pfn);
struct page *buddy;
unsigned int max_order;
@@ -802,15 +802,16 @@ static inline void __free_one_page(struct page *page,
if (likely(!is_migrate_isolate(migratetype)))
__mod_zone_freepage_state(zone, 1 << order, migratetype);
- page_idx = pfn & ((1 << MAX_ORDER) - 1);
-
- VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
+ VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
continue_merging:
while (order < max_order - 1) {
- buddy_idx = __find_buddy_index(page_idx, order);
- buddy = page + (buddy_idx - page_idx);
+ buddy_pfn = __find_buddy_pfn(pfn, order);
+ buddy = page + (buddy_pfn - pfn);
+
+ if (!pfn_valid_within(buddy_pfn))
+ goto done_merging;
if (!page_is_buddy(page, buddy, order))
goto done_merging;
/*
@@ -824,9 +825,9 @@ continue_merging:
zone->free_area[order].nr_free--;
rmv_page_order(buddy);
}
- combined_idx = buddy_idx & page_idx;
- page = page + (combined_idx - page_idx);
- page_idx = combined_idx;
+ combined_pfn = buddy_pfn & pfn;
+ page = page + (combined_pfn - pfn);
+ pfn = combined_pfn;
order++;
}
if (max_order < MAX_ORDER) {
@@ -841,8 +842,8 @@ continue_merging:
if (unlikely(has_isolate_pageblock(zone))) {
int buddy_mt;
- buddy_idx = __find_buddy_index(page_idx, order);
- buddy = page + (buddy_idx - page_idx);
+ buddy_pfn = __find_buddy_pfn(pfn, order);
+ buddy = page + (buddy_pfn - pfn);
buddy_mt = get_pageblock_migratetype(buddy);
if (migratetype != buddy_mt
@@ -865,12 +866,12 @@ done_merging:
* so it's less likely to be used soon and more likely to be merged
* as a higher order page
*/
- if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
+ if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
struct page *higher_page, *higher_buddy;
- combined_idx = buddy_idx & page_idx;
- higher_page = page + (combined_idx - page_idx);
- buddy_idx = __find_buddy_index(combined_idx, order + 1);
- higher_buddy = higher_page + (buddy_idx - combined_idx);
+ combined_pfn = buddy_pfn & pfn;
+ higher_page = page + (combined_pfn - pfn);
+ buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
+ higher_buddy = higher_page + (buddy_pfn - combined_pfn);
if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
list_add_tail(&page->lru,
&zone->free_area[order].free_list[migratetype]);
@@ -1087,10 +1088,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
{
int migratetype = 0;
int batch_free = 0;
- unsigned long nr_scanned;
+ unsigned long nr_scanned, flags;
bool isolated_pageblocks;
- spin_lock(&zone->lock);
+ spin_lock_irqsave(&zone->lock, flags);
isolated_pageblocks = has_isolate_pageblock(zone);
nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
if (nr_scanned)
@@ -1139,7 +1140,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
trace_mm_page_pcpu_drain(page, 0, mt);
} while (--count && --batch_free && !list_empty(list));
}
- spin_unlock(&zone->lock);
+ spin_unlock_irqrestore(&zone->lock, flags);
}
static void free_one_page(struct zone *zone,
@@ -1147,8 +1148,9 @@ static void free_one_page(struct zone *zone,
unsigned int order,
int migratetype)
{
- unsigned long nr_scanned;
- spin_lock(&zone->lock);
+ unsigned long nr_scanned, flags;
+ spin_lock_irqsave(&zone->lock, flags);
+ __count_vm_events(PGFREE, 1 << order);
nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
if (nr_scanned)
__mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
@@ -1158,7 +1160,7 @@ static void free_one_page(struct zone *zone,
migratetype = get_pfnblock_migratetype(page, pfn);
}
__free_one_page(page, pfn, zone, order, migratetype);
- spin_unlock(&zone->lock);
+ spin_unlock_irqrestore(&zone->lock, flags);
}
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@ -1236,7 +1238,6 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
static void __free_pages_ok(struct page *page, unsigned int order)
{
- unsigned long flags;
int migratetype;
unsigned long pfn = page_to_pfn(page);
@@ -1244,10 +1245,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
return;
migratetype = get_pfnblock_migratetype(page, pfn);
- local_irq_save(flags);
- __count_vm_events(PGFREE, 1 << order);
free_one_page(page_zone(page), page, pfn, order, migratetype);
- local_irq_restore(flags);
}
static void __init __free_pages_boot_core(struct page *page, unsigned int order)
@@ -1864,14 +1862,14 @@ int move_freepages(struct zone *zone,
#endif
for (page = start_page; page <= end_page;) {
- /* Make sure we are not inadvertently changing nodes */
- VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
-
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
continue;
}
+ /* Make sure we are not inadvertently changing nodes */
+ VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
+
if (!PageBuddy(page)) {
page++;
continue;
@@ -2058,8 +2056,12 @@ out_unlock:
* potentially hurts the reliability of high-order allocations when under
* intense memory pressure but failed atomic allocations should be easier
* to recover from than an OOM.
+ *
+ * If @force is true, try to unreserve a pageblock even though highatomic
+ * pageblock is exhausted.
*/
-static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
+static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+ bool force)
{
struct zonelist *zonelist = ac->zonelist;
unsigned long flags;
@@ -2067,11 +2069,16 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
struct zone *zone;
struct page *page;
int order;
+ bool ret;
for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
ac->nodemask) {
- /* Preserve at least one pageblock */
- if (zone->nr_reserved_highatomic <= pageblock_nr_pages)
+ /*
+ * Preserve at least one pageblock unless memory pressure
+ * is really high.
+ */
+ if (!force && zone->nr_reserved_highatomic <=
+ pageblock_nr_pages)
continue;
spin_lock_irqsave(&zone->lock, flags);
@@ -2085,13 +2092,25 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
continue;
/*
- * It should never happen but changes to locking could
- * inadvertently allow a per-cpu drain to add pages
- * to MIGRATE_HIGHATOMIC while unreserving so be safe
- * and watch for underflows.
+ * In page freeing path, migratetype change is racy so
+ * we can counter several free pages in a pageblock
+ * in this loop althoug we changed the pageblock type
+ * from highatomic to ac->migratetype. So we should
+ * adjust the count once.
*/
- zone->nr_reserved_highatomic -= min(pageblock_nr_pages,
- zone->nr_reserved_highatomic);
+ if (get_pageblock_migratetype(page) ==
+ MIGRATE_HIGHATOMIC) {
+ /*
+ * It should never happen but changes to
+ * locking could inadvertently allow a per-cpu
+ * drain to add pages to MIGRATE_HIGHATOMIC
+ * while unreserving so be safe and watch for
+ * underflows.
+ */
+ zone->nr_reserved_highatomic -= min(
+ pageblock_nr_pages,
+ zone->nr_reserved_highatomic);
+ }
/*
* Convert to ac->migratetype and avoid the normal
@@ -2103,12 +2122,16 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
* may increase.
*/
set_pageblock_migratetype(page, ac->migratetype);
- move_freepages_block(zone, page, ac->migratetype);
- spin_unlock_irqrestore(&zone->lock, flags);
- return;
+ ret = move_freepages_block(zone, page, ac->migratetype);
+ if (ret) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return ret;
+ }
}
spin_unlock_irqrestore(&zone->lock, flags);
}
+
+ return false;
}
/* Remove an element from the buddy allocator from the fallback list */
@@ -2133,7 +2156,8 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
page = list_first_entry(&area->free_list[fallback_mt],
struct page, lru);
- if (can_steal)
+ if (can_steal &&
+ get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC)
steal_suitable_fallback(zone, page, start_migratetype);
/* Remove the page from the freelists */
@@ -2192,9 +2216,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, bool cold)
{
- int i;
+ int i, alloced = 0;
+ unsigned long flags;
- spin_lock(&zone->lock);
+ spin_lock_irqsave(&zone->lock, flags);
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype);
if (unlikely(page == NULL))
@@ -2217,13 +2242,21 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
else
list_add_tail(&page->lru, list);
list = &page->lru;
+ alloced++;
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}
+
+ /*
+ * i pages were removed from the buddy list even if some leak due
+ * to check_pcp_refill failing so adjust NR_FREE_PAGES based
+ * on i. Do not confuse with 'alloced' which is the number of
+ * pages added to the pcp list.
+ */
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
- spin_unlock(&zone->lock);
- return i;
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return alloced;
}
#ifdef CONFIG_NUMA
@@ -2307,16 +2340,26 @@ void drain_local_pages(struct zone *zone)
drain_pages(cpu);
}
+static void drain_local_pages_wq(struct work_struct *work)
+{
+ /*
+ * drain_all_pages doesn't use proper cpu hotplug protection so
+ * we can race with cpu offline when the WQ can move this from
+ * a cpu pinned worker to an unbound one. We can operate on a different
+ * cpu which is allright but we also have to make sure to not move to
+ * a different one.
+ */
+ preempt_disable();
+ drain_local_pages(NULL);
+ preempt_enable();
+}
+
/*
* Spill all the per-cpu pages from all CPUs back into the buddy allocator.
*
* When zone parameter is non-NULL, spill just the single zone's pages.
*
- * Note that this code is protected against sending an IPI to an offline
- * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
- * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
- * nothing keeps CPUs from showing up after we populated the cpumask and
- * before the call to on_each_cpu_mask().
+ * Note that this can be extremely slow as the draining happens in a workqueue.
*/
void drain_all_pages(struct zone *zone)
{
@@ -2328,6 +2371,21 @@ void drain_all_pages(struct zone *zone)
*/
static cpumask_t cpus_with_pcps;
+ /* Workqueues cannot recurse */
+ if (current->flags & PF_WQ_WORKER)
+ return;
+
+ /*
+ * Do not drain if one is already in progress unless it's specific to
+ * a zone. Such callers are primarily CMA and memory hotplug and need
+ * the drain to be complete when the call returns.
+ */
+ if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
+ if (!zone)
+ return;
+ mutex_lock(&pcpu_drain_mutex);
+ }
+
/*
* We don't care about racing with CPU hotplug event
* as offline notification will cause the notified
@@ -2358,8 +2416,16 @@ void drain_all_pages(struct zone *zone)
else
cpumask_clear_cpu(cpu, &cpus_with_pcps);
}
- on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
- zone, 1);
+
+ for_each_cpu(cpu, &cpus_with_pcps) {
+ struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
+ INIT_WORK(work, drain_local_pages_wq);
+ schedule_work_on(cpu, work);
+ }
+ for_each_cpu(cpu, &cpus_with_pcps)
+ flush_work(per_cpu_ptr(&pcpu_drain, cpu));
+
+ mutex_unlock(&pcpu_drain_mutex);
}
#ifdef CONFIG_HIBERNATION
@@ -2410,17 +2476,20 @@ void free_hot_cold_page(struct page *page, bool cold)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
- unsigned long flags;
unsigned long pfn = page_to_pfn(page);
int migratetype;
+ if (in_interrupt()) {
+ __free_pages_ok(page, 0);
+ return;
+ }
+
if (!free_pcp_prepare(page))
return;
migratetype = get_pfnblock_migratetype(page, pfn);
set_pcppage_migratetype(page, migratetype);
- local_irq_save(flags);
- __count_vm_event(PGFREE);
+ preempt_disable();
/*
* We only track unmovable, reclaimable and movable on pcp lists.
@@ -2437,6 +2506,7 @@ void free_hot_cold_page(struct page *page, bool cold)
migratetype = MIGRATE_MOVABLE;
}
+ __count_vm_event(PGFREE);
pcp = &this_cpu_ptr(zone->pageset)->pcp;
if (!cold)
list_add(&page->lru, &pcp->lists[migratetype]);
@@ -2450,7 +2520,7 @@ void free_hot_cold_page(struct page *page, bool cold)
}
out:
- local_irq_restore(flags);
+ preempt_enable();
}
/*
@@ -2534,7 +2604,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages) {
int mt = get_pageblock_migratetype(page);
- if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
+ if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
+ && mt != MIGRATE_HIGHATOMIC)
set_pageblock_migratetype(page,
MIGRATE_MOVABLE);
}
@@ -2548,101 +2619,124 @@ int __isolate_free_page(struct page *page, unsigned int order)
* Update NUMA hit/miss statistics
*
* Must be called with interrupts disabled.
- *
- * When __GFP_OTHER_NODE is set assume the node of the preferred
- * zone is the local node. This is useful for daemons who allocate
- * memory on behalf of other processes.
*/
-static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
- gfp_t flags)
+static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
{
#ifdef CONFIG_NUMA
- int local_nid = numa_node_id();
enum zone_stat_item local_stat = NUMA_LOCAL;
- if (unlikely(flags & __GFP_OTHER_NODE)) {
+ if (z->node != numa_node_id())
local_stat = NUMA_OTHER;
- local_nid = preferred_zone->node;
- }
- if (z->node == local_nid) {
+ if (z->node == preferred_zone->node)
__inc_zone_state(z, NUMA_HIT);
- __inc_zone_state(z, local_stat);
- } else {
+ else {
__inc_zone_state(z, NUMA_MISS);
__inc_zone_state(preferred_zone, NUMA_FOREIGN);
}
+ __inc_zone_state(z, local_stat);
#endif
}
+/* Remove page from the per-cpu list, caller must protect the list */
+static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
+ bool cold, struct per_cpu_pages *pcp,
+ struct list_head *list)
+{
+ struct page *page;
+
+ VM_BUG_ON(in_interrupt());
+
+ do {
+ if (list_empty(list)) {
+ pcp->count += rmqueue_bulk(zone, 0,
+ pcp->batch, list,
+ migratetype, cold);
+ if (unlikely(list_empty(list)))
+ return NULL;
+ }
+
+ if (cold)
+ page = list_last_entry(list, struct page, lru);
+ else
+ page = list_first_entry(list, struct page, lru);
+
+ list_del(&page->lru);
+ pcp->count--;
+ } while (check_new_pcp(page));
+
+ return page;
+}
+
+/* Lock and remove page from the per-cpu list */
+static struct page *rmqueue_pcplist(struct zone *preferred_zone,
+ struct zone *zone, unsigned int order,
+ gfp_t gfp_flags, int migratetype)
+{
+ struct per_cpu_pages *pcp;
+ struct list_head *list;
+ bool cold = ((gfp_flags & __GFP_COLD) != 0);
+ struct page *page;
+
+ preempt_disable();
+ pcp = &this_cpu_ptr(zone->pageset)->pcp;
+ list = &pcp->lists[migratetype];
+ page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list);
+ if (page) {
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+ zone_statistics(preferred_zone, zone);
+ }
+ preempt_enable();
+ return page;
+}
+
/*
* Allocate a page from the given zone. Use pcplists for order-0 allocations.
*/
static inline
-struct page *buffered_rmqueue(struct zone *preferred_zone,
+struct page *rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
gfp_t gfp_flags, unsigned int alloc_flags,
int migratetype)
{
unsigned long flags;
struct page *page;
- bool cold = ((gfp_flags & __GFP_COLD) != 0);
-
- if (likely(order == 0)) {
- struct per_cpu_pages *pcp;
- struct list_head *list;
-
- local_irq_save(flags);
- do {
- pcp = &this_cpu_ptr(zone->pageset)->pcp;
- list = &pcp->lists[migratetype];
- if (list_empty(list)) {
- pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, list,
- migratetype, cold);
- if (unlikely(list_empty(list)))
- goto failed;
- }
-
- if (cold)
- page = list_last_entry(list, struct page, lru);
- else
- page = list_first_entry(list, struct page, lru);
- list_del(&page->lru);
- pcp->count--;
+ if (likely(order == 0) && !in_interrupt()) {
+ page = rmqueue_pcplist(preferred_zone, zone, order,
+ gfp_flags, migratetype);
+ goto out;
+ }
- } while (check_new_pcp(page));
- } else {
- /*
- * We most definitely don't want callers attempting to
- * allocate greater than order-1 page units with __GFP_NOFAIL.
- */
- WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
- spin_lock_irqsave(&zone->lock, flags);
+ /*
+ * We most definitely don't want callers attempting to
+ * allocate greater than order-1 page units with __GFP_NOFAIL.
+ */
+ WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
+ spin_lock_irqsave(&zone->lock, flags);
- do {
- page = NULL;
- if (alloc_flags & ALLOC_HARDER) {
- page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
- if (page)
- trace_mm_page_alloc_zone_locked(page, order, migratetype);
- }
- if (!page)
- page = __rmqueue(zone, order, migratetype);
- } while (page && check_new_pages(page, order));
- spin_unlock(&zone->lock);
+ do {
+ page = NULL;
+ if (alloc_flags & ALLOC_HARDER) {
+ page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+ if (page)
+ trace_mm_page_alloc_zone_locked(page, order, migratetype);
+ }
if (!page)
- goto failed;
- __mod_zone_freepage_state(zone, -(1 << order),
- get_pcppage_migratetype(page));
- }
+ page = __rmqueue(zone, order, migratetype);
+ } while (page && check_new_pages(page, order));
+ spin_unlock(&zone->lock);
+ if (!page)
+ goto failed;
+ __mod_zone_freepage_state(zone, -(1 << order),
+ get_pcppage_migratetype(page));
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
- zone_statistics(preferred_zone, zone, gfp_flags);
+ zone_statistics(preferred_zone, zone);
local_irq_restore(flags);
- VM_BUG_ON_PAGE(bad_range(zone, page), page);
+out:
+ VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
return page;
failed:
@@ -2850,7 +2944,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
#ifdef CONFIG_NUMA
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
- return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
+ return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
RECLAIM_DISTANCE;
}
#else /* CONFIG_NUMA */
@@ -2947,7 +3041,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
}
try_this_zone:
- page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order,
+ page = rmqueue(ac->preferred_zoneref->zone, zone, order,
gfp_mask, alloc_flags, ac->migratetype);
if (page) {
prep_new_page(page, order, gfp_mask, alloc_flags);
@@ -2980,18 +3074,12 @@ static inline bool should_suppress_show_mem(void)
return ret;
}
-static DEFINE_RATELIMIT_STATE(nopage_rs,
- DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
-
-void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
+static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
{
unsigned int filter = SHOW_MEM_FILTER_NODES;
- struct va_format vaf;
- va_list args;
+ static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
- if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
- debug_guardpage_minorder() > 0)
+ if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs))
return;
/*
@@ -3006,6 +3094,20 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
filter &= ~SHOW_MEM_FILTER_NODES;
+ show_mem(filter, nodemask);
+}
+
+void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+ static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+ debug_guardpage_minorder() > 0)
+ return;
+
pr_warn("%s: ", current->comm);
va_start(args, fmt);
@@ -3014,11 +3116,36 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
pr_cont("%pV", &vaf);
va_end(args);
- pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask);
+ pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask);
+ if (nodemask)
+ pr_cont("%*pbl\n", nodemask_pr_args(nodemask));
+ else
+ pr_cont("(null)\n");
+
+ cpuset_print_current_mems_allowed();
dump_stack();
- if (!should_suppress_show_mem())
- show_mem(filter);
+ warn_alloc_show_mem(gfp_mask, nodemask);
+}
+
+static inline struct page *
+__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
+ unsigned int alloc_flags,
+ const struct alloc_context *ac)
+{
+ struct page *page;
+
+ page = get_page_from_freelist(gfp_mask, order,
+ alloc_flags|ALLOC_CPUSET, ac);
+ /*
+ * fallback to ignore cpuset restriction if our nodes
+ * are depleted
+ */
+ if (!page)
+ page = get_page_from_freelist(gfp_mask, order,
+ alloc_flags, ac);
+
+ return page;
}
static inline struct page *
@@ -3056,47 +3183,42 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
if (page)
goto out;
- if (!(gfp_mask & __GFP_NOFAIL)) {
- /* Coredumps can quickly deplete all memory reserves */
- if (current->flags & PF_DUMPCORE)
- goto out;
- /* The OOM killer will not help higher order allocs */
- if (order > PAGE_ALLOC_COSTLY_ORDER)
- goto out;
- /* The OOM killer does not needlessly kill tasks for lowmem */
- if (ac->high_zoneidx < ZONE_NORMAL)
- goto out;
- if (pm_suspended_storage())
- goto out;
- /*
- * XXX: GFP_NOFS allocations should rather fail than rely on
- * other request to make a forward progress.
- * We are in an unfortunate situation where out_of_memory cannot
- * do much for this context but let's try it to at least get
- * access to memory reserved if the current task is killed (see
- * out_of_memory). Once filesystems are ready to handle allocation
- * failures more gracefully we should just bail out here.
- */
+ /* Coredumps can quickly deplete all memory reserves */
+ if (current->flags & PF_DUMPCORE)
+ goto out;
+ /* The OOM killer will not help higher order allocs */
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ goto out;
+ /* The OOM killer does not needlessly kill tasks for lowmem */
+ if (ac->high_zoneidx < ZONE_NORMAL)
+ goto out;
+ if (pm_suspended_storage())
+ goto out;
+ /*
+ * XXX: GFP_NOFS allocations should rather fail than rely on
+ * other request to make a forward progress.
+ * We are in an unfortunate situation where out_of_memory cannot
+ * do much for this context but let's try it to at least get
+ * access to memory reserved if the current task is killed (see
+ * out_of_memory). Once filesystems are ready to handle allocation
+ * failures more gracefully we should just bail out here.
+ */
+
+ /* The OOM killer may not free memory on a specific node */
+ if (gfp_mask & __GFP_THISNODE)
+ goto out;
- /* The OOM killer may not free memory on a specific node */
- if (gfp_mask & __GFP_THISNODE)
- goto out;
- }
/* Exhausted what can be done so it's blamo time */
if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
*did_some_progress = 1;
- if (gfp_mask & __GFP_NOFAIL) {
- page = get_page_from_freelist(gfp_mask, order,
- ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac);
- /*
- * fallback to ignore cpuset restriction if our nodes
- * are depleted
- */
- if (!page)
- page = get_page_from_freelist(gfp_mask, order,
+ /*
+ * Help non-failing allocations by giving them access to memory
+ * reserves
+ */
+ if (gfp_mask & __GFP_NOFAIL)
+ page = __alloc_pages_cpuset_fallback(gfp_mask, order,
ALLOC_NO_WATERMARKS, ac);
- }
}
out:
mutex_unlock(&oom_lock);
@@ -3165,6 +3287,9 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
{
int max_retries = MAX_COMPACT_RETRIES;
int min_priority;
+ bool ret = false;
+ int retries = *compaction_retries;
+ enum compact_priority priority = *compact_priority;
if (!order)
return false;
@@ -3186,8 +3311,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
* But do not retry if the given zonelist is not suitable for
* compaction.
*/
- if (compaction_withdrawn(compact_result))
- return compaction_zonelist_suitable(ac, order, alloc_flags);
+ if (compaction_withdrawn(compact_result)) {
+ ret = compaction_zonelist_suitable(ac, order, alloc_flags);
+ goto out;
+ }
/*
* !costly requests are much more important than __GFP_REPEAT
@@ -3199,8 +3326,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
*/
if (order > PAGE_ALLOC_COSTLY_ORDER)
max_retries /= 4;
- if (*compaction_retries <= max_retries)
- return true;
+ if (*compaction_retries <= max_retries) {
+ ret = true;
+ goto out;
+ }
/*
* Make sure there are attempts at the highest priority if we exhausted
@@ -3209,12 +3338,15 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
check_priority:
min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
+
if (*compact_priority > min_priority) {
(*compact_priority)--;
*compaction_retries = 0;
- return true;
+ ret = true;
}
- return false;
+out:
+ trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
+ return ret;
}
#else
static inline struct page *
@@ -3305,7 +3437,7 @@ retry:
* Shrink them them and try again
*/
if (!page && !drained) {
- unreserve_highatomic_pageblock(ac);
+ unreserve_highatomic_pageblock(ac, false);
drain_all_pages(NULL);
drained = true;
goto retry;
@@ -3422,8 +3554,10 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
* Make sure we converge to OOM if we cannot make any progress
* several times in the row.
*/
- if (*no_progress_loops > MAX_RECLAIM_RETRIES)
- return false;
+ if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
+ /* Before OOM, exhaust highatomic_reserve */
+ return unreserve_highatomic_pageblock(ac, true);
+ }
/*
* Keep reclaiming pages while there is a chance this will lead
@@ -3435,6 +3569,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
ac->nodemask) {
unsigned long available;
unsigned long reclaimable;
+ unsigned long min_wmark = min_wmark_pages(zone);
+ bool wmark;
available = reclaimable = zone_reclaimable_pages(zone);
available -= DIV_ROUND_UP((*no_progress_loops) * available,
@@ -3445,8 +3581,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
* Would the allocation succeed if we reclaimed the whole
* available?
*/
- if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
- ac_classzone_idx(ac), alloc_flags, available)) {
+ wmark = __zone_watermark_ok(zone, order, min_wmark,
+ ac_classzone_idx(ac), alloc_flags, available);
+ trace_reclaim_retry_zone(z, order, reclaimable,
+ available, min_wmark, *no_progress_loops, wmark);
+ if (wmark) {
/*
* If we didn't make any progress and have a lot of
* dirty + writeback pages then we should wait for
@@ -3494,12 +3633,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct page *page = NULL;
unsigned int alloc_flags;
unsigned long did_some_progress;
- enum compact_priority compact_priority = DEF_COMPACT_PRIORITY;
+ enum compact_priority compact_priority;
enum compact_result compact_result;
- int compaction_retries = 0;
- int no_progress_loops = 0;
+ int compaction_retries;
+ int no_progress_loops;
unsigned long alloc_start = jiffies;
unsigned int stall_timeout = 10 * HZ;
+ unsigned int cpuset_mems_cookie;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -3520,6 +3660,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
+retry_cpuset:
+ compaction_retries = 0;
+ no_progress_loops = 0;
+ compact_priority = DEF_COMPACT_PRIORITY;
+ cpuset_mems_cookie = read_mems_allowed_begin();
+
/*
* The fast path uses conservative alloc_flags to succeed only until
* kswapd needs to be woken up, and to avoid the cost of setting up
@@ -3527,6 +3673,17 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
+ /*
+ * We need to recalculate the starting point for the zonelist iterator
+ * because we might have used different nodemask in the fast path, or
+ * there was a cpuset modification and we are retrying - otherwise we
+ * could end up iterating over non-eligible zones endlessly.
+ */
+ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
+ ac->high_zoneidx, ac->nodemask);
+ if (!ac->preferred_zoneref->zone)
+ goto nopage;
+
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);
@@ -3603,35 +3760,21 @@ retry:
goto got_pg;
/* Caller is not willing to reclaim, we can't balance anything */
- if (!can_direct_reclaim) {
- /*
- * All existing users of the __GFP_NOFAIL are blockable, so warn
- * of any new users that actually allow this type of allocation
- * to fail.
- */
- WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
+ if (!can_direct_reclaim)
goto nopage;
- }
- /* Avoid recursion of direct reclaim */
- if (current->flags & PF_MEMALLOC) {
- /*
- * __GFP_NOFAIL request from this context is rather bizarre
- * because we cannot reclaim anything and only can loop waiting
- * for somebody to do a work for us.
- */
- if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
- cond_resched();
- goto retry;
- }
- goto nopage;
+ /* Make sure we know about allocations which stall for too long */
+ if (time_after(jiffies, alloc_start + stall_timeout)) {
+ warn_alloc(gfp_mask, ac->nodemask,
+ "page allocation stalls for %ums, order:%u",
+ jiffies_to_msecs(jiffies-alloc_start), order);
+ stall_timeout += 10 * HZ;
}
- /* Avoid allocations with no watermarks from looping endlessly */
- if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
+ /* Avoid recursion of direct reclaim */
+ if (current->flags & PF_MEMALLOC)
goto nopage;
-
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
&did_some_progress);
@@ -3655,14 +3798,6 @@ retry:
if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
goto nopage;
- /* Make sure we know about allocations which stall for too long */
- if (time_after(jiffies, alloc_start + stall_timeout)) {
- warn_alloc(gfp_mask,
- "page allocation stalls for %ums, order:%u",
- jiffies_to_msecs(jiffies-alloc_start), order);
- stall_timeout += 10 * HZ;
- }
-
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
did_some_progress > 0, &no_progress_loops))
goto retry;
@@ -3679,11 +3814,22 @@ retry:
&compaction_retries))
goto retry;
+ /*
+ * It's possible we raced with cpuset update so the OOM would be
+ * premature (see below the nopage: label for full explanation).
+ */
+ if (read_mems_allowed_retry(cpuset_mems_cookie))
+ goto retry_cpuset;
+
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
if (page)
goto got_pg;
+ /* Avoid allocations with no watermarks from looping endlessly */
+ if (test_thread_flag(TIF_MEMDIE))
+ goto nopage;
+
/* Retry as long as the OOM killer is making progress */
if (did_some_progress) {
no_progress_loops = 0;
@@ -3691,74 +3837,127 @@ retry:
}
nopage:
- warn_alloc(gfp_mask,
+ /*
+ * When updating a task's mems_allowed or mempolicy nodemask, it is
+ * possible to race with parallel threads in such a way that our
+ * allocation can fail while the mask is being updated. If we are about
+ * to fail, check if the cpuset changed during allocation and if so,
+ * retry.
+ */
+ if (read_mems_allowed_retry(cpuset_mems_cookie))
+ goto retry_cpuset;
+
+ /*
+ * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
+ * we always retry
+ */
+ if (gfp_mask & __GFP_NOFAIL) {
+ /*
+ * All existing users of the __GFP_NOFAIL are blockable, so warn
+ * of any new users that actually require GFP_NOWAIT
+ */
+ if (WARN_ON_ONCE(!can_direct_reclaim))
+ goto fail;
+
+ /*
+ * PF_MEMALLOC request from this context is rather bizarre
+ * because we cannot reclaim anything and only can loop waiting
+ * for somebody to do a work for us
+ */
+ WARN_ON_ONCE(current->flags & PF_MEMALLOC);
+
+ /*
+ * non failing costly orders are a hard requirement which we
+ * are not prepared for much so let's warn about these users
+ * so that we can identify them and convert them to something
+ * else.
+ */
+ WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
+
+ /*
+ * Help non-failing allocations by giving them access to memory
+ * reserves but do not use ALLOC_NO_WATERMARKS because this
+ * could deplete whole memory reserves which would just make
+ * the situation worse
+ */
+ page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
+ if (page)
+ goto got_pg;
+
+ cond_resched();
+ goto retry;
+ }
+fail:
+ warn_alloc(gfp_mask, ac->nodemask,
"page allocation failure: order:%u", order);
got_pg:
return page;
}
-/*
- * This is the 'heart' of the zoned buddy allocator.
- */
-struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, nodemask_t *nodemask)
+static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, nodemask_t *nodemask,
+ struct alloc_context *ac, gfp_t *alloc_mask,
+ unsigned int *alloc_flags)
{
- struct page *page;
- unsigned int cpuset_mems_cookie;
- unsigned int alloc_flags = ALLOC_WMARK_LOW;
- gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
- struct alloc_context ac = {
- .high_zoneidx = gfp_zone(gfp_mask),
- .zonelist = zonelist,
- .nodemask = nodemask,
- .migratetype = gfpflags_to_migratetype(gfp_mask),
- };
+ ac->high_zoneidx = gfp_zone(gfp_mask);
+ ac->zonelist = zonelist;
+ ac->nodemask = nodemask;
+ ac->migratetype = gfpflags_to_migratetype(gfp_mask);
if (cpusets_enabled()) {
- alloc_mask |= __GFP_HARDWALL;
- alloc_flags |= ALLOC_CPUSET;
- if (!ac.nodemask)
- ac.nodemask = &cpuset_current_mems_allowed;
+ *alloc_mask |= __GFP_HARDWALL;
+ if (!ac->nodemask)
+ ac->nodemask = &cpuset_current_mems_allowed;
+ else
+ *alloc_flags |= ALLOC_CPUSET;
}
- gfp_mask &= gfp_allowed_mask;
-
lockdep_trace_alloc(gfp_mask);
might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
if (should_fail_alloc_page(gfp_mask, order))
- return NULL;
-
- /*
- * Check the zones suitable for the gfp_mask contain at least one
- * valid zone. It's possible to have an empty zonelist as a result
- * of __GFP_THISNODE and a memoryless node
- */
- if (unlikely(!zonelist->_zonerefs->zone))
- return NULL;
+ return false;
- if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
- alloc_flags |= ALLOC_CMA;
+ if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
+ *alloc_flags |= ALLOC_CMA;
-retry_cpuset:
- cpuset_mems_cookie = read_mems_allowed_begin();
+ return true;
+}
+/* Determine whether to spread dirty pages and what the first usable zone */
+static inline void finalise_ac(gfp_t gfp_mask,
+ unsigned int order, struct alloc_context *ac)
+{
/* Dirty zone balancing only done in the fast path */
- ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
+ ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
/*
* The preferred zone is used for statistics but crucially it is
* also used as the starting point for the zonelist iterator. It
* may get reset for allocations that ignore memory policies.
*/
- ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
- ac.high_zoneidx, ac.nodemask);
- if (!ac.preferred_zoneref) {
- page = NULL;
- goto no_zone;
- }
+ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
+ ac->high_zoneidx, ac->nodemask);
+}
+
+/*
+ * This is the 'heart' of the zoned buddy allocator.
+ */
+struct page *
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, nodemask_t *nodemask)
+{
+ struct page *page;
+ unsigned int alloc_flags = ALLOC_WMARK_LOW;
+ gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
+ struct alloc_context ac = { };
+
+ gfp_mask &= gfp_allowed_mask;
+ if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags))
+ return NULL;
+
+ finalise_ac(gfp_mask, order, &ac);
/* First allocation attempt */
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
@@ -3776,21 +3975,10 @@ retry_cpuset:
* Restore the original nodemask if it was potentially replaced with
* &cpuset_current_mems_allowed to optimize the fast-path attempt.
*/
- if (cpusets_enabled())
+ if (unlikely(ac.nodemask != nodemask))
ac.nodemask = nodemask;
- page = __alloc_pages_slowpath(alloc_mask, order, &ac);
-no_zone:
- /*
- * When updating a task's mems_allowed, it is possible to race with
- * parallel threads in such a way that an allocation can fail while
- * the mask is being updated. If a page allocation is about to fail,
- * check if the cpuset changed during allocation and if so, retry.
- */
- if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) {
- alloc_mask = gfp_mask;
- goto retry_cpuset;
- }
+ page = __alloc_pages_slowpath(alloc_mask, order, &ac);
out:
if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
@@ -3867,8 +4055,8 @@ EXPORT_SYMBOL(free_pages);
* drivers to provide a backing region of memory for use as either an
* sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
*/
-static struct page *__page_frag_refill(struct page_frag_cache *nc,
- gfp_t gfp_mask)
+static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
+ gfp_t gfp_mask)
{
struct page *page = NULL;
gfp_t gfp = gfp_mask;
@@ -3888,8 +4076,23 @@ static struct page *__page_frag_refill(struct page_frag_cache *nc,
return page;
}
-void *__alloc_page_frag(struct page_frag_cache *nc,
- unsigned int fragsz, gfp_t gfp_mask)
+void __page_frag_cache_drain(struct page *page, unsigned int count)
+{
+ VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
+
+ if (page_ref_sub_and_test(page, count)) {
+ unsigned int order = compound_order(page);
+
+ if (order == 0)
+ free_hot_cold_page(page, false);
+ else
+ __free_pages_ok(page, order);
+ }
+}
+EXPORT_SYMBOL(__page_frag_cache_drain);
+
+void *page_frag_alloc(struct page_frag_cache *nc,
+ unsigned int fragsz, gfp_t gfp_mask)
{
unsigned int size = PAGE_SIZE;
struct page *page;
@@ -3897,7 +4100,7 @@ void *__alloc_page_frag(struct page_frag_cache *nc,
if (unlikely(!nc->va)) {
refill:
- page = __page_frag_refill(nc, gfp_mask);
+ page = __page_frag_cache_refill(nc, gfp_mask);
if (!page)
return NULL;
@@ -3940,19 +4143,19 @@ refill:
return nc->va + offset;
}
-EXPORT_SYMBOL(__alloc_page_frag);
+EXPORT_SYMBOL(page_frag_alloc);
/*
* Frees a page fragment allocated out of either a compound or order 0 page.
*/
-void __free_page_frag(void *addr)
+void page_frag_free(void *addr)
{
struct page *page = virt_to_head_page(addr);
if (unlikely(put_page_testzero(page)))
__free_pages_ok(page, compound_order(page));
}
-EXPORT_SYMBOL(__free_page_frag);
+EXPORT_SYMBOL(page_frag_free);
static void *make_alloc_exact(unsigned long addr, unsigned int order,
size_t size)
@@ -4182,20 +4385,20 @@ void si_meminfo_node(struct sysinfo *val, int nid)
* Determine whether the node should be displayed or not, depending on whether
* SHOW_MEM_FILTER_NODES was passed to show_free_areas().
*/
-bool skip_free_areas_node(unsigned int flags, int nid)
+static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
{
- bool ret = false;
- unsigned int cpuset_mems_cookie;
-
if (!(flags & SHOW_MEM_FILTER_NODES))
- goto out;
+ return false;
- do {
- cpuset_mems_cookie = read_mems_allowed_begin();
- ret = !node_isset(nid, cpuset_current_mems_allowed);
- } while (read_mems_allowed_retry(cpuset_mems_cookie));
-out:
- return ret;
+ /*
+ * no node mask - aka implicit memory numa policy. Do not bother with
+ * the synchronization - read_mems_allowed_begin - because we do not
+ * have to be precise here.
+ */
+ if (!nodemask)
+ nodemask = &cpuset_current_mems_allowed;
+
+ return !node_isset(nid, *nodemask);
}
#define K(x) ((x) << (PAGE_SHIFT-10))
@@ -4236,7 +4439,7 @@ static void show_migration_types(unsigned char type)
* SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
* cpuset.
*/
-void show_free_areas(unsigned int filter)
+void show_free_areas(unsigned int filter, nodemask_t *nodemask)
{
unsigned long free_pcp = 0;
int cpu;
@@ -4244,7 +4447,7 @@ void show_free_areas(unsigned int filter)
pg_data_t *pgdat;
for_each_populated_zone(zone) {
- if (skip_free_areas_node(filter, zone_to_nid(zone)))
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
continue;
for_each_online_cpu(cpu)
@@ -4278,6 +4481,9 @@ void show_free_areas(unsigned int filter)
global_page_state(NR_FREE_CMA_PAGES));
for_each_online_pgdat(pgdat) {
+ if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
+ continue;
+
printk("Node %d"
" active_anon:%lukB"
" inactive_anon:%lukB"
@@ -4327,7 +4533,7 @@ void show_free_areas(unsigned int filter)
for_each_populated_zone(zone) {
int i;
- if (skip_free_areas_node(filter, zone_to_nid(zone)))
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
continue;
free_pcp = 0;
@@ -4392,7 +4598,7 @@ void show_free_areas(unsigned int filter)
unsigned long nr[MAX_ORDER], flags, total = 0;
unsigned char types[MAX_ORDER];
- if (skip_free_areas_node(filter, zone_to_nid(zone)))
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
continue;
show_node(zone);
printk(KERN_CONT "%s: ", zone->name);
@@ -5013,8 +5219,17 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
if (context != MEMMAP_EARLY)
goto not_early;
- if (!early_pfn_valid(pfn))
+ if (!early_pfn_valid(pfn)) {
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ /*
+ * Skip to the pfn preceding the next valid one (or
+ * end_pfn), such that we hit a valid pfn (or end_pfn)
+ * on our next iteration of the loop.
+ */
+ pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
+#endif
continue;
+ }
if (!early_pfn_in_nid(pfn, nid))
continue;
if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
@@ -6274,8 +6489,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
start_pfn = end_pfn;
}
- arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
- arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
@@ -6399,8 +6612,8 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
}
if (pages && s)
- pr_info("Freeing %s memory: %ldK (%p - %p)\n",
- s, pages << (PAGE_SHIFT - 10), start, end);
+ pr_info("Freeing %s memory: %ldK\n",
+ s, pages << (PAGE_SHIFT - 10));
return pages;
}
@@ -6491,38 +6704,39 @@ void __init free_area_init(unsigned long *zones_size)
__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
}
-static int page_alloc_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+static int page_alloc_cpu_dead(unsigned int cpu)
{
- int cpu = (unsigned long)hcpu;
- if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
- lru_add_drain_cpu(cpu);
- drain_pages(cpu);
+ lru_add_drain_cpu(cpu);
+ drain_pages(cpu);
- /*
- * Spill the event counters of the dead processor
- * into the current processors event counters.
- * This artificially elevates the count of the current
- * processor.
- */
- vm_events_fold_cpu(cpu);
+ /*
+ * Spill the event counters of the dead processor
+ * into the current processors event counters.
+ * This artificially elevates the count of the current
+ * processor.
+ */
+ vm_events_fold_cpu(cpu);
- /*
- * Zero the differential counters of the dead processor
- * so that the vm statistics are consistent.
- *
- * This is only okay since the processor is dead and cannot
- * race with what we are doing.
- */
- cpu_vm_stats_fold(cpu);
- }
- return NOTIFY_OK;
+ /*
+ * Zero the differential counters of the dead processor
+ * so that the vm statistics are consistent.
+ *
+ * This is only okay since the processor is dead and cannot
+ * race with what we are doing.
+ */
+ cpu_vm_stats_fold(cpu);
+ return 0;
}
void __init page_alloc_init(void)
{
- hotcpu_notifier(page_alloc_cpu_notify, 0);
+ int ret;
+
+ ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
+ "mm/page_alloc:dead", NULL,
+ page_alloc_cpu_dead);
+ WARN_ON(ret < 0);
}
/*
@@ -7010,8 +7224,9 @@ void *__init alloc_large_system_hash(const char *tablename,
* If @count is not zero, it is okay to include less @count unmovable pages
*
* PageLRU check without isolation or lru_lock could race so that
- * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
- * expect this function should be exact.
+ * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
+ * check without lock_page also may miss some movable non-lru pages at
+ * race condition. So you can't expect this function should be exact.
*/
bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
bool skip_hwpoisoned_pages)
@@ -7067,6 +7282,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
if (skip_hwpoisoned_pages && PageHWPoison(page))
continue;
+ if (__PageMovable(page))
+ continue;
+
if (!PageLRU(page))
found++;
/*
@@ -7178,6 +7396,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
* in range must have the same migratetype and it must
* be either of the two.
+ * @gfp_mask: GFP mask to use during compaction
*
* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
* aligned, however it's the caller's responsibility to guarantee that
@@ -7191,7 +7410,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
* need to be freed with free_contig_range().
*/
int alloc_contig_range(unsigned long start, unsigned long end,
- unsigned migratetype)
+ unsigned migratetype, gfp_t gfp_mask)
{
unsigned long outer_start, outer_end;
unsigned int order;
@@ -7203,6 +7422,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
.zone = page_zone(pfn_to_page(start)),
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
+ .gfp_mask = memalloc_noio_flags(gfp_mask),
};
INIT_LIST_HEAD(&cc.migratepages);