From aa187507ef8bb3178a3312d851e8485bd81913c9 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 22 Feb 2017 15:41:45 -0800
Subject: mm: throttle show_mem() from warn_alloc()

Tetsuo has been stressing OOM killer path with many parallel allocation
requests when he has noticed that it is not all that hard to swamp
kernel logs with warn_alloc messages caused by allocation stalls.  Even
though the allocation stall message is triggered only once in 10s there
might be many different tasks hitting it roughly around the same time.

A big part of the output is show_mem() which can generate a lot of
output even on a small machines.  There is no reason to show the state
of memory counter for each allocation stall, especially when multiple of
them are reported in a short time period.  Chances are that not much has
changed since the last report.  This patch simply rate limits show_mem
called from warn_alloc to only dump something once per second.  This
should be enough to give us a clue why an allocation might be stalling
while burst of warnings will not swamp log with too much data.

While we are at it, extract all the show_mem related handling (filters)
into a separate function warn_alloc_show_mem.  This will make the code
cleaner and as a bonus point we can distinguish which part of warn_alloc
got throttled due to rate limiting as ___ratelimit dumps the caller.

[akpm@linux-foundation.org: reduce scope of the ratelimit_states]
Link: http://lkml.kernel.org/r/20161215101510.9030-1-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f3e0c69a97b7..3c790ae4cb52 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3007,18 +3007,12 @@ static inline bool should_suppress_show_mem(void)
 	return ret;
 }
 
-static DEFINE_RATELIMIT_STATE(nopage_rs,
-		DEFAULT_RATELIMIT_INTERVAL,
-		DEFAULT_RATELIMIT_BURST);
-
-void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
+static void warn_alloc_show_mem(gfp_t gfp_mask)
 {
 	unsigned int filter = SHOW_MEM_FILTER_NODES;
-	struct va_format vaf;
-	va_list args;
+	static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
 
-	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
-	    debug_guardpage_minorder() > 0)
+	if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs))
 		return;
 
 	/*
@@ -3033,6 +3027,20 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 	if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
 		filter &= ~SHOW_MEM_FILTER_NODES;
 
+	show_mem(filter);
+}
+
+void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+	static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+
+	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+	    debug_guardpage_minorder() > 0)
+		return;
+
 	pr_warn("%s: ", current->comm);
 
 	va_start(args, fmt);
@@ -3044,8 +3052,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 	pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask);
 
 	dump_stack();
-	if (!should_suppress_show_mem())
-		show_mem(filter);
+	warn_alloc_show_mem(gfp_mask);
 }
 
 static inline struct page *
-- 
cgit 


From 76741e776a37973a3e398d504069b3e55c5cc866 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 22 Feb 2017 15:41:48 -0800
Subject: mm, page_alloc: don't convert pfn to idx when merging

In __free_one_page() we do the buddy merging arithmetics on "page/buddy
index", which is just the lower MAX_ORDER bits of pfn.  The operations
we do that affect the higher bits are bitwise AND and subtraction (in
that order), where the final result will be the same with the higher
bits left unmasked, as long as these bits are equal for both buddies -
which must be true by the definition of a buddy.

We can therefore use pfn's directly instead of "index" and skip the
zeroing of >MAX_ORDER bits.  This can help a bit by itself, although
compiler might be smart enough already.  It also helps the next patch to
avoid page_to_pfn() for memory hole checks.

Link: http://lkml.kernel.org/r/20161216120009.20064-1-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/internal.h       |  4 ++--
 mm/page_alloc.c     | 31 ++++++++++++++-----------------
 mm/page_isolation.c |  8 ++++----
 3 files changed, 20 insertions(+), 23 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/internal.h b/mm/internal.h
index 7aa2ea0a8623..bfad3b5d2665 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -133,9 +133,9 @@ struct alloc_context {
  * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
  */
 static inline unsigned long
-__find_buddy_index(unsigned long page_idx, unsigned int order)
+__find_buddy_pfn(unsigned long page_pfn, unsigned int order)
 {
-	return page_idx ^ (1 << order);
+	return page_pfn ^ (1 << order);
 }
 
 extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3c790ae4cb52..49d40261f8c4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -787,9 +787,8 @@ static inline void __free_one_page(struct page *page,
 		struct zone *zone, unsigned int order,
 		int migratetype)
 {
-	unsigned long page_idx;
-	unsigned long combined_idx;
-	unsigned long uninitialized_var(buddy_idx);
+	unsigned long combined_pfn;
+	unsigned long uninitialized_var(buddy_pfn);
 	struct page *buddy;
 	unsigned int max_order;
 
@@ -802,15 +801,13 @@ static inline void __free_one_page(struct page *page,
 	if (likely(!is_migrate_isolate(migratetype)))
 		__mod_zone_freepage_state(zone, 1 << order, migratetype);
 
-	page_idx = pfn & ((1 << MAX_ORDER) - 1);
-
-	VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
+	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
 	VM_BUG_ON_PAGE(bad_range(zone, page), page);
 
 continue_merging:
 	while (order < max_order - 1) {
-		buddy_idx = __find_buddy_index(page_idx, order);
-		buddy = page + (buddy_idx - page_idx);
+		buddy_pfn = __find_buddy_pfn(pfn, order);
+		buddy = page + (buddy_pfn - pfn);
 		if (!page_is_buddy(page, buddy, order))
 			goto done_merging;
 		/*
@@ -824,9 +821,9 @@ continue_merging:
 			zone->free_area[order].nr_free--;
 			rmv_page_order(buddy);
 		}
-		combined_idx = buddy_idx & page_idx;
-		page = page + (combined_idx - page_idx);
-		page_idx = combined_idx;
+		combined_pfn = buddy_pfn & pfn;
+		page = page + (combined_pfn - pfn);
+		pfn = combined_pfn;
 		order++;
 	}
 	if (max_order < MAX_ORDER) {
@@ -841,8 +838,8 @@ continue_merging:
 		if (unlikely(has_isolate_pageblock(zone))) {
 			int buddy_mt;
 
-			buddy_idx = __find_buddy_index(page_idx, order);
-			buddy = page + (buddy_idx - page_idx);
+			buddy_pfn = __find_buddy_pfn(pfn, order);
+			buddy = page + (buddy_pfn - pfn);
 			buddy_mt = get_pageblock_migratetype(buddy);
 
 			if (migratetype != buddy_mt
@@ -867,10 +864,10 @@ done_merging:
 	 */
 	if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
 		struct page *higher_page, *higher_buddy;
-		combined_idx = buddy_idx & page_idx;
-		higher_page = page + (combined_idx - page_idx);
-		buddy_idx = __find_buddy_index(combined_idx, order + 1);
-		higher_buddy = higher_page + (buddy_idx - combined_idx);
+		combined_pfn = buddy_pfn & pfn;
+		higher_page = page + (combined_pfn - pfn);
+		buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
+		higher_buddy = higher_page + (buddy_pfn - combined_pfn);
 		if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
 			list_add_tail(&page->lru,
 				&zone->free_area[order].free_list[migratetype]);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index a5594bfcc5ed..dadb7e74d7d6 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -83,7 +83,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 	unsigned long flags, nr_pages;
 	bool isolated_page = false;
 	unsigned int order;
-	unsigned long page_idx, buddy_idx;
+	unsigned long pfn, buddy_pfn;
 	struct page *buddy;
 
 	zone = page_zone(page);
@@ -102,9 +102,9 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 	if (PageBuddy(page)) {
 		order = page_order(page);
 		if (order >= pageblock_order) {
-			page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
-			buddy_idx = __find_buddy_index(page_idx, order);
-			buddy = page + (buddy_idx - page_idx);
+			pfn = page_to_pfn(page);
+			buddy_pfn = __find_buddy_pfn(pfn, order);
+			buddy = page + (buddy_pfn - pfn);
 
 			if (pfn_valid_within(page_to_pfn(buddy)) &&
 			    !is_migrate_isolate_page(buddy)) {
-- 
cgit 


From 13ad59df67f19788f6c22985b1a33e466eceb643 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 22 Feb 2017 15:41:51 -0800
Subject: mm, page_alloc: avoid page_to_pfn() when merging buddies

On architectures that allow memory holes, page_is_buddy() has to perform
page_to_pfn() to check for the memory hole.  After the previous patch,
we have the pfn already available in __free_one_page(), which is the
only caller of page_is_buddy(), so move the check there and avoid
page_to_pfn().

Link: http://lkml.kernel.org/r/20161216120009.20064-2-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c     | 10 +++++-----
 mm/page_isolation.c |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 49d40261f8c4..af65c4eedc79 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -714,7 +714,7 @@ static inline void rmv_page_order(struct page *page)
 /*
  * This function checks whether a page is free && is the buddy
  * we can do coalesce a page and its buddy if
- * (a) the buddy is not in a hole &&
+ * (a) the buddy is not in a hole (check before calling!) &&
  * (b) the buddy is in the buddy system &&
  * (c) a page and its buddy have the same order &&
  * (d) a page and its buddy are in the same zone.
@@ -729,9 +729,6 @@ static inline void rmv_page_order(struct page *page)
 static inline int page_is_buddy(struct page *page, struct page *buddy,
 							unsigned int order)
 {
-	if (!pfn_valid_within(page_to_pfn(buddy)))
-		return 0;
-
 	if (page_is_guard(buddy) && page_order(buddy) == order) {
 		if (page_zone_id(page) != page_zone_id(buddy))
 			return 0;
@@ -808,6 +805,9 @@ continue_merging:
 	while (order < max_order - 1) {
 		buddy_pfn = __find_buddy_pfn(pfn, order);
 		buddy = page + (buddy_pfn - pfn);
+
+		if (!pfn_valid_within(buddy_pfn))
+			goto done_merging;
 		if (!page_is_buddy(page, buddy, order))
 			goto done_merging;
 		/*
@@ -862,7 +862,7 @@ done_merging:
 	 * so it's less likely to be used soon and more likely to be merged
 	 * as a higher order page
 	 */
-	if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
+	if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
 		struct page *higher_page, *higher_buddy;
 		combined_pfn = buddy_pfn & pfn;
 		higher_page = page + (combined_pfn - pfn);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index dadb7e74d7d6..f4e17a57926a 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -106,7 +106,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 			buddy_pfn = __find_buddy_pfn(pfn, order);
 			buddy = page + (buddy_pfn - pfn);
 
-			if (pfn_valid_within(page_to_pfn(buddy)) &&
+			if (pfn_valid_within(buddy_pfn) &&
 			    !is_migrate_isolate_page(buddy)) {
 				__isolate_free_page(page, order);
 				isolated_page = true;
-- 
cgit 


From d379f01de09570e06d84b4b09e5f4951821a1dc8 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 22 Feb 2017 15:42:00 -0800
Subject: oom, trace: add oom detection tracepoints

should_reclaim_retry is the central decision point for declaring the
OOM.  It might be really useful to expose data used for this decision
making when debugging an unexpected oom situations.

Say we have an OOM report:
[   52.264001] mem_eater invoked oom-killer: gfp_mask=0x24280ca(GFP_HIGHUSER_MOVABLE|__GFP_ZERO), nodemask=0, order=0, oom_score_adj=0
[   52.267549] CPU: 3 PID: 3148 Comm: mem_eater Tainted: G        W       4.8.0-oomtrace3-00006-gb21338b386d2 #1024

Now we can check the tracepoint data to see how we have ended up in this
situation:
       mem_eater-3148  [003] ....    52.432801: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11134 min_wmark=11084 no_progress_loops=1 wmark_check=1
       mem_eater-3148  [003] ....    52.433269: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11103 min_wmark=11084 no_progress_loops=1 wmark_check=1
       mem_eater-3148  [003] ....    52.433712: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11100 min_wmark=11084 no_progress_loops=2 wmark_check=1
       mem_eater-3148  [003] ....    52.434067: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11097 min_wmark=11084 no_progress_loops=3 wmark_check=1
       mem_eater-3148  [003] ....    52.434414: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11094 min_wmark=11084 no_progress_loops=4 wmark_check=1
       mem_eater-3148  [003] ....    52.434761: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11091 min_wmark=11084 no_progress_loops=5 wmark_check=1
       mem_eater-3148  [003] ....    52.435108: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11087 min_wmark=11084 no_progress_loops=6 wmark_check=1
       mem_eater-3148  [003] ....    52.435478: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11084 min_wmark=11084 no_progress_loops=7 wmark_check=0
       mem_eater-3148  [003] ....    52.435478: reclaim_retry_zone: node=0 zone=DMA order=0 reclaimable=0 available=1126 min_wmark=179 no_progress_loops=7 wmark_check=0

The above shows that we can quickly deduce that the reclaim stopped
making any progress (see no_progress_loops increased in each round) and
while there were still some 51 reclaimable pages they couldn't be
dropped for some reason (vmscan trace points would tell us more about
that part).  available will represent reclaimable + free_pages scaled
down per no_progress_loops factor.  This is essentially an optimistic
estimate of how much memory we would have when reclaiming everything.
This can be compared to min_wmark to get a rought idea but the
wmark_check tells the result of the watermark check which is more
precise (includes lowmem reserves, considers the order etc.).  As we can
see no zone is eligible in the end and that is why we have triggered the
oom in this situation.

Please note that higher order requests might fail on the wmark_check
even when there is much more memory available than min_wmark - e.g.
when the memory is fragmented.  A follow up tracepoint will help to
debug those situations.

Link: http://lkml.kernel.org/r/20161220130135.15719-3-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/trace/events/oom.h | 42 ++++++++++++++++++++++++++++++++++++++++++
 mm/page_alloc.c            | 10 ++++++++--
 2 files changed, 50 insertions(+), 2 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h
index 1e974983757e..9160da7a26a0 100644
--- a/include/trace/events/oom.h
+++ b/include/trace/events/oom.h
@@ -4,6 +4,7 @@
 #if !defined(_TRACE_OOM_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_OOM_H
 #include <linux/tracepoint.h>
+#include <trace/events/mmflags.h>
 
 TRACE_EVENT(oom_score_adj_update,
 
@@ -27,6 +28,47 @@ TRACE_EVENT(oom_score_adj_update,
 		__entry->pid, __entry->comm, __entry->oom_score_adj)
 );
 
+TRACE_EVENT(reclaim_retry_zone,
+
+	TP_PROTO(struct zoneref *zoneref,
+		int order,
+		unsigned long reclaimable,
+		unsigned long available,
+		unsigned long min_wmark,
+		int no_progress_loops,
+		bool wmark_check),
+
+	TP_ARGS(zoneref, order, reclaimable, available, min_wmark, no_progress_loops, wmark_check),
+
+	TP_STRUCT__entry(
+		__field(	int, node)
+		__field(	int, zone_idx)
+		__field(	int,	order)
+		__field(	unsigned long,	reclaimable)
+		__field(	unsigned long,	available)
+		__field(	unsigned long,	min_wmark)
+		__field(	int,	no_progress_loops)
+		__field(	bool,	wmark_check)
+	),
+
+	TP_fast_assign(
+		__entry->node = zone_to_nid(zoneref->zone);
+		__entry->zone_idx = zoneref->zone_idx;
+		__entry->order = order;
+		__entry->reclaimable = reclaimable;
+		__entry->available = available;
+		__entry->min_wmark = min_wmark;
+		__entry->no_progress_loops = no_progress_loops;
+		__entry->wmark_check = wmark_check;
+	),
+
+	TP_printk("node=%d zone=%-8s order=%d reclaimable=%lu available=%lu min_wmark=%lu no_progress_loops=%d wmark_check=%d",
+			__entry->node, __print_symbolic(__entry->zone_idx, ZONE_TYPE),
+			__entry->order,
+			__entry->reclaimable, __entry->available, __entry->min_wmark,
+			__entry->no_progress_loops,
+			__entry->wmark_check)
+);
 #endif
 
 /* This part must be outside protection */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index af65c4eedc79..d20f8c3139bb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -55,6 +55,7 @@
 #include <linux/kmemleak.h>
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
+#include <trace/events/oom.h>
 #include <linux/prefetch.h>
 #include <linux/mm_inline.h>
 #include <linux/migrate.h>
@@ -3468,6 +3469,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 					ac->nodemask) {
 		unsigned long available;
 		unsigned long reclaimable;
+		unsigned long min_wmark = min_wmark_pages(zone);
+		bool wmark;
 
 		available = reclaimable = zone_reclaimable_pages(zone);
 		available -= DIV_ROUND_UP((*no_progress_loops) * available,
@@ -3478,8 +3481,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 		 * Would the allocation succeed if we reclaimed the whole
 		 * available?
 		 */
-		if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
-				ac_classzone_idx(ac), alloc_flags, available)) {
+		wmark = __zone_watermark_ok(zone, order, min_wmark,
+				ac_classzone_idx(ac), alloc_flags, available);
+		trace_reclaim_retry_zone(z, order, reclaimable,
+				available, min_wmark, *no_progress_loops, wmark);
+		if (wmark) {
 			/*
 			 * If we didn't make any progress and have a lot of
 			 * dirty + writeback pages then we should wait for
-- 
cgit 


From 65190cff3cc108b72e42cce67ed8b73dbad6b731 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 22 Feb 2017 15:42:03 -0800
Subject: oom, trace: add compaction retry tracepoint

Higher order requests oom debugging is currently quite hard.  We do have
some compaction points which can tell us how the compaction is operating
but there is no trace point to tell us about compaction retry logic.
This patch adds a one which will have the following format

            bash-3126  [001] ....  1498.220001: compact_retry: order=9 priority=COMPACT_PRIO_SYNC_LIGHT compaction_result=withdrawn retries=0 max_retries=16 should_retry=0

we can see that the order 9 request is not retried even though we are in
the highest compaction priority mode becase the last compaction attempt
was withdrawn.  This means that compaction_zonelist_suitable must have
returned false and there is no suitable zone to compact for this request
and so no need to retry further.

another example would be
           <...>-3137  [001] ....    81.501689: compact_retry: order=9 priority=COMPACT_PRIO_SYNC_LIGHT compaction_result=failed retries=0 max_retries=16 should_retry=0

in this case the order-9 compaction failed to find any suitable block.
We do not retry anymore because this is a costly request and those do
not go below COMPACT_PRIO_SYNC_LIGHT priority.

Link: http://lkml.kernel.org/r/20161220130135.15719-4-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/trace/events/mmflags.h | 26 ++++++++++++++++++++++++++
 include/trace/events/oom.h     | 39 +++++++++++++++++++++++++++++++++++++++
 mm/page_alloc.c                | 22 ++++++++++++++++------
 3 files changed, 81 insertions(+), 6 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 75ed3220ede2..91554faed17e 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -186,8 +186,32 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY,	"softdirty"	)		\
 	EM( COMPACT_NO_SUITABLE_PAGE,	"no_suitable_page")	\
 	EM( COMPACT_NOT_SUITABLE_ZONE,	"not_suitable_zone")	\
 	EMe(COMPACT_CONTENDED,		"contended")
+
+/* High-level compaction status feedback */
+#define COMPACTION_FAILED	1
+#define COMPACTION_WITHDRAWN	2
+#define COMPACTION_PROGRESS	3
+
+#define compact_result_to_feedback(result)	\
+({						\
+	enum compact_result __result = result;	\
+	(compaction_failed(__result)) ? COMPACTION_FAILED : \
+		(compaction_withdrawn(__result)) ? COMPACTION_WITHDRAWN : COMPACTION_PROGRESS; \
+})
+
+#define COMPACTION_FEEDBACK		\
+	EM(COMPACTION_FAILED,		"failed")	\
+	EM(COMPACTION_WITHDRAWN,	"withdrawn")	\
+	EMe(COMPACTION_PROGRESS,	"progress")
+
+#define COMPACTION_PRIORITY						\
+	EM(COMPACT_PRIO_SYNC_FULL,	"COMPACT_PRIO_SYNC_FULL")	\
+	EM(COMPACT_PRIO_SYNC_LIGHT,	"COMPACT_PRIO_SYNC_LIGHT")	\
+	EMe(COMPACT_PRIO_ASYNC,		"COMPACT_PRIO_ASYNC")
 #else
 #define COMPACTION_STATUS
+#define COMPACTION_PRIORITY
+#define COMPACTION_FEEDBACK
 #endif
 
 #ifdef CONFIG_ZONE_DMA
@@ -225,6 +249,8 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY,	"softdirty"	)		\
 #define EMe(a, b)	TRACE_DEFINE_ENUM(a);
 
 COMPACTION_STATUS
+COMPACTION_PRIORITY
+COMPACTION_FEEDBACK
 ZONE_TYPE
 
 /*
diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h
index 9160da7a26a0..38baeb27221a 100644
--- a/include/trace/events/oom.h
+++ b/include/trace/events/oom.h
@@ -69,6 +69,45 @@ TRACE_EVENT(reclaim_retry_zone,
 			__entry->no_progress_loops,
 			__entry->wmark_check)
 );
+
+#ifdef CONFIG_COMPACTION
+TRACE_EVENT(compact_retry,
+
+	TP_PROTO(int order,
+		enum compact_priority priority,
+		enum compact_result result,
+		int retries,
+		int max_retries,
+		bool ret),
+
+	TP_ARGS(order, priority, result, retries, max_retries, ret),
+
+	TP_STRUCT__entry(
+		__field(	int, order)
+		__field(	int, priority)
+		__field(	int, result)
+		__field(	int, retries)
+		__field(	int, max_retries)
+		__field(	bool, ret)
+	),
+
+	TP_fast_assign(
+		__entry->order = order;
+		__entry->priority = priority;
+		__entry->result = compact_result_to_feedback(result);
+		__entry->retries = retries;
+		__entry->max_retries = max_retries;
+		__entry->ret = ret;
+	),
+
+	TP_printk("order=%d priority=%s compaction_result=%s retries=%d max_retries=%d should_retry=%d",
+			__entry->order,
+			__print_symbolic(__entry->priority, COMPACTION_PRIORITY),
+			__print_symbolic(__entry->result, COMPACTION_FEEDBACK),
+			__entry->retries, __entry->max_retries,
+			__entry->ret)
+);
+#endif /* CONFIG_COMPACTION */
 #endif
 
 /* This part must be outside protection */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d20f8c3139bb..05c0a59323bd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3197,6 +3197,9 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
 {
 	int max_retries = MAX_COMPACT_RETRIES;
 	int min_priority;
+	bool ret = false;
+	int retries = *compaction_retries;
+	enum compact_priority priority = *compact_priority;
 
 	if (!order)
 		return false;
@@ -3218,8 +3221,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
 	 * But do not retry if the given zonelist is not suitable for
 	 * compaction.
 	 */
-	if (compaction_withdrawn(compact_result))
-		return compaction_zonelist_suitable(ac, order, alloc_flags);
+	if (compaction_withdrawn(compact_result)) {
+		ret = compaction_zonelist_suitable(ac, order, alloc_flags);
+		goto out;
+	}
 
 	/*
 	 * !costly requests are much more important than __GFP_REPEAT
@@ -3231,8 +3236,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
 	 */
 	if (order > PAGE_ALLOC_COSTLY_ORDER)
 		max_retries /= 4;
-	if (*compaction_retries <= max_retries)
-		return true;
+	if (*compaction_retries <= max_retries) {
+		ret = true;
+		goto out;
+	}
 
 	/*
 	 * Make sure there are attempts at the highest priority if we exhausted
@@ -3241,12 +3248,15 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
 check_priority:
 	min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
 			MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
+
 	if (*compact_priority > min_priority) {
 		(*compact_priority)--;
 		*compaction_retries = 0;
-		return true;
+		ret = true;
 	}
-	return false;
+out:
+	trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
+	return ret;
 }
 #else
 static inline struct page *
-- 
cgit 


From b92df1de5d289c0b5d653e72414bf0850b8511e0 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 22 Feb 2017 15:44:53 -0800
Subject: mm: page_alloc: skip over regions of invalid pfns where possible

When using a sparse memory model memmap_init_zone() when invoked with
the MEMMAP_EARLY context will skip over pages which aren't valid - ie.
which aren't in a populated region of the sparse memory map.  However if
the memory map is extremely sparse then it can spend a long time
linearly checking each PFN in a large non-populated region of the memory
map & skipping it in turn.

When CONFIG_HAVE_MEMBLOCK_NODE_MAP is enabled, we have sufficient
information to quickly discover the next valid PFN given an invalid one
by searching through the list of memory regions & skipping forwards to
the first PFN covered by the memory region to the right of the
non-populated region.  Implement this in order to speed up
memmap_init_zone() for systems with extremely sparse memory maps.

James said "I have tested this patch on a virtual model of a Samurai CPU
with a sparse memory map.  The kernel boot time drops from 109 to
62 seconds. "

Link: http://lkml.kernel.org/r/20161125185518.29885-1-paul.burton@imgtec.com
Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Tested-by: James Hartley <james.hartley@imgtec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h |  1 +
 mm/memblock.c            | 25 +++++++++++++++++++++++++
 mm/page_alloc.c          | 11 ++++++++++-
 3 files changed, 36 insertions(+), 1 deletion(-)

(limited to 'mm/page_alloc.c')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 5b759c9acf97..38bcf00cbed3 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -203,6 +203,7 @@ int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
 			    unsigned long  *end_pfn);
 void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
 			  unsigned long *out_end_pfn, int *out_nid);
+unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn);
 
 /**
  * for_each_mem_pfn_range - early memory pfn range iterator
diff --git a/mm/memblock.c b/mm/memblock.c
index 7608bc305936..a476d28e0733 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1105,6 +1105,31 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
 		*out_nid = r->nid;
 }
 
+unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
+						      unsigned long max_pfn)
+{
+	struct memblock_type *type = &memblock.memory;
+	unsigned int right = type->cnt;
+	unsigned int mid, left = 0;
+	phys_addr_t addr = PFN_PHYS(pfn + 1);
+
+	do {
+		mid = (right + left) / 2;
+
+		if (addr < type->regions[mid].base)
+			right = mid;
+		else if (addr >= (type->regions[mid].base +
+				  type->regions[mid].size))
+			left = mid + 1;
+		else {
+			/* addr is within the region, so pfn + 1 is valid */
+			return min(pfn + 1, max_pfn);
+		}
+	} while (left < right);
+
+	return min(PHYS_PFN(type->regions[right].base), max_pfn);
+}
+
 /**
  * memblock_set_node - set node ID on memblock regions
  * @base: base of area to set node ID for
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 05c0a59323bd..6da3169d3750 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5103,8 +5103,17 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		if (context != MEMMAP_EARLY)
 			goto not_early;
 
-		if (!early_pfn_valid(pfn))
+		if (!early_pfn_valid(pfn)) {
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+			/*
+			 * Skip to the pfn preceding the next valid one (or
+			 * end_pfn), such that we hit a valid pfn (or end_pfn)
+			 * on our next iteration of the loop.
+			 */
+			pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
+#endif
 			continue;
+		}
 		if (!early_pfn_in_nid(pfn, nid))
 			continue;
 		if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
-- 
cgit 


From c02e50bb8a55a7adeeca5e411479ed70c6a2dfa1 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 22 Feb 2017 15:46:07 -0800
Subject: mm, page_alloc: do not report all nodes in show_mem

Patch series "show_mem updates", v2.

This is a mixture of one bug fix (patch 1), an enhancement (patch 2) and
cleanups (the rest of the series).  First two patches should be really
straightforward.  Patch 3 removes some arch specific show_mem
implementations because I think they are quite outdated and do not
really serve any useful purpose anymore.  I think we should really
strive to have a consistent show_mem output regardless of the
architecture.  If some architecture is really special and wants to dump
something additional we should do that via an arch specific hook.

The last patch adds nodemask parameter so that we do not rely on the
hardcoded mems_allowed of the current task when doing the node
filtering.  I consider this more a cleanup than a fix because basically
all users use a nodemask which is a subset of mems_allowed.  There is
only one call path in the memory hotplug which doesn't comply with this
but that is hardly something to worry about.

This patch (of 4):

Commit 599d0c954f91 ("mm, vmscan: move LRU lists to node") has added per
numa node statistics to show_mem but it forgot to add
skip_free_areas_node to filter out nodes which are outside of the
allocating task numa policy.  Add this check to not pollute the output
with the pointless information.

Link: http://lkml.kernel.org/r/20170117091543.25850-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6da3169d3750..cf6b53dc08f9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4368,6 +4368,9 @@ void show_free_areas(unsigned int filter)
 		global_page_state(NR_FREE_CMA_PAGES));
 
 	for_each_online_pgdat(pgdat) {
+		if (skip_free_areas_node(filter, pgdat->node_id))
+			continue;
+
 		printk("Node %d"
 			" active_anon:%lukB"
 			" inactive_anon:%lukB"
-- 
cgit 


From a8e99259e7e32b67af2b447f0a570813c0c283ec Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 22 Feb 2017 15:46:10 -0800
Subject: mm, page_alloc: warn_alloc print nodemask

warn_alloc is currently used for to report an allocation failure or an
allocation stall.  We print some details of the allocation request like
the gfp mask and the request order.  We do not print the allocation
nodemask which is important when debugging the reason for the allocation
failure as well.  We alreaddy print the nodemask in the OOM report.

Add nodemask to warn_alloc and print it in warn_alloc as well.

Link: http://lkml.kernel.org/r/20170117091543.25850-3-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  4 ++--
 mm/page_alloc.c    | 10 ++++++----
 mm/vmalloc.c       |  4 ++--
 3 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index dae6f58d67c8..28b6c3f8a7f3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1942,8 +1942,8 @@ extern void si_meminfo_node(struct sysinfo *val, int nid);
 extern unsigned long arch_reserved_kernel_pages(void);
 #endif
 
-extern __printf(2, 3)
-void warn_alloc(gfp_t gfp_mask, const char *fmt, ...);
+extern __printf(3, 4)
+void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);
 
 extern void setup_per_cpu_pageset(void);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cf6b53dc08f9..96c8fe602dfb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3028,12 +3028,13 @@ static void warn_alloc_show_mem(gfp_t gfp_mask)
 	show_mem(filter);
 }
 
-void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
+void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 {
 	struct va_format vaf;
 	va_list args;
 	static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
 				      DEFAULT_RATELIMIT_BURST);
+	nodemask_t *nm = (nodemask) ? nodemask : &cpuset_current_mems_allowed;
 
 	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
 	    debug_guardpage_minorder() > 0)
@@ -3047,7 +3048,8 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 	pr_cont("%pV", &vaf);
 	va_end(args);
 
-	pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask);
+	pr_cont(", mode:%#x(%pGg), nodemask=%*pbl\n", gfp_mask, &gfp_mask, nodemask_pr_args(nm));
+	cpuset_print_current_mems_allowed();
 
 	dump_stack();
 	warn_alloc_show_mem(gfp_mask);
@@ -3724,7 +3726,7 @@ retry:
 
 	/* Make sure we know about allocations which stall for too long */
 	if (time_after(jiffies, alloc_start + stall_timeout)) {
-		warn_alloc(gfp_mask,
+		warn_alloc(gfp_mask, ac->nodemask,
 			"page allocation stalls for %ums, order:%u",
 			jiffies_to_msecs(jiffies-alloc_start), order);
 		stall_timeout += 10 * HZ;
@@ -3775,7 +3777,7 @@ nopage:
 	if (read_mems_allowed_retry(cpuset_mems_cookie))
 		goto retry_cpuset;
 
-	warn_alloc(gfp_mask,
+	warn_alloc(gfp_mask, ac->nodemask,
 			"page allocation failure: order:%u", order);
 got_pg:
 	return page;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5f5b09e9dccd..d89034a393f2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1662,7 +1662,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	return area->addr;
 
 fail:
-	warn_alloc(gfp_mask,
+	warn_alloc(gfp_mask, NULL,
 			  "vmalloc: allocation failure, allocated %ld of %ld bytes",
 			  (area->nr_pages*PAGE_SIZE), area->size);
 	vfree(area->addr);
@@ -1724,7 +1724,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 	return addr;
 
 fail:
-	warn_alloc(gfp_mask,
+	warn_alloc(gfp_mask, NULL,
 			  "vmalloc: allocation failure: %lu bytes", real_size);
 	return NULL;
 }
-- 
cgit 


From 9af744d743170b5f5ef70031dea8d772d166ab28 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 22 Feb 2017 15:46:16 -0800
Subject: lib/show_mem.c: teach show_mem to work with the given nodemask

show_mem() allows to filter out node specific data which is irrelevant
to the allocation request via SHOW_MEM_FILTER_NODES.  The filtering is
done in skip_free_areas_node which skips all nodes which are not in the
mems_allowed of the current process.  This works most of the time as
expected because the nodemask shouldn't be outside of the allocating
task but there are some exceptions.  E.g.  memory hotplug might want to
request allocations from outside of the allowed nodes (see
new_node_page).

Get rid of this hardcoded behavior and push the allocation mask down the
show_mem path and use it instead of cpuset_current_mems_allowed.  NULL
nodemask is interpreted as cpuset_current_mems_allowed.

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/20170117091543.25850-5-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/xmon/xmon.c            |  2 +-
 arch/sparc/kernel/setup_32.c        |  2 +-
 drivers/net/ethernet/sgi/ioc3-eth.c |  2 +-
 drivers/tty/sysrq.c                 |  2 +-
 drivers/tty/vt/keyboard.c           |  2 +-
 include/linux/mm.h                  |  5 ++---
 lib/show_mem.c                      |  4 ++--
 mm/nommu.c                          |  6 +++---
 mm/oom_kill.c                       |  2 +-
 mm/page_alloc.c                     | 38 ++++++++++++++++++-------------------
 10 files changed, 32 insertions(+), 33 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 1be0499f5397..5720236d0266 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -916,7 +916,7 @@ cmds(struct pt_regs *excp)
 				memzcan();
 				break;
 			case 'i':
-				show_mem(0);
+				show_mem(0, NULL);
 				break;
 			default:
 				termch = cmd;
diff --git a/arch/sparc/kernel/setup_32.c b/arch/sparc/kernel/setup_32.c
index c4e65cb3280f..6f06058c5ae7 100644
--- a/arch/sparc/kernel/setup_32.c
+++ b/arch/sparc/kernel/setup_32.c
@@ -82,7 +82,7 @@ static void prom_sync_me(void)
 			     "nop\n\t" : : "r" (&trapbase));
 
 	prom_printf("PROM SYNC COMMAND...\n");
-	show_free_areas(0);
+	show_free_areas(0, NULL);
 	if (!is_idle_task(current)) {
 		local_irq_enable();
 		sys_sync();
diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c
index d390b9663dc3..57e6cef81ebe 100644
--- a/drivers/net/ethernet/sgi/ioc3-eth.c
+++ b/drivers/net/ethernet/sgi/ioc3-eth.c
@@ -914,7 +914,7 @@ static void ioc3_alloc_rings(struct net_device *dev)
 
 			skb = ioc3_alloc_skb(RX_BUF_ALLOC_SIZE, GFP_ATOMIC);
 			if (!skb) {
-				show_free_areas(0);
+				show_free_areas(0, NULL);
 				continue;
 			}
 
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 701c085bb19b..71136742e606 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -317,7 +317,7 @@ static struct sysrq_key_op sysrq_ftrace_dump_op = {
 
 static void sysrq_handle_showmem(int key)
 {
-	show_mem(0);
+	show_mem(0, NULL);
 }
 static struct sysrq_key_op sysrq_showmem_op = {
 	.handler	= sysrq_handle_showmem,
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index 3dd6a491cdba..397e1509fe51 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -572,7 +572,7 @@ static void fn_scroll_back(struct vc_data *vc)
 
 static void fn_show_mem(struct vc_data *vc)
 {
-	show_mem(0);
+	show_mem(0, NULL);
 }
 
 static void fn_show_state(struct vc_data *vc)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 28b6c3f8a7f3..8a67cae5a07c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1152,8 +1152,7 @@ extern void pagefault_out_of_memory(void);
  */
 #define SHOW_MEM_FILTER_NODES		(0x0001u)	/* disallowed nodes */
 
-extern void show_free_areas(unsigned int flags);
-extern bool skip_free_areas_node(unsigned int flags, int nid);
+extern void show_free_areas(unsigned int flags, nodemask_t *nodemask);
 
 int shmem_zero_setup(struct vm_area_struct *);
 #ifdef CONFIG_SHMEM
@@ -1934,7 +1933,7 @@ extern void setup_per_zone_wmarks(void);
 extern int __meminit init_per_zone_wmark_min(void);
 extern void mem_init(void);
 extern void __init mmap_init(void);
-extern void show_mem(unsigned int flags);
+extern void show_mem(unsigned int flags, nodemask_t *nodemask);
 extern long si_mem_available(void);
 extern void si_meminfo(struct sysinfo * val);
 extern void si_meminfo_node(struct sysinfo *val, int nid);
diff --git a/lib/show_mem.c b/lib/show_mem.c
index 1feed6a2b12a..0beaa1d899aa 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -9,13 +9,13 @@
 #include <linux/quicklist.h>
 #include <linux/cma.h>
 
-void show_mem(unsigned int filter)
+void show_mem(unsigned int filter, nodemask_t *nodemask)
 {
 	pg_data_t *pgdat;
 	unsigned long total = 0, reserved = 0, highmem = 0;
 
 	printk("Mem-Info:\n");
-	show_free_areas(filter);
+	show_free_areas(filter, nodemask);
 
 	for_each_online_pgdat(pgdat) {
 		unsigned long flags;
diff --git a/mm/nommu.c b/mm/nommu.c
index 24f9f5f39145..bc964c26be8c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1191,7 +1191,7 @@ error_free:
 enomem:
 	pr_err("Allocation of length %lu from process %d (%s) failed\n",
 	       len, current->pid, current->comm);
-	show_free_areas(0);
+	show_free_areas(0, NULL);
 	return -ENOMEM;
 }
 
@@ -1412,13 +1412,13 @@ error_getting_vma:
 	kmem_cache_free(vm_region_jar, region);
 	pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n",
 			len, current->pid);
-	show_free_areas(0);
+	show_free_areas(0, NULL);
 	return -ENOMEM;
 
 error_getting_region:
 	pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n",
 			len, current->pid);
-	show_free_areas(0);
+	show_free_areas(0, NULL);
 	return -ENOMEM;
 }
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ec9f11d4f094..7176b6a754cf 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -417,7 +417,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
 	if (oc->memcg)
 		mem_cgroup_print_oom_info(oc->memcg, p);
 	else
-		show_mem(SHOW_MEM_FILTER_NODES);
+		show_mem(SHOW_MEM_FILTER_NODES, nm);
 	if (sysctl_oom_dump_tasks)
 		dump_tasks(oc->memcg, oc->nodemask);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 96c8fe602dfb..644fb75f6f24 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3005,7 +3005,7 @@ static inline bool should_suppress_show_mem(void)
 	return ret;
 }
 
-static void warn_alloc_show_mem(gfp_t gfp_mask)
+static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
 {
 	unsigned int filter = SHOW_MEM_FILTER_NODES;
 	static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
@@ -3025,7 +3025,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask)
 	if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
 		filter &= ~SHOW_MEM_FILTER_NODES;
 
-	show_mem(filter);
+	show_mem(filter, nodemask);
 }
 
 void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
@@ -3052,7 +3052,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 	cpuset_print_current_mems_allowed();
 
 	dump_stack();
-	warn_alloc_show_mem(gfp_mask);
+	warn_alloc_show_mem(gfp_mask, nm);
 }
 
 static inline struct page *
@@ -4274,20 +4274,20 @@ void si_meminfo_node(struct sysinfo *val, int nid)
  * Determine whether the node should be displayed or not, depending on whether
  * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
  */
-bool skip_free_areas_node(unsigned int flags, int nid)
+static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
 {
-	bool ret = false;
-	unsigned int cpuset_mems_cookie;
-
 	if (!(flags & SHOW_MEM_FILTER_NODES))
-		goto out;
+		return false;
 
-	do {
-		cpuset_mems_cookie = read_mems_allowed_begin();
-		ret = !node_isset(nid, cpuset_current_mems_allowed);
-	} while (read_mems_allowed_retry(cpuset_mems_cookie));
-out:
-	return ret;
+	/*
+	 * no node mask - aka implicit memory numa policy. Do not bother with
+	 * the synchronization - read_mems_allowed_begin - because we do not
+	 * have to be precise here.
+	 */
+	if (!nodemask)
+		nodemask = &cpuset_current_mems_allowed;
+
+	return !node_isset(nid, *nodemask);
 }
 
 #define K(x) ((x) << (PAGE_SHIFT-10))
@@ -4328,7 +4328,7 @@ static void show_migration_types(unsigned char type)
  * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
  *   cpuset.
  */
-void show_free_areas(unsigned int filter)
+void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 {
 	unsigned long free_pcp = 0;
 	int cpu;
@@ -4336,7 +4336,7 @@ void show_free_areas(unsigned int filter)
 	pg_data_t *pgdat;
 
 	for_each_populated_zone(zone) {
-		if (skip_free_areas_node(filter, zone_to_nid(zone)))
+		if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
 			continue;
 
 		for_each_online_cpu(cpu)
@@ -4370,7 +4370,7 @@ void show_free_areas(unsigned int filter)
 		global_page_state(NR_FREE_CMA_PAGES));
 
 	for_each_online_pgdat(pgdat) {
-		if (skip_free_areas_node(filter, pgdat->node_id))
+		if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
 			continue;
 
 		printk("Node %d"
@@ -4422,7 +4422,7 @@ void show_free_areas(unsigned int filter)
 	for_each_populated_zone(zone) {
 		int i;
 
-		if (skip_free_areas_node(filter, zone_to_nid(zone)))
+		if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
 			continue;
 
 		free_pcp = 0;
@@ -4487,7 +4487,7 @@ void show_free_areas(unsigned int filter)
 		unsigned long nr[MAX_ORDER], flags, total = 0;
 		unsigned char types[MAX_ORDER];
 
-		if (skip_free_areas_node(filter, zone_to_nid(zone)))
+		if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
 			continue;
 		show_node(zone);
 		printk(KERN_CONT "%s: ", zone->name);
-- 
cgit 


From 9a67f6488eca926f8356b2737fc9f8f6c0cbed85 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 22 Feb 2017 15:46:19 -0800
Subject: mm: consolidate GFP_NOFAIL checks in the allocator slowpath

Tetsuo Handa has pointed out that commit 0a0337e0d1d1 ("mm, oom: rework
oom detection") has subtly changed semantic for costly high order
requests with __GFP_NOFAIL and withtout __GFP_REPEAT and those can fail
right now.  My code inspection didn't reveal any such users in the tree
but it is true that this might lead to unexpected allocation failures
and subsequent OOPs.

__alloc_pages_slowpath wrt.  GFP_NOFAIL is hard to follow currently.
There are few special cases but we are lacking a catch all place to be
sure we will not miss any case where the non failing allocation might
fail.  This patch reorganizes the code a bit and puts all those special
cases under nopage label which is the generic go-to-fail path.  Non
failing allocations are retried or those that cannot retry like
non-sleeping allocation go to the failure point directly.  This should
make the code flow much easier to follow and make it less error prone
for future changes.

While we are there we have to move the stall check up to catch
potentially looping non-failing allocations.

[akpm@linux-foundation.org: fix alloc_flags may-be-used-uninitalized]
Link: http://lkml.kernel.org/r/20161220134904.21023-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 91 ++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 52 insertions(+), 39 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 644fb75f6f24..dd36da6ffef5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3577,6 +3577,14 @@ retry_cpuset:
 	no_progress_loops = 0;
 	compact_priority = DEF_COMPACT_PRIORITY;
 	cpuset_mems_cookie = read_mems_allowed_begin();
+
+	/*
+	 * The fast path uses conservative alloc_flags to succeed only until
+	 * kswapd needs to be woken up, and to avoid the cost of setting up
+	 * alloc_flags precisely. So we do that now.
+	 */
+	alloc_flags = gfp_to_alloc_flags(gfp_mask);
+
 	/*
 	 * We need to recalculate the starting point for the zonelist iterator
 	 * because we might have used different nodemask in the fast path, or
@@ -3588,14 +3596,6 @@ retry_cpuset:
 	if (!ac->preferred_zoneref->zone)
 		goto nopage;
 
-
-	/*
-	 * The fast path uses conservative alloc_flags to succeed only until
-	 * kswapd needs to be woken up, and to avoid the cost of setting up
-	 * alloc_flags precisely. So we do that now.
-	 */
-	alloc_flags = gfp_to_alloc_flags(gfp_mask);
-
 	if (gfp_mask & __GFP_KSWAPD_RECLAIM)
 		wake_all_kswapds(order, ac);
 
@@ -3672,35 +3672,21 @@ retry:
 		goto got_pg;
 
 	/* Caller is not willing to reclaim, we can't balance anything */
-	if (!can_direct_reclaim) {
-		/*
-		 * All existing users of the __GFP_NOFAIL are blockable, so warn
-		 * of any new users that actually allow this type of allocation
-		 * to fail.
-		 */
-		WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
+	if (!can_direct_reclaim)
 		goto nopage;
-	}
 
-	/* Avoid recursion of direct reclaim */
-	if (current->flags & PF_MEMALLOC) {
-		/*
-		 * __GFP_NOFAIL request from this context is rather bizarre
-		 * because we cannot reclaim anything and only can loop waiting
-		 * for somebody to do a work for us.
-		 */
-		if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
-			cond_resched();
-			goto retry;
-		}
-		goto nopage;
+	/* Make sure we know about allocations which stall for too long */
+	if (time_after(jiffies, alloc_start + stall_timeout)) {
+		warn_alloc(gfp_mask, ac->nodemask,
+			"page allocation stalls for %ums, order:%u",
+			jiffies_to_msecs(jiffies-alloc_start), order);
+		stall_timeout += 10 * HZ;
 	}
 
-	/* Avoid allocations with no watermarks from looping endlessly */
-	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
+	/* Avoid recursion of direct reclaim */
+	if (current->flags & PF_MEMALLOC)
 		goto nopage;
 
-
 	/* Try direct reclaim and then allocating */
 	page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
 							&did_some_progress);
@@ -3724,14 +3710,6 @@ retry:
 	if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
 		goto nopage;
 
-	/* Make sure we know about allocations which stall for too long */
-	if (time_after(jiffies, alloc_start + stall_timeout)) {
-		warn_alloc(gfp_mask, ac->nodemask,
-			"page allocation stalls for %ums, order:%u",
-			jiffies_to_msecs(jiffies-alloc_start), order);
-		stall_timeout += 10 * HZ;
-	}
-
 	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
 				 did_some_progress > 0, &no_progress_loops))
 		goto retry;
@@ -3760,6 +3738,10 @@ retry:
 	if (page)
 		goto got_pg;
 
+	/* Avoid allocations with no watermarks from looping endlessly */
+	if (test_thread_flag(TIF_MEMDIE))
+		goto nopage;
+
 	/* Retry as long as the OOM killer is making progress */
 	if (did_some_progress) {
 		no_progress_loops = 0;
@@ -3777,6 +3759,37 @@ nopage:
 	if (read_mems_allowed_retry(cpuset_mems_cookie))
 		goto retry_cpuset;
 
+	/*
+	 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
+	 * we always retry
+	 */
+	if (gfp_mask & __GFP_NOFAIL) {
+		/*
+		 * All existing users of the __GFP_NOFAIL are blockable, so warn
+		 * of any new users that actually require GFP_NOWAIT
+		 */
+		if (WARN_ON_ONCE(!can_direct_reclaim))
+			goto fail;
+
+		/*
+		 * PF_MEMALLOC request from this context is rather bizarre
+		 * because we cannot reclaim anything and only can loop waiting
+		 * for somebody to do a work for us
+		 */
+		WARN_ON_ONCE(current->flags & PF_MEMALLOC);
+
+		/*
+		 * non failing costly orders are a hard requirement which we
+		 * are not prepared for much so let's warn about these users
+		 * so that we can identify them and convert them to something
+		 * else.
+		 */
+		WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
+
+		cond_resched();
+		goto retry;
+	}
+fail:
 	warn_alloc(gfp_mask, ac->nodemask,
 			"page allocation failure: order:%u", order);
 got_pg:
-- 
cgit 


From 06ad276ac18742c6b281698d41b27a290cd42407 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 22 Feb 2017 15:46:22 -0800
Subject: mm, oom: do not enforce OOM killer for __GFP_NOFAIL automatically

__alloc_pages_may_oom makes sure to skip the OOM killer depending on the
allocation request.  This includes lowmem requests, costly high order
requests and others.  For a long time __GFP_NOFAIL acted as an override
for all those rules.  This is not documented and it can be quite
surprising as well.  E.g.  GFP_NOFS requests are not invoking the OOM
killer but GFP_NOFS|__GFP_NOFAIL does so if we try to convert some of
the existing open coded loops around allocator to nofail request (and we
have done that in the past) then such a change would have a non trivial
side effect which is far from obvious.  Note that the primary motivation
for skipping the OOM killer is to prevent from pre-mature invocation.

The exception has been added by commit 82553a937f12 ("oom: invoke oom
killer for __GFP_NOFAIL").  The changelog points out that the oom killer
has to be invoked otherwise the request would be looping for ever.  But
this argument is rather weak because the OOM killer doesn't really
guarantee a forward progress for those exceptional cases:

- it will hardly help to form costly order which in turn can result in
  the system panic because of no oom killable task in the end - I believe
  we certainly do not want to put the system down just because there is a
  nasty driver asking for order-9 page with GFP_NOFAIL not realizing all
  the consequences.  It is much better this request would loop for ever
  than the massive system disruption

- lowmem is also highly unlikely to be freed during OOM killer

- GFP_NOFS request could trigger while there is still a lot of memory
  pinned by filesystems.

This patch simply removes the __GFP_NOFAIL special case in order to have a
more clear semantic without surprising side effects.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Nils Holland <nholland@tisys.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c   |  2 +-
 mm/page_alloc.c | 49 ++++++++++++++++++++++++-------------------------
 2 files changed, 25 insertions(+), 26 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7176b6a754cf..c7b48b4282d9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1013,7 +1013,7 @@ bool out_of_memory(struct oom_control *oc)
 	 * make sure exclude 0 mask - all other users should have at least
 	 * ___GFP_DIRECT_RECLAIM to get here.
 	 */
-	if (oc->gfp_mask && !(oc->gfp_mask & (__GFP_FS|__GFP_NOFAIL)))
+	if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS))
 		return true;
 
 	/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dd36da6ffef5..1e37740837ac 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3090,32 +3090,31 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	if (page)
 		goto out;
 
-	if (!(gfp_mask & __GFP_NOFAIL)) {
-		/* Coredumps can quickly deplete all memory reserves */
-		if (current->flags & PF_DUMPCORE)
-			goto out;
-		/* The OOM killer will not help higher order allocs */
-		if (order > PAGE_ALLOC_COSTLY_ORDER)
-			goto out;
-		/* The OOM killer does not needlessly kill tasks for lowmem */
-		if (ac->high_zoneidx < ZONE_NORMAL)
-			goto out;
-		if (pm_suspended_storage())
-			goto out;
-		/*
-		 * XXX: GFP_NOFS allocations should rather fail than rely on
-		 * other request to make a forward progress.
-		 * We are in an unfortunate situation where out_of_memory cannot
-		 * do much for this context but let's try it to at least get
-		 * access to memory reserved if the current task is killed (see
-		 * out_of_memory). Once filesystems are ready to handle allocation
-		 * failures more gracefully we should just bail out here.
-		 */
+	/* Coredumps can quickly deplete all memory reserves */
+	if (current->flags & PF_DUMPCORE)
+		goto out;
+	/* The OOM killer will not help higher order allocs */
+	if (order > PAGE_ALLOC_COSTLY_ORDER)
+		goto out;
+	/* The OOM killer does not needlessly kill tasks for lowmem */
+	if (ac->high_zoneidx < ZONE_NORMAL)
+		goto out;
+	if (pm_suspended_storage())
+		goto out;
+	/*
+	 * XXX: GFP_NOFS allocations should rather fail than rely on
+	 * other request to make a forward progress.
+	 * We are in an unfortunate situation where out_of_memory cannot
+	 * do much for this context but let's try it to at least get
+	 * access to memory reserved if the current task is killed (see
+	 * out_of_memory). Once filesystems are ready to handle allocation
+	 * failures more gracefully we should just bail out here.
+	 */
+
+	/* The OOM killer may not free memory on a specific node */
+	if (gfp_mask & __GFP_THISNODE)
+		goto out;
 
-		/* The OOM killer may not free memory on a specific node */
-		if (gfp_mask & __GFP_THISNODE)
-			goto out;
-	}
 	/* Exhausted what can be done so it's blamo time */
 	if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
 		*did_some_progress = 1;
-- 
cgit 


From 6c18ba7a18997dadbf7ee912e15677ad2c9993e5 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 22 Feb 2017 15:46:25 -0800
Subject: mm: help __GFP_NOFAIL allocations which do not trigger OOM killer

Now that __GFP_NOFAIL doesn't override decisions to skip the oom killer
we are left with requests which require to loop inside the allocator
without invoking the oom killer (e.g.  GFP_NOFS|__GFP_NOFAIL used by fs
code) and so they might, in very unlikely situations, loop for ever -
e.g.  other parallel request could starve them.

This patch tries to limit the likelihood of such a lockup by giving
these __GFP_NOFAIL requests a chance to move on by consuming a small
part of memory reserves.  We are using ALLOC_HARDER which should be
enough to prevent from the starvation by regular allocation requests,
yet it shouldn't consume enough from the reserves to disrupt high
priority requests (ALLOC_HIGH).

While we are at it, let's introduce a helper __alloc_pages_cpuset_fallback
which enforces the cpusets but allows to fallback to ignore them if the
first attempt fails.  __GFP_NOFAIL requests can be considered important
enough to allow cpuset runaway in order for the system to move on.  It
is highly unlikely that any of these will be GFP_USER anyway.

Link: http://lkml.kernel.org/r/20161220134904.21023-4-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 46 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 10 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1e37740837ac..a179607de26f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3055,6 +3055,26 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 	warn_alloc_show_mem(gfp_mask, nm);
 }
 
+static inline struct page *
+__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
+			      unsigned int alloc_flags,
+			      const struct alloc_context *ac)
+{
+	struct page *page;
+
+	page = get_page_from_freelist(gfp_mask, order,
+			alloc_flags|ALLOC_CPUSET, ac);
+	/*
+	 * fallback to ignore cpuset restriction if our nodes
+	 * are depleted
+	 */
+	if (!page)
+		page = get_page_from_freelist(gfp_mask, order,
+				alloc_flags, ac);
+
+	return page;
+}
+
 static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	const struct alloc_context *ac, unsigned long *did_some_progress)
@@ -3119,17 +3139,13 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
 		*did_some_progress = 1;
 
-		if (gfp_mask & __GFP_NOFAIL) {
-			page = get_page_from_freelist(gfp_mask, order,
-					ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac);
-			/*
-			 * fallback to ignore cpuset restriction if our nodes
-			 * are depleted
-			 */
-			if (!page)
-				page = get_page_from_freelist(gfp_mask, order,
+		/*
+		 * Help non-failing allocations by giving them access to memory
+		 * reserves
+		 */
+		if (gfp_mask & __GFP_NOFAIL)
+			page = __alloc_pages_cpuset_fallback(gfp_mask, order,
 					ALLOC_NO_WATERMARKS, ac);
-		}
 	}
 out:
 	mutex_unlock(&oom_lock);
@@ -3785,6 +3801,16 @@ nopage:
 		 */
 		WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
 
+		/*
+		 * Help non-failing allocations by giving them access to memory
+		 * reserves but do not use ALLOC_NO_WATERMARKS because this
+		 * could deplete whole memory reserves which would just make
+		 * the situation worse
+		 */
+		page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
+		if (page)
+			goto got_pg;
+
 		cond_resched();
 		goto retry;
 	}
-- 
cgit 


From 685dbf6f5a643c4bdb9323ee3544ec652505d2ea Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 22 Feb 2017 15:46:28 -0800
Subject: mm, page_alloc: warn_alloc nodemask is NULL when cpusets are disabled

The patch "mm, page_alloc: warn_alloc print nodemask" implicitly sets
the allocation nodemask to cpuset_current_mems_allowed when there is no
effective mempolicy.  cpuset_current_mems_allowed is only effective when
cpusets are enabled, which is also printed by warn_alloc(), so setting
the nodemask to cpuset_current_mems_allowed is redundant and prevents
debugging issues where ac->nodemask is not set properly in the page
allocator.

This provides better debugging output since
cpuset_print_current_mems_allowed() is already provided.

Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1701181347320.142399@chino.kir.corp.google.com
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a179607de26f..c21b33668133 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3034,7 +3034,6 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 	va_list args;
 	static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
 				      DEFAULT_RATELIMIT_BURST);
-	nodemask_t *nm = (nodemask) ? nodemask : &cpuset_current_mems_allowed;
 
 	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
 	    debug_guardpage_minorder() > 0)
@@ -3048,11 +3047,16 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 	pr_cont("%pV", &vaf);
 	va_end(args);
 
-	pr_cont(", mode:%#x(%pGg), nodemask=%*pbl\n", gfp_mask, &gfp_mask, nodemask_pr_args(nm));
+	pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask);
+	if (nodemask)
+		pr_cont("%*pbl\n", nodemask_pr_args(nodemask));
+	else
+		pr_cont("(null)\n");
+
 	cpuset_print_current_mems_allowed();
 
 	dump_stack();
-	warn_alloc_show_mem(gfp_mask, nm);
+	warn_alloc_show_mem(gfp_mask, nodemask);
 }
 
 static inline struct page *
-- 
cgit 


From 066b23935578d3913c2df9bed7addbcdf4711f1a Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 24 Feb 2017 14:56:26 -0800
Subject: mm, page_alloc: split buffered_rmqueue()

Patch series "Use per-cpu allocator for !irq requests and prepare for a
bulk allocator", v5.

This series is motivated by a conversation led by Jesper Dangaard Brouer
at the last LSF/MM proposing a generic page pool for DMA-coherent pages.
Part of his motivation was due to the overhead of allocating multiple
order-0 that led some drivers to use high-order allocations and
splitting them.  This is very slow in some cases.

The first two patches in this series restructure the page allocator such
that it is relatively easy to introduce an order-0 bulk page allocator.
A patch exists to do that and has been handed over to Jesper until an
in-kernel users is created.  The third patch prevents the per-cpu
allocator being drained from IPI context as that can potentially corrupt
the list after patch four is merged.  The final patch alters the per-cpu
alloctor to make it exclusive to !irq requests.  This cuts
allocation/free overhead by roughly 30%.

Performance tests from both Jesper and me are included in the patch.

This patch (of 4):

buffered_rmqueue removes a page from a given zone and uses the per-cpu
list for order-0.  This is fine but a hypothetical caller that wanted
multiple order-0 pages has to disable/reenable interrupts multiple
times.  This patch structures buffere_rmqueue such that it's relatively
easy to build a bulk order-0 page allocator.  There is no functional
change.

[mgorman@techsingularity.net: failed per-cpu refill may blow up]
  Link: http://lkml.kernel.org/r/20170124112723.mshmgwq2ihxku2um@techsingularity.net
Link: http://lkml.kernel.org/r/20170123153906.3122-2-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 128 ++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 79 insertions(+), 49 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c21b33668133..284153d3e0fc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2600,74 +2600,104 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
 #endif
 }
 
+/* Remove page from the per-cpu list, caller must protect the list */
+static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
+			bool cold, struct per_cpu_pages *pcp,
+			struct list_head *list)
+{
+	struct page *page;
+
+	do {
+		if (list_empty(list)) {
+			pcp->count += rmqueue_bulk(zone, 0,
+					pcp->batch, list,
+					migratetype, cold);
+			if (unlikely(list_empty(list)))
+				return NULL;
+		}
+
+		if (cold)
+			page = list_last_entry(list, struct page, lru);
+		else
+			page = list_first_entry(list, struct page, lru);
+
+		list_del(&page->lru);
+		pcp->count--;
+	} while (check_new_pcp(page));
+
+	return page;
+}
+
+/* Lock and remove page from the per-cpu list */
+static struct page *rmqueue_pcplist(struct zone *preferred_zone,
+			struct zone *zone, unsigned int order,
+			gfp_t gfp_flags, int migratetype)
+{
+	struct per_cpu_pages *pcp;
+	struct list_head *list;
+	bool cold = ((gfp_flags & __GFP_COLD) != 0);
+	struct page *page;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	pcp = &this_cpu_ptr(zone->pageset)->pcp;
+	list = &pcp->lists[migratetype];
+	page = __rmqueue_pcplist(zone,  migratetype, cold, pcp, list);
+	if (page) {
+		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+		zone_statistics(preferred_zone, zone);
+	}
+	local_irq_restore(flags);
+	return page;
+}
+
 /*
  * Allocate a page from the given zone. Use pcplists for order-0 allocations.
  */
 static inline
-struct page *buffered_rmqueue(struct zone *preferred_zone,
+struct page *rmqueue(struct zone *preferred_zone,
 			struct zone *zone, unsigned int order,
 			gfp_t gfp_flags, unsigned int alloc_flags,
 			int migratetype)
 {
 	unsigned long flags;
 	struct page *page;
-	bool cold = ((gfp_flags & __GFP_COLD) != 0);
 
 	if (likely(order == 0)) {
-		struct per_cpu_pages *pcp;
-		struct list_head *list;
-
-		local_irq_save(flags);
-		do {
-			pcp = &this_cpu_ptr(zone->pageset)->pcp;
-			list = &pcp->lists[migratetype];
-			if (list_empty(list)) {
-				pcp->count += rmqueue_bulk(zone, 0,
-						pcp->batch, list,
-						migratetype, cold);
-				if (unlikely(list_empty(list)))
-					goto failed;
-			}
-
-			if (cold)
-				page = list_last_entry(list, struct page, lru);
-			else
-				page = list_first_entry(list, struct page, lru);
-
-			list_del(&page->lru);
-			pcp->count--;
+		page = rmqueue_pcplist(preferred_zone, zone, order,
+				gfp_flags, migratetype);
+		goto out;
+	}
 
-		} while (check_new_pcp(page));
-	} else {
-		/*
-		 * We most definitely don't want callers attempting to
-		 * allocate greater than order-1 page units with __GFP_NOFAIL.
-		 */
-		WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
-		spin_lock_irqsave(&zone->lock, flags);
+	/*
+	 * We most definitely don't want callers attempting to
+	 * allocate greater than order-1 page units with __GFP_NOFAIL.
+	 */
+	WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
+	spin_lock_irqsave(&zone->lock, flags);
 
-		do {
-			page = NULL;
-			if (alloc_flags & ALLOC_HARDER) {
-				page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
-				if (page)
-					trace_mm_page_alloc_zone_locked(page, order, migratetype);
-			}
-			if (!page)
-				page = __rmqueue(zone, order, migratetype);
-		} while (page && check_new_pages(page, order));
-		spin_unlock(&zone->lock);
+	do {
+		page = NULL;
+		if (alloc_flags & ALLOC_HARDER) {
+			page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+			if (page)
+				trace_mm_page_alloc_zone_locked(page, order, migratetype);
+		}
 		if (!page)
-			goto failed;
-		__mod_zone_freepage_state(zone, -(1 << order),
-					  get_pcppage_migratetype(page));
-	}
+			page = __rmqueue(zone, order, migratetype);
+	} while (page && check_new_pages(page, order));
+	spin_unlock(&zone->lock);
+	if (!page)
+		goto failed;
+	__mod_zone_freepage_state(zone, -(1 << order),
+				  get_pcppage_migratetype(page));
 
 	__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
 	zone_statistics(preferred_zone, zone);
 	local_irq_restore(flags);
 
-	VM_BUG_ON_PAGE(bad_range(zone, page), page);
+out:
+	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
 	return page;
 
 failed:
@@ -2972,7 +3002,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 		}
 
 try_this_zone:
-		page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order,
+		page = rmqueue(ac->preferred_zoneref->zone, zone, order,
 				gfp_mask, alloc_flags, ac->migratetype);
 		if (page) {
 			prep_new_page(page, order, gfp_mask, alloc_flags);
-- 
cgit 


From 9cd7555875bb09dad875e89a76f41f576e11c638 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 24 Feb 2017 14:56:29 -0800
Subject: mm, page_alloc: split alloc_pages_nodemask()

alloc_pages_nodemask does a number of preperation steps that determine
what zones can be used for the allocation depending on a variety of
factors.  This is fine but a hypothetical caller that wanted multiple
order-0 pages has to do the preparation steps multiple times.  This
patch structures __alloc_pages_nodemask such that it's relatively easy
to build a bulk order-0 page allocator.  There is no functional change.

Link: http://lkml.kernel.org/r/20170123153906.3122-3-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 75 +++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 46 insertions(+), 29 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 284153d3e0fc..678b2882faaa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3855,60 +3855,77 @@ got_pg:
 	return page;
 }
 
-/*
- * This is the 'heart' of the zoned buddy allocator.
- */
-struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
-			struct zonelist *zonelist, nodemask_t *nodemask)
+static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
+		struct zonelist *zonelist, nodemask_t *nodemask,
+		struct alloc_context *ac, gfp_t *alloc_mask,
+		unsigned int *alloc_flags)
 {
-	struct page *page;
-	unsigned int alloc_flags = ALLOC_WMARK_LOW;
-	gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
-	struct alloc_context ac = {
-		.high_zoneidx = gfp_zone(gfp_mask),
-		.zonelist = zonelist,
-		.nodemask = nodemask,
-		.migratetype = gfpflags_to_migratetype(gfp_mask),
-	};
+	ac->high_zoneidx = gfp_zone(gfp_mask);
+	ac->zonelist = zonelist;
+	ac->nodemask = nodemask;
+	ac->migratetype = gfpflags_to_migratetype(gfp_mask);
 
 	if (cpusets_enabled()) {
-		alloc_mask |= __GFP_HARDWALL;
-		alloc_flags |= ALLOC_CPUSET;
-		if (!ac.nodemask)
-			ac.nodemask = &cpuset_current_mems_allowed;
+		*alloc_mask |= __GFP_HARDWALL;
+		*alloc_flags |= ALLOC_CPUSET;
+		if (!ac->nodemask)
+			ac->nodemask = &cpuset_current_mems_allowed;
 	}
 
-	gfp_mask &= gfp_allowed_mask;
-
 	lockdep_trace_alloc(gfp_mask);
 
 	might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
 
 	if (should_fail_alloc_page(gfp_mask, order))
-		return NULL;
+		return false;
 
 	/*
 	 * Check the zones suitable for the gfp_mask contain at least one
 	 * valid zone. It's possible to have an empty zonelist as a result
 	 * of __GFP_THISNODE and a memoryless node
 	 */
-	if (unlikely(!zonelist->_zonerefs->zone))
-		return NULL;
+	if (unlikely(!ac->zonelist->_zonerefs->zone))
+		return false;
 
-	if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
-		alloc_flags |= ALLOC_CMA;
+	if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
+		*alloc_flags |= ALLOC_CMA;
+
+	return true;
+}
 
+/* Determine whether to spread dirty pages and what the first usable zone */
+static inline void finalise_ac(gfp_t gfp_mask,
+		unsigned int order, struct alloc_context *ac)
+{
 	/* Dirty zone balancing only done in the fast path */
-	ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
+	ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
 
 	/*
 	 * The preferred zone is used for statistics but crucially it is
 	 * also used as the starting point for the zonelist iterator. It
 	 * may get reset for allocations that ignore memory policies.
 	 */
-	ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
-					ac.high_zoneidx, ac.nodemask);
+	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
+					ac->high_zoneidx, ac->nodemask);
+}
+
+/*
+ * This is the 'heart' of the zoned buddy allocator.
+ */
+struct page *
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+			struct zonelist *zonelist, nodemask_t *nodemask)
+{
+	struct page *page;
+	unsigned int alloc_flags = ALLOC_WMARK_LOW;
+	gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
+	struct alloc_context ac = { };
+
+	gfp_mask &= gfp_allowed_mask;
+	if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags))
+		return NULL;
+
+	finalise_ac(gfp_mask, order, &ac);
 	if (!ac.preferred_zoneref->zone) {
 		page = NULL;
 		/*
-- 
cgit 


From 0ccce3b924212e121503619df97cc0f17189b77b Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 24 Feb 2017 14:56:32 -0800
Subject: mm, page_alloc: drain per-cpu pages from workqueue context

The per-cpu page allocator can be drained immediately via
drain_all_pages() which sends IPIs to every CPU.  In the next patch, the
per-cpu allocator will only be used for interrupt-safe allocations which
prevents draining it from IPI context.  This patch uses workqueues to
drain the per-cpu lists instead.

This is slower but no slowdown during intensive reclaim was measured and
the paths that use drain_all_pages() are not that sensitive to
performance.  This is particularly true as the path would only be
triggered when reclaim is failing.  It also makes a some sense to avoid
storming a machine with IPIs when it's under memory pressure.  Arguably,
it should be further adjusted so that only one caller at a time is
draining pages but it's beyond the scope of the current patch.

Link: http://lkml.kernel.org/r/20170123153906.3122-4-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 44 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 7 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 678b2882faaa..610a3db680ae 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2339,19 +2339,21 @@ void drain_local_pages(struct zone *zone)
 		drain_pages(cpu);
 }
 
+static void drain_local_pages_wq(struct work_struct *work)
+{
+	drain_local_pages(NULL);
+}
+
 /*
  * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
  *
  * When zone parameter is non-NULL, spill just the single zone's pages.
  *
- * Note that this code is protected against sending an IPI to an offline
- * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
- * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
- * nothing keeps CPUs from showing up after we populated the cpumask and
- * before the call to on_each_cpu_mask().
+ * Note that this can be extremely slow as the draining happens in a workqueue.
  */
 void drain_all_pages(struct zone *zone)
 {
+	struct work_struct __percpu *works;
 	int cpu;
 
 	/*
@@ -2360,6 +2362,17 @@ void drain_all_pages(struct zone *zone)
 	 */
 	static cpumask_t cpus_with_pcps;
 
+	/* Workqueues cannot recurse */
+	if (current->flags & PF_WQ_WORKER)
+		return;
+
+	/*
+	 * As this can be called from reclaim context, do not reenter reclaim.
+	 * An allocation failure can be handled, it's simply slower
+	 */
+	get_online_cpus();
+	works = alloc_percpu_gfp(struct work_struct, GFP_ATOMIC);
+
 	/*
 	 * We don't care about racing with CPU hotplug event
 	 * as offline notification will cause the notified
@@ -2390,8 +2403,25 @@ void drain_all_pages(struct zone *zone)
 		else
 			cpumask_clear_cpu(cpu, &cpus_with_pcps);
 	}
-	on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
-								zone, 1);
+
+	if (works) {
+		for_each_cpu(cpu, &cpus_with_pcps) {
+			struct work_struct *work = per_cpu_ptr(works, cpu);
+			INIT_WORK(work, drain_local_pages_wq);
+			schedule_work_on(cpu, work);
+		}
+		for_each_cpu(cpu, &cpus_with_pcps)
+			flush_work(per_cpu_ptr(works, cpu));
+	} else {
+		for_each_cpu(cpu, &cpus_with_pcps) {
+			struct work_struct work;
+
+			INIT_WORK(&work, drain_local_pages_wq);
+			schedule_work_on(cpu, &work);
+			flush_work(&work);
+		}
+	}
+	put_online_cpus();
 }
 
 #ifdef CONFIG_HIBERNATION
-- 
cgit 


From a459eeb7b852bcdac605123a500c61286c2a2c3d Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 24 Feb 2017 14:56:35 -0800
Subject: mm, page_alloc: do not depend on cpu hotplug locks inside the
 allocator

Dmitry has reported the following lockdep splat
  lock_acquire+0x2a1/0x630 kernel/locking/lockdep.c:3753
  __mutex_lock_common kernel/locking/mutex.c:521 [inline]
  mutex_lock_nested+0x24e/0xff0 kernel/locking/mutex.c:621
  pcpu_alloc+0xbda/0x1280 mm/percpu.c:896
  __alloc_percpu+0x24/0x30 mm/percpu.c:1075
  smpcfd_prepare_cpu+0x73/0xd0 kernel/smp.c:44
  cpuhp_invoke_callback+0x254/0x1480 kernel/cpu.c:136
  cpuhp_up_callbacks+0x81/0x2a0 kernel/cpu.c:493
  _cpu_up+0x1e3/0x2a0 kernel/cpu.c:1057
  do_cpu_up+0x73/0xa0 kernel/cpu.c:1087
  cpu_up+0x18/0x20 kernel/cpu.c:1095
  smp_init+0xe9/0xee kernel/smp.c:564
  kernel_init_freeable+0x439/0x690 init/main.c:1010
  kernel_init+0x13/0x180 init/main.c:941
  ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:433

cpu_hotplug_begin
  cpu_hotplug.lock
pcpu_alloc
  pcpu_alloc_mutex

  get_online_cpus+0x62/0x90 kernel/cpu.c:248
  drain_all_pages+0xf8/0x710 mm/page_alloc.c:2385
  __alloc_pages_direct_reclaim mm/page_alloc.c:3440 [inline]
  __alloc_pages_slowpath+0x8fd/0x2370 mm/page_alloc.c:3778
  __alloc_pages_nodemask+0x8f5/0xc60 mm/page_alloc.c:3980
  __alloc_pages include/linux/gfp.h:426 [inline]
  __alloc_pages_node include/linux/gfp.h:439 [inline]
  alloc_pages_node include/linux/gfp.h:453 [inline]
  pcpu_alloc_pages mm/percpu-vm.c:93 [inline]
  pcpu_populate_chunk+0x1e1/0x900 mm/percpu-vm.c:282
  pcpu_alloc+0xe01/0x1280 mm/percpu.c:998
  __alloc_percpu_gfp+0x27/0x30 mm/percpu.c:1062
  bpf_array_alloc_percpu kernel/bpf/arraymap.c:34 [inline]
  array_map_alloc+0x532/0x710 kernel/bpf/arraymap.c:99
  find_and_alloc_map kernel/bpf/syscall.c:34 [inline]
  map_create kernel/bpf/syscall.c:188 [inline]
  SYSC_bpf kernel/bpf/syscall.c:870 [inline]
  SyS_bpf+0xd64/0x2500 kernel/bpf/syscall.c:827
  entry_SYSCALL_64_fastpath+0x1f/0xc2

pcpu_alloc
  pcpu_alloc_mutex
drain_all_pages
  get_online_cpus
    cpu_hotplug.lock

  cpu_hotplug_begin+0x206/0x2e0 kernel/cpu.c:304
  _cpu_up+0xca/0x2a0 kernel/cpu.c:1011
  do_cpu_up+0x73/0xa0 kernel/cpu.c:1087
  cpu_up+0x18/0x20 kernel/cpu.c:1095
  smp_init+0xe9/0xee kernel/smp.c:564
  kernel_init_freeable+0x439/0x690 init/main.c:1010
  kernel_init+0x13/0x180 init/main.c:941
  ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:433

cpu_hotplug_begin
  cpu_hotplug.lock

Pulling cpu hotplug locks inside the page allocator is just too
dangerous.  Let's remove the dependency by dropping get_online_cpus()
from drain_all_pages.  This is not so simple though because now we do
not have a protection against cpu hotplug which means 2 things:

  - the work item might be executed on a different cpu in worker from
    unbound pool so it doesn't run on pinned on the cpu

  - we have to make sure that we do not race with page_alloc_cpu_dead
    calling drain_pages_zone

Disabling preemption in drain_local_pages_wq will solve the first
problem drain_local_pages will determine its local CPU from the WQ
context which will be stable after that point, page_alloc_cpu_dead is
pinned to the CPU already.  The later condition is achieved by disabling
IRQs in drain_pages_zone.

Fixes: mm, page_alloc: drain per-cpu pages from workqueue context
Link: http://lkml.kernel.org/r/20170207201950.20482-1-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 610a3db680ae..8af0d4fa683d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2341,7 +2341,16 @@ void drain_local_pages(struct zone *zone)
 
 static void drain_local_pages_wq(struct work_struct *work)
 {
+	/*
+	 * drain_all_pages doesn't use proper cpu hotplug protection so
+	 * we can race with cpu offline when the WQ can move this from
+	 * a cpu pinned worker to an unbound one. We can operate on a different
+	 * cpu which is allright but we also have to make sure to not move to
+	 * a different one.
+	 */
+	preempt_disable();
 	drain_local_pages(NULL);
+	preempt_enable();
 }
 
 /*
@@ -2366,11 +2375,6 @@ void drain_all_pages(struct zone *zone)
 	if (current->flags & PF_WQ_WORKER)
 		return;
 
-	/*
-	 * As this can be called from reclaim context, do not reenter reclaim.
-	 * An allocation failure can be handled, it's simply slower
-	 */
-	get_online_cpus();
 	works = alloc_percpu_gfp(struct work_struct, GFP_ATOMIC);
 
 	/*
@@ -2421,7 +2425,6 @@ void drain_all_pages(struct zone *zone)
 			flush_work(&work);
 		}
 	}
-	put_online_cpus();
 }
 
 #ifdef CONFIG_HIBERNATION
-- 
cgit 


From 374ad05ab64d696303cec5cc8ec3a65d457b7b1c Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 24 Feb 2017 14:56:38 -0800
Subject: mm, page_alloc: only use per-cpu allocator for irq-safe requests

Many workloads that allocate pages are not handling an interrupt at a
time.  As allocation requests may be from IRQ context, it's necessary to
disable/enable IRQs for every page allocation.  This cost is the bulk of
the free path but also a significant percentage of the allocation path.

This patch alters the locking and checks such that only irq-safe
allocation requests use the per-cpu allocator.  All others acquire the
irq-safe zone->lock and allocate from the buddy allocator.  It relies on
disabling preemption to safely access the per-cpu structures.  It could
be slightly modified to avoid soft IRQs using it but it's not clear it's
worthwhile.

This modification may slow allocations from IRQ context slightly but the
main gain from the per-cpu allocator is that it scales better for
allocations from multiple contexts.  There is an implicit assumption
that intensive allocations from IRQ contexts on multiple CPUs from a
single NUMA node are rare and that the fast majority of scaling issues
are encountered in !IRQ contexts such as page faulting.  It's worth
noting that this patch is not required for a bulk page allocator but it
significantly reduces the overhead.

The following is results from a page allocator micro-benchmark.  Only
order-0 is interesting as higher orders do not use the per-cpu allocator

                                          4.10.0-rc2                 4.10.0-rc2
                                             vanilla               irqsafe-v1r5
Amean    alloc-odr0-1               287.15 (  0.00%)           219.00 ( 23.73%)
Amean    alloc-odr0-2               221.23 (  0.00%)           183.23 ( 17.18%)
Amean    alloc-odr0-4               187.00 (  0.00%)           151.38 ( 19.05%)
Amean    alloc-odr0-8               167.54 (  0.00%)           132.77 ( 20.75%)
Amean    alloc-odr0-16              156.00 (  0.00%)           123.00 ( 21.15%)
Amean    alloc-odr0-32              149.00 (  0.00%)           118.31 ( 20.60%)
Amean    alloc-odr0-64              138.77 (  0.00%)           116.00 ( 16.41%)
Amean    alloc-odr0-128             145.00 (  0.00%)           118.00 ( 18.62%)
Amean    alloc-odr0-256             136.15 (  0.00%)           125.00 (  8.19%)
Amean    alloc-odr0-512             147.92 (  0.00%)           121.77 ( 17.68%)
Amean    alloc-odr0-1024            147.23 (  0.00%)           126.15 ( 14.32%)
Amean    alloc-odr0-2048            155.15 (  0.00%)           129.92 ( 16.26%)
Amean    alloc-odr0-4096            164.00 (  0.00%)           136.77 ( 16.60%)
Amean    alloc-odr0-8192            166.92 (  0.00%)           138.08 ( 17.28%)
Amean    alloc-odr0-16384           159.00 (  0.00%)           138.00 ( 13.21%)
Amean    free-odr0-1                165.00 (  0.00%)            89.00 ( 46.06%)
Amean    free-odr0-2                113.00 (  0.00%)            63.00 ( 44.25%)
Amean    free-odr0-4                 99.00 (  0.00%)            54.00 ( 45.45%)
Amean    free-odr0-8                 88.00 (  0.00%)            47.38 ( 46.15%)
Amean    free-odr0-16                83.00 (  0.00%)            46.00 ( 44.58%)
Amean    free-odr0-32                80.00 (  0.00%)            44.38 ( 44.52%)
Amean    free-odr0-64                72.62 (  0.00%)            43.00 ( 40.78%)
Amean    free-odr0-128               78.00 (  0.00%)            42.00 ( 46.15%)
Amean    free-odr0-256               80.46 (  0.00%)            57.00 ( 29.16%)
Amean    free-odr0-512               96.38 (  0.00%)            64.69 ( 32.88%)
Amean    free-odr0-1024             107.31 (  0.00%)            72.54 ( 32.40%)
Amean    free-odr0-2048             108.92 (  0.00%)            78.08 ( 28.32%)
Amean    free-odr0-4096             113.38 (  0.00%)            82.23 ( 27.48%)
Amean    free-odr0-8192             112.08 (  0.00%)            82.85 ( 26.08%)
Amean    free-odr0-16384            110.38 (  0.00%)            81.92 ( 25.78%)
Amean    total-odr0-1               452.15 (  0.00%)           308.00 ( 31.88%)
Amean    total-odr0-2               334.23 (  0.00%)           246.23 ( 26.33%)
Amean    total-odr0-4               286.00 (  0.00%)           205.38 ( 28.19%)
Amean    total-odr0-8               255.54 (  0.00%)           180.15 ( 29.50%)
Amean    total-odr0-16              239.00 (  0.00%)           169.00 ( 29.29%)
Amean    total-odr0-32              229.00 (  0.00%)           162.69 ( 28.96%)
Amean    total-odr0-64              211.38 (  0.00%)           159.00 ( 24.78%)
Amean    total-odr0-128             223.00 (  0.00%)           160.00 ( 28.25%)
Amean    total-odr0-256             216.62 (  0.00%)           182.00 ( 15.98%)
Amean    total-odr0-512             244.31 (  0.00%)           186.46 ( 23.68%)
Amean    total-odr0-1024            254.54 (  0.00%)           198.69 ( 21.94%)
Amean    total-odr0-2048            264.08 (  0.00%)           208.00 ( 21.24%)
Amean    total-odr0-4096            277.38 (  0.00%)           219.00 ( 21.05%)
Amean    total-odr0-8192            279.00 (  0.00%)           220.92 ( 20.82%)
Amean    total-odr0-16384           269.38 (  0.00%)           219.92 ( 18.36%)

This is the alloc, free and total overhead of allocating order-0 pages
in batches of 1 page up to 16384 pages.  Avoiding disabling/enabling
overhead massively reduces overhead.  Alloc overhead is roughly reduced
by 14-20% in most cases.  The free path is reduced by 26-46% and the
total reduction is significant.

Many users require zeroing of pages from the page allocator which is the
vast cost of allocation.  Hence, the impact on a basic page faulting
benchmark is not that significant

                              4.10.0-rc2            4.10.0-rc2
                                 vanilla          irqsafe-v1r5
Hmean    page_test   656632.98 (  0.00%)   675536.13 (  2.88%)
Hmean    brk_test   3845502.67 (  0.00%)  3867186.94 (  0.56%)
Stddev   page_test    10543.29 (  0.00%)     4104.07 ( 61.07%)
Stddev   brk_test     33472.36 (  0.00%)    15538.39 ( 53.58%)
CoeffVar page_test        1.61 (  0.00%)        0.61 ( 62.15%)
CoeffVar brk_test         0.87 (  0.00%)        0.40 ( 53.84%)
Max      page_test   666513.33 (  0.00%)   678640.00 (  1.82%)
Max      brk_test   3882800.00 (  0.00%)  3887008.66 (  0.11%)

This is from aim9 and the most notable outcome is that fault variability
is reduced by the patch.  The headline improvement is small as the
overall fault cost, zeroing, page table insertion etc dominate relative
to disabling/enabling IRQs in the per-cpu allocator.

Similarly, little benefit was seen on networking benchmarks both
localhost and between physical server/clients where other costs
dominate.  It's possible that this will only be noticable on very high
speed networks.

Jesper Dangaard Brouer independently tested this with a separate
microbenchmark from
  https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench

Micro-benchmarked with [1] page_bench02:
 modprobe page_bench02 page_order=0 run_flags=$((2#010)) loops=$((10**8)); \
  rmmod page_bench02 ; dmesg --notime | tail -n 4

Compared to baseline: 213 cycles(tsc) 53.417 ns
 - against this     : 184 cycles(tsc) 46.056 ns
 - Saving           : -29 cycles
 - Very close to expected 27 cycles saving [see below [2]]

Micro benchmarking via time_bench_sample[3], we get the cost of these
operations:

 time_bench: Type:for_loop                 Per elem: 0 cycles(tsc) 0.232 ns (step:0)
 time_bench: Type:spin_lock_unlock         Per elem: 33 cycles(tsc) 8.334 ns (step:0)
 time_bench: Type:spin_lock_unlock_irqsave Per elem: 62 cycles(tsc) 15.607 ns (step:0)
 time_bench: Type:irqsave_before_lock      Per elem: 57 cycles(tsc) 14.344 ns (step:0)
 time_bench: Type:spin_lock_unlock_irq     Per elem: 34 cycles(tsc) 8.560 ns (step:0)
 time_bench: Type:simple_irq_disable_before_lock Per elem: 37 cycles(tsc) 9.289 ns (step:0)
 time_bench: Type:local_BH_disable_enable  Per elem: 19 cycles(tsc) 4.920 ns (step:0)
 time_bench: Type:local_IRQ_disable_enable Per elem: 7 cycles(tsc) 1.864 ns (step:0)
 time_bench: Type:local_irq_save_restore   Per elem: 38 cycles(tsc) 9.665 ns (step:0)
 [Mel's patch removes a ^^^^^^^^^^^^^^^^]            ^^^^^^^^^ expected saving - preempt cost
 time_bench: Type:preempt_disable_enable   Per elem: 11 cycles(tsc) 2.794 ns (step:0)
 [adds a preempt  ^^^^^^^^^^^^^^^^^^^^^^]            ^^^^^^^^^ adds this cost
 time_bench: Type:funcion_call_cost        Per elem: 6 cycles(tsc) 1.689 ns (step:0)
 time_bench: Type:func_ptr_call_cost       Per elem: 11 cycles(tsc) 2.767 ns (step:0)
 time_bench: Type:page_alloc_put           Per elem: 211 cycles(tsc) 52.803 ns (step:0)

Thus, expected improvement is: 38-11 = 27 cycles.

[mgorman@techsingularity.net: s/preempt_enable_no_resched/preempt_enable/]
  Link: http://lkml.kernel.org/r/20170208143128.25ahymqlyspjcixu@techsingularity.net
Link: http://lkml.kernel.org/r/20170123153906.3122-5-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 43 +++++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8af0d4fa683d..6196eed96732 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1085,10 +1085,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 {
 	int migratetype = 0;
 	int batch_free = 0;
-	unsigned long nr_scanned;
+	unsigned long nr_scanned, flags;
 	bool isolated_pageblocks;
 
-	spin_lock(&zone->lock);
+	spin_lock_irqsave(&zone->lock, flags);
 	isolated_pageblocks = has_isolate_pageblock(zone);
 	nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
 	if (nr_scanned)
@@ -1137,7 +1137,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 			trace_mm_page_pcpu_drain(page, 0, mt);
 		} while (--count && --batch_free && !list_empty(list));
 	}
-	spin_unlock(&zone->lock);
+	spin_unlock_irqrestore(&zone->lock, flags);
 }
 
 static void free_one_page(struct zone *zone,
@@ -1145,8 +1145,9 @@ static void free_one_page(struct zone *zone,
 				unsigned int order,
 				int migratetype)
 {
-	unsigned long nr_scanned;
-	spin_lock(&zone->lock);
+	unsigned long nr_scanned, flags;
+	spin_lock_irqsave(&zone->lock, flags);
+	__count_vm_events(PGFREE, 1 << order);
 	nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
 	if (nr_scanned)
 		__mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
@@ -1156,7 +1157,7 @@ static void free_one_page(struct zone *zone,
 		migratetype = get_pfnblock_migratetype(page, pfn);
 	}
 	__free_one_page(page, pfn, zone, order, migratetype);
-	spin_unlock(&zone->lock);
+	spin_unlock_irqrestore(&zone->lock, flags);
 }
 
 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@ -1234,7 +1235,6 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
 
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
-	unsigned long flags;
 	int migratetype;
 	unsigned long pfn = page_to_pfn(page);
 
@@ -1242,10 +1242,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 		return;
 
 	migratetype = get_pfnblock_migratetype(page, pfn);
-	local_irq_save(flags);
-	__count_vm_events(PGFREE, 1 << order);
 	free_one_page(page_zone(page), page, pfn, order, migratetype);
-	local_irq_restore(flags);
 }
 
 static void __init __free_pages_boot_core(struct page *page, unsigned int order)
@@ -2217,8 +2214,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			int migratetype, bool cold)
 {
 	int i, alloced = 0;
+	unsigned long flags;
 
-	spin_lock(&zone->lock);
+	spin_lock_irqsave(&zone->lock, flags);
 	for (i = 0; i < count; ++i) {
 		struct page *page = __rmqueue(zone, order, migratetype);
 		if (unlikely(page == NULL))
@@ -2254,7 +2252,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 	 * pages added to the pcp list.
 	 */
 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
-	spin_unlock(&zone->lock);
+	spin_unlock_irqrestore(&zone->lock, flags);
 	return alloced;
 }
 
@@ -2475,17 +2473,20 @@ void free_hot_cold_page(struct page *page, bool cold)
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
-	unsigned long flags;
 	unsigned long pfn = page_to_pfn(page);
 	int migratetype;
 
+	if (in_interrupt()) {
+		__free_pages_ok(page, 0);
+		return;
+	}
+
 	if (!free_pcp_prepare(page))
 		return;
 
 	migratetype = get_pfnblock_migratetype(page, pfn);
 	set_pcppage_migratetype(page, migratetype);
-	local_irq_save(flags);
-	__count_vm_event(PGFREE);
+	preempt_disable();
 
 	/*
 	 * We only track unmovable, reclaimable and movable on pcp lists.
@@ -2502,6 +2503,7 @@ void free_hot_cold_page(struct page *page, bool cold)
 		migratetype = MIGRATE_MOVABLE;
 	}
 
+	__count_vm_event(PGFREE);
 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
 	if (!cold)
 		list_add(&page->lru, &pcp->lists[migratetype]);
@@ -2515,7 +2517,7 @@ void free_hot_cold_page(struct page *page, bool cold)
 	}
 
 out:
-	local_irq_restore(flags);
+	preempt_enable();
 }
 
 /*
@@ -2640,6 +2642,8 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
 {
 	struct page *page;
 
+	VM_BUG_ON(in_interrupt());
+
 	do {
 		if (list_empty(list)) {
 			pcp->count += rmqueue_bulk(zone, 0,
@@ -2670,9 +2674,8 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	struct list_head *list;
 	bool cold = ((gfp_flags & __GFP_COLD) != 0);
 	struct page *page;
-	unsigned long flags;
 
-	local_irq_save(flags);
+	preempt_disable();
 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
 	list = &pcp->lists[migratetype];
 	page = __rmqueue_pcplist(zone,  migratetype, cold, pcp, list);
@@ -2680,7 +2683,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
 		zone_statistics(preferred_zone, zone);
 	}
-	local_irq_restore(flags);
+	preempt_enable();
 	return page;
 }
 
@@ -2696,7 +2699,7 @@ struct page *rmqueue(struct zone *preferred_zone,
 	unsigned long flags;
 	struct page *page;
 
-	if (likely(order == 0)) {
+	if (likely(order == 0) && !in_interrupt()) {
 		page = rmqueue_pcplist(preferred_zone, zone, order,
 				gfp_flags, migratetype);
 		goto out;
-- 
cgit 


From df76cee6bbeb2ed036f1622f63a99c28cecf6b30 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Fri, 24 Feb 2017 14:56:50 -0800
Subject: mm, page_alloc: remove redundant checks from alloc fastpath

The allocation fast path contains two similar checks for zoneref->zone
being NULL, where zoneref points either to the first zone in the
zonelist, or to the preferred zone.  These can be NULL either due to
empty zonelist, or no zone being compatible with given nodemask or
task's cpuset.

These checks are unnecessary, because the zonelist walks in
first_zones_zonelist() and get_page_from_freelist() handle a NULL
starting zoneref->zone or preferred_zoneref->zone safely.  It's safe to
fallback to __alloc_pages_slowpath() where we also have the check early
enough.

Link: http://lkml.kernel.org/r/20170124150511.5710-1-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6196eed96732..65876feb86f3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3915,14 +3915,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
 	if (should_fail_alloc_page(gfp_mask, order))
 		return false;
 
-	/*
-	 * Check the zones suitable for the gfp_mask contain at least one
-	 * valid zone. It's possible to have an empty zonelist as a result
-	 * of __GFP_THISNODE and a memoryless node
-	 */
-	if (unlikely(!ac->zonelist->_zonerefs->zone))
-		return false;
-
 	if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
 		*alloc_flags |= ALLOC_CMA;
 
@@ -3962,22 +3954,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 		return NULL;
 
 	finalise_ac(gfp_mask, order, &ac);
-	if (!ac.preferred_zoneref->zone) {
-		page = NULL;
-		/*
-		 * This might be due to race with cpuset_current_mems_allowed
-		 * update, so make sure we retry with original nodemask in the
-		 * slow path.
-		 */
-		goto no_zone;
-	}
 
 	/* First allocation attempt */
 	page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
 	if (likely(page))
 		goto out;
 
-no_zone:
 	/*
 	 * Runtime PM, block IO and its error handling path can deadlock
 	 * because I/O on the device might not complete.
-- 
cgit 


From 5104782011a12b04fe9cfaa6f1085bdcdedd79c4 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Fri, 24 Feb 2017 14:56:53 -0800
Subject: mm, page_alloc: don't check cpuset allowed twice in fast-path

Since commit 682a3385e773 ("mm, page_alloc: inline the fast path of the
zonelist iterator") we replace a NULL nodemask with
cpuset_current_mems_allowed in the fast path, so that
get_page_from_freelist() filters nodes allowed by the cpuset via
for_next_zone_zonelist_nodemask().

In that case it's pointless to additionaly check __cpuset_zone_allowed()
in each iteration, which we can avoid by not adding ALLOC_CPUSET to
alloc_flags in that scenario.

This saves some cycles in the allocator fast path on systems with one or
more non-root cpuset configured.  In the slow path, ALLOC_CPUSET is
reset according to __alloc_pages_slowpath().  Without configured
cpusets, this code is disabled by a static key.

Link: http://lkml.kernel.org/r/20170124150511.5710-2-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 65876feb86f3..46c30fa26acd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3903,9 +3903,10 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
 
 	if (cpusets_enabled()) {
 		*alloc_mask |= __GFP_HARDWALL;
-		*alloc_flags |= ALLOC_CPUSET;
 		if (!ac->nodemask)
 			ac->nodemask = &cpuset_current_mems_allowed;
+		else
+			*alloc_flags |= ALLOC_CPUSET;
 	}
 
 	lockdep_trace_alloc(gfp_mask);
-- 
cgit 


From bd233f538d51c2cae6f0bfc2cf7f0960e1683b8a Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 24 Feb 2017 14:56:56 -0800
Subject: mm, page_alloc: use static global work_struct for draining per-cpu
 pages

As suggested by Vlastimil Babka and Tejun Heo, this patch uses a static
work_struct to co-ordinate the draining of per-cpu pages on the
workqueue.  Only one task can drain at a time but this is better than
the previous scheme that allowed multiple tasks to send IPIs at a time.

One consideration is whether parallel requests should synchronise
against each other.  This patch does not synchronise for a global drain
as the common case for such callers is expected to be multiple parallel
direct reclaimers competing for pages when the watermark is close to
min.  Draining the per-cpu list is unlikely to make much progress and
serialising the drain is of dubious merit.  Drains are synchonrised for
callers such as memory hotplug and CMA that care about the drain being
complete when the function returns.

Link: http://lkml.kernel.org/r/20170125083038.rzb5f43nptmk7aed@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Suggested-by: Tejun Heo <tj@kernel.org>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 46c30fa26acd..41985aa4672d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -92,6 +92,10 @@ EXPORT_PER_CPU_SYMBOL(_numa_mem_);
 int _node_numa_mem_[MAX_NUMNODES];
 #endif
 
+/* work_structs for global per-cpu drains */
+DEFINE_MUTEX(pcpu_drain_mutex);
+DEFINE_PER_CPU(struct work_struct, pcpu_drain);
+
 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
 volatile unsigned long latent_entropy __latent_entropy;
 EXPORT_SYMBOL(latent_entropy);
@@ -2360,7 +2364,6 @@ static void drain_local_pages_wq(struct work_struct *work)
  */
 void drain_all_pages(struct zone *zone)
 {
-	struct work_struct __percpu *works;
 	int cpu;
 
 	/*
@@ -2373,7 +2376,16 @@ void drain_all_pages(struct zone *zone)
 	if (current->flags & PF_WQ_WORKER)
 		return;
 
-	works = alloc_percpu_gfp(struct work_struct, GFP_ATOMIC);
+	/*
+	 * Do not drain if one is already in progress unless it's specific to
+	 * a zone. Such callers are primarily CMA and memory hotplug and need
+	 * the drain to be complete when the call returns.
+	 */
+	if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
+		if (!zone)
+			return;
+		mutex_lock(&pcpu_drain_mutex);
+	}
 
 	/*
 	 * We don't care about racing with CPU hotplug event
@@ -2406,23 +2418,15 @@ void drain_all_pages(struct zone *zone)
 			cpumask_clear_cpu(cpu, &cpus_with_pcps);
 	}
 
-	if (works) {
-		for_each_cpu(cpu, &cpus_with_pcps) {
-			struct work_struct *work = per_cpu_ptr(works, cpu);
-			INIT_WORK(work, drain_local_pages_wq);
-			schedule_work_on(cpu, work);
-		}
-		for_each_cpu(cpu, &cpus_with_pcps)
-			flush_work(per_cpu_ptr(works, cpu));
-	} else {
-		for_each_cpu(cpu, &cpus_with_pcps) {
-			struct work_struct work;
-
-			INIT_WORK(&work, drain_local_pages_wq);
-			schedule_work_on(cpu, &work);
-			flush_work(&work);
-		}
+	for_each_cpu(cpu, &cpus_with_pcps) {
+		struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
+		INIT_WORK(work, drain_local_pages_wq);
+		schedule_work_on(cpu, work);
 	}
+	for_each_cpu(cpu, &cpus_with_pcps)
+		flush_work(per_cpu_ptr(&pcpu_drain, cpu));
+
+	mutex_unlock(&pcpu_drain_mutex);
 }
 
 #ifdef CONFIG_HIBERNATION
-- 
cgit 


From 0efadf48bca01f17cb64ebceaf528590b2bc7665 Mon Sep 17 00:00:00 2001
From: Yisheng Xie <xieyisheng1@huawei.com>
Date: Fri, 24 Feb 2017 14:57:39 -0800
Subject: mm/hotplug: enable memory hotplug for non-lru movable pages

We had considered all of the non-lru pages as unmovable before commit
bda807d44454 ("mm: migrate: support non-lru movable page migration").
But now some of non-lru pages like zsmalloc, virtio-balloon pages also
become movable.  So we can offline such blocks by using non-lru page
migration.

This patch straightforwardly adds non-lru migration code, which means
adding non-lru related code to the functions which scan over pfn and
collect pages to be migrated and isolate them before migration.

Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Yisheng Xie <xieyisheng1@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 28 +++++++++++++++++-----------
 mm/page_alloc.c     |  8 ++++++--
 2 files changed, 23 insertions(+), 13 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 5c4f48409347..7946375fe466 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1531,10 +1531,10 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
 }
 
 /*
- * Scan pfn range [start,end) to find movable/migratable pages (LRU pages
- * and hugepages). We scan pfn because it's much easier than scanning over
- * linked list. This function returns the pfn of the first found movable
- * page if it's found, otherwise 0.
+ * Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
+ * non-lru movable pages and hugepages). We scan pfn because it's much
+ * easier than scanning over linked list. This function returns the pfn
+ * of the first found movable page if it's found, otherwise 0.
  */
 static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
 {
@@ -1545,6 +1545,8 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
 			page = pfn_to_page(pfn);
 			if (PageLRU(page))
 				return pfn;
+			if (__PageMovable(page))
+				return pfn;
 			if (PageHuge(page)) {
 				if (page_huge_active(page))
 					return pfn;
@@ -1621,21 +1623,25 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		if (!get_page_unless_zero(page))
 			continue;
 		/*
-		 * We can skip free pages. And we can only deal with pages on
-		 * LRU.
+		 * We can skip free pages. And we can deal with pages on
+		 * LRU and non-lru movable pages.
 		 */
-		ret = isolate_lru_page(page);
+		if (PageLRU(page))
+			ret = isolate_lru_page(page);
+		else
+			ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
 		if (!ret) { /* Success */
 			put_page(page);
 			list_add_tail(&page->lru, &source);
 			move_pages--;
-			inc_node_page_state(page, NR_ISOLATED_ANON +
-					    page_is_file_cache(page));
+			if (!__PageMovable(page))
+				inc_node_page_state(page, NR_ISOLATED_ANON +
+						    page_is_file_cache(page));
 
 		} else {
 #ifdef CONFIG_DEBUG_VM
-			pr_alert("removing pfn %lx from LRU failed\n", pfn);
-			dump_page(page, "failed to remove from LRU");
+			pr_alert("failed to isolate pfn %lx\n", pfn);
+			dump_page(page, "isolation failed");
 #endif
 			put_page(page);
 			/* Because we don't have big zone->lock. we should
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 41985aa4672d..2d34cdb70f1d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7227,8 +7227,9 @@ void *__init alloc_large_system_hash(const char *tablename,
  * If @count is not zero, it is okay to include less @count unmovable pages
  *
  * PageLRU check without isolation or lru_lock could race so that
- * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
- * expect this function should be exact.
+ * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
+ * check without lock_page also may miss some movable non-lru pages at
+ * race condition. So you can't expect this function should be exact.
  */
 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
 			 bool skip_hwpoisoned_pages)
@@ -7284,6 +7285,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
 		if (skip_hwpoisoned_pages && PageHWPoison(page))
 			continue;
 
+		if (__PageMovable(page))
+			continue;
+
 		if (!PageLRU(page))
 			found++;
 		/*
-- 
cgit 


From ca96b625341027f611c3e61351a70311077ebcf5 Mon Sep 17 00:00:00 2001
From: Lucas Stach <l.stach@pengutronix.de>
Date: Fri, 24 Feb 2017 14:58:37 -0800
Subject: mm: alloc_contig_range: allow to specify GFP mask

Currently alloc_contig_range assumes that the compaction should be done
with the default GFP_KERNEL flags.  This is probably right for all
current uses of this interface, but may change as CMA is used in more
use-cases (including being the default DMA memory allocator on some
platforms).

Change the function prototype, to allow for passing through the GFP mask
set by upper layers.

Also respect global restrictions by applying memalloc_noio_flags to the
passed in flags.

Link: http://lkml.kernel.org/r/20170127172328.18574-1-l.stach@pengutronix.de
Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Radim Krcmar <rkrcmar@redhat.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Alexander Graf <agraf@suse.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 2 +-
 mm/cma.c            | 3 ++-
 mm/hugetlb.c        | 3 ++-
 mm/page_alloc.c     | 5 +++--
 4 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0fe0b6295ab5..db373b9d3223 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -541,7 +541,7 @@ static inline bool pm_suspended_storage(void)
 #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
 /* The below functions must be run on a range from a single zone. */
 extern int alloc_contig_range(unsigned long start, unsigned long end,
-			      unsigned migratetype);
+			      unsigned migratetype, gfp_t gfp_mask);
 extern void free_contig_range(unsigned long pfn, unsigned nr_pages);
 #endif
 
diff --git a/mm/cma.c b/mm/cma.c
index 94b3460cd608..c6aed23ca6df 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -402,7 +402,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
 
 		pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
 		mutex_lock(&cma_mutex);
-		ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA);
+		ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA,
+					 GFP_KERNEL);
 		mutex_unlock(&cma_mutex);
 		if (ret == 0) {
 			page = pfn_to_page(pfn);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 167fd0722c15..2e0e8159ce8e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1052,7 +1052,8 @@ static int __alloc_gigantic_page(unsigned long start_pfn,
 				unsigned long nr_pages)
 {
 	unsigned long end_pfn = start_pfn + nr_pages;
-	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
+				  GFP_KERNEL);
 }
 
 static bool pfn_range_valid_gigantic(struct zone *z,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2d34cdb70f1d..8a0f33624335 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7399,6 +7399,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
  *			#MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
  *			in range must have the same migratetype and it must
  *			be either of the two.
+ * @gfp_mask:	GFP mask to use during compaction
  *
  * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
  * aligned, however it's the caller's responsibility to guarantee that
@@ -7412,7 +7413,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
  * need to be freed with free_contig_range().
  */
 int alloc_contig_range(unsigned long start, unsigned long end,
-		       unsigned migratetype)
+		       unsigned migratetype, gfp_t gfp_mask)
 {
 	unsigned long outer_start, outer_end;
 	unsigned int order;
@@ -7424,7 +7425,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 		.zone = page_zone(pfn_to_page(start)),
 		.mode = MIGRATE_SYNC,
 		.ignore_skip_hint = true,
-		.gfp_mask = GFP_KERNEL,
+		.gfp_mask = memalloc_noio_flags(gfp_mask),
 	};
 	INIT_LIST_HEAD(&cc.migratepages);
 
-- 
cgit 


From f2bf14d14dbc50dc56be3ebd18652ea2738ef6f8 Mon Sep 17 00:00:00 2001
From: Masanari Iida <standby24x7@gmail.com>
Date: Fri, 24 Feb 2017 14:58:56 -0800
Subject: mm/page_alloc.c: remove duplicate inclusion of page_ext.h

Link: http://lkml.kernel.org/r/20170202011942.1609-1-standby24x7@gmail.com
Signed-off-by: Masanari Iida <standby24x7@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8a0f33624335..f8ef2c90edbe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -59,7 +59,6 @@
 #include <linux/prefetch.h>
 #include <linux/mm_inline.h>
 #include <linux/migrate.h>
-#include <linux/page_ext.h>
 #include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
 #include <linux/page_owner.h>
-- 
cgit 


From e02dc017c3032dcdce1b993af0db135462e1b4b7 Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Fri, 24 Feb 2017 14:59:33 -0800
Subject: mm/page_alloc: fix nodes for reclaim in fast path

When @node_reclaim_node isn't 0, the page allocator tries to reclaim
pages if the amount of free memory in the zones are below the low
watermark.  On Power platform, none of NUMA nodes are scanned for page
reclaim because no nodes match the condition in zone_allows_reclaim().
On Power platform, RECLAIM_DISTANCE is set to 10 which is the distance
of Node-A to Node-A.  So the preferred node even won't be scanned for
page reclaim.

   __alloc_pages_nodemask()
   get_page_from_freelist()
      zone_allows_reclaim()

Anton proposed the test code as below:

   # cat alloc.c
      :
   int main(int argc, char *argv[])
   {
	void *p;
	unsigned long size;
	unsigned long start, end;

	start = time(NULL);
	size = strtoul(argv[1], NULL, 0);
	printf("To allocate %ldGB memory\n", size);

	size <<= 30;
	p = malloc(size);
	assert(p);
	memset(p, 0, size);

	end = time(NULL);
	printf("Used time: %ld seconds\n", end - start);
	sleep(3600);
	return 0;
   }

The system I use for testing has two NUMA nodes.  Both have 128GB
memory.  In below scnario, the page caches on node#0 should be reclaimed
when it encounters pressure to accommodate request of allocation.

   # echo 2 > /proc/sys/vm/zone_reclaim_mode; \
     sync; \
     echo 3 > /proc/sys/vm/drop_caches; \
   # taskset -c 0 cat file.32G > /dev/null; \
     grep FilePages /sys/devices/system/node/node0/meminfo
     Node 0 FilePages:       33619712 kB
   # taskset -c 0 ./alloc 128
   # grep FilePages /sys/devices/system/node/node0/meminfo
     Node 0 FilePages:       33619840 kB
   # grep MemFree /sys/devices/system/node/node0/meminfo
     Node 0 MemFree:          186816 kB

With the patch applied, the pagecache on node-0 is reclaimed when its
free memory is running out.  It's the expected behaviour.

   # echo 2 > /proc/sys/vm/zone_reclaim_mode; \
     sync; \
     echo 3 > /proc/sys/vm/drop_caches
   # taskset -c 0 cat file.32G > /dev/null; \
     grep FilePages /sys/devices/system/node/node0/meminfo
     Node 0 FilePages:       33605568 kB
   # taskset -c 0 ./alloc 128
   # grep FilePages /sys/devices/system/node/node0/meminfo
     Node 0 FilePages:        1379520 kB
   # grep MemFree /sys/devices/system/node/node0/meminfo
     Node 0 MemFree:           317120 kB

Fixes: 5f7a75acdb24 ("mm: page_alloc: do not cache reclaim distances")
Link: http://lkml.kernel.org/r/1486532455-29613-1-git-send-email-gwshan@linux.vnet.ibm.com
Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>	[3.16+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f8ef2c90edbe..4c2011f6b08f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2944,7 +2944,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
 #ifdef CONFIG_NUMA
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
-	return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
+	return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
 				RECLAIM_DISTANCE;
 }
 #else	/* CONFIG_NUMA */
-- 
cgit 


From ad69444e75d77981291ccf807f48d81e8fca010f Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Fri, 24 Feb 2017 14:59:45 -0800
Subject: mm/page_alloc.c: remove redundant init code for ZONE_MOVABLE

arch_zone_lowest/highest_possible_pfn[] is set to 0 and [ZONE_MOVABLE]
is skipped in the loop.  No need to reset them to 0 again.

This patch just removes the redundant code.

Link: http://lkml.kernel.org/r/20170209141731.60208-1-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4c2011f6b08f..9f9623d690d6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6489,8 +6489,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 
 		start_pfn = end_pfn;
 	}
-	arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
-	arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
 
 	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
 	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
-- 
cgit 


From 89d790ab31d033d67635f6362d57ea64e47708fa Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Mon, 27 Feb 2017 14:29:01 -0800
Subject: scripts/spelling.txt: add "algined" pattern and fix typo instances

Fix typos and add the following to the scripts/spelling.txt:

  algined||aligned

While we are here, fix the "appplication" in the touched line in
drivers/block/loop.c.  Also, fix the "may not naturally ..." to
"may not be naturally ..." in the touched line in mm/page_alloc.

Link: http://lkml.kernel.org/r/1481573103-11329-9-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/loop.c | 2 +-
 mm/page_alloc.c      | 2 +-
 scripts/spelling.txt | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 304377182c1a..4b52a1690329 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -186,7 +186,7 @@ static void __loop_update_dio(struct loop_device *lo, bool dio)
 	 *
 	 * TODO: the above condition may be loosed in the future, and
 	 * direct I/O may be switched runtime at that time because most
-	 * of requests in sane appplications should be PAGE_SIZE algined
+	 * of requests in sane applications should be PAGE_SIZE aligned
 	 */
 	if (dio) {
 		if (queue_logical_block_size(lo->lo_queue) >= sb_bsize &&
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9f9623d690d6..a7a6aac95a6d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5925,7 +5925,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
 	 * the zone and SPARSEMEM is in use. If there are holes within the
 	 * zone, each populated memory region may cost us one or two extra
 	 * memmap pages due to alignment because memmap pages for each
-	 * populated regions may not naturally algined on page boundary.
+	 * populated regions may not be naturally aligned on page boundary.
 	 * So the (present_pages >> 4) heuristic is a tradeoff for that.
 	 */
 	if (spanned_pages > present_pages + (present_pages >> 4) &&
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index 12777ab32306..096e8b8b6502 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -65,6 +65,7 @@ afecting||affecting
 agaist||against
 albumns||albums
 alegorical||allegorical
+algined||aligned
 algorith||algorithm
 algorithmical||algorithmically
 algoritm||algorithm
-- 
cgit 


From 5b3cc15aff243cb518cbeed8b1a220cbfd023d9c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 20:43:54 +0100
Subject: sched/headers: Prepare to move the memalloc_noio_*() APIs to
 <linux/sched/mm.h>

Update the .c files that depend on these APIs.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/base/power/runtime.c | 2 +-
 drivers/md/dm-bufio.c        | 1 +
 drivers/md/dm-ioctl.c        | 1 +
 drivers/usb/core/hub.c       | 2 +-
 fs/ocfs2/cluster/tcp.c       | 1 +
 fs/xfs/kmem.c                | 1 +
 fs/xfs/xfs_buf.c             | 1 +
 mm/page_alloc.c              | 1 +
 mm/vmscan.c                  | 1 +
 net/ceph/crypto.c            | 1 +
 10 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index a14fac6a01d3..7bcf80fa9ada 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -7,7 +7,7 @@
  * This file is released under the GPLv2.
  */
 
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/export.h>
 #include <linux/pm_runtime.h>
 #include <linux/pm_wakeirq.h>
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index d36d427a9efb..df4859f6ac6a 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -11,6 +11,7 @@
 #include <linux/device-mapper.h>
 #include <linux/dm-io.h>
 #include <linux/slab.h>
+#include <linux/sched/mm.h>
 #include <linux/jiffies.h>
 #include <linux/vmalloc.h>
 #include <linux/shrinker.h>
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index a5a9b17f0f7f..4da6fc6b1ffd 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/miscdevice.h>
+#include <linux/sched/mm.h>
 #include <linux/init.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index a56c75e09786..f0dd08198d74 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -15,7 +15,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/completion.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/ioctl.h>
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index ec000575e863..4348027384f5 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -54,6 +54,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/sched/mm.h>
 #include <linux/jiffies.h>
 #include <linux/slab.h>
 #include <linux/idr.h>
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 339c696bbc01..2dfdc62f795e 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -16,6 +16,7 @@
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include <linux/mm.h>
+#include <linux/sched/mm.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 8c7d01b75922..b6208728ba39 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -33,6 +33,7 @@
 #include <linux/migrate.h>
 #include <linux/backing-dev.h>
 #include <linux/freezer.h>
+#include <linux/sched/mm.h>
 
 #include "xfs_format.h"
 #include "xfs_log_format.h"
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a7a6aac95a6d..eaa64d2ffdc5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -61,6 +61,7 @@
 #include <linux/migrate.h>
 #include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/mm.h>
 #include <linux/page_owner.h>
 #include <linux/kthread.h>
 #include <linux/memcontrol.h>
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 70aa739c6b68..bc8031ef994d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -14,6 +14,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/mm.h>
+#include <linux/sched/mm.h>
 #include <linux/module.h>
 #include <linux/gfp.h>
 #include <linux/kernel_stat.h>
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index 85747b7f91a9..46008d5ac504 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -8,6 +8,7 @@
 #include <crypto/aes.h>
 #include <crypto/skcipher.h>
 #include <linux/key-type.h>
+#include <linux/sched/mm.h>
 
 #include <keys/ceph-type.h>
 #include <keys/user-type.h>
-- 
cgit 


From b4fb8f66f1ae2e167d06c12d018025a8d4d3ba7e Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 8 Mar 2017 09:35:39 -0800
Subject: mm, page_alloc: Add missing check for memory holes

Commit 13ad59df67f1 ("mm, page_alloc: avoid page_to_pfn() when merging
buddies") moved the check for memory holes out of page_is_buddy() and
had the callers do the check.

But this wasn't done correctly in one place which caused ia64 to crash
very early in boot.

Update to fix that and make ia64 boot again.

[ v2: Vlastimil pointed out we don't need to call page_to_pfn()
      since we already have the result of that in "buddy_pfn" ]

Fixes: 13ad59df67f1 ("avoid page_to_pfn() when merging buddies")
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index eaa64d2ffdc5..6cbde310abed 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -873,7 +873,8 @@ done_merging:
 		higher_page = page + (combined_pfn - pfn);
 		buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
 		higher_buddy = higher_page + (buddy_pfn - combined_pfn);
-		if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
+		if (pfn_valid_within(buddy_pfn) &&
+		    page_is_buddy(higher_page, higher_buddy, order + 1)) {
 			list_add_tail(&page->lru,
 				&zone->free_area[order].free_list[migratetype]);
 			goto out;
-- 
cgit