From aa187507ef8bb3178a3312d851e8485bd81913c9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:41:45 -0800 Subject: mm: throttle show_mem() from warn_alloc() Tetsuo has been stressing OOM killer path with many parallel allocation requests when he has noticed that it is not all that hard to swamp kernel logs with warn_alloc messages caused by allocation stalls. Even though the allocation stall message is triggered only once in 10s there might be many different tasks hitting it roughly around the same time. A big part of the output is show_mem() which can generate a lot of output even on a small machines. There is no reason to show the state of memory counter for each allocation stall, especially when multiple of them are reported in a short time period. Chances are that not much has changed since the last report. This patch simply rate limits show_mem called from warn_alloc to only dump something once per second. This should be enough to give us a clue why an allocation might be stalling while burst of warnings will not swamp log with too much data. While we are at it, extract all the show_mem related handling (filters) into a separate function warn_alloc_show_mem. This will make the code cleaner and as a bonus point we can distinguish which part of warn_alloc got throttled due to rate limiting as ___ratelimit dumps the caller. [akpm@linux-foundation.org: reduce scope of the ratelimit_states] Link: http://lkml.kernel.org/r/20161215101510.9030-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f3e0c69a97b7..3c790ae4cb52 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3007,18 +3007,12 @@ static inline bool should_suppress_show_mem(void) return ret; } -static DEFINE_RATELIMIT_STATE(nopage_rs, - DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - -void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) +static void warn_alloc_show_mem(gfp_t gfp_mask) { unsigned int filter = SHOW_MEM_FILTER_NODES; - struct va_format vaf; - va_list args; + static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || - debug_guardpage_minorder() > 0) + if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs)) return; /* @@ -3033,6 +3027,20 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; + show_mem(filter); +} + +void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || + debug_guardpage_minorder() > 0) + return; + pr_warn("%s: ", current->comm); va_start(args, fmt); @@ -3044,8 +3052,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask); dump_stack(); - if (!should_suppress_show_mem()) - show_mem(filter); + warn_alloc_show_mem(gfp_mask); } static inline struct page * -- cgit From 76741e776a37973a3e398d504069b3e55c5cc866 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 22 Feb 2017 15:41:48 -0800 Subject: mm, page_alloc: don't convert pfn to idx when merging In __free_one_page() we do the buddy merging arithmetics on "page/buddy index", which is just the lower MAX_ORDER bits of pfn. The operations we do that affect the higher bits are bitwise AND and subtraction (in that order), where the final result will be the same with the higher bits left unmasked, as long as these bits are equal for both buddies - which must be true by the definition of a buddy. We can therefore use pfn's directly instead of "index" and skip the zeroing of >MAX_ORDER bits. This can help a bit by itself, although compiler might be smart enough already. It also helps the next patch to avoid page_to_pfn() for memory hole checks. Link: http://lkml.kernel.org/r/20161216120009.20064-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Joonsoo Kim Cc: Michal Hocko Cc: "Kirill A. Shutemov" Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 4 ++-- mm/page_alloc.c | 31 ++++++++++++++----------------- mm/page_isolation.c | 8 ++++---- 3 files changed, 20 insertions(+), 23 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/internal.h b/mm/internal.h index 7aa2ea0a8623..bfad3b5d2665 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -133,9 +133,9 @@ struct alloc_context { * Assumption: *_mem_map is contiguous at least up to MAX_ORDER */ static inline unsigned long -__find_buddy_index(unsigned long page_idx, unsigned int order) +__find_buddy_pfn(unsigned long page_pfn, unsigned int order) { - return page_idx ^ (1 << order); + return page_pfn ^ (1 << order); } extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3c790ae4cb52..49d40261f8c4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -787,9 +787,8 @@ static inline void __free_one_page(struct page *page, struct zone *zone, unsigned int order, int migratetype) { - unsigned long page_idx; - unsigned long combined_idx; - unsigned long uninitialized_var(buddy_idx); + unsigned long combined_pfn; + unsigned long uninitialized_var(buddy_pfn); struct page *buddy; unsigned int max_order; @@ -802,15 +801,13 @@ static inline void __free_one_page(struct page *page, if (likely(!is_migrate_isolate(migratetype))) __mod_zone_freepage_state(zone, 1 << order, migratetype); - page_idx = pfn & ((1 << MAX_ORDER) - 1); - - VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); + VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); continue_merging: while (order < max_order - 1) { - buddy_idx = __find_buddy_index(page_idx, order); - buddy = page + (buddy_idx - page_idx); + buddy_pfn = __find_buddy_pfn(pfn, order); + buddy = page + (buddy_pfn - pfn); if (!page_is_buddy(page, buddy, order)) goto done_merging; /* @@ -824,9 +821,9 @@ continue_merging: zone->free_area[order].nr_free--; rmv_page_order(buddy); } - combined_idx = buddy_idx & page_idx; - page = page + (combined_idx - page_idx); - page_idx = combined_idx; + combined_pfn = buddy_pfn & pfn; + page = page + (combined_pfn - pfn); + pfn = combined_pfn; order++; } if (max_order < MAX_ORDER) { @@ -841,8 +838,8 @@ continue_merging: if (unlikely(has_isolate_pageblock(zone))) { int buddy_mt; - buddy_idx = __find_buddy_index(page_idx, order); - buddy = page + (buddy_idx - page_idx); + buddy_pfn = __find_buddy_pfn(pfn, order); + buddy = page + (buddy_pfn - pfn); buddy_mt = get_pageblock_migratetype(buddy); if (migratetype != buddy_mt @@ -867,10 +864,10 @@ done_merging: */ if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { struct page *higher_page, *higher_buddy; - combined_idx = buddy_idx & page_idx; - higher_page = page + (combined_idx - page_idx); - buddy_idx = __find_buddy_index(combined_idx, order + 1); - higher_buddy = higher_page + (buddy_idx - combined_idx); + combined_pfn = buddy_pfn & pfn; + higher_page = page + (combined_pfn - pfn); + buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); + higher_buddy = higher_page + (buddy_pfn - combined_pfn); if (page_is_buddy(higher_page, higher_buddy, order + 1)) { list_add_tail(&page->lru, &zone->free_area[order].free_list[migratetype]); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index a5594bfcc5ed..dadb7e74d7d6 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -83,7 +83,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) unsigned long flags, nr_pages; bool isolated_page = false; unsigned int order; - unsigned long page_idx, buddy_idx; + unsigned long pfn, buddy_pfn; struct page *buddy; zone = page_zone(page); @@ -102,9 +102,9 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) if (PageBuddy(page)) { order = page_order(page); if (order >= pageblock_order) { - page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); - buddy_idx = __find_buddy_index(page_idx, order); - buddy = page + (buddy_idx - page_idx); + pfn = page_to_pfn(page); + buddy_pfn = __find_buddy_pfn(pfn, order); + buddy = page + (buddy_pfn - pfn); if (pfn_valid_within(page_to_pfn(buddy)) && !is_migrate_isolate_page(buddy)) { -- cgit From 13ad59df67f19788f6c22985b1a33e466eceb643 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 22 Feb 2017 15:41:51 -0800 Subject: mm, page_alloc: avoid page_to_pfn() when merging buddies On architectures that allow memory holes, page_is_buddy() has to perform page_to_pfn() to check for the memory hole. After the previous patch, we have the pfn already available in __free_one_page(), which is the only caller of page_is_buddy(), so move the check there and avoid page_to_pfn(). Link: http://lkml.kernel.org/r/20161216120009.20064-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Joonsoo Kim Cc: Michal Hocko Cc: "Kirill A. Shutemov" Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 +++++----- mm/page_isolation.c | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 49d40261f8c4..af65c4eedc79 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -714,7 +714,7 @@ static inline void rmv_page_order(struct page *page) /* * This function checks whether a page is free && is the buddy * we can do coalesce a page and its buddy if - * (a) the buddy is not in a hole && + * (a) the buddy is not in a hole (check before calling!) && * (b) the buddy is in the buddy system && * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. @@ -729,9 +729,6 @@ static inline void rmv_page_order(struct page *page) static inline int page_is_buddy(struct page *page, struct page *buddy, unsigned int order) { - if (!pfn_valid_within(page_to_pfn(buddy))) - return 0; - if (page_is_guard(buddy) && page_order(buddy) == order) { if (page_zone_id(page) != page_zone_id(buddy)) return 0; @@ -808,6 +805,9 @@ continue_merging: while (order < max_order - 1) { buddy_pfn = __find_buddy_pfn(pfn, order); buddy = page + (buddy_pfn - pfn); + + if (!pfn_valid_within(buddy_pfn)) + goto done_merging; if (!page_is_buddy(page, buddy, order)) goto done_merging; /* @@ -862,7 +862,7 @@ done_merging: * so it's less likely to be used soon and more likely to be merged * as a higher order page */ - if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { + if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) { struct page *higher_page, *higher_buddy; combined_pfn = buddy_pfn & pfn; higher_page = page + (combined_pfn - pfn); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index dadb7e74d7d6..f4e17a57926a 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -106,7 +106,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) buddy_pfn = __find_buddy_pfn(pfn, order); buddy = page + (buddy_pfn - pfn); - if (pfn_valid_within(page_to_pfn(buddy)) && + if (pfn_valid_within(buddy_pfn) && !is_migrate_isolate_page(buddy)) { __isolate_free_page(page, order); isolated_page = true; -- cgit From d379f01de09570e06d84b4b09e5f4951821a1dc8 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:42:00 -0800 Subject: oom, trace: add oom detection tracepoints should_reclaim_retry is the central decision point for declaring the OOM. It might be really useful to expose data used for this decision making when debugging an unexpected oom situations. Say we have an OOM report: [ 52.264001] mem_eater invoked oom-killer: gfp_mask=0x24280ca(GFP_HIGHUSER_MOVABLE|__GFP_ZERO), nodemask=0, order=0, oom_score_adj=0 [ 52.267549] CPU: 3 PID: 3148 Comm: mem_eater Tainted: G W 4.8.0-oomtrace3-00006-gb21338b386d2 #1024 Now we can check the tracepoint data to see how we have ended up in this situation: mem_eater-3148 [003] .... 52.432801: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11134 min_wmark=11084 no_progress_loops=1 wmark_check=1 mem_eater-3148 [003] .... 52.433269: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11103 min_wmark=11084 no_progress_loops=1 wmark_check=1 mem_eater-3148 [003] .... 52.433712: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11100 min_wmark=11084 no_progress_loops=2 wmark_check=1 mem_eater-3148 [003] .... 52.434067: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11097 min_wmark=11084 no_progress_loops=3 wmark_check=1 mem_eater-3148 [003] .... 52.434414: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11094 min_wmark=11084 no_progress_loops=4 wmark_check=1 mem_eater-3148 [003] .... 52.434761: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11091 min_wmark=11084 no_progress_loops=5 wmark_check=1 mem_eater-3148 [003] .... 52.435108: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11087 min_wmark=11084 no_progress_loops=6 wmark_check=1 mem_eater-3148 [003] .... 52.435478: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11084 min_wmark=11084 no_progress_loops=7 wmark_check=0 mem_eater-3148 [003] .... 52.435478: reclaim_retry_zone: node=0 zone=DMA order=0 reclaimable=0 available=1126 min_wmark=179 no_progress_loops=7 wmark_check=0 The above shows that we can quickly deduce that the reclaim stopped making any progress (see no_progress_loops increased in each round) and while there were still some 51 reclaimable pages they couldn't be dropped for some reason (vmscan trace points would tell us more about that part). available will represent reclaimable + free_pages scaled down per no_progress_loops factor. This is essentially an optimistic estimate of how much memory we would have when reclaiming everything. This can be compared to min_wmark to get a rought idea but the wmark_check tells the result of the watermark check which is more precise (includes lowmem reserves, considers the order etc.). As we can see no zone is eligible in the end and that is why we have triggered the oom in this situation. Please note that higher order requests might fail on the wmark_check even when there is much more memory available than min_wmark - e.g. when the memory is fragmented. A follow up tracepoint will help to debug those situations. Link: http://lkml.kernel.org/r/20161220130135.15719-3-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: David Rientjes Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/oom.h | 42 ++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 10 ++++++++-- 2 files changed, 50 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h index 1e974983757e..9160da7a26a0 100644 --- a/include/trace/events/oom.h +++ b/include/trace/events/oom.h @@ -4,6 +4,7 @@ #if !defined(_TRACE_OOM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_OOM_H #include +#include TRACE_EVENT(oom_score_adj_update, @@ -27,6 +28,47 @@ TRACE_EVENT(oom_score_adj_update, __entry->pid, __entry->comm, __entry->oom_score_adj) ); +TRACE_EVENT(reclaim_retry_zone, + + TP_PROTO(struct zoneref *zoneref, + int order, + unsigned long reclaimable, + unsigned long available, + unsigned long min_wmark, + int no_progress_loops, + bool wmark_check), + + TP_ARGS(zoneref, order, reclaimable, available, min_wmark, no_progress_loops, wmark_check), + + TP_STRUCT__entry( + __field( int, node) + __field( int, zone_idx) + __field( int, order) + __field( unsigned long, reclaimable) + __field( unsigned long, available) + __field( unsigned long, min_wmark) + __field( int, no_progress_loops) + __field( bool, wmark_check) + ), + + TP_fast_assign( + __entry->node = zone_to_nid(zoneref->zone); + __entry->zone_idx = zoneref->zone_idx; + __entry->order = order; + __entry->reclaimable = reclaimable; + __entry->available = available; + __entry->min_wmark = min_wmark; + __entry->no_progress_loops = no_progress_loops; + __entry->wmark_check = wmark_check; + ), + + TP_printk("node=%d zone=%-8s order=%d reclaimable=%lu available=%lu min_wmark=%lu no_progress_loops=%d wmark_check=%d", + __entry->node, __print_symbolic(__entry->zone_idx, ZONE_TYPE), + __entry->order, + __entry->reclaimable, __entry->available, __entry->min_wmark, + __entry->no_progress_loops, + __entry->wmark_check) +); #endif /* This part must be outside protection */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index af65c4eedc79..d20f8c3139bb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include @@ -3468,6 +3469,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, ac->nodemask) { unsigned long available; unsigned long reclaimable; + unsigned long min_wmark = min_wmark_pages(zone); + bool wmark; available = reclaimable = zone_reclaimable_pages(zone); available -= DIV_ROUND_UP((*no_progress_loops) * available, @@ -3478,8 +3481,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * Would the allocation succeed if we reclaimed the whole * available? */ - if (__zone_watermark_ok(zone, order, min_wmark_pages(zone), - ac_classzone_idx(ac), alloc_flags, available)) { + wmark = __zone_watermark_ok(zone, order, min_wmark, + ac_classzone_idx(ac), alloc_flags, available); + trace_reclaim_retry_zone(z, order, reclaimable, + available, min_wmark, *no_progress_loops, wmark); + if (wmark) { /* * If we didn't make any progress and have a lot of * dirty + writeback pages then we should wait for -- cgit From 65190cff3cc108b72e42cce67ed8b73dbad6b731 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:42:03 -0800 Subject: oom, trace: add compaction retry tracepoint Higher order requests oom debugging is currently quite hard. We do have some compaction points which can tell us how the compaction is operating but there is no trace point to tell us about compaction retry logic. This patch adds a one which will have the following format bash-3126 [001] .... 1498.220001: compact_retry: order=9 priority=COMPACT_PRIO_SYNC_LIGHT compaction_result=withdrawn retries=0 max_retries=16 should_retry=0 we can see that the order 9 request is not retried even though we are in the highest compaction priority mode becase the last compaction attempt was withdrawn. This means that compaction_zonelist_suitable must have returned false and there is no suitable zone to compact for this request and so no need to retry further. another example would be <...>-3137 [001] .... 81.501689: compact_retry: order=9 priority=COMPACT_PRIO_SYNC_LIGHT compaction_result=failed retries=0 max_retries=16 should_retry=0 in this case the order-9 compaction failed to find any suitable block. We do not retry anymore because this is a costly request and those do not go below COMPACT_PRIO_SYNC_LIGHT priority. Link: http://lkml.kernel.org/r/20161220130135.15719-4-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: David Rientjes Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/mmflags.h | 26 ++++++++++++++++++++++++++ include/trace/events/oom.h | 39 +++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 22 ++++++++++++++++------ 3 files changed, 81 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 75ed3220ede2..91554faed17e 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -186,8 +186,32 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ EM( COMPACT_NO_SUITABLE_PAGE, "no_suitable_page") \ EM( COMPACT_NOT_SUITABLE_ZONE, "not_suitable_zone") \ EMe(COMPACT_CONTENDED, "contended") + +/* High-level compaction status feedback */ +#define COMPACTION_FAILED 1 +#define COMPACTION_WITHDRAWN 2 +#define COMPACTION_PROGRESS 3 + +#define compact_result_to_feedback(result) \ +({ \ + enum compact_result __result = result; \ + (compaction_failed(__result)) ? COMPACTION_FAILED : \ + (compaction_withdrawn(__result)) ? COMPACTION_WITHDRAWN : COMPACTION_PROGRESS; \ +}) + +#define COMPACTION_FEEDBACK \ + EM(COMPACTION_FAILED, "failed") \ + EM(COMPACTION_WITHDRAWN, "withdrawn") \ + EMe(COMPACTION_PROGRESS, "progress") + +#define COMPACTION_PRIORITY \ + EM(COMPACT_PRIO_SYNC_FULL, "COMPACT_PRIO_SYNC_FULL") \ + EM(COMPACT_PRIO_SYNC_LIGHT, "COMPACT_PRIO_SYNC_LIGHT") \ + EMe(COMPACT_PRIO_ASYNC, "COMPACT_PRIO_ASYNC") #else #define COMPACTION_STATUS +#define COMPACTION_PRIORITY +#define COMPACTION_FEEDBACK #endif #ifdef CONFIG_ZONE_DMA @@ -225,6 +249,8 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ #define EMe(a, b) TRACE_DEFINE_ENUM(a); COMPACTION_STATUS +COMPACTION_PRIORITY +COMPACTION_FEEDBACK ZONE_TYPE /* diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h index 9160da7a26a0..38baeb27221a 100644 --- a/include/trace/events/oom.h +++ b/include/trace/events/oom.h @@ -69,6 +69,45 @@ TRACE_EVENT(reclaim_retry_zone, __entry->no_progress_loops, __entry->wmark_check) ); + +#ifdef CONFIG_COMPACTION +TRACE_EVENT(compact_retry, + + TP_PROTO(int order, + enum compact_priority priority, + enum compact_result result, + int retries, + int max_retries, + bool ret), + + TP_ARGS(order, priority, result, retries, max_retries, ret), + + TP_STRUCT__entry( + __field( int, order) + __field( int, priority) + __field( int, result) + __field( int, retries) + __field( int, max_retries) + __field( bool, ret) + ), + + TP_fast_assign( + __entry->order = order; + __entry->priority = priority; + __entry->result = compact_result_to_feedback(result); + __entry->retries = retries; + __entry->max_retries = max_retries; + __entry->ret = ret; + ), + + TP_printk("order=%d priority=%s compaction_result=%s retries=%d max_retries=%d should_retry=%d", + __entry->order, + __print_symbolic(__entry->priority, COMPACTION_PRIORITY), + __print_symbolic(__entry->result, COMPACTION_FEEDBACK), + __entry->retries, __entry->max_retries, + __entry->ret) +); +#endif /* CONFIG_COMPACTION */ #endif /* This part must be outside protection */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d20f8c3139bb..05c0a59323bd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3197,6 +3197,9 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, { int max_retries = MAX_COMPACT_RETRIES; int min_priority; + bool ret = false; + int retries = *compaction_retries; + enum compact_priority priority = *compact_priority; if (!order) return false; @@ -3218,8 +3221,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, * But do not retry if the given zonelist is not suitable for * compaction. */ - if (compaction_withdrawn(compact_result)) - return compaction_zonelist_suitable(ac, order, alloc_flags); + if (compaction_withdrawn(compact_result)) { + ret = compaction_zonelist_suitable(ac, order, alloc_flags); + goto out; + } /* * !costly requests are much more important than __GFP_REPEAT @@ -3231,8 +3236,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, */ if (order > PAGE_ALLOC_COSTLY_ORDER) max_retries /= 4; - if (*compaction_retries <= max_retries) - return true; + if (*compaction_retries <= max_retries) { + ret = true; + goto out; + } /* * Make sure there are attempts at the highest priority if we exhausted @@ -3241,12 +3248,15 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, check_priority: min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; + if (*compact_priority > min_priority) { (*compact_priority)--; *compaction_retries = 0; - return true; + ret = true; } - return false; +out: + trace_compact_retry(order, priority, compact_result, retries, max_retries, ret); + return ret; } #else static inline struct page * -- cgit From b92df1de5d289c0b5d653e72414bf0850b8511e0 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Wed, 22 Feb 2017 15:44:53 -0800 Subject: mm: page_alloc: skip over regions of invalid pfns where possible When using a sparse memory model memmap_init_zone() when invoked with the MEMMAP_EARLY context will skip over pages which aren't valid - ie. which aren't in a populated region of the sparse memory map. However if the memory map is extremely sparse then it can spend a long time linearly checking each PFN in a large non-populated region of the memory map & skipping it in turn. When CONFIG_HAVE_MEMBLOCK_NODE_MAP is enabled, we have sufficient information to quickly discover the next valid PFN given an invalid one by searching through the list of memory regions & skipping forwards to the first PFN covered by the memory region to the right of the non-populated region. Implement this in order to speed up memmap_init_zone() for systems with extremely sparse memory maps. James said "I have tested this patch on a virtual model of a Samurai CPU with a sparse memory map. The kernel boot time drops from 109 to 62 seconds. " Link: http://lkml.kernel.org/r/20161125185518.29885-1-paul.burton@imgtec.com Signed-off-by: Paul Burton Tested-by: James Hartley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 1 + mm/memblock.c | 25 +++++++++++++++++++++++++ mm/page_alloc.c | 11 ++++++++++- 3 files changed, 36 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 5b759c9acf97..38bcf00cbed3 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -203,6 +203,7 @@ int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, unsigned long *out_end_pfn, int *out_nid); +unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn); /** * for_each_mem_pfn_range - early memory pfn range iterator diff --git a/mm/memblock.c b/mm/memblock.c index 7608bc305936..a476d28e0733 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1105,6 +1105,31 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, *out_nid = r->nid; } +unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn, + unsigned long max_pfn) +{ + struct memblock_type *type = &memblock.memory; + unsigned int right = type->cnt; + unsigned int mid, left = 0; + phys_addr_t addr = PFN_PHYS(pfn + 1); + + do { + mid = (right + left) / 2; + + if (addr < type->regions[mid].base) + right = mid; + else if (addr >= (type->regions[mid].base + + type->regions[mid].size)) + left = mid + 1; + else { + /* addr is within the region, so pfn + 1 is valid */ + return min(pfn + 1, max_pfn); + } + } while (left < right); + + return min(PHYS_PFN(type->regions[right].base), max_pfn); +} + /** * memblock_set_node - set node ID on memblock regions * @base: base of area to set node ID for diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 05c0a59323bd..6da3169d3750 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5103,8 +5103,17 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, if (context != MEMMAP_EARLY) goto not_early; - if (!early_pfn_valid(pfn)) + if (!early_pfn_valid(pfn)) { +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + /* + * Skip to the pfn preceding the next valid one (or + * end_pfn), such that we hit a valid pfn (or end_pfn) + * on our next iteration of the loop. + */ + pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1; +#endif continue; + } if (!early_pfn_in_nid(pfn, nid)) continue; if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) -- cgit From c02e50bb8a55a7adeeca5e411479ed70c6a2dfa1 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:46:07 -0800 Subject: mm, page_alloc: do not report all nodes in show_mem Patch series "show_mem updates", v2. This is a mixture of one bug fix (patch 1), an enhancement (patch 2) and cleanups (the rest of the series). First two patches should be really straightforward. Patch 3 removes some arch specific show_mem implementations because I think they are quite outdated and do not really serve any useful purpose anymore. I think we should really strive to have a consistent show_mem output regardless of the architecture. If some architecture is really special and wants to dump something additional we should do that via an arch specific hook. The last patch adds nodemask parameter so that we do not rely on the hardcoded mems_allowed of the current task when doing the node filtering. I consider this more a cleanup than a fix because basically all users use a nodemask which is a subset of mems_allowed. There is only one call path in the memory hotplug which doesn't comply with this but that is hardly something to worry about. This patch (of 4): Commit 599d0c954f91 ("mm, vmscan: move LRU lists to node") has added per numa node statistics to show_mem but it forgot to add skip_free_areas_node to filter out nodes which are outside of the allocating task numa policy. Add this check to not pollute the output with the pointless information. Link: http://lkml.kernel.org/r/20170117091543.25850-2-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Johannes Weiner Acked-by: Hillf Danton Acked-by: David Rientjes Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6da3169d3750..cf6b53dc08f9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4368,6 +4368,9 @@ void show_free_areas(unsigned int filter) global_page_state(NR_FREE_CMA_PAGES)); for_each_online_pgdat(pgdat) { + if (skip_free_areas_node(filter, pgdat->node_id)) + continue; + printk("Node %d" " active_anon:%lukB" " inactive_anon:%lukB" -- cgit From a8e99259e7e32b67af2b447f0a570813c0c283ec Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:46:10 -0800 Subject: mm, page_alloc: warn_alloc print nodemask warn_alloc is currently used for to report an allocation failure or an allocation stall. We print some details of the allocation request like the gfp mask and the request order. We do not print the allocation nodemask which is important when debugging the reason for the allocation failure as well. We alreaddy print the nodemask in the OOM report. Add nodemask to warn_alloc and print it in warn_alloc as well. Link: http://lkml.kernel.org/r/20170117091543.25850-3-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Hillf Danton Cc: Johannes Weiner Cc: Vlastimil Babka Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++-- mm/page_alloc.c | 10 ++++++---- mm/vmalloc.c | 4 ++-- 3 files changed, 10 insertions(+), 8 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index dae6f58d67c8..28b6c3f8a7f3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1942,8 +1942,8 @@ extern void si_meminfo_node(struct sysinfo *val, int nid); extern unsigned long arch_reserved_kernel_pages(void); #endif -extern __printf(2, 3) -void warn_alloc(gfp_t gfp_mask, const char *fmt, ...); +extern __printf(3, 4) +void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...); extern void setup_per_cpu_pageset(void); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cf6b53dc08f9..96c8fe602dfb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3028,12 +3028,13 @@ static void warn_alloc_show_mem(gfp_t gfp_mask) show_mem(filter); } -void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) +void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) { struct va_format vaf; va_list args; static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + nodemask_t *nm = (nodemask) ? nodemask : &cpuset_current_mems_allowed; if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || debug_guardpage_minorder() > 0) @@ -3047,7 +3048,8 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) pr_cont("%pV", &vaf); va_end(args); - pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask); + pr_cont(", mode:%#x(%pGg), nodemask=%*pbl\n", gfp_mask, &gfp_mask, nodemask_pr_args(nm)); + cpuset_print_current_mems_allowed(); dump_stack(); warn_alloc_show_mem(gfp_mask); @@ -3724,7 +3726,7 @@ retry: /* Make sure we know about allocations which stall for too long */ if (time_after(jiffies, alloc_start + stall_timeout)) { - warn_alloc(gfp_mask, + warn_alloc(gfp_mask, ac->nodemask, "page allocation stalls for %ums, order:%u", jiffies_to_msecs(jiffies-alloc_start), order); stall_timeout += 10 * HZ; @@ -3775,7 +3777,7 @@ nopage: if (read_mems_allowed_retry(cpuset_mems_cookie)) goto retry_cpuset; - warn_alloc(gfp_mask, + warn_alloc(gfp_mask, ac->nodemask, "page allocation failure: order:%u", order); got_pg: return page; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5f5b09e9dccd..d89034a393f2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1662,7 +1662,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return area->addr; fail: - warn_alloc(gfp_mask, + warn_alloc(gfp_mask, NULL, "vmalloc: allocation failure, allocated %ld of %ld bytes", (area->nr_pages*PAGE_SIZE), area->size); vfree(area->addr); @@ -1724,7 +1724,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, return addr; fail: - warn_alloc(gfp_mask, + warn_alloc(gfp_mask, NULL, "vmalloc: allocation failure: %lu bytes", real_size); return NULL; } -- cgit From 9af744d743170b5f5ef70031dea8d772d166ab28 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:46:16 -0800 Subject: lib/show_mem.c: teach show_mem to work with the given nodemask show_mem() allows to filter out node specific data which is irrelevant to the allocation request via SHOW_MEM_FILTER_NODES. The filtering is done in skip_free_areas_node which skips all nodes which are not in the mems_allowed of the current process. This works most of the time as expected because the nodemask shouldn't be outside of the allocating task but there are some exceptions. E.g. memory hotplug might want to request allocations from outside of the allowed nodes (see new_node_page). Get rid of this hardcoded behavior and push the allocation mask down the show_mem path and use it instead of cpuset_current_mems_allowed. NULL nodemask is interpreted as cpuset_current_mems_allowed. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20170117091543.25850-5-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Mel Gorman Cc: Hillf Danton Cc: Johannes Weiner Cc: Vlastimil Babka Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/xmon/xmon.c | 2 +- arch/sparc/kernel/setup_32.c | 2 +- drivers/net/ethernet/sgi/ioc3-eth.c | 2 +- drivers/tty/sysrq.c | 2 +- drivers/tty/vt/keyboard.c | 2 +- include/linux/mm.h | 5 ++--- lib/show_mem.c | 4 ++-- mm/nommu.c | 6 +++--- mm/oom_kill.c | 2 +- mm/page_alloc.c | 38 ++++++++++++++++++------------------- 10 files changed, 32 insertions(+), 33 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 1be0499f5397..5720236d0266 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -916,7 +916,7 @@ cmds(struct pt_regs *excp) memzcan(); break; case 'i': - show_mem(0); + show_mem(0, NULL); break; default: termch = cmd; diff --git a/arch/sparc/kernel/setup_32.c b/arch/sparc/kernel/setup_32.c index c4e65cb3280f..6f06058c5ae7 100644 --- a/arch/sparc/kernel/setup_32.c +++ b/arch/sparc/kernel/setup_32.c @@ -82,7 +82,7 @@ static void prom_sync_me(void) "nop\n\t" : : "r" (&trapbase)); prom_printf("PROM SYNC COMMAND...\n"); - show_free_areas(0); + show_free_areas(0, NULL); if (!is_idle_task(current)) { local_irq_enable(); sys_sync(); diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c index d390b9663dc3..57e6cef81ebe 100644 --- a/drivers/net/ethernet/sgi/ioc3-eth.c +++ b/drivers/net/ethernet/sgi/ioc3-eth.c @@ -914,7 +914,7 @@ static void ioc3_alloc_rings(struct net_device *dev) skb = ioc3_alloc_skb(RX_BUF_ALLOC_SIZE, GFP_ATOMIC); if (!skb) { - show_free_areas(0); + show_free_areas(0, NULL); continue; } diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 701c085bb19b..71136742e606 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -317,7 +317,7 @@ static struct sysrq_key_op sysrq_ftrace_dump_op = { static void sysrq_handle_showmem(int key) { - show_mem(0); + show_mem(0, NULL); } static struct sysrq_key_op sysrq_showmem_op = { .handler = sysrq_handle_showmem, diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c index 3dd6a491cdba..397e1509fe51 100644 --- a/drivers/tty/vt/keyboard.c +++ b/drivers/tty/vt/keyboard.c @@ -572,7 +572,7 @@ static void fn_scroll_back(struct vc_data *vc) static void fn_show_mem(struct vc_data *vc) { - show_mem(0); + show_mem(0, NULL); } static void fn_show_state(struct vc_data *vc) diff --git a/include/linux/mm.h b/include/linux/mm.h index 28b6c3f8a7f3..8a67cae5a07c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1152,8 +1152,7 @@ extern void pagefault_out_of_memory(void); */ #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ -extern void show_free_areas(unsigned int flags); -extern bool skip_free_areas_node(unsigned int flags, int nid); +extern void show_free_areas(unsigned int flags, nodemask_t *nodemask); int shmem_zero_setup(struct vm_area_struct *); #ifdef CONFIG_SHMEM @@ -1934,7 +1933,7 @@ extern void setup_per_zone_wmarks(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); extern void __init mmap_init(void); -extern void show_mem(unsigned int flags); +extern void show_mem(unsigned int flags, nodemask_t *nodemask); extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); diff --git a/lib/show_mem.c b/lib/show_mem.c index 1feed6a2b12a..0beaa1d899aa 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -9,13 +9,13 @@ #include #include -void show_mem(unsigned int filter) +void show_mem(unsigned int filter, nodemask_t *nodemask) { pg_data_t *pgdat; unsigned long total = 0, reserved = 0, highmem = 0; printk("Mem-Info:\n"); - show_free_areas(filter); + show_free_areas(filter, nodemask); for_each_online_pgdat(pgdat) { unsigned long flags; diff --git a/mm/nommu.c b/mm/nommu.c index 24f9f5f39145..bc964c26be8c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1191,7 +1191,7 @@ error_free: enomem: pr_err("Allocation of length %lu from process %d (%s) failed\n", len, current->pid, current->comm); - show_free_areas(0); + show_free_areas(0, NULL); return -ENOMEM; } @@ -1412,13 +1412,13 @@ error_getting_vma: kmem_cache_free(vm_region_jar, region); pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n", len, current->pid); - show_free_areas(0); + show_free_areas(0, NULL); return -ENOMEM; error_getting_region: pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n", len, current->pid); - show_free_areas(0); + show_free_areas(0, NULL); return -ENOMEM; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ec9f11d4f094..7176b6a754cf 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -417,7 +417,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) if (oc->memcg) mem_cgroup_print_oom_info(oc->memcg, p); else - show_mem(SHOW_MEM_FILTER_NODES); + show_mem(SHOW_MEM_FILTER_NODES, nm); if (sysctl_oom_dump_tasks) dump_tasks(oc->memcg, oc->nodemask); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 96c8fe602dfb..644fb75f6f24 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3005,7 +3005,7 @@ static inline bool should_suppress_show_mem(void) return ret; } -static void warn_alloc_show_mem(gfp_t gfp_mask) +static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) { unsigned int filter = SHOW_MEM_FILTER_NODES; static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); @@ -3025,7 +3025,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask) if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; - show_mem(filter); + show_mem(filter, nodemask); } void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) @@ -3052,7 +3052,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) cpuset_print_current_mems_allowed(); dump_stack(); - warn_alloc_show_mem(gfp_mask); + warn_alloc_show_mem(gfp_mask, nm); } static inline struct page * @@ -4274,20 +4274,20 @@ void si_meminfo_node(struct sysinfo *val, int nid) * Determine whether the node should be displayed or not, depending on whether * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). */ -bool skip_free_areas_node(unsigned int flags, int nid) +static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask) { - bool ret = false; - unsigned int cpuset_mems_cookie; - if (!(flags & SHOW_MEM_FILTER_NODES)) - goto out; + return false; - do { - cpuset_mems_cookie = read_mems_allowed_begin(); - ret = !node_isset(nid, cpuset_current_mems_allowed); - } while (read_mems_allowed_retry(cpuset_mems_cookie)); -out: - return ret; + /* + * no node mask - aka implicit memory numa policy. Do not bother with + * the synchronization - read_mems_allowed_begin - because we do not + * have to be precise here. + */ + if (!nodemask) + nodemask = &cpuset_current_mems_allowed; + + return !node_isset(nid, *nodemask); } #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -4328,7 +4328,7 @@ static void show_migration_types(unsigned char type) * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's * cpuset. */ -void show_free_areas(unsigned int filter) +void show_free_areas(unsigned int filter, nodemask_t *nodemask) { unsigned long free_pcp = 0; int cpu; @@ -4336,7 +4336,7 @@ void show_free_areas(unsigned int filter) pg_data_t *pgdat; for_each_populated_zone(zone) { - if (skip_free_areas_node(filter, zone_to_nid(zone))) + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; for_each_online_cpu(cpu) @@ -4370,7 +4370,7 @@ void show_free_areas(unsigned int filter) global_page_state(NR_FREE_CMA_PAGES)); for_each_online_pgdat(pgdat) { - if (skip_free_areas_node(filter, pgdat->node_id)) + if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) continue; printk("Node %d" @@ -4422,7 +4422,7 @@ void show_free_areas(unsigned int filter) for_each_populated_zone(zone) { int i; - if (skip_free_areas_node(filter, zone_to_nid(zone))) + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; free_pcp = 0; @@ -4487,7 +4487,7 @@ void show_free_areas(unsigned int filter) unsigned long nr[MAX_ORDER], flags, total = 0; unsigned char types[MAX_ORDER]; - if (skip_free_areas_node(filter, zone_to_nid(zone))) + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; show_node(zone); printk(KERN_CONT "%s: ", zone->name); -- cgit From 9a67f6488eca926f8356b2737fc9f8f6c0cbed85 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:46:19 -0800 Subject: mm: consolidate GFP_NOFAIL checks in the allocator slowpath Tetsuo Handa has pointed out that commit 0a0337e0d1d1 ("mm, oom: rework oom detection") has subtly changed semantic for costly high order requests with __GFP_NOFAIL and withtout __GFP_REPEAT and those can fail right now. My code inspection didn't reveal any such users in the tree but it is true that this might lead to unexpected allocation failures and subsequent OOPs. __alloc_pages_slowpath wrt. GFP_NOFAIL is hard to follow currently. There are few special cases but we are lacking a catch all place to be sure we will not miss any case where the non failing allocation might fail. This patch reorganizes the code a bit and puts all those special cases under nopage label which is the generic go-to-fail path. Non failing allocations are retried or those that cannot retry like non-sleeping allocation go to the failure point directly. This should make the code flow much easier to follow and make it less error prone for future changes. While we are there we have to move the stall check up to catch potentially looping non-failing allocations. [akpm@linux-foundation.org: fix alloc_flags may-be-used-uninitalized] Link: http://lkml.kernel.org/r/20161220134904.21023-2-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Johannes Weiner Acked-by: Hillf Danton Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 91 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 52 insertions(+), 39 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 644fb75f6f24..dd36da6ffef5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3577,6 +3577,14 @@ retry_cpuset: no_progress_loops = 0; compact_priority = DEF_COMPACT_PRIORITY; cpuset_mems_cookie = read_mems_allowed_begin(); + + /* + * The fast path uses conservative alloc_flags to succeed only until + * kswapd needs to be woken up, and to avoid the cost of setting up + * alloc_flags precisely. So we do that now. + */ + alloc_flags = gfp_to_alloc_flags(gfp_mask); + /* * We need to recalculate the starting point for the zonelist iterator * because we might have used different nodemask in the fast path, or @@ -3588,14 +3596,6 @@ retry_cpuset: if (!ac->preferred_zoneref->zone) goto nopage; - - /* - * The fast path uses conservative alloc_flags to succeed only until - * kswapd needs to be woken up, and to avoid the cost of setting up - * alloc_flags precisely. So we do that now. - */ - alloc_flags = gfp_to_alloc_flags(gfp_mask); - if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); @@ -3672,35 +3672,21 @@ retry: goto got_pg; /* Caller is not willing to reclaim, we can't balance anything */ - if (!can_direct_reclaim) { - /* - * All existing users of the __GFP_NOFAIL are blockable, so warn - * of any new users that actually allow this type of allocation - * to fail. - */ - WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); + if (!can_direct_reclaim) goto nopage; - } - /* Avoid recursion of direct reclaim */ - if (current->flags & PF_MEMALLOC) { - /* - * __GFP_NOFAIL request from this context is rather bizarre - * because we cannot reclaim anything and only can loop waiting - * for somebody to do a work for us. - */ - if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { - cond_resched(); - goto retry; - } - goto nopage; + /* Make sure we know about allocations which stall for too long */ + if (time_after(jiffies, alloc_start + stall_timeout)) { + warn_alloc(gfp_mask, ac->nodemask, + "page allocation stalls for %ums, order:%u", + jiffies_to_msecs(jiffies-alloc_start), order); + stall_timeout += 10 * HZ; } - /* Avoid allocations with no watermarks from looping endlessly */ - if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) + /* Avoid recursion of direct reclaim */ + if (current->flags & PF_MEMALLOC) goto nopage; - /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, &did_some_progress); @@ -3724,14 +3710,6 @@ retry: if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) goto nopage; - /* Make sure we know about allocations which stall for too long */ - if (time_after(jiffies, alloc_start + stall_timeout)) { - warn_alloc(gfp_mask, ac->nodemask, - "page allocation stalls for %ums, order:%u", - jiffies_to_msecs(jiffies-alloc_start), order); - stall_timeout += 10 * HZ; - } - if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, did_some_progress > 0, &no_progress_loops)) goto retry; @@ -3760,6 +3738,10 @@ retry: if (page) goto got_pg; + /* Avoid allocations with no watermarks from looping endlessly */ + if (test_thread_flag(TIF_MEMDIE)) + goto nopage; + /* Retry as long as the OOM killer is making progress */ if (did_some_progress) { no_progress_loops = 0; @@ -3777,6 +3759,37 @@ nopage: if (read_mems_allowed_retry(cpuset_mems_cookie)) goto retry_cpuset; + /* + * Make sure that __GFP_NOFAIL request doesn't leak out and make sure + * we always retry + */ + if (gfp_mask & __GFP_NOFAIL) { + /* + * All existing users of the __GFP_NOFAIL are blockable, so warn + * of any new users that actually require GFP_NOWAIT + */ + if (WARN_ON_ONCE(!can_direct_reclaim)) + goto fail; + + /* + * PF_MEMALLOC request from this context is rather bizarre + * because we cannot reclaim anything and only can loop waiting + * for somebody to do a work for us + */ + WARN_ON_ONCE(current->flags & PF_MEMALLOC); + + /* + * non failing costly orders are a hard requirement which we + * are not prepared for much so let's warn about these users + * so that we can identify them and convert them to something + * else. + */ + WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER); + + cond_resched(); + goto retry; + } +fail: warn_alloc(gfp_mask, ac->nodemask, "page allocation failure: order:%u", order); got_pg: -- cgit From 06ad276ac18742c6b281698d41b27a290cd42407 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:46:22 -0800 Subject: mm, oom: do not enforce OOM killer for __GFP_NOFAIL automatically __alloc_pages_may_oom makes sure to skip the OOM killer depending on the allocation request. This includes lowmem requests, costly high order requests and others. For a long time __GFP_NOFAIL acted as an override for all those rules. This is not documented and it can be quite surprising as well. E.g. GFP_NOFS requests are not invoking the OOM killer but GFP_NOFS|__GFP_NOFAIL does so if we try to convert some of the existing open coded loops around allocator to nofail request (and we have done that in the past) then such a change would have a non trivial side effect which is far from obvious. Note that the primary motivation for skipping the OOM killer is to prevent from pre-mature invocation. The exception has been added by commit 82553a937f12 ("oom: invoke oom killer for __GFP_NOFAIL"). The changelog points out that the oom killer has to be invoked otherwise the request would be looping for ever. But this argument is rather weak because the OOM killer doesn't really guarantee a forward progress for those exceptional cases: - it will hardly help to form costly order which in turn can result in the system panic because of no oom killable task in the end - I believe we certainly do not want to put the system down just because there is a nasty driver asking for order-9 page with GFP_NOFAIL not realizing all the consequences. It is much better this request would loop for ever than the massive system disruption - lowmem is also highly unlikely to be freed during OOM killer - GFP_NOFS request could trigger while there is still a lot of memory pinned by filesystems. This patch simply removes the __GFP_NOFAIL special case in order to have a more clear semantic without surprising side effects. Signed-off-by: Michal Hocko Reported-by: Nils Holland Acked-by: Johannes Weiner Cc: Vlastimil Babka Cc: Hillf Danton Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 +- mm/page_alloc.c | 49 ++++++++++++++++++++++++------------------------- 2 files changed, 25 insertions(+), 26 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7176b6a754cf..c7b48b4282d9 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1013,7 +1013,7 @@ bool out_of_memory(struct oom_control *oc) * make sure exclude 0 mask - all other users should have at least * ___GFP_DIRECT_RECLAIM to get here. */ - if (oc->gfp_mask && !(oc->gfp_mask & (__GFP_FS|__GFP_NOFAIL))) + if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS)) return true; /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dd36da6ffef5..1e37740837ac 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3090,32 +3090,31 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (page) goto out; - if (!(gfp_mask & __GFP_NOFAIL)) { - /* Coredumps can quickly deplete all memory reserves */ - if (current->flags & PF_DUMPCORE) - goto out; - /* The OOM killer will not help higher order allocs */ - if (order > PAGE_ALLOC_COSTLY_ORDER) - goto out; - /* The OOM killer does not needlessly kill tasks for lowmem */ - if (ac->high_zoneidx < ZONE_NORMAL) - goto out; - if (pm_suspended_storage()) - goto out; - /* - * XXX: GFP_NOFS allocations should rather fail than rely on - * other request to make a forward progress. - * We are in an unfortunate situation where out_of_memory cannot - * do much for this context but let's try it to at least get - * access to memory reserved if the current task is killed (see - * out_of_memory). Once filesystems are ready to handle allocation - * failures more gracefully we should just bail out here. - */ + /* Coredumps can quickly deplete all memory reserves */ + if (current->flags & PF_DUMPCORE) + goto out; + /* The OOM killer will not help higher order allocs */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto out; + /* The OOM killer does not needlessly kill tasks for lowmem */ + if (ac->high_zoneidx < ZONE_NORMAL) + goto out; + if (pm_suspended_storage()) + goto out; + /* + * XXX: GFP_NOFS allocations should rather fail than rely on + * other request to make a forward progress. + * We are in an unfortunate situation where out_of_memory cannot + * do much for this context but let's try it to at least get + * access to memory reserved if the current task is killed (see + * out_of_memory). Once filesystems are ready to handle allocation + * failures more gracefully we should just bail out here. + */ + + /* The OOM killer may not free memory on a specific node */ + if (gfp_mask & __GFP_THISNODE) + goto out; - /* The OOM killer may not free memory on a specific node */ - if (gfp_mask & __GFP_THISNODE) - goto out; - } /* Exhausted what can be done so it's blamo time */ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { *did_some_progress = 1; -- cgit From 6c18ba7a18997dadbf7ee912e15677ad2c9993e5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:46:25 -0800 Subject: mm: help __GFP_NOFAIL allocations which do not trigger OOM killer Now that __GFP_NOFAIL doesn't override decisions to skip the oom killer we are left with requests which require to loop inside the allocator without invoking the oom killer (e.g. GFP_NOFS|__GFP_NOFAIL used by fs code) and so they might, in very unlikely situations, loop for ever - e.g. other parallel request could starve them. This patch tries to limit the likelihood of such a lockup by giving these __GFP_NOFAIL requests a chance to move on by consuming a small part of memory reserves. We are using ALLOC_HARDER which should be enough to prevent from the starvation by regular allocation requests, yet it shouldn't consume enough from the reserves to disrupt high priority requests (ALLOC_HIGH). While we are at it, let's introduce a helper __alloc_pages_cpuset_fallback which enforces the cpusets but allows to fallback to ignore them if the first attempt fails. __GFP_NOFAIL requests can be considered important enough to allow cpuset runaway in order for the system to move on. It is highly unlikely that any of these will be GFP_USER anyway. Link: http://lkml.kernel.org/r/20161220134904.21023-4-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Vlastimil Babka Cc: Johannes Weiner Cc: Hillf Danton Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 46 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 10 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1e37740837ac..a179607de26f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3055,6 +3055,26 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) warn_alloc_show_mem(gfp_mask, nm); } +static inline struct page * +__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, + unsigned int alloc_flags, + const struct alloc_context *ac) +{ + struct page *page; + + page = get_page_from_freelist(gfp_mask, order, + alloc_flags|ALLOC_CPUSET, ac); + /* + * fallback to ignore cpuset restriction if our nodes + * are depleted + */ + if (!page) + page = get_page_from_freelist(gfp_mask, order, + alloc_flags, ac); + + return page; +} + static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac, unsigned long *did_some_progress) @@ -3119,17 +3139,13 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { *did_some_progress = 1; - if (gfp_mask & __GFP_NOFAIL) { - page = get_page_from_freelist(gfp_mask, order, - ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac); - /* - * fallback to ignore cpuset restriction if our nodes - * are depleted - */ - if (!page) - page = get_page_from_freelist(gfp_mask, order, + /* + * Help non-failing allocations by giving them access to memory + * reserves + */ + if (gfp_mask & __GFP_NOFAIL) + page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_NO_WATERMARKS, ac); - } } out: mutex_unlock(&oom_lock); @@ -3785,6 +3801,16 @@ nopage: */ WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER); + /* + * Help non-failing allocations by giving them access to memory + * reserves but do not use ALLOC_NO_WATERMARKS because this + * could deplete whole memory reserves which would just make + * the situation worse + */ + page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac); + if (page) + goto got_pg; + cond_resched(); goto retry; } -- cgit From 685dbf6f5a643c4bdb9323ee3544ec652505d2ea Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 22 Feb 2017 15:46:28 -0800 Subject: mm, page_alloc: warn_alloc nodemask is NULL when cpusets are disabled The patch "mm, page_alloc: warn_alloc print nodemask" implicitly sets the allocation nodemask to cpuset_current_mems_allowed when there is no effective mempolicy. cpuset_current_mems_allowed is only effective when cpusets are enabled, which is also printed by warn_alloc(), so setting the nodemask to cpuset_current_mems_allowed is redundant and prevents debugging issues where ac->nodemask is not set properly in the page allocator. This provides better debugging output since cpuset_print_current_mems_allowed() is already provided. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1701181347320.142399@chino.kir.corp.google.com Signed-off-by: David Rientjes Cc: Michal Hocko Cc: Johannes Weiner Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a179607de26f..c21b33668133 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3034,7 +3034,6 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) va_list args; static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - nodemask_t *nm = (nodemask) ? nodemask : &cpuset_current_mems_allowed; if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || debug_guardpage_minorder() > 0) @@ -3048,11 +3047,16 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) pr_cont("%pV", &vaf); va_end(args); - pr_cont(", mode:%#x(%pGg), nodemask=%*pbl\n", gfp_mask, &gfp_mask, nodemask_pr_args(nm)); + pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask); + if (nodemask) + pr_cont("%*pbl\n", nodemask_pr_args(nodemask)); + else + pr_cont("(null)\n"); + cpuset_print_current_mems_allowed(); dump_stack(); - warn_alloc_show_mem(gfp_mask, nm); + warn_alloc_show_mem(gfp_mask, nodemask); } static inline struct page * -- cgit From 066b23935578d3913c2df9bed7addbcdf4711f1a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 24 Feb 2017 14:56:26 -0800 Subject: mm, page_alloc: split buffered_rmqueue() Patch series "Use per-cpu allocator for !irq requests and prepare for a bulk allocator", v5. This series is motivated by a conversation led by Jesper Dangaard Brouer at the last LSF/MM proposing a generic page pool for DMA-coherent pages. Part of his motivation was due to the overhead of allocating multiple order-0 that led some drivers to use high-order allocations and splitting them. This is very slow in some cases. The first two patches in this series restructure the page allocator such that it is relatively easy to introduce an order-0 bulk page allocator. A patch exists to do that and has been handed over to Jesper until an in-kernel users is created. The third patch prevents the per-cpu allocator being drained from IPI context as that can potentially corrupt the list after patch four is merged. The final patch alters the per-cpu alloctor to make it exclusive to !irq requests. This cuts allocation/free overhead by roughly 30%. Performance tests from both Jesper and me are included in the patch. This patch (of 4): buffered_rmqueue removes a page from a given zone and uses the per-cpu list for order-0. This is fine but a hypothetical caller that wanted multiple order-0 pages has to disable/reenable interrupts multiple times. This patch structures buffere_rmqueue such that it's relatively easy to build a bulk order-0 page allocator. There is no functional change. [mgorman@techsingularity.net: failed per-cpu refill may blow up] Link: http://lkml.kernel.org/r/20170124112723.mshmgwq2ihxku2um@techsingularity.net Link: http://lkml.kernel.org/r/20170123153906.3122-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Hillf Danton Cc: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 128 ++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 79 insertions(+), 49 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c21b33668133..284153d3e0fc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2600,74 +2600,104 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) #endif } +/* Remove page from the per-cpu list, caller must protect the list */ +static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, + bool cold, struct per_cpu_pages *pcp, + struct list_head *list) +{ + struct page *page; + + do { + if (list_empty(list)) { + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, list, + migratetype, cold); + if (unlikely(list_empty(list))) + return NULL; + } + + if (cold) + page = list_last_entry(list, struct page, lru); + else + page = list_first_entry(list, struct page, lru); + + list_del(&page->lru); + pcp->count--; + } while (check_new_pcp(page)); + + return page; +} + +/* Lock and remove page from the per-cpu list */ +static struct page *rmqueue_pcplist(struct zone *preferred_zone, + struct zone *zone, unsigned int order, + gfp_t gfp_flags, int migratetype) +{ + struct per_cpu_pages *pcp; + struct list_head *list; + bool cold = ((gfp_flags & __GFP_COLD) != 0); + struct page *page; + unsigned long flags; + + local_irq_save(flags); + pcp = &this_cpu_ptr(zone->pageset)->pcp; + list = &pcp->lists[migratetype]; + page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); + if (page) { + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); + zone_statistics(preferred_zone, zone); + } + local_irq_restore(flags); + return page; +} + /* * Allocate a page from the given zone. Use pcplists for order-0 allocations. */ static inline -struct page *buffered_rmqueue(struct zone *preferred_zone, +struct page *rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, gfp_t gfp_flags, unsigned int alloc_flags, int migratetype) { unsigned long flags; struct page *page; - bool cold = ((gfp_flags & __GFP_COLD) != 0); if (likely(order == 0)) { - struct per_cpu_pages *pcp; - struct list_head *list; - - local_irq_save(flags); - do { - pcp = &this_cpu_ptr(zone->pageset)->pcp; - list = &pcp->lists[migratetype]; - if (list_empty(list)) { - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, list, - migratetype, cold); - if (unlikely(list_empty(list))) - goto failed; - } - - if (cold) - page = list_last_entry(list, struct page, lru); - else - page = list_first_entry(list, struct page, lru); - - list_del(&page->lru); - pcp->count--; + page = rmqueue_pcplist(preferred_zone, zone, order, + gfp_flags, migratetype); + goto out; + } - } while (check_new_pcp(page)); - } else { - /* - * We most definitely don't want callers attempting to - * allocate greater than order-1 page units with __GFP_NOFAIL. - */ - WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); - spin_lock_irqsave(&zone->lock, flags); + /* + * We most definitely don't want callers attempting to + * allocate greater than order-1 page units with __GFP_NOFAIL. + */ + WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); + spin_lock_irqsave(&zone->lock, flags); - do { - page = NULL; - if (alloc_flags & ALLOC_HARDER) { - page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); - if (page) - trace_mm_page_alloc_zone_locked(page, order, migratetype); - } - if (!page) - page = __rmqueue(zone, order, migratetype); - } while (page && check_new_pages(page, order)); - spin_unlock(&zone->lock); + do { + page = NULL; + if (alloc_flags & ALLOC_HARDER) { + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (page) + trace_mm_page_alloc_zone_locked(page, order, migratetype); + } if (!page) - goto failed; - __mod_zone_freepage_state(zone, -(1 << order), - get_pcppage_migratetype(page)); - } + page = __rmqueue(zone, order, migratetype); + } while (page && check_new_pages(page, order)); + spin_unlock(&zone->lock); + if (!page) + goto failed; + __mod_zone_freepage_state(zone, -(1 << order), + get_pcppage_migratetype(page)); __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone); local_irq_restore(flags); - VM_BUG_ON_PAGE(bad_range(zone, page), page); +out: + VM_BUG_ON_PAGE(page && bad_range(zone, page), page); return page; failed: @@ -2972,7 +3002,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, } try_this_zone: - page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order, + page = rmqueue(ac->preferred_zoneref->zone, zone, order, gfp_mask, alloc_flags, ac->migratetype); if (page) { prep_new_page(page, order, gfp_mask, alloc_flags); -- cgit From 9cd7555875bb09dad875e89a76f41f576e11c638 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 24 Feb 2017 14:56:29 -0800 Subject: mm, page_alloc: split alloc_pages_nodemask() alloc_pages_nodemask does a number of preperation steps that determine what zones can be used for the allocation depending on a variety of factors. This is fine but a hypothetical caller that wanted multiple order-0 pages has to do the preparation steps multiple times. This patch structures __alloc_pages_nodemask such that it's relatively easy to build a bulk order-0 page allocator. There is no functional change. Link: http://lkml.kernel.org/r/20170123153906.3122-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Hillf Danton Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 75 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 29 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 284153d3e0fc..678b2882faaa 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3855,60 +3855,77 @@ got_pg: return page; } -/* - * This is the 'heart' of the zoned buddy allocator. - */ -struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask) +static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, nodemask_t *nodemask, + struct alloc_context *ac, gfp_t *alloc_mask, + unsigned int *alloc_flags) { - struct page *page; - unsigned int alloc_flags = ALLOC_WMARK_LOW; - gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ - struct alloc_context ac = { - .high_zoneidx = gfp_zone(gfp_mask), - .zonelist = zonelist, - .nodemask = nodemask, - .migratetype = gfpflags_to_migratetype(gfp_mask), - }; + ac->high_zoneidx = gfp_zone(gfp_mask); + ac->zonelist = zonelist; + ac->nodemask = nodemask; + ac->migratetype = gfpflags_to_migratetype(gfp_mask); if (cpusets_enabled()) { - alloc_mask |= __GFP_HARDWALL; - alloc_flags |= ALLOC_CPUSET; - if (!ac.nodemask) - ac.nodemask = &cpuset_current_mems_allowed; + *alloc_mask |= __GFP_HARDWALL; + *alloc_flags |= ALLOC_CPUSET; + if (!ac->nodemask) + ac->nodemask = &cpuset_current_mems_allowed; } - gfp_mask &= gfp_allowed_mask; - lockdep_trace_alloc(gfp_mask); might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); if (should_fail_alloc_page(gfp_mask, order)) - return NULL; + return false; /* * Check the zones suitable for the gfp_mask contain at least one * valid zone. It's possible to have an empty zonelist as a result * of __GFP_THISNODE and a memoryless node */ - if (unlikely(!zonelist->_zonerefs->zone)) - return NULL; + if (unlikely(!ac->zonelist->_zonerefs->zone)) + return false; - if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) - alloc_flags |= ALLOC_CMA; + if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE) + *alloc_flags |= ALLOC_CMA; + + return true; +} +/* Determine whether to spread dirty pages and what the first usable zone */ +static inline void finalise_ac(gfp_t gfp_mask, + unsigned int order, struct alloc_context *ac) +{ /* Dirty zone balancing only done in the fast path */ - ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); + ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); /* * The preferred zone is used for statistics but crucially it is * also used as the starting point for the zonelist iterator. It * may get reset for allocations that ignore memory policies. */ - ac.preferred_zoneref = first_zones_zonelist(ac.zonelist, - ac.high_zoneidx, ac.nodemask); + ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, + ac->high_zoneidx, ac->nodemask); +} + +/* + * This is the 'heart' of the zoned buddy allocator. + */ +struct page * +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, nodemask_t *nodemask) +{ + struct page *page; + unsigned int alloc_flags = ALLOC_WMARK_LOW; + gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ + struct alloc_context ac = { }; + + gfp_mask &= gfp_allowed_mask; + if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags)) + return NULL; + + finalise_ac(gfp_mask, order, &ac); if (!ac.preferred_zoneref->zone) { page = NULL; /* -- cgit From 0ccce3b924212e121503619df97cc0f17189b77b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 24 Feb 2017 14:56:32 -0800 Subject: mm, page_alloc: drain per-cpu pages from workqueue context The per-cpu page allocator can be drained immediately via drain_all_pages() which sends IPIs to every CPU. In the next patch, the per-cpu allocator will only be used for interrupt-safe allocations which prevents draining it from IPI context. This patch uses workqueues to drain the per-cpu lists instead. This is slower but no slowdown during intensive reclaim was measured and the paths that use drain_all_pages() are not that sensitive to performance. This is particularly true as the path would only be triggered when reclaim is failing. It also makes a some sense to avoid storming a machine with IPIs when it's under memory pressure. Arguably, it should be further adjusted so that only one caller at a time is draining pages but it's beyond the scope of the current patch. Link: http://lkml.kernel.org/r/20170123153906.3122-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Cc: Vlastimil Babka Cc: Hillf Danton Cc: Jesper Dangaard Brouer Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 44 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 678b2882faaa..610a3db680ae 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2339,19 +2339,21 @@ void drain_local_pages(struct zone *zone) drain_pages(cpu); } +static void drain_local_pages_wq(struct work_struct *work) +{ + drain_local_pages(NULL); +} + /* * Spill all the per-cpu pages from all CPUs back into the buddy allocator. * * When zone parameter is non-NULL, spill just the single zone's pages. * - * Note that this code is protected against sending an IPI to an offline - * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: - * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but - * nothing keeps CPUs from showing up after we populated the cpumask and - * before the call to on_each_cpu_mask(). + * Note that this can be extremely slow as the draining happens in a workqueue. */ void drain_all_pages(struct zone *zone) { + struct work_struct __percpu *works; int cpu; /* @@ -2360,6 +2362,17 @@ void drain_all_pages(struct zone *zone) */ static cpumask_t cpus_with_pcps; + /* Workqueues cannot recurse */ + if (current->flags & PF_WQ_WORKER) + return; + + /* + * As this can be called from reclaim context, do not reenter reclaim. + * An allocation failure can be handled, it's simply slower + */ + get_online_cpus(); + works = alloc_percpu_gfp(struct work_struct, GFP_ATOMIC); + /* * We don't care about racing with CPU hotplug event * as offline notification will cause the notified @@ -2390,8 +2403,25 @@ void drain_all_pages(struct zone *zone) else cpumask_clear_cpu(cpu, &cpus_with_pcps); } - on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages, - zone, 1); + + if (works) { + for_each_cpu(cpu, &cpus_with_pcps) { + struct work_struct *work = per_cpu_ptr(works, cpu); + INIT_WORK(work, drain_local_pages_wq); + schedule_work_on(cpu, work); + } + for_each_cpu(cpu, &cpus_with_pcps) + flush_work(per_cpu_ptr(works, cpu)); + } else { + for_each_cpu(cpu, &cpus_with_pcps) { + struct work_struct work; + + INIT_WORK(&work, drain_local_pages_wq); + schedule_work_on(cpu, &work); + flush_work(&work); + } + } + put_online_cpus(); } #ifdef CONFIG_HIBERNATION -- cgit From a459eeb7b852bcdac605123a500c61286c2a2c3d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 24 Feb 2017 14:56:35 -0800 Subject: mm, page_alloc: do not depend on cpu hotplug locks inside the allocator Dmitry has reported the following lockdep splat lock_acquire+0x2a1/0x630 kernel/locking/lockdep.c:3753 __mutex_lock_common kernel/locking/mutex.c:521 [inline] mutex_lock_nested+0x24e/0xff0 kernel/locking/mutex.c:621 pcpu_alloc+0xbda/0x1280 mm/percpu.c:896 __alloc_percpu+0x24/0x30 mm/percpu.c:1075 smpcfd_prepare_cpu+0x73/0xd0 kernel/smp.c:44 cpuhp_invoke_callback+0x254/0x1480 kernel/cpu.c:136 cpuhp_up_callbacks+0x81/0x2a0 kernel/cpu.c:493 _cpu_up+0x1e3/0x2a0 kernel/cpu.c:1057 do_cpu_up+0x73/0xa0 kernel/cpu.c:1087 cpu_up+0x18/0x20 kernel/cpu.c:1095 smp_init+0xe9/0xee kernel/smp.c:564 kernel_init_freeable+0x439/0x690 init/main.c:1010 kernel_init+0x13/0x180 init/main.c:941 ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:433 cpu_hotplug_begin cpu_hotplug.lock pcpu_alloc pcpu_alloc_mutex get_online_cpus+0x62/0x90 kernel/cpu.c:248 drain_all_pages+0xf8/0x710 mm/page_alloc.c:2385 __alloc_pages_direct_reclaim mm/page_alloc.c:3440 [inline] __alloc_pages_slowpath+0x8fd/0x2370 mm/page_alloc.c:3778 __alloc_pages_nodemask+0x8f5/0xc60 mm/page_alloc.c:3980 __alloc_pages include/linux/gfp.h:426 [inline] __alloc_pages_node include/linux/gfp.h:439 [inline] alloc_pages_node include/linux/gfp.h:453 [inline] pcpu_alloc_pages mm/percpu-vm.c:93 [inline] pcpu_populate_chunk+0x1e1/0x900 mm/percpu-vm.c:282 pcpu_alloc+0xe01/0x1280 mm/percpu.c:998 __alloc_percpu_gfp+0x27/0x30 mm/percpu.c:1062 bpf_array_alloc_percpu kernel/bpf/arraymap.c:34 [inline] array_map_alloc+0x532/0x710 kernel/bpf/arraymap.c:99 find_and_alloc_map kernel/bpf/syscall.c:34 [inline] map_create kernel/bpf/syscall.c:188 [inline] SYSC_bpf kernel/bpf/syscall.c:870 [inline] SyS_bpf+0xd64/0x2500 kernel/bpf/syscall.c:827 entry_SYSCALL_64_fastpath+0x1f/0xc2 pcpu_alloc pcpu_alloc_mutex drain_all_pages get_online_cpus cpu_hotplug.lock cpu_hotplug_begin+0x206/0x2e0 kernel/cpu.c:304 _cpu_up+0xca/0x2a0 kernel/cpu.c:1011 do_cpu_up+0x73/0xa0 kernel/cpu.c:1087 cpu_up+0x18/0x20 kernel/cpu.c:1095 smp_init+0xe9/0xee kernel/smp.c:564 kernel_init_freeable+0x439/0x690 init/main.c:1010 kernel_init+0x13/0x180 init/main.c:941 ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:433 cpu_hotplug_begin cpu_hotplug.lock Pulling cpu hotplug locks inside the page allocator is just too dangerous. Let's remove the dependency by dropping get_online_cpus() from drain_all_pages. This is not so simple though because now we do not have a protection against cpu hotplug which means 2 things: - the work item might be executed on a different cpu in worker from unbound pool so it doesn't run on pinned on the cpu - we have to make sure that we do not race with page_alloc_cpu_dead calling drain_pages_zone Disabling preemption in drain_local_pages_wq will solve the first problem drain_local_pages will determine its local CPU from the WQ context which will be stable after that point, page_alloc_cpu_dead is pinned to the CPU already. The later condition is achieved by disabling IRQs in drain_pages_zone. Fixes: mm, page_alloc: drain per-cpu pages from workqueue context Link: http://lkml.kernel.org/r/20170207201950.20482-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Dmitry Vyukov Acked-by: Tejun Heo Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 610a3db680ae..8af0d4fa683d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2341,7 +2341,16 @@ void drain_local_pages(struct zone *zone) static void drain_local_pages_wq(struct work_struct *work) { + /* + * drain_all_pages doesn't use proper cpu hotplug protection so + * we can race with cpu offline when the WQ can move this from + * a cpu pinned worker to an unbound one. We can operate on a different + * cpu which is allright but we also have to make sure to not move to + * a different one. + */ + preempt_disable(); drain_local_pages(NULL); + preempt_enable(); } /* @@ -2366,11 +2375,6 @@ void drain_all_pages(struct zone *zone) if (current->flags & PF_WQ_WORKER) return; - /* - * As this can be called from reclaim context, do not reenter reclaim. - * An allocation failure can be handled, it's simply slower - */ - get_online_cpus(); works = alloc_percpu_gfp(struct work_struct, GFP_ATOMIC); /* @@ -2421,7 +2425,6 @@ void drain_all_pages(struct zone *zone) flush_work(&work); } } - put_online_cpus(); } #ifdef CONFIG_HIBERNATION -- cgit From 374ad05ab64d696303cec5cc8ec3a65d457b7b1c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 24 Feb 2017 14:56:38 -0800 Subject: mm, page_alloc: only use per-cpu allocator for irq-safe requests Many workloads that allocate pages are not handling an interrupt at a time. As allocation requests may be from IRQ context, it's necessary to disable/enable IRQs for every page allocation. This cost is the bulk of the free path but also a significant percentage of the allocation path. This patch alters the locking and checks such that only irq-safe allocation requests use the per-cpu allocator. All others acquire the irq-safe zone->lock and allocate from the buddy allocator. It relies on disabling preemption to safely access the per-cpu structures. It could be slightly modified to avoid soft IRQs using it but it's not clear it's worthwhile. This modification may slow allocations from IRQ context slightly but the main gain from the per-cpu allocator is that it scales better for allocations from multiple contexts. There is an implicit assumption that intensive allocations from IRQ contexts on multiple CPUs from a single NUMA node are rare and that the fast majority of scaling issues are encountered in !IRQ contexts such as page faulting. It's worth noting that this patch is not required for a bulk page allocator but it significantly reduces the overhead. The following is results from a page allocator micro-benchmark. Only order-0 is interesting as higher orders do not use the per-cpu allocator 4.10.0-rc2 4.10.0-rc2 vanilla irqsafe-v1r5 Amean alloc-odr0-1 287.15 ( 0.00%) 219.00 ( 23.73%) Amean alloc-odr0-2 221.23 ( 0.00%) 183.23 ( 17.18%) Amean alloc-odr0-4 187.00 ( 0.00%) 151.38 ( 19.05%) Amean alloc-odr0-8 167.54 ( 0.00%) 132.77 ( 20.75%) Amean alloc-odr0-16 156.00 ( 0.00%) 123.00 ( 21.15%) Amean alloc-odr0-32 149.00 ( 0.00%) 118.31 ( 20.60%) Amean alloc-odr0-64 138.77 ( 0.00%) 116.00 ( 16.41%) Amean alloc-odr0-128 145.00 ( 0.00%) 118.00 ( 18.62%) Amean alloc-odr0-256 136.15 ( 0.00%) 125.00 ( 8.19%) Amean alloc-odr0-512 147.92 ( 0.00%) 121.77 ( 17.68%) Amean alloc-odr0-1024 147.23 ( 0.00%) 126.15 ( 14.32%) Amean alloc-odr0-2048 155.15 ( 0.00%) 129.92 ( 16.26%) Amean alloc-odr0-4096 164.00 ( 0.00%) 136.77 ( 16.60%) Amean alloc-odr0-8192 166.92 ( 0.00%) 138.08 ( 17.28%) Amean alloc-odr0-16384 159.00 ( 0.00%) 138.00 ( 13.21%) Amean free-odr0-1 165.00 ( 0.00%) 89.00 ( 46.06%) Amean free-odr0-2 113.00 ( 0.00%) 63.00 ( 44.25%) Amean free-odr0-4 99.00 ( 0.00%) 54.00 ( 45.45%) Amean free-odr0-8 88.00 ( 0.00%) 47.38 ( 46.15%) Amean free-odr0-16 83.00 ( 0.00%) 46.00 ( 44.58%) Amean free-odr0-32 80.00 ( 0.00%) 44.38 ( 44.52%) Amean free-odr0-64 72.62 ( 0.00%) 43.00 ( 40.78%) Amean free-odr0-128 78.00 ( 0.00%) 42.00 ( 46.15%) Amean free-odr0-256 80.46 ( 0.00%) 57.00 ( 29.16%) Amean free-odr0-512 96.38 ( 0.00%) 64.69 ( 32.88%) Amean free-odr0-1024 107.31 ( 0.00%) 72.54 ( 32.40%) Amean free-odr0-2048 108.92 ( 0.00%) 78.08 ( 28.32%) Amean free-odr0-4096 113.38 ( 0.00%) 82.23 ( 27.48%) Amean free-odr0-8192 112.08 ( 0.00%) 82.85 ( 26.08%) Amean free-odr0-16384 110.38 ( 0.00%) 81.92 ( 25.78%) Amean total-odr0-1 452.15 ( 0.00%) 308.00 ( 31.88%) Amean total-odr0-2 334.23 ( 0.00%) 246.23 ( 26.33%) Amean total-odr0-4 286.00 ( 0.00%) 205.38 ( 28.19%) Amean total-odr0-8 255.54 ( 0.00%) 180.15 ( 29.50%) Amean total-odr0-16 239.00 ( 0.00%) 169.00 ( 29.29%) Amean total-odr0-32 229.00 ( 0.00%) 162.69 ( 28.96%) Amean total-odr0-64 211.38 ( 0.00%) 159.00 ( 24.78%) Amean total-odr0-128 223.00 ( 0.00%) 160.00 ( 28.25%) Amean total-odr0-256 216.62 ( 0.00%) 182.00 ( 15.98%) Amean total-odr0-512 244.31 ( 0.00%) 186.46 ( 23.68%) Amean total-odr0-1024 254.54 ( 0.00%) 198.69 ( 21.94%) Amean total-odr0-2048 264.08 ( 0.00%) 208.00 ( 21.24%) Amean total-odr0-4096 277.38 ( 0.00%) 219.00 ( 21.05%) Amean total-odr0-8192 279.00 ( 0.00%) 220.92 ( 20.82%) Amean total-odr0-16384 269.38 ( 0.00%) 219.92 ( 18.36%) This is the alloc, free and total overhead of allocating order-0 pages in batches of 1 page up to 16384 pages. Avoiding disabling/enabling overhead massively reduces overhead. Alloc overhead is roughly reduced by 14-20% in most cases. The free path is reduced by 26-46% and the total reduction is significant. Many users require zeroing of pages from the page allocator which is the vast cost of allocation. Hence, the impact on a basic page faulting benchmark is not that significant 4.10.0-rc2 4.10.0-rc2 vanilla irqsafe-v1r5 Hmean page_test 656632.98 ( 0.00%) 675536.13 ( 2.88%) Hmean brk_test 3845502.67 ( 0.00%) 3867186.94 ( 0.56%) Stddev page_test 10543.29 ( 0.00%) 4104.07 ( 61.07%) Stddev brk_test 33472.36 ( 0.00%) 15538.39 ( 53.58%) CoeffVar page_test 1.61 ( 0.00%) 0.61 ( 62.15%) CoeffVar brk_test 0.87 ( 0.00%) 0.40 ( 53.84%) Max page_test 666513.33 ( 0.00%) 678640.00 ( 1.82%) Max brk_test 3882800.00 ( 0.00%) 3887008.66 ( 0.11%) This is from aim9 and the most notable outcome is that fault variability is reduced by the patch. The headline improvement is small as the overall fault cost, zeroing, page table insertion etc dominate relative to disabling/enabling IRQs in the per-cpu allocator. Similarly, little benefit was seen on networking benchmarks both localhost and between physical server/clients where other costs dominate. It's possible that this will only be noticable on very high speed networks. Jesper Dangaard Brouer independently tested this with a separate microbenchmark from https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench Micro-benchmarked with [1] page_bench02: modprobe page_bench02 page_order=0 run_flags=$((2#010)) loops=$((10**8)); \ rmmod page_bench02 ; dmesg --notime | tail -n 4 Compared to baseline: 213 cycles(tsc) 53.417 ns - against this : 184 cycles(tsc) 46.056 ns - Saving : -29 cycles - Very close to expected 27 cycles saving [see below [2]] Micro benchmarking via time_bench_sample[3], we get the cost of these operations: time_bench: Type:for_loop Per elem: 0 cycles(tsc) 0.232 ns (step:0) time_bench: Type:spin_lock_unlock Per elem: 33 cycles(tsc) 8.334 ns (step:0) time_bench: Type:spin_lock_unlock_irqsave Per elem: 62 cycles(tsc) 15.607 ns (step:0) time_bench: Type:irqsave_before_lock Per elem: 57 cycles(tsc) 14.344 ns (step:0) time_bench: Type:spin_lock_unlock_irq Per elem: 34 cycles(tsc) 8.560 ns (step:0) time_bench: Type:simple_irq_disable_before_lock Per elem: 37 cycles(tsc) 9.289 ns (step:0) time_bench: Type:local_BH_disable_enable Per elem: 19 cycles(tsc) 4.920 ns (step:0) time_bench: Type:local_IRQ_disable_enable Per elem: 7 cycles(tsc) 1.864 ns (step:0) time_bench: Type:local_irq_save_restore Per elem: 38 cycles(tsc) 9.665 ns (step:0) [Mel's patch removes a ^^^^^^^^^^^^^^^^] ^^^^^^^^^ expected saving - preempt cost time_bench: Type:preempt_disable_enable Per elem: 11 cycles(tsc) 2.794 ns (step:0) [adds a preempt ^^^^^^^^^^^^^^^^^^^^^^] ^^^^^^^^^ adds this cost time_bench: Type:funcion_call_cost Per elem: 6 cycles(tsc) 1.689 ns (step:0) time_bench: Type:func_ptr_call_cost Per elem: 11 cycles(tsc) 2.767 ns (step:0) time_bench: Type:page_alloc_put Per elem: 211 cycles(tsc) 52.803 ns (step:0) Thus, expected improvement is: 38-11 = 27 cycles. [mgorman@techsingularity.net: s/preempt_enable_no_resched/preempt_enable/] Link: http://lkml.kernel.org/r/20170208143128.25ahymqlyspjcixu@techsingularity.net Link: http://lkml.kernel.org/r/20170123153906.3122-5-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Hillf Danton Acked-by: Jesper Dangaard Brouer Acked-by: Vlastimil Babka Cc: Hillf Danton Cc: Thomas Gleixner Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8af0d4fa683d..6196eed96732 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1085,10 +1085,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; - unsigned long nr_scanned; + unsigned long nr_scanned, flags; bool isolated_pageblocks; - spin_lock(&zone->lock); + spin_lock_irqsave(&zone->lock, flags); isolated_pageblocks = has_isolate_pageblock(zone); nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); if (nr_scanned) @@ -1137,7 +1137,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, trace_mm_page_pcpu_drain(page, 0, mt); } while (--count && --batch_free && !list_empty(list)); } - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); } static void free_one_page(struct zone *zone, @@ -1145,8 +1145,9 @@ static void free_one_page(struct zone *zone, unsigned int order, int migratetype) { - unsigned long nr_scanned; - spin_lock(&zone->lock); + unsigned long nr_scanned, flags; + spin_lock_irqsave(&zone->lock, flags); + __count_vm_events(PGFREE, 1 << order); nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); if (nr_scanned) __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); @@ -1156,7 +1157,7 @@ static void free_one_page(struct zone *zone, migratetype = get_pfnblock_migratetype(page, pfn); } __free_one_page(page, pfn, zone, order, migratetype); - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); } static void __meminit __init_single_page(struct page *page, unsigned long pfn, @@ -1234,7 +1235,6 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) static void __free_pages_ok(struct page *page, unsigned int order) { - unsigned long flags; int migratetype; unsigned long pfn = page_to_pfn(page); @@ -1242,10 +1242,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) return; migratetype = get_pfnblock_migratetype(page, pfn); - local_irq_save(flags); - __count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, pfn, order, migratetype); - local_irq_restore(flags); } static void __init __free_pages_boot_core(struct page *page, unsigned int order) @@ -2217,8 +2214,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, int migratetype, bool cold) { int i, alloced = 0; + unsigned long flags; - spin_lock(&zone->lock); + spin_lock_irqsave(&zone->lock, flags); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype); if (unlikely(page == NULL)) @@ -2254,7 +2252,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages added to the pcp list. */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); return alloced; } @@ -2475,17 +2473,20 @@ void free_hot_cold_page(struct page *page, bool cold) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; - unsigned long flags; unsigned long pfn = page_to_pfn(page); int migratetype; + if (in_interrupt()) { + __free_pages_ok(page, 0); + return; + } + if (!free_pcp_prepare(page)) return; migratetype = get_pfnblock_migratetype(page, pfn); set_pcppage_migratetype(page, migratetype); - local_irq_save(flags); - __count_vm_event(PGFREE); + preempt_disable(); /* * We only track unmovable, reclaimable and movable on pcp lists. @@ -2502,6 +2503,7 @@ void free_hot_cold_page(struct page *page, bool cold) migratetype = MIGRATE_MOVABLE; } + __count_vm_event(PGFREE); pcp = &this_cpu_ptr(zone->pageset)->pcp; if (!cold) list_add(&page->lru, &pcp->lists[migratetype]); @@ -2515,7 +2517,7 @@ void free_hot_cold_page(struct page *page, bool cold) } out: - local_irq_restore(flags); + preempt_enable(); } /* @@ -2640,6 +2642,8 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, { struct page *page; + VM_BUG_ON(in_interrupt()); + do { if (list_empty(list)) { pcp->count += rmqueue_bulk(zone, 0, @@ -2670,9 +2674,8 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct list_head *list; bool cold = ((gfp_flags & __GFP_COLD) != 0); struct page *page; - unsigned long flags; - local_irq_save(flags); + preempt_disable(); pcp = &this_cpu_ptr(zone->pageset)->pcp; list = &pcp->lists[migratetype]; page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); @@ -2680,7 +2683,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone); } - local_irq_restore(flags); + preempt_enable(); return page; } @@ -2696,7 +2699,7 @@ struct page *rmqueue(struct zone *preferred_zone, unsigned long flags; struct page *page; - if (likely(order == 0)) { + if (likely(order == 0) && !in_interrupt()) { page = rmqueue_pcplist(preferred_zone, zone, order, gfp_flags, migratetype); goto out; -- cgit From df76cee6bbeb2ed036f1622f63a99c28cecf6b30 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 24 Feb 2017 14:56:50 -0800 Subject: mm, page_alloc: remove redundant checks from alloc fastpath The allocation fast path contains two similar checks for zoneref->zone being NULL, where zoneref points either to the first zone in the zonelist, or to the preferred zone. These can be NULL either due to empty zonelist, or no zone being compatible with given nodemask or task's cpuset. These checks are unnecessary, because the zonelist walks in first_zones_zonelist() and get_page_from_freelist() handle a NULL starting zoneref->zone or preferred_zoneref->zone safely. It's safe to fallback to __alloc_pages_slowpath() where we also have the check early enough. Link: http://lkml.kernel.org/r/20170124150511.5710-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Michal Hocko Cc: Anshuman Khandual Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6196eed96732..65876feb86f3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3915,14 +3915,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, if (should_fail_alloc_page(gfp_mask, order)) return false; - /* - * Check the zones suitable for the gfp_mask contain at least one - * valid zone. It's possible to have an empty zonelist as a result - * of __GFP_THISNODE and a memoryless node - */ - if (unlikely(!ac->zonelist->_zonerefs->zone)) - return false; - if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE) *alloc_flags |= ALLOC_CMA; @@ -3962,22 +3954,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, return NULL; finalise_ac(gfp_mask, order, &ac); - if (!ac.preferred_zoneref->zone) { - page = NULL; - /* - * This might be due to race with cpuset_current_mems_allowed - * update, so make sure we retry with original nodemask in the - * slow path. - */ - goto no_zone; - } /* First allocation attempt */ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); if (likely(page)) goto out; -no_zone: /* * Runtime PM, block IO and its error handling path can deadlock * because I/O on the device might not complete. -- cgit From 5104782011a12b04fe9cfaa6f1085bdcdedd79c4 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 24 Feb 2017 14:56:53 -0800 Subject: mm, page_alloc: don't check cpuset allowed twice in fast-path Since commit 682a3385e773 ("mm, page_alloc: inline the fast path of the zonelist iterator") we replace a NULL nodemask with cpuset_current_mems_allowed in the fast path, so that get_page_from_freelist() filters nodes allowed by the cpuset via for_next_zone_zonelist_nodemask(). In that case it's pointless to additionaly check __cpuset_zone_allowed() in each iteration, which we can avoid by not adding ALLOC_CPUSET to alloc_flags in that scenario. This saves some cycles in the allocator fast path on systems with one or more non-root cpuset configured. In the slow path, ALLOC_CPUSET is reset according to __alloc_pages_slowpath(). Without configured cpusets, this code is disabled by a static key. Link: http://lkml.kernel.org/r/20170124150511.5710-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Anshuman Khandual Acked-by: Michal Hocko Cc: Anshuman Khandual Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 65876feb86f3..46c30fa26acd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3903,9 +3903,10 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, if (cpusets_enabled()) { *alloc_mask |= __GFP_HARDWALL; - *alloc_flags |= ALLOC_CPUSET; if (!ac->nodemask) ac->nodemask = &cpuset_current_mems_allowed; + else + *alloc_flags |= ALLOC_CPUSET; } lockdep_trace_alloc(gfp_mask); -- cgit From bd233f538d51c2cae6f0bfc2cf7f0960e1683b8a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 24 Feb 2017 14:56:56 -0800 Subject: mm, page_alloc: use static global work_struct for draining per-cpu pages As suggested by Vlastimil Babka and Tejun Heo, this patch uses a static work_struct to co-ordinate the draining of per-cpu pages on the workqueue. Only one task can drain at a time but this is better than the previous scheme that allowed multiple tasks to send IPIs at a time. One consideration is whether parallel requests should synchronise against each other. This patch does not synchronise for a global drain as the common case for such callers is expected to be multiple parallel direct reclaimers competing for pages when the watermark is close to min. Draining the per-cpu list is unlikely to make much progress and serialising the drain is of dubious merit. Drains are synchonrised for callers such as memory hotplug and CMA that care about the drain being complete when the function returns. Link: http://lkml.kernel.org/r/20170125083038.rzb5f43nptmk7aed@techsingularity.net Signed-off-by: Mel Gorman Suggested-by: Tejun Heo Suggested-by: Vlastimil Babka Acked-by: Vlastimil Babka Acked-by: Hillf Danton Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 46c30fa26acd..41985aa4672d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -92,6 +92,10 @@ EXPORT_PER_CPU_SYMBOL(_numa_mem_); int _node_numa_mem_[MAX_NUMNODES]; #endif +/* work_structs for global per-cpu drains */ +DEFINE_MUTEX(pcpu_drain_mutex); +DEFINE_PER_CPU(struct work_struct, pcpu_drain); + #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY volatile unsigned long latent_entropy __latent_entropy; EXPORT_SYMBOL(latent_entropy); @@ -2360,7 +2364,6 @@ static void drain_local_pages_wq(struct work_struct *work) */ void drain_all_pages(struct zone *zone) { - struct work_struct __percpu *works; int cpu; /* @@ -2373,7 +2376,16 @@ void drain_all_pages(struct zone *zone) if (current->flags & PF_WQ_WORKER) return; - works = alloc_percpu_gfp(struct work_struct, GFP_ATOMIC); + /* + * Do not drain if one is already in progress unless it's specific to + * a zone. Such callers are primarily CMA and memory hotplug and need + * the drain to be complete when the call returns. + */ + if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) { + if (!zone) + return; + mutex_lock(&pcpu_drain_mutex); + } /* * We don't care about racing with CPU hotplug event @@ -2406,23 +2418,15 @@ void drain_all_pages(struct zone *zone) cpumask_clear_cpu(cpu, &cpus_with_pcps); } - if (works) { - for_each_cpu(cpu, &cpus_with_pcps) { - struct work_struct *work = per_cpu_ptr(works, cpu); - INIT_WORK(work, drain_local_pages_wq); - schedule_work_on(cpu, work); - } - for_each_cpu(cpu, &cpus_with_pcps) - flush_work(per_cpu_ptr(works, cpu)); - } else { - for_each_cpu(cpu, &cpus_with_pcps) { - struct work_struct work; - - INIT_WORK(&work, drain_local_pages_wq); - schedule_work_on(cpu, &work); - flush_work(&work); - } + for_each_cpu(cpu, &cpus_with_pcps) { + struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu); + INIT_WORK(work, drain_local_pages_wq); + schedule_work_on(cpu, work); } + for_each_cpu(cpu, &cpus_with_pcps) + flush_work(per_cpu_ptr(&pcpu_drain, cpu)); + + mutex_unlock(&pcpu_drain_mutex); } #ifdef CONFIG_HIBERNATION -- cgit From 0efadf48bca01f17cb64ebceaf528590b2bc7665 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Fri, 24 Feb 2017 14:57:39 -0800 Subject: mm/hotplug: enable memory hotplug for non-lru movable pages We had considered all of the non-lru pages as unmovable before commit bda807d44454 ("mm: migrate: support non-lru movable page migration"). But now some of non-lru pages like zsmalloc, virtio-balloon pages also become movable. So we can offline such blocks by using non-lru page migration. This patch straightforwardly adds non-lru migration code, which means adding non-lru related code to the functions which scan over pfn and collect pages to be migrated and isolate them before migration. Signed-off-by: Yisheng Xie Cc: Michal Hocko Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Vlastimil Babka Cc: Andi Kleen Cc: Hanjun Guo Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Reza Arbab Cc: Taku Izumi Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yisheng Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 28 +++++++++++++++++----------- mm/page_alloc.c | 8 ++++++-- 2 files changed, 23 insertions(+), 13 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 5c4f48409347..7946375fe466 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1531,10 +1531,10 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, } /* - * Scan pfn range [start,end) to find movable/migratable pages (LRU pages - * and hugepages). We scan pfn because it's much easier than scanning over - * linked list. This function returns the pfn of the first found movable - * page if it's found, otherwise 0. + * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, + * non-lru movable pages and hugepages). We scan pfn because it's much + * easier than scanning over linked list. This function returns the pfn + * of the first found movable page if it's found, otherwise 0. */ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) { @@ -1545,6 +1545,8 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) page = pfn_to_page(pfn); if (PageLRU(page)) return pfn; + if (__PageMovable(page)) + return pfn; if (PageHuge(page)) { if (page_huge_active(page)) return pfn; @@ -1621,21 +1623,25 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (!get_page_unless_zero(page)) continue; /* - * We can skip free pages. And we can only deal with pages on - * LRU. + * We can skip free pages. And we can deal with pages on + * LRU and non-lru movable pages. */ - ret = isolate_lru_page(page); + if (PageLRU(page)) + ret = isolate_lru_page(page); + else + ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); if (!ret) { /* Success */ put_page(page); list_add_tail(&page->lru, &source); move_pages--; - inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + if (!__PageMovable(page)) + inc_node_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); } else { #ifdef CONFIG_DEBUG_VM - pr_alert("removing pfn %lx from LRU failed\n", pfn); - dump_page(page, "failed to remove from LRU"); + pr_alert("failed to isolate pfn %lx\n", pfn); + dump_page(page, "isolation failed"); #endif put_page(page); /* Because we don't have big zone->lock. we should diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 41985aa4672d..2d34cdb70f1d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7227,8 +7227,9 @@ void *__init alloc_large_system_hash(const char *tablename, * If @count is not zero, it is okay to include less @count unmovable pages * * PageLRU check without isolation or lru_lock could race so that - * MIGRATE_MOVABLE block might include unmovable pages. It means you can't - * expect this function should be exact. + * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable + * check without lock_page also may miss some movable non-lru pages at + * race condition. So you can't expect this function should be exact. */ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, bool skip_hwpoisoned_pages) @@ -7284,6 +7285,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, if (skip_hwpoisoned_pages && PageHWPoison(page)) continue; + if (__PageMovable(page)) + continue; + if (!PageLRU(page)) found++; /* -- cgit From ca96b625341027f611c3e61351a70311077ebcf5 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 24 Feb 2017 14:58:37 -0800 Subject: mm: alloc_contig_range: allow to specify GFP mask Currently alloc_contig_range assumes that the compaction should be done with the default GFP_KERNEL flags. This is probably right for all current uses of this interface, but may change as CMA is used in more use-cases (including being the default DMA memory allocator on some platforms). Change the function prototype, to allow for passing through the GFP mask set by upper layers. Also respect global restrictions by applying memalloc_noio_flags to the passed in flags. Link: http://lkml.kernel.org/r/20170127172328.18574-1-l.stach@pengutronix.de Signed-off-by: Lucas Stach Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Radim Krcmar Cc: Catalin Marinas Cc: Will Deacon Cc: Chris Zankel Cc: Ralf Baechle Cc: Paolo Bonzini Cc: Alexander Graf Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 +- mm/cma.c | 3 ++- mm/hugetlb.c | 3 ++- mm/page_alloc.c | 5 +++-- 4 files changed, 8 insertions(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 0fe0b6295ab5..db373b9d3223 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -541,7 +541,7 @@ static inline bool pm_suspended_storage(void) #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) /* The below functions must be run on a range from a single zone. */ extern int alloc_contig_range(unsigned long start, unsigned long end, - unsigned migratetype); + unsigned migratetype, gfp_t gfp_mask); extern void free_contig_range(unsigned long pfn, unsigned nr_pages); #endif diff --git a/mm/cma.c b/mm/cma.c index 94b3460cd608..c6aed23ca6df 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -402,7 +402,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); mutex_lock(&cma_mutex); - ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA); + ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, + GFP_KERNEL); mutex_unlock(&cma_mutex); if (ret == 0) { page = pfn_to_page(pfn); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 167fd0722c15..2e0e8159ce8e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1052,7 +1052,8 @@ static int __alloc_gigantic_page(unsigned long start_pfn, unsigned long nr_pages) { unsigned long end_pfn = start_pfn + nr_pages; - return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE); + return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, + GFP_KERNEL); } static bool pfn_range_valid_gigantic(struct zone *z, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2d34cdb70f1d..8a0f33624335 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7399,6 +7399,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks * in range must have the same migratetype and it must * be either of the two. + * @gfp_mask: GFP mask to use during compaction * * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES * aligned, however it's the caller's responsibility to guarantee that @@ -7412,7 +7413,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * need to be freed with free_contig_range(). */ int alloc_contig_range(unsigned long start, unsigned long end, - unsigned migratetype) + unsigned migratetype, gfp_t gfp_mask) { unsigned long outer_start, outer_end; unsigned int order; @@ -7424,7 +7425,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .zone = page_zone(pfn_to_page(start)), .mode = MIGRATE_SYNC, .ignore_skip_hint = true, - .gfp_mask = GFP_KERNEL, + .gfp_mask = memalloc_noio_flags(gfp_mask), }; INIT_LIST_HEAD(&cc.migratepages); -- cgit From f2bf14d14dbc50dc56be3ebd18652ea2738ef6f8 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Fri, 24 Feb 2017 14:58:56 -0800 Subject: mm/page_alloc.c: remove duplicate inclusion of page_ext.h Link: http://lkml.kernel.org/r/20170202011942.1609-1-standby24x7@gmail.com Signed-off-by: Masanari Iida Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8a0f33624335..f8ef2c90edbe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -59,7 +59,6 @@ #include #include #include -#include #include #include #include -- cgit From e02dc017c3032dcdce1b993af0db135462e1b4b7 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 24 Feb 2017 14:59:33 -0800 Subject: mm/page_alloc: fix nodes for reclaim in fast path When @node_reclaim_node isn't 0, the page allocator tries to reclaim pages if the amount of free memory in the zones are below the low watermark. On Power platform, none of NUMA nodes are scanned for page reclaim because no nodes match the condition in zone_allows_reclaim(). On Power platform, RECLAIM_DISTANCE is set to 10 which is the distance of Node-A to Node-A. So the preferred node even won't be scanned for page reclaim. __alloc_pages_nodemask() get_page_from_freelist() zone_allows_reclaim() Anton proposed the test code as below: # cat alloc.c : int main(int argc, char *argv[]) { void *p; unsigned long size; unsigned long start, end; start = time(NULL); size = strtoul(argv[1], NULL, 0); printf("To allocate %ldGB memory\n", size); size <<= 30; p = malloc(size); assert(p); memset(p, 0, size); end = time(NULL); printf("Used time: %ld seconds\n", end - start); sleep(3600); return 0; } The system I use for testing has two NUMA nodes. Both have 128GB memory. In below scnario, the page caches on node#0 should be reclaimed when it encounters pressure to accommodate request of allocation. # echo 2 > /proc/sys/vm/zone_reclaim_mode; \ sync; \ echo 3 > /proc/sys/vm/drop_caches; \ # taskset -c 0 cat file.32G > /dev/null; \ grep FilePages /sys/devices/system/node/node0/meminfo Node 0 FilePages: 33619712 kB # taskset -c 0 ./alloc 128 # grep FilePages /sys/devices/system/node/node0/meminfo Node 0 FilePages: 33619840 kB # grep MemFree /sys/devices/system/node/node0/meminfo Node 0 MemFree: 186816 kB With the patch applied, the pagecache on node-0 is reclaimed when its free memory is running out. It's the expected behaviour. # echo 2 > /proc/sys/vm/zone_reclaim_mode; \ sync; \ echo 3 > /proc/sys/vm/drop_caches # taskset -c 0 cat file.32G > /dev/null; \ grep FilePages /sys/devices/system/node/node0/meminfo Node 0 FilePages: 33605568 kB # taskset -c 0 ./alloc 128 # grep FilePages /sys/devices/system/node/node0/meminfo Node 0 FilePages: 1379520 kB # grep MemFree /sys/devices/system/node/node0/meminfo Node 0 MemFree: 317120 kB Fixes: 5f7a75acdb24 ("mm: page_alloc: do not cache reclaim distances") Link: http://lkml.kernel.org/r/1486532455-29613-1-git-send-email-gwshan@linux.vnet.ibm.com Signed-off-by: Gavin Shan Acked-by: Mel Gorman Acked-by: Michal Hocko Cc: Anton Blanchard Cc: Michael Ellerman Cc: [3.16+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f8ef2c90edbe..4c2011f6b08f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2944,7 +2944,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order, #ifdef CONFIG_NUMA static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { - return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < + return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= RECLAIM_DISTANCE; } #else /* CONFIG_NUMA */ -- cgit From ad69444e75d77981291ccf807f48d81e8fca010f Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 24 Feb 2017 14:59:45 -0800 Subject: mm/page_alloc.c: remove redundant init code for ZONE_MOVABLE arch_zone_lowest/highest_possible_pfn[] is set to 0 and [ZONE_MOVABLE] is skipped in the loop. No need to reset them to 0 again. This patch just removes the redundant code. Link: http://lkml.kernel.org/r/20170209141731.60208-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: Anshuman Khandual Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4c2011f6b08f..9f9623d690d6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6489,8 +6489,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) start_pfn = end_pfn; } - arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; - arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; /* Find the PFNs that ZONE_MOVABLE begins at in each node */ memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); -- cgit From 89d790ab31d033d67635f6362d57ea64e47708fa Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 27 Feb 2017 14:29:01 -0800 Subject: scripts/spelling.txt: add "algined" pattern and fix typo instances Fix typos and add the following to the scripts/spelling.txt: algined||aligned While we are here, fix the "appplication" in the touched line in drivers/block/loop.c. Also, fix the "may not naturally ..." to "may not be naturally ..." in the touched line in mm/page_alloc. Link: http://lkml.kernel.org/r/1481573103-11329-9-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/loop.c | 2 +- mm/page_alloc.c | 2 +- scripts/spelling.txt | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 304377182c1a..4b52a1690329 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -186,7 +186,7 @@ static void __loop_update_dio(struct loop_device *lo, bool dio) * * TODO: the above condition may be loosed in the future, and * direct I/O may be switched runtime at that time because most - * of requests in sane appplications should be PAGE_SIZE algined + * of requests in sane applications should be PAGE_SIZE aligned */ if (dio) { if (queue_logical_block_size(lo->lo_queue) >= sb_bsize && diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9f9623d690d6..a7a6aac95a6d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5925,7 +5925,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, * the zone and SPARSEMEM is in use. If there are holes within the * zone, each populated memory region may cost us one or two extra * memmap pages due to alignment because memmap pages for each - * populated regions may not naturally algined on page boundary. + * populated regions may not be naturally aligned on page boundary. * So the (present_pages >> 4) heuristic is a tradeoff for that. */ if (spanned_pages > present_pages + (present_pages >> 4) && diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 12777ab32306..096e8b8b6502 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -65,6 +65,7 @@ afecting||affecting agaist||against albumns||albums alegorical||allegorical +algined||aligned algorith||algorithm algorithmical||algorithmically algoritm||algorithm -- cgit From 5b3cc15aff243cb518cbeed8b1a220cbfd023d9c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 2 Feb 2017 20:43:54 +0100 Subject: sched/headers: Prepare to move the memalloc_noio_*() APIs to Update the .c files that depend on these APIs. Acked-by: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- drivers/base/power/runtime.c | 2 +- drivers/md/dm-bufio.c | 1 + drivers/md/dm-ioctl.c | 1 + drivers/usb/core/hub.c | 2 +- fs/ocfs2/cluster/tcp.c | 1 + fs/xfs/kmem.c | 1 + fs/xfs/xfs_buf.c | 1 + mm/page_alloc.c | 1 + mm/vmscan.c | 1 + net/ceph/crypto.c | 1 + 10 files changed, 10 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index a14fac6a01d3..7bcf80fa9ada 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -7,7 +7,7 @@ * This file is released under the GPLv2. */ -#include +#include #include #include #include diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index d36d427a9efb..df4859f6ac6a 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index a5a9b17f0f7f..4da6fc6b1ffd 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index a56c75e09786..f0dd08198d74 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index ec000575e863..4348027384f5 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -54,6 +54,7 @@ */ #include +#include #include #include #include diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 339c696bbc01..2dfdc62f795e 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -16,6 +16,7 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include +#include #include #include #include diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8c7d01b75922..b6208728ba39 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "xfs_format.h" #include "xfs_log_format.h" diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a7a6aac95a6d..eaa64d2ffdc5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/vmscan.c b/mm/vmscan.c index 70aa739c6b68..bc8031ef994d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -14,6 +14,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 85747b7f91a9..46008d5ac504 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include -- cgit From b4fb8f66f1ae2e167d06c12d018025a8d4d3ba7e Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 8 Mar 2017 09:35:39 -0800 Subject: mm, page_alloc: Add missing check for memory holes Commit 13ad59df67f1 ("mm, page_alloc: avoid page_to_pfn() when merging buddies") moved the check for memory holes out of page_is_buddy() and had the callers do the check. But this wasn't done correctly in one place which caused ia64 to crash very early in boot. Update to fix that and make ia64 boot again. [ v2: Vlastimil pointed out we don't need to call page_to_pfn() since we already have the result of that in "buddy_pfn" ] Fixes: 13ad59df67f1 ("avoid page_to_pfn() when merging buddies") Cc: Mel Gorman Cc: Joonsoo Kim Cc: Michal Hocko Cc: "Kirill A. Shutemov" Cc: Johannes Weiner Cc: Andrew Morton Signed-off-by: Tony Luck Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eaa64d2ffdc5..6cbde310abed 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -873,7 +873,8 @@ done_merging: higher_page = page + (combined_pfn - pfn); buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); higher_buddy = higher_page + (buddy_pfn - combined_pfn); - if (page_is_buddy(higher_page, higher_buddy, order + 1)) { + if (pfn_valid_within(buddy_pfn) && + page_is_buddy(higher_page, higher_buddy, order + 1)) { list_add_tail(&page->lru, &zone->free_area[order].free_list[migratetype]); goto out; -- cgit