aboutsummaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c875
1 files changed, 432 insertions, 443 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6e5b4488a0c5..e5486d47406e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -81,6 +81,7 @@
#include "internal.h"
#include "shuffle.h"
#include "page_reporting.h"
+#include "swap.h"
/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
typedef int __bitwise fpi_t;
@@ -125,13 +126,97 @@ typedef int __bitwise fpi_t;
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
-struct pagesets {
- local_lock_t lock;
-};
-static DEFINE_PER_CPU(struct pagesets, pagesets) = {
- .lock = INIT_LOCAL_LOCK(lock),
-};
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+/*
+ * On SMP, spin_trylock is sufficient protection.
+ * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
+ */
+#define pcp_trylock_prepare(flags) do { } while (0)
+#define pcp_trylock_finish(flag) do { } while (0)
+#else
+
+/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
+#define pcp_trylock_prepare(flags) local_irq_save(flags)
+#define pcp_trylock_finish(flags) local_irq_restore(flags)
+#endif
+
+/*
+ * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
+ * a migration causing the wrong PCP to be locked and remote memory being
+ * potentially allocated, pin the task to the CPU for the lookup+lock.
+ * preempt_disable is used on !RT because it is faster than migrate_disable.
+ * migrate_disable is used on RT because otherwise RT spinlock usage is
+ * interfered with and a high priority task cannot preempt the allocator.
+ */
+#ifndef CONFIG_PREEMPT_RT
+#define pcpu_task_pin() preempt_disable()
+#define pcpu_task_unpin() preempt_enable()
+#else
+#define pcpu_task_pin() migrate_disable()
+#define pcpu_task_unpin() migrate_enable()
+#endif
+/*
+ * Generic helper to lookup and a per-cpu variable with an embedded spinlock.
+ * Return value should be used with equivalent unlock helper.
+ */
+#define pcpu_spin_lock(type, member, ptr) \
+({ \
+ type *_ret; \
+ pcpu_task_pin(); \
+ _ret = this_cpu_ptr(ptr); \
+ spin_lock(&_ret->member); \
+ _ret; \
+})
+
+#define pcpu_spin_lock_irqsave(type, member, ptr, flags) \
+({ \
+ type *_ret; \
+ pcpu_task_pin(); \
+ _ret = this_cpu_ptr(ptr); \
+ spin_lock_irqsave(&_ret->member, flags); \
+ _ret; \
+})
+
+#define pcpu_spin_trylock_irqsave(type, member, ptr, flags) \
+({ \
+ type *_ret; \
+ pcpu_task_pin(); \
+ _ret = this_cpu_ptr(ptr); \
+ if (!spin_trylock_irqsave(&_ret->member, flags)) { \
+ pcpu_task_unpin(); \
+ _ret = NULL; \
+ } \
+ _ret; \
+})
+
+#define pcpu_spin_unlock(member, ptr) \
+({ \
+ spin_unlock(&ptr->member); \
+ pcpu_task_unpin(); \
+})
+
+#define pcpu_spin_unlock_irqrestore(member, ptr, flags) \
+({ \
+ spin_unlock_irqrestore(&ptr->member, flags); \
+ pcpu_task_unpin(); \
+})
+
+/* struct per_cpu_pages specific helpers. */
+#define pcp_spin_lock(ptr) \
+ pcpu_spin_lock(struct per_cpu_pages, lock, ptr)
+
+#define pcp_spin_lock_irqsave(ptr, flags) \
+ pcpu_spin_lock_irqsave(struct per_cpu_pages, lock, ptr, flags)
+
+#define pcp_spin_trylock_irqsave(ptr, flags) \
+ pcpu_spin_trylock_irqsave(struct per_cpu_pages, lock, ptr, flags)
+
+#define pcp_spin_unlock(ptr) \
+ pcpu_spin_unlock(lock, ptr)
+
+#define pcp_spin_unlock_irqrestore(ptr, flags) \
+ pcpu_spin_unlock_irqrestore(lock, ptr, flags)
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -150,13 +235,7 @@ DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
EXPORT_PER_CPU_SYMBOL(_numa_mem_);
#endif
-/* work_structs for global per-cpu drains */
-struct pcpu_drain {
- struct zone *zone;
- struct work_struct work;
-};
static DEFINE_MUTEX(pcpu_drain_mutex);
-static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
volatile unsigned long latent_entropy __latent_entropy;
@@ -355,7 +434,7 @@ static unsigned long required_kernelcore_percent __initdata;
static unsigned long required_movablecore __initdata;
static unsigned long required_movablecore_percent __initdata;
static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
-static bool mirrored_kernelcore __meminitdata;
+bool mirrored_kernelcore __initdata_memblock;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
@@ -481,8 +560,12 @@ unsigned long __get_pfnblock_flags_mask(const struct page *page,
bitidx = pfn_to_bitidx(page, pfn);
word_bitidx = bitidx / BITS_PER_LONG;
bitidx &= (BITS_PER_LONG-1);
-
- word = bitmap[word_bitidx];
+ /*
+ * This races, without locks, with set_pfnblock_flags_mask(). Ensure
+ * a consistent read of the memory array, so that results, even though
+ * racy, are not corrupted.
+ */
+ word = READ_ONCE(bitmap[word_bitidx]);
return (word >> bitidx) & mask;
}
@@ -519,7 +602,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
{
unsigned long *bitmap;
unsigned long bitidx, word_bitidx;
- unsigned long old_word, word;
+ unsigned long word;
BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
@@ -535,12 +618,8 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
flags <<= bitidx;
word = READ_ONCE(bitmap[word_bitidx]);
- for (;;) {
- old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
- if (word == old_word)
- break;
- word = old_word;
- }
+ do {
+ } while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags));
}
void set_pageblock_migratetype(struct page *page, int migratetype)
@@ -648,7 +727,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (order > PAGE_ALLOC_COSTLY_ORDER) {
VM_BUG_ON(order != pageblock_order);
- base = PAGE_ALLOC_COSTLY_ORDER + 1;
+ return NR_LOWORDER_PCP_LISTS;
}
#else
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
@@ -662,7 +741,7 @@ static inline int pindex_to_order(unsigned int pindex)
int order = pindex / MIGRATE_PCPTYPES;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (order > PAGE_ALLOC_COSTLY_ORDER)
+ if (pindex == NR_LOWORDER_PCP_LISTS)
order = pageblock_order;
#else
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
@@ -739,6 +818,14 @@ void prep_compound_page(struct page *page, unsigned int order)
prep_compound_head(page, order);
}
+void destroy_large_folio(struct folio *folio)
+{
+ enum compound_dtor_id dtor = folio_page(folio, 1)->compound_dtor;
+
+ VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio);
+ compound_page_dtors[dtor](&folio->page);
+}
+
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
@@ -780,7 +867,7 @@ static inline bool set_page_guard(struct zone *zone, struct page *page,
return false;
__SetPageGuard(page);
- INIT_LIST_HEAD(&page->lru);
+ INIT_LIST_HEAD(&page->buddy_list);
set_page_private(page, order);
/* Guard pages are not available for any usage */
__mod_zone_freepage_state(zone, -(1 << order), migratetype);
@@ -867,40 +954,6 @@ static inline void set_buddy_order(struct page *page, unsigned int order)
__SetPageBuddy(page);
}
-/*
- * This function checks whether a page is free && is the buddy
- * we can coalesce a page and its buddy if
- * (a) the buddy is not in a hole (check before calling!) &&
- * (b) the buddy is in the buddy system &&
- * (c) a page and its buddy have the same order &&
- * (d) a page and its buddy are in the same zone.
- *
- * For recording whether a page is in the buddy system, we set PageBuddy.
- * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
- *
- * For recording page's order, we use page_private(page).
- */
-static inline bool page_is_buddy(struct page *page, struct page *buddy,
- unsigned int order)
-{
- if (!page_is_guard(buddy) && !PageBuddy(buddy))
- return false;
-
- if (buddy_order(buddy) != order)
- return false;
-
- /*
- * zone check is done late to avoid uselessly calculating
- * zone/node ids for pages that could never merge.
- */
- if (page_zone_id(page) != page_zone_id(buddy))
- return false;
-
- VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
-
- return true;
-}
-
#ifdef CONFIG_COMPACTION
static inline struct capture_control *task_capc(struct zone *zone)
{
@@ -957,7 +1010,7 @@ static inline void add_to_free_list(struct page *page, struct zone *zone,
{
struct free_area *area = &zone->free_area[order];
- list_add(&page->lru, &area->free_list[migratetype]);
+ list_add(&page->buddy_list, &area->free_list[migratetype]);
area->nr_free++;
}
@@ -967,7 +1020,7 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
{
struct free_area *area = &zone->free_area[order];
- list_add_tail(&page->lru, &area->free_list[migratetype]);
+ list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
area->nr_free++;
}
@@ -981,7 +1034,7 @@ static inline void move_to_free_list(struct page *page, struct zone *zone,
{
struct free_area *area = &zone->free_area[order];
- list_move_tail(&page->lru, &area->free_list[migratetype]);
+ list_move_tail(&page->buddy_list, &area->free_list[migratetype]);
}
static inline void del_page_from_free_list(struct page *page, struct zone *zone,
@@ -991,7 +1044,7 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone,
if (page_reported(page))
__ClearPageReported(page);
- list_del(&page->lru);
+ list_del(&page->buddy_list);
__ClearPageBuddy(page);
set_page_private(page, 0);
zone->free_area[order].nr_free--;
@@ -1009,18 +1062,17 @@ static inline bool
buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
struct page *page, unsigned int order)
{
- struct page *higher_page, *higher_buddy;
- unsigned long combined_pfn;
+ unsigned long higher_page_pfn;
+ struct page *higher_page;
if (order >= MAX_ORDER - 2)
return false;
- combined_pfn = buddy_pfn & pfn;
- higher_page = page + (combined_pfn - pfn);
- buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
- higher_buddy = higher_page + (buddy_pfn - combined_pfn);
+ higher_page_pfn = buddy_pfn & pfn;
+ higher_page = page + (higher_page_pfn - pfn);
- return page_is_buddy(higher_page, higher_buddy, order + 1);
+ return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1,
+ NULL) != NULL;
}
/*
@@ -1053,7 +1105,6 @@ static inline void __free_one_page(struct page *page,
int migratetype, fpi_t fpi_flags)
{
struct capture_control *capc = task_capc(zone);
- unsigned int max_order = pageblock_order;
unsigned long buddy_pfn;
unsigned long combined_pfn;
struct page *buddy;
@@ -1069,18 +1120,32 @@ static inline void __free_one_page(struct page *page,
VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
-continue_merging:
- while (order < max_order) {
+ while (order < MAX_ORDER - 1) {
if (compaction_capture(capc, page, order, migratetype)) {
__mod_zone_freepage_state(zone, -(1 << order),
migratetype);
return;
}
- buddy_pfn = __find_buddy_pfn(pfn, order);
- buddy = page + (buddy_pfn - pfn);
- if (!page_is_buddy(page, buddy, order))
+ buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
+ if (!buddy)
goto done_merging;
+
+ if (unlikely(order >= pageblock_order)) {
+ /*
+ * We want to prevent merge between freepages on pageblock
+ * without fallbacks and normal pageblock. Without this,
+ * pageblock isolation could cause incorrect freepage or CMA
+ * accounting or HIGHATOMIC accounting.
+ */
+ int buddy_mt = get_pageblock_migratetype(buddy);
+
+ if (migratetype != buddy_mt
+ && (!migratetype_is_mergeable(migratetype) ||
+ !migratetype_is_mergeable(buddy_mt)))
+ goto done_merging;
+ }
+
/*
* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
* merge with it and move up one order.
@@ -1094,32 +1159,6 @@ continue_merging:
pfn = combined_pfn;
order++;
}
- if (order < MAX_ORDER - 1) {
- /* If we are here, it means order is >= pageblock_order.
- * We want to prevent merge between freepages on pageblock
- * without fallbacks and normal pageblock. Without this,
- * pageblock isolation could cause incorrect freepage or CMA
- * accounting or HIGHATOMIC accounting.
- *
- * We don't want to hit this code for the more frequent
- * low-order merging.
- */
- int buddy_mt;
-
- buddy_pfn = __find_buddy_pfn(pfn, order);
- buddy = page + (buddy_pfn - pfn);
-
- if (!page_is_buddy(page, buddy, order))
- goto done_merging;
- buddy_mt = get_pageblock_migratetype(buddy);
-
- if (migratetype != buddy_mt
- && (!migratetype_is_mergeable(migratetype) ||
- !migratetype_is_mergeable(buddy_mt)))
- goto done_merging;
- max_order = order + 1;
- goto continue_merging;
- }
done_merging:
set_buddy_order(page, order);
@@ -1141,6 +1180,64 @@ done_merging:
page_reporting_notify_free(order);
}
+/**
+ * split_free_page() -- split a free page at split_pfn_offset
+ * @free_page: the original free page
+ * @order: the order of the page
+ * @split_pfn_offset: split offset within the page
+ *
+ * Return -ENOENT if the free page is changed, otherwise 0
+ *
+ * It is used when the free page crosses two pageblocks with different migratetypes
+ * at split_pfn_offset within the page. The split free page will be put into
+ * separate migratetype lists afterwards. Otherwise, the function achieves
+ * nothing.
+ */
+int split_free_page(struct page *free_page,
+ unsigned int order, unsigned long split_pfn_offset)
+{
+ struct zone *zone = page_zone(free_page);
+ unsigned long free_page_pfn = page_to_pfn(free_page);
+ unsigned long pfn;
+ unsigned long flags;
+ int free_page_order;
+ int mt;
+ int ret = 0;
+
+ if (split_pfn_offset == 0)
+ return ret;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ if (!PageBuddy(free_page) || buddy_order(free_page) != order) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ mt = get_pageblock_migratetype(free_page);
+ if (likely(!is_migrate_isolate(mt)))
+ __mod_zone_freepage_state(zone, -(1UL << order), mt);
+
+ del_page_from_free_list(free_page, zone, order);
+ for (pfn = free_page_pfn;
+ pfn < free_page_pfn + (1UL << order);) {
+ int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
+
+ free_page_order = min_t(unsigned int,
+ pfn ? __ffs(pfn) : order,
+ __fls(split_pfn_offset));
+ __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
+ mt, FPI_NONE);
+ pfn += 1UL << free_page_order;
+ split_pfn_offset -= (1UL << free_page_order);
+ /* we have done the first part, now switch to second part */
+ if (split_pfn_offset == 0)
+ split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
+ }
+out:
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return ret;
+}
/*
* A bad page could be due to a number of fields. Instead of multiple branches,
* try and check multiple fields with one check. The caller must do a detailed
@@ -1281,18 +1378,14 @@ static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
PageSkipKASanPoison(page);
}
-static void kernel_init_free_pages(struct page *page, int numpages)
+static void kernel_init_pages(struct page *page, int numpages)
{
int i;
/* s390's use of memset() could override KASAN redzones. */
kasan_disable_current();
- for (i = 0; i < numpages; i++) {
- u8 tag = page_kasan_tag(page + i);
- page_kasan_tag_reset(page + i);
- clear_highpage(page + i);
- page_kasan_tag_set(page + i, tag);
- }
+ for (i = 0; i < numpages; i++)
+ clear_highpage_kasan_tagged(page + i);
kasan_enable_current();
}
@@ -1381,7 +1474,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
init = false;
}
if (init)
- kernel_init_free_pages(page, 1 << order);
+ kernel_init_pages(page, 1 << order);
/*
* arch_free_page() can make the page's contents inaccessible. s390
@@ -1458,10 +1551,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* Ensure requested pindex is drained first. */
pindex = pindex - 1;
- /*
- * local_lock_irq held so equivalent to spin_lock_irqsave for
- * both PREEMPT_RT and non-PREEMPT_RT configurations.
- */
+ /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */
spin_lock(&zone->lock);
isolated_pageblocks = has_isolate_pageblock(zone);
@@ -1489,11 +1579,11 @@ static void free_pcppages_bulk(struct zone *zone, int count,
do {
int mt;
- page = list_last_entry(list, struct page, lru);
+ page = list_last_entry(list, struct page, pcp_list);
mt = get_pcppage_migratetype(page);
/* must delete to avoid corrupting pcp list */
- list_del(&page->lru);
+ list_del(&page->pcp_list);
count -= nr_pages;
pcp->count -= nr_pages;
@@ -2346,7 +2436,7 @@ static inline bool check_new_pcp(struct page *page, unsigned int order)
}
#endif /* CONFIG_DEBUG_VM */
-static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags)
+static inline bool should_skip_kasan_unpoison(gfp_t flags)
{
/* Don't skip if a software KASAN mode is enabled. */
if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
@@ -2358,12 +2448,10 @@ static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags)
return true;
/*
- * With hardware tag-based KASAN enabled, skip if either:
- *
- * 1. Memory tags have already been cleared via tag_clear_highpage().
- * 2. Skipping has been requested via __GFP_SKIP_KASAN_UNPOISON.
+ * With hardware tag-based KASAN enabled, skip if this has been
+ * requested via __GFP_SKIP_KASAN_UNPOISON.
*/
- return init_tags || (flags & __GFP_SKIP_KASAN_UNPOISON);
+ return flags & __GFP_SKIP_KASAN_UNPOISON;
}
static inline bool should_skip_init(gfp_t flags)
@@ -2382,6 +2470,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
!should_skip_init(gfp_flags);
bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS);
+ int i;
set_page_private(page, 0);
set_page_refcounted(page);
@@ -2407,8 +2496,6 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
* should be initialized as well).
*/
if (init_tags) {
- int i;
-
/* Initialize both memory and tags. */
for (i = 0; i != 1 << order; ++i)
tag_clear_highpage(page + i);
@@ -2416,17 +2503,21 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
/* Note that memory is already initialized by the loop above. */
init = false;
}
- if (!should_skip_kasan_unpoison(gfp_flags, init_tags)) {
+ if (!should_skip_kasan_unpoison(gfp_flags)) {
/* Unpoison shadow memory or set memory tags. */
kasan_unpoison_pages(page, order, init);
/* Note that memory is already initialized by KASAN. */
if (kasan_has_integrated_init())
init = false;
+ } else {
+ /* Ensure page_address() dereferencing does not fault. */
+ for (i = 0; i != 1 << order; ++i)
+ page_kasan_tag_reset(page + i);
}
/* If memory is still not initialized, do it now. */
if (init)
- kernel_init_free_pages(page, 1 << order);
+ kernel_init_pages(page, 1 << order);
/* Propagate __GFP_SKIP_KASAN_POISON to page flags. */
if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON))
SetPageSkipKASanPoison(page);
@@ -2476,6 +2567,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
del_page_from_free_list(page, zone, current_order);
expand(zone, page, order, current_order, migratetype);
set_pcppage_migratetype(page, migratetype);
+ trace_mm_page_alloc_zone_locked(page, order, migratetype,
+ pcp_allowed_order(order) &&
+ migratetype < MIGRATE_PCPTYPES);
return page;
}
@@ -2999,7 +3093,7 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
zone_page_state(zone, NR_FREE_PAGES) / 2) {
page = __rmqueue_cma_fallback(zone, order);
if (page)
- goto out;
+ return page;
}
}
retry:
@@ -3012,9 +3106,6 @@ retry:
alloc_flags))
goto retry;
}
-out:
- if (page)
- trace_mm_page_alloc_zone_locked(page, order, migratetype);
return page;
}
@@ -3029,10 +3120,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
{
int i, allocated = 0;
- /*
- * local_lock_irq held so equivalent to spin_lock_irqsave for
- * both PREEMPT_RT and non-PREEMPT_RT configurations.
- */
+ /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
@@ -3053,7 +3141,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* for IO devices that can merge IO requests if the physical
* pages are ordered properly.
*/
- list_add_tail(&page->lru, list);
+ list_add_tail(&page->pcp_list, list);
allocated++;
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
@@ -3076,51 +3164,48 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* Called from the vmstat counter updater to drain pagesets of this
* currently executing processor on remote nodes after they have
* expired.
- *
- * Note that this function must be called with the thread pinned to
- * a single processor.
*/
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{
- unsigned long flags;
int to_drain, batch;
- local_lock_irqsave(&pagesets.lock, flags);
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
- if (to_drain > 0)
+ if (to_drain > 0) {
+ unsigned long flags;
+
+ /*
+ * free_pcppages_bulk expects IRQs disabled for zone->lock
+ * so even though pcp->lock is not intended to be IRQ-safe,
+ * it's needed in this context.
+ */
+ spin_lock_irqsave(&pcp->lock, flags);
free_pcppages_bulk(zone, to_drain, pcp, 0);
- local_unlock_irqrestore(&pagesets.lock, flags);
+ spin_unlock_irqrestore(&pcp->lock, flags);
+ }
}
#endif
/*
* Drain pcplists of the indicated processor and zone.
- *
- * The processor must either be the current processor and the
- * thread pinned to the current processor or a processor that
- * is not online.
*/
static void drain_pages_zone(unsigned int cpu, struct zone *zone)
{
- unsigned long flags;
struct per_cpu_pages *pcp;
- local_lock_irqsave(&pagesets.lock, flags);
-
pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
- if (pcp->count)
- free_pcppages_bulk(zone, pcp->count, pcp, 0);
+ if (pcp->count) {
+ unsigned long flags;
- local_unlock_irqrestore(&pagesets.lock, flags);
+ /* See drain_zone_pages on why this is disabling IRQs */
+ spin_lock_irqsave(&pcp->lock, flags);
+ free_pcppages_bulk(zone, pcp->count, pcp, 0);
+ spin_unlock_irqrestore(&pcp->lock, flags);
+ }
}
/*
* Drain pcplists of all zones on the indicated processor.
- *
- * The processor must either be the current processor and the
- * thread pinned to the current processor or a processor that
- * is not online.
*/
static void drain_pages(unsigned int cpu)
{
@@ -3133,9 +3218,6 @@ static void drain_pages(unsigned int cpu)
/*
* Spill all of this CPU's per-cpu pages back into the buddy allocator.
- *
- * The CPU has to be pinned. When zone parameter is non-NULL, spill just
- * the single zone's pages.
*/
void drain_local_pages(struct zone *zone)
{
@@ -3147,24 +3229,6 @@ void drain_local_pages(struct zone *zone)
drain_pages(cpu);
}
-static void drain_local_pages_wq(struct work_struct *work)
-{
- struct pcpu_drain *drain;
-
- drain = container_of(work, struct pcpu_drain, work);
-
- /*
- * drain_all_pages doesn't use proper cpu hotplug protection so
- * we can race with cpu offline when the WQ can move this from
- * a cpu pinned worker to an unbound one. We can operate on a different
- * cpu which is alright but we also have to make sure to not move to
- * a different one.
- */
- migrate_disable();
- drain_local_pages(drain->zone);
- migrate_enable();
-}
-
/*
* The implementation of drain_all_pages(), exposing an extra parameter to
* drain on all cpus.
@@ -3186,13 +3250,6 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
static cpumask_t cpus_with_pcps;
/*
- * Make sure nobody triggers this path before mm_percpu_wq is fully
- * initialized.
- */
- if (WARN_ON_ONCE(!mm_percpu_wq))
- return;
-
- /*
* Do not drain if one is already in progress unless it's specific to
* a zone. Such callers are primarily CMA and memory hotplug and need
* the drain to be complete when the call returns.
@@ -3241,14 +3298,11 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
}
for_each_cpu(cpu, &cpus_with_pcps) {
- struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
-
- drain->zone = zone;
- INIT_WORK(&drain->work, drain_local_pages_wq);
- queue_work_on(cpu, mm_percpu_wq, &drain->work);
+ if (zone)
+ drain_pages_zone(cpu, zone);
+ else
+ drain_pages(cpu);
}
- for_each_cpu(cpu, &cpus_with_pcps)
- flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
mutex_unlock(&pcpu_drain_mutex);
}
@@ -3257,8 +3311,6 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
* Spill all the per-cpu pages from all CPUs back into the buddy allocator.
*
* When zone parameter is non-NULL, spill just the single zone's pages.
- *
- * Note that this can be extremely slow as the draining happens in a workqueue.
*/
void drain_all_pages(struct zone *zone)
{
@@ -3303,7 +3355,7 @@ void mark_free_pages(struct zone *zone)
for_each_migratetype_order(order, t) {
list_for_each_entry(page,
- &zone->free_area[order].free_list[t], lru) {
+ &zone->free_area[order].free_list[t], buddy_list) {
unsigned long i;
pfn = page_to_pfn(page);
@@ -3380,19 +3432,17 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
return min(READ_ONCE(pcp->batch) << 2, high);
}
-static void free_unref_page_commit(struct page *page, int migratetype,
+static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
+ struct page *page, int migratetype,
unsigned int order)
{
- struct zone *zone = page_zone(page);
- struct per_cpu_pages *pcp;
int high;
int pindex;
bool free_high;
__count_vm_event(PGFREE);
- pcp = this_cpu_ptr(zone->per_cpu_pageset);
pindex = order_to_pindex(migratetype, order);
- list_add(&page->lru, &pcp->lists[pindex]);
+ list_add(&page->pcp_list, &pcp->lists[pindex]);
pcp->count += 1 << order;
/*
@@ -3417,6 +3467,9 @@ static void free_unref_page_commit(struct page *page, int migratetype,
void free_unref_page(struct page *page, unsigned int order)
{
unsigned long flags;
+ unsigned long __maybe_unused UP_flags;
+ struct per_cpu_pages *pcp;
+ struct zone *zone;
unsigned long pfn = page_to_pfn(page);
int migratetype;
@@ -3439,9 +3492,16 @@ void free_unref_page(struct page *page, unsigned int order)
migratetype = MIGRATE_MOVABLE;
}
- local_lock_irqsave(&pagesets.lock, flags);
- free_unref_page_commit(page, migratetype, order);
- local_unlock_irqrestore(&pagesets.lock, flags);
+ zone = page_zone(page);
+ pcp_trylock_prepare(UP_flags);
+ pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags);
+ if (pcp) {
+ free_unref_page_commit(zone, pcp, page, migratetype, order);
+ pcp_spin_unlock_irqrestore(pcp, flags);
+ } else {
+ free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);
+ }
+ pcp_trylock_finish(UP_flags);
}
/*
@@ -3450,6 +3510,8 @@ void free_unref_page(struct page *page, unsigned int order)
void free_unref_page_list(struct list_head *list)
{
struct page *page, *next;
+ struct per_cpu_pages *pcp = NULL;
+ struct zone *locked_zone = NULL;
unsigned long flags;
int batch_count = 0;
int migratetype;
@@ -3474,8 +3536,18 @@ void free_unref_page_list(struct list_head *list)
}
}
- local_lock_irqsave(&pagesets.lock, flags);
list_for_each_entry_safe(page, next, list, lru) {
+ struct zone *zone = page_zone(page);
+
+ /* Different zone, different pcp lock. */
+ if (zone != locked_zone) {
+ if (pcp)
+ pcp_spin_unlock_irqrestore(pcp, flags);
+
+ locked_zone = zone;
+ pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags);
+ }
+
/*
* Non-isolated types over MIGRATE_PCPTYPES get added
* to the MIGRATE_MOVABLE pcp list.
@@ -3485,19 +3557,21 @@ void free_unref_page_list(struct list_head *list)
migratetype = MIGRATE_MOVABLE;
trace_mm_page_free_batched(page);
- free_unref_page_commit(page, migratetype, 0);
+ free_unref_page_commit(zone, pcp, page, migratetype, 0);
/*
* Guard against excessive IRQ disabled times when we get
* a large list of pages to free.
*/
if (++batch_count == SWAP_CLUSTER_MAX) {
- local_unlock_irqrestore(&pagesets.lock, flags);
+ pcp_spin_unlock_irqrestore(pcp, flags);
batch_count = 0;
- local_lock_irqsave(&pagesets.lock, flags);
+ pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags);
}
}
- local_unlock_irqrestore(&pagesets.lock, flags);
+
+ if (pcp)
+ pcp_spin_unlock_irqrestore(pcp, flags);
}
/*
@@ -3622,6 +3696,43 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
#endif
}
+static __always_inline
+struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
+ unsigned int order, unsigned int alloc_flags,
+ int migratetype)
+{
+ struct page *page;
+ unsigned long flags;
+
+ do {
+ page = NULL;
+ spin_lock_irqsave(&zone->lock, flags);
+ /*
+ * order-0 request can reach here when the pcplist is skipped
+ * due to non-CMA allocation context. HIGHATOMIC area is
+ * reserved for high-order atomic allocation, so order-0
+ * request should skip it.
+ */
+ if (order > 0 && alloc_flags & ALLOC_HARDER)
+ page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+ if (!page) {
+ page = __rmqueue(zone, order, migratetype, alloc_flags);
+ if (!page) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return NULL;
+ }
+ }
+ __mod_zone_freepage_state(zone, -(1 << order),
+ get_pcppage_migratetype(page));
+ spin_unlock_irqrestore(&zone->lock, flags);
+ } while (check_new_pages(page, order));
+
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+ zone_statistics(preferred_zone, zone, 1);
+
+ return page;
+}
+
/* Remove page from the per-cpu list, caller must protect the list */
static inline
struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
@@ -3655,8 +3766,8 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
return NULL;
}
- page = list_first_entry(list, struct page, lru);
- list_del(&page->lru);
+ page = list_first_entry(list, struct page, pcp_list);
+ list_del(&page->pcp_list);
pcp->count -= 1 << order;
} while (check_new_pcp(page, order));
@@ -3673,19 +3784,29 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct list_head *list;
struct page *page;
unsigned long flags;
+ unsigned long __maybe_unused UP_flags;
- local_lock_irqsave(&pagesets.lock, flags);
+ /*
+ * spin_trylock may fail due to a parallel drain. In the future, the
+ * trylock will also protect against IRQ reentrancy.
+ */
+ pcp_trylock_prepare(UP_flags);
+ pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags);
+ if (!pcp) {
+ pcp_trylock_finish(UP_flags);
+ return NULL;
+ }
/*
* On allocation, reduce the number of pages that are batch freed.
* See nr_pcp_free() where free_factor is increased for subsequent
* frees.
*/
- pcp = this_cpu_ptr(zone->per_cpu_pageset);
pcp->free_factor >>= 1;
list = &pcp->lists[order_to_pindex(migratetype, order)];
page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
- local_unlock_irqrestore(&pagesets.lock, flags);
+ pcp_spin_unlock_irqrestore(pcp, flags);
+ pcp_trylock_finish(UP_flags);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
zone_statistics(preferred_zone, zone, 1);
@@ -3702,9 +3823,14 @@ struct page *rmqueue(struct zone *preferred_zone,
gfp_t gfp_flags, unsigned int alloc_flags,
int migratetype)
{
- unsigned long flags;
struct page *page;
+ /*
+ * We most definitely don't want callers attempting to
+ * allocate greater than order-1 page units with __GFP_NOFAIL.
+ */
+ WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
+
if (likely(pcp_allowed_order(order))) {
/*
* MIGRATE_MOVABLE pcplist could have the pages on CMA area and
@@ -3714,56 +3840,23 @@ struct page *rmqueue(struct zone *preferred_zone,
migratetype != MIGRATE_MOVABLE) {
page = rmqueue_pcplist(preferred_zone, zone, order,
gfp_flags, migratetype, alloc_flags);
- goto out;
+ if (likely(page))
+ goto out;
}
}
- /*
- * We most definitely don't want callers attempting to
- * allocate greater than order-1 page units with __GFP_NOFAIL.
- */
- WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
-
- do {
- page = NULL;
- spin_lock_irqsave(&zone->lock, flags);
- /*
- * order-0 request can reach here when the pcplist is skipped
- * due to non-CMA allocation context. HIGHATOMIC area is
- * reserved for high-order atomic allocation, so order-0
- * request should skip it.
- */
- if (order > 0 && alloc_flags & ALLOC_HARDER) {
- page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
- if (page)
- trace_mm_page_alloc_zone_locked(page, order, migratetype);
- }
- if (!page) {
- page = __rmqueue(zone, order, migratetype, alloc_flags);
- if (!page)
- goto failed;
- }
- __mod_zone_freepage_state(zone, -(1 << order),
- get_pcppage_migratetype(page));
- spin_unlock_irqrestore(&zone->lock, flags);
- } while (check_new_pages(page, order));
-
- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
- zone_statistics(preferred_zone, zone, 1);
+ page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
+ migratetype);
out:
/* Separate test+clear to avoid unnecessary atomics */
- if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
+ if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
wakeup_kswapd(zone, 0, 0, zone_idx(zone));
}
VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
return page;
-
-failed:
- spin_unlock_irqrestore(&zone->lock, flags);
- return NULL;
}
#ifdef CONFIG_FAIL_PAGE_ALLOC
@@ -3799,6 +3892,9 @@ static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
(gfp_mask & __GFP_DIRECT_RECLAIM))
return false;
+ if (gfp_mask & __GFP_NOWARN)
+ fail_page_alloc.attr.no_warn = true;
+
return should_fail(&fail_page_alloc.attr, 1 << order);
}
@@ -3953,11 +4049,15 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
* need to be calculated.
*/
if (!order) {
- long fast_free;
+ long usable_free;
+ long reserved;
+
+ usable_free = free_pages;
+ reserved = __zone_watermark_unusable_free(z, 0, alloc_flags);
- fast_free = free_pages;
- fast_free -= __zone_watermark_unusable_free(z, 0, alloc_flags);
- if (fast_free > mark + z->lowmem_reserve[highest_zoneidx])
+ /* reserved may over estimate high-atomic reserves. */
+ usable_free -= min(usable_free, reserved);
+ if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
return true;
}
@@ -4068,13 +4168,14 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
{
struct zoneref *z;
struct zone *zone;
- struct pglist_data *last_pgdat_dirty_limit = NULL;
+ struct pglist_data *last_pgdat = NULL;
+ bool last_pgdat_dirty_ok = false;
bool no_fallback;
retry:
/*
* Scan zonelist, looking for a zone with enough free.
- * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
+ * See also __cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
@@ -4107,13 +4208,13 @@ retry:
* dirty-throttling and the flusher threads.
*/
if (ac->spread_dirty_pages) {
- if (last_pgdat_dirty_limit == zone->zone_pgdat)
- continue;
+ if (last_pgdat != zone->zone_pgdat) {
+ last_pgdat = zone->zone_pgdat;
+ last_pgdat_dirty_ok = node_dirty_ok(zone->zone_pgdat);
+ }
- if (!node_dirty_ok(zone->zone_pgdat)) {
- last_pgdat_dirty_limit = zone->zone_pgdat;
+ if (!last_pgdat_dirty_ok)
continue;
- }
}
if (no_fallback && nr_online_nodes > 1 &&
@@ -4346,7 +4447,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
*/
/* Exhausted what can be done so it's blame time */
- if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
+ if (out_of_memory(&oc) ||
+ WARN_ON_ONCE_GFP(gfp_mask & __GFP_NOFAIL, gfp_mask)) {
*did_some_progress = 1;
/*
@@ -4677,9 +4779,12 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
ac->nodemask) {
- if (last_pgdat != zone->zone_pgdat)
+ if (!managed_zone(zone))
+ continue;
+ if (last_pgdat != zone->zone_pgdat) {
wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
- last_pgdat = zone->zone_pgdat;
+ last_pgdat = zone->zone_pgdat;
+ }
}
}
@@ -5117,7 +5222,7 @@ nopage:
* All existing users of the __GFP_NOFAIL are blockable, so warn
* of any new users that actually require GFP_NOWAIT
*/
- if (WARN_ON_ONCE(!can_direct_reclaim))
+ if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask))
goto fail;
/*
@@ -5125,7 +5230,7 @@ nopage:
* because we cannot reclaim anything and only can loop waiting
* for somebody to do a work for us
*/
- WARN_ON_ONCE(current->flags & PF_MEMALLOC);
+ WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask);
/*
* non failing costly orders are a hard requirement which we
@@ -5133,7 +5238,7 @@ nopage:
* so that we can identify them and convert them to something
* else.
*/
- WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
+ WARN_ON_ONCE_GFP(order > PAGE_ALLOC_COSTLY_ORDER, gfp_mask);
/*
* Help non-failing allocations by giving them access to memory
@@ -5177,10 +5282,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
*alloc_flags |= ALLOC_CPUSET;
}
- fs_reclaim_acquire(gfp_mask);
- fs_reclaim_release(gfp_mask);
-
- might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
+ might_alloc(gfp_mask);
if (should_fail_alloc_page(gfp_mask, order))
return false;
@@ -5228,6 +5330,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
{
struct page *page;
unsigned long flags;
+ unsigned long __maybe_unused UP_flags;
struct zone *zone;
struct zoneref *z;
struct per_cpu_pages *pcp;
@@ -5308,11 +5411,14 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
if (unlikely(!zone))
goto failed;
+ /* Is a parallel drain in progress? */
+ pcp_trylock_prepare(UP_flags);
+ pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags);
+ if (!pcp)
+ goto failed_irq;
+
/* Attempt the batch allocation */
- local_lock_irqsave(&pagesets.lock, flags);
- pcp = this_cpu_ptr(zone->per_cpu_pageset);
pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
-
while (nr_populated < nr_pages) {
/* Skip existing pages */
@@ -5324,9 +5430,11 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
pcp, pcp_list);
if (unlikely(!page)) {
- /* Try and get at least one page */
- if (!nr_populated)
+ /* Try and allocate at least one page */
+ if (!nr_account) {
+ pcp_spin_unlock_irqrestore(pcp, flags);
goto failed_irq;
+ }
break;
}
nr_account++;
@@ -5339,7 +5447,8 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
nr_populated++;
}
- local_unlock_irqrestore(&pagesets.lock, flags);
+ pcp_spin_unlock_irqrestore(pcp, flags);
+ pcp_trylock_finish(UP_flags);
__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
@@ -5348,7 +5457,7 @@ out:
return nr_populated;
failed_irq:
- local_unlock_irqrestore(&pagesets.lock, flags);
+ pcp_trylock_finish(UP_flags);
failed:
page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
@@ -5379,10 +5488,8 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
* There are several places where we assume that the order value is sane
* so bail out early if the request is out of bound.
*/
- if (unlikely(order >= MAX_ORDER)) {
- WARN_ON_ONCE(!(gfp & __GFP_NOWARN));
+ if (WARN_ON_ONCE_GFP(order >= MAX_ORDER, gfp))
return NULL;
- }
gfp &= gfp_allowed_mask;
/*
@@ -5781,14 +5888,14 @@ long si_mem_available(void)
/*
* Estimate the amount of memory available for userspace allocations,
- * without causing swapping.
+ * without causing swapping or OOM.
*/
available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
/*
* Not all the page cache can be freed, otherwise the system will
- * start swapping. Assume at least half of the page cache, or the
- * low watermark worth of cache, needs to stay.
+ * start swapping or thrashing. Assume at least half of the page
+ * cache, or the low watermark worth of cache, needs to stay.
*/
pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
pagecache -= min(pagecache / 2, wmark_low);
@@ -5916,7 +6023,7 @@ static void show_migration_types(unsigned char type)
void show_free_areas(unsigned int filter, nodemask_t *nodemask)
{
unsigned long free_pcp = 0;
- int cpu;
+ int cpu, nid;
struct zone *zone;
pg_data_t *pgdat;
@@ -6104,7 +6211,11 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
printk(KERN_CONT "= %lukB\n", K(total));
}
- hugetlb_show_meminfo();
+ for_each_online_node(nid) {
+ if (show_mem_node_skip(filter, nid, nodemask))
+ continue;
+ hugetlb_show_meminfo_node(nid);
+ }
printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
@@ -6131,7 +6242,7 @@ static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
do {
zone_type--;
zone = pgdat->node_zones + zone_type;
- if (managed_zone(zone)) {
+ if (populated_zone(zone)) {
zoneref_set_zone(zone, &zonerefs[nr_zones++]);
check_highest_zone(zone_type);
}
@@ -6171,7 +6282,6 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
}
-#define MAX_NODE_LOAD (nr_online_nodes)
static int node_load[MAX_NUMNODES];
/**
@@ -6218,7 +6328,7 @@ int find_next_best_node(int node, nodemask_t *used_node_mask)
val += PENALTY_FOR_NODE_WITH_CPUS;
/* Slight preference for less loaded node */
- val *= (MAX_NODE_LOAD*MAX_NUMNODES);
+ val *= MAX_NUMNODES;
val += node_load[n];
if (val < min_val) {
@@ -6284,13 +6394,12 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
static void build_zonelists(pg_data_t *pgdat)
{
static int node_order[MAX_NUMNODES];
- int node, load, nr_nodes = 0;
+ int node, nr_nodes = 0;
nodemask_t used_mask = NODE_MASK_NONE;
int local_node, prev_node;
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
- load = nr_online_nodes;
prev_node = local_node;
memset(node_order, 0, sizeof(node_order));
@@ -6302,11 +6411,10 @@ static void build_zonelists(pg_data_t *pgdat)
*/
if (node_distance(local_node, node) !=
node_distance(local_node, prev_node))
- node_load[node] += load;
+ node_load[node] += 1;
node_order[nr_nodes++] = node;
prev_node = node;
- load--;
}
build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
@@ -6645,6 +6753,21 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
}
}
+/*
+ * With compound page geometry and when struct pages are stored in ram most
+ * tail pages are reused. Consequently, the amount of unique struct pages to
+ * initialize is a lot smaller that the total amount of struct pages being
+ * mapped. This is a paired / mild layering violation with explicit knowledge
+ * of how the sparse_vmemmap internals handle compound pages in the lack
+ * of an altmap. See vmemmap_populate_compound_pages().
+ */
+static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
+ unsigned long nr_pages)
+{
+ return is_power_of_2(sizeof(struct page)) &&
+ !altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages;
+}
+
static void __ref memmap_init_compound(struct page *head,
unsigned long head_pfn,
unsigned long zone_idx, int nid,
@@ -6709,7 +6832,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
continue;
memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
- pfns_per_compound);
+ compound_nr_pages(altmap, pfns_per_compound));
}
pr_info("%s initialised %lu pages in %ums\n", __func__,
@@ -6978,6 +7101,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
memset(pcp, 0, sizeof(*pcp));
memset(pzstats, 0, sizeof(*pzstats));
+ spin_lock_init(&pcp->lock);
for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
INIT_LIST_HEAD(&pcp->lists[pindex]);
@@ -7870,7 +7994,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
usable_startpfn = memblock_region_memory_base_pfn(r);
- if (usable_startpfn < 0x100000) {
+ if (usable_startpfn < PHYS_PFN(SZ_4G)) {
mem_below_4gb_not_mirrored = true;
continue;
}
@@ -8919,7 +9043,7 @@ void *__init alloc_large_system_hash(const char *tablename,
table = memblock_alloc_raw(size,
SMP_CACHE_BYTES);
} else if (get_order(size) >= MAX_ORDER || hashdist) {
- table = __vmalloc(size, gfp_flags);
+ table = vmalloc_huge(size, gfp_flags);
virt = true;
if (table)
huge = is_vm_area_hugepages(table);
@@ -8949,136 +9073,7 @@ void *__init alloc_large_system_hash(const char *tablename,
return table;
}
-/*
- * This function checks whether pageblock includes unmovable pages or not.
- *
- * PageLRU check without isolation or lru_lock could race so that
- * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
- * check without lock_page also may miss some movable non-lru pages at
- * race condition. So you can't expect this function should be exact.
- *
- * Returns a page without holding a reference. If the caller wants to
- * dereference that page (e.g., dumping), it has to make sure that it
- * cannot get removed (e.g., via memory unplug) concurrently.
- *
- */
-struct page *has_unmovable_pages(struct zone *zone, struct page *page,
- int migratetype, int flags)
-{
- unsigned long iter = 0;
- unsigned long pfn = page_to_pfn(page);
- unsigned long offset = pfn % pageblock_nr_pages;
-
- if (is_migrate_cma_page(page)) {
- /*
- * CMA allocations (alloc_contig_range) really need to mark
- * isolate CMA pageblocks even when they are not movable in fact
- * so consider them movable here.
- */
- if (is_migrate_cma(migratetype))
- return NULL;
-
- return page;
- }
-
- for (; iter < pageblock_nr_pages - offset; iter++) {
- page = pfn_to_page(pfn + iter);
-
- /*
- * Both, bootmem allocations and memory holes are marked
- * PG_reserved and are unmovable. We can even have unmovable
- * allocations inside ZONE_MOVABLE, for example when
- * specifying "movablecore".
- */
- if (PageReserved(page))
- return page;
-
- /*
- * If the zone is movable and we have ruled out all reserved
- * pages then it should be reasonably safe to assume the rest
- * is movable.
- */
- if (zone_idx(zone) == ZONE_MOVABLE)
- continue;
-
- /*
- * Hugepages are not in LRU lists, but they're movable.
- * THPs are on the LRU, but need to be counted as #small pages.
- * We need not scan over tail pages because we don't
- * handle each tail page individually in migration.
- */
- if (PageHuge(page) || PageTransCompound(page)) {
- struct page *head = compound_head(page);
- unsigned int skip_pages;
-
- if (PageHuge(page)) {
- if (!hugepage_migration_supported(page_hstate(head)))
- return page;
- } else if (!PageLRU(head) && !__PageMovable(head)) {
- return page;
- }
-
- skip_pages = compound_nr(head) - (page - head);
- iter += skip_pages - 1;
- continue;
- }
-
- /*
- * We can't use page_count without pin a page
- * because another CPU can free compound page.
- * This check already skips compound tails of THP
- * because their page->_refcount is zero at all time.
- */
- if (!page_ref_count(page)) {
- if (PageBuddy(page))
- iter += (1 << buddy_order(page)) - 1;
- continue;
- }
-
- /*
- * The HWPoisoned page may be not in buddy system, and
- * page_count() is not 0.
- */
- if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
- continue;
-
- /*
- * We treat all PageOffline() pages as movable when offlining
- * to give drivers a chance to decrement their reference count
- * in MEM_GOING_OFFLINE in order to indicate that these pages
- * can be offlined as there are no direct references anymore.
- * For actually unmovable PageOffline() where the driver does
- * not support this, we will fail later when trying to actually
- * move these pages that still have a reference count > 0.
- * (false negatives in this function only)
- */
- if ((flags & MEMORY_OFFLINE) && PageOffline(page))
- continue;
-
- if (__PageMovable(page) || PageLRU(page))
- continue;
-
- /*
- * If there are RECLAIMABLE pages, we need to check
- * it. But now, memory offline itself doesn't call
- * shrink_node_slabs() and it still to be fixed.
- */
- return page;
- }
- return NULL;
-}
-
#ifdef CONFIG_CONTIG_ALLOC
-static unsigned long pfn_max_align_down(unsigned long pfn)
-{
- return ALIGN_DOWN(pfn, MAX_ORDER_NR_PAGES);
-}
-
-static unsigned long pfn_max_align_up(unsigned long pfn)
-{
- return ALIGN(pfn, MAX_ORDER_NR_PAGES);
-}
-
#if defined(CONFIG_DYNAMIC_DEBUG) || \
(defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
/* Usage: See admin-guide/dynamic-debug-howto.rst */
@@ -9101,7 +9096,7 @@ static inline void alloc_contig_dump_pages(struct list_head *page_list)
#endif
/* [start, end) must belong to a single zone. */
-static int __alloc_contig_migrate_range(struct compact_control *cc,
+int __alloc_contig_migrate_range(struct compact_control *cc,
unsigned long start, unsigned long end)
{
/* This function is based on compact_zone() from compaction.c. */
@@ -9151,7 +9146,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
lru_cache_enable();
if (ret < 0) {
- if (ret == -EBUSY)
+ if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY)
alloc_contig_dump_pages(&cc->migratepages);
putback_movable_pages(&cc->migratepages);
return ret;
@@ -9169,8 +9164,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
* be either of the two.
* @gfp_mask: GFP mask to use during compaction
*
- * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
- * aligned. The PFN range must belong to a single zone.
+ * The PFN range does not have to be pageblock aligned. The PFN range must
+ * belong to a single zone.
*
* The first thing this routine does is attempt to MIGRATE_ISOLATE all
* pageblocks in the range. Once isolated, the pageblocks should not
@@ -9184,7 +9179,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
unsigned migratetype, gfp_t gfp_mask)
{
unsigned long outer_start, outer_end;
- unsigned int order;
+ int order;
int ret = 0;
struct compact_control cc = {
@@ -9203,14 +9198,11 @@ int alloc_contig_range(unsigned long start, unsigned long end,
* What we do here is we mark all pageblocks in range as
* MIGRATE_ISOLATE. Because pageblock and max order pages may
* have different sizes, and due to the way page allocator
- * work, we align the range to biggest of the two pages so
- * that page allocator won't try to merge buddies from
- * different pageblocks and change MIGRATE_ISOLATE to some
- * other migration type.
+ * work, start_isolate_page_range() has special handlings for this.
*
* Once the pageblocks are marked as MIGRATE_ISOLATE, we
* migrate the pages from an unaligned range (ie. pages that
- * we are interested in). This will put all the pages in
+ * we are interested in). This will put all the pages in
* range back to page allocator as MIGRATE_ISOLATE.
*
* When this is done, we take the pages in range from page
@@ -9223,10 +9215,9 @@ int alloc_contig_range(unsigned long start, unsigned long end,
* put back to page allocator so that buddy can use them.
*/
- ret = start_isolate_page_range(pfn_max_align_down(start),
- pfn_max_align_up(end), migratetype, 0);
+ ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
if (ret)
- return ret;
+ goto done;
drain_all_pages(cc.zone);
@@ -9246,7 +9237,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
ret = 0;
/*
- * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
+ * Pages from [start, end) are within a pageblock_nr_pages
* aligned blocks that are marked as MIGRATE_ISOLATE. What's
* more, all pages in [start, end) are free in page allocator.
* What we are going to do is to allocate all pages from
@@ -9305,8 +9296,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
free_contig_range(end, outer_end - end);
done:
- undo_isolate_page_range(pfn_max_align_down(start),
- pfn_max_align_up(end), migratetype);
+ undo_isolate_page_range(start, end, migratetype);
return ret;
}
EXPORT_SYMBOL(alloc_contig_range);
@@ -9625,7 +9615,6 @@ bool put_page_back_buddy(struct page *page)
ClearPageHWPoisonTakenOff(page);
__free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
if (TestClearPageHWPoison(page)) {
- num_poisoned_pages_dec();
ret = true;
}
}