diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 537 |
1 files changed, 338 insertions, 199 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e008a3df0485..d04211f0ef0b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -126,13 +126,97 @@ typedef int __bitwise fpi_t; static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) -struct pagesets { - local_lock_t lock; -}; -static DEFINE_PER_CPU(struct pagesets, pagesets) = { - .lock = INIT_LOCAL_LOCK(lock), -}; +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) +/* + * On SMP, spin_trylock is sufficient protection. + * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP. + */ +#define pcp_trylock_prepare(flags) do { } while (0) +#define pcp_trylock_finish(flag) do { } while (0) +#else + +/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */ +#define pcp_trylock_prepare(flags) local_irq_save(flags) +#define pcp_trylock_finish(flags) local_irq_restore(flags) +#endif + +/* + * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid + * a migration causing the wrong PCP to be locked and remote memory being + * potentially allocated, pin the task to the CPU for the lookup+lock. + * preempt_disable is used on !RT because it is faster than migrate_disable. + * migrate_disable is used on RT because otherwise RT spinlock usage is + * interfered with and a high priority task cannot preempt the allocator. + */ +#ifndef CONFIG_PREEMPT_RT +#define pcpu_task_pin() preempt_disable() +#define pcpu_task_unpin() preempt_enable() +#else +#define pcpu_task_pin() migrate_disable() +#define pcpu_task_unpin() migrate_enable() +#endif +/* + * Generic helper to lookup and a per-cpu variable with an embedded spinlock. + * Return value should be used with equivalent unlock helper. + */ +#define pcpu_spin_lock(type, member, ptr) \ +({ \ + type *_ret; \ + pcpu_task_pin(); \ + _ret = this_cpu_ptr(ptr); \ + spin_lock(&_ret->member); \ + _ret; \ +}) + +#define pcpu_spin_lock_irqsave(type, member, ptr, flags) \ +({ \ + type *_ret; \ + pcpu_task_pin(); \ + _ret = this_cpu_ptr(ptr); \ + spin_lock_irqsave(&_ret->member, flags); \ + _ret; \ +}) + +#define pcpu_spin_trylock_irqsave(type, member, ptr, flags) \ +({ \ + type *_ret; \ + pcpu_task_pin(); \ + _ret = this_cpu_ptr(ptr); \ + if (!spin_trylock_irqsave(&_ret->member, flags)) { \ + pcpu_task_unpin(); \ + _ret = NULL; \ + } \ + _ret; \ +}) + +#define pcpu_spin_unlock(member, ptr) \ +({ \ + spin_unlock(&ptr->member); \ + pcpu_task_unpin(); \ +}) + +#define pcpu_spin_unlock_irqrestore(member, ptr, flags) \ +({ \ + spin_unlock_irqrestore(&ptr->member, flags); \ + pcpu_task_unpin(); \ +}) + +/* struct per_cpu_pages specific helpers. */ +#define pcp_spin_lock(ptr) \ + pcpu_spin_lock(struct per_cpu_pages, lock, ptr) + +#define pcp_spin_lock_irqsave(ptr, flags) \ + pcpu_spin_lock_irqsave(struct per_cpu_pages, lock, ptr, flags) + +#define pcp_spin_trylock_irqsave(ptr, flags) \ + pcpu_spin_trylock_irqsave(struct per_cpu_pages, lock, ptr, flags) + +#define pcp_spin_unlock(ptr) \ + pcpu_spin_unlock(lock, ptr) + +#define pcp_spin_unlock_irqrestore(ptr, flags) \ + pcpu_spin_unlock_irqrestore(lock, ptr, flags) #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -151,13 +235,7 @@ DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ EXPORT_PER_CPU_SYMBOL(_numa_mem_); #endif -/* work_structs for global per-cpu drains */ -struct pcpu_drain { - struct zone *zone; - struct work_struct work; -}; static DEFINE_MUTEX(pcpu_drain_mutex); -static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain); #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY volatile unsigned long latent_entropy __latent_entropy; @@ -356,7 +434,7 @@ static unsigned long required_kernelcore_percent __initdata; static unsigned long required_movablecore __initdata; static unsigned long required_movablecore_percent __initdata; static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata; -static bool mirrored_kernelcore __meminitdata; +bool mirrored_kernelcore __initdata_memblock; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -524,7 +602,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags, { unsigned long *bitmap; unsigned long bitidx, word_bitidx; - unsigned long old_word, word; + unsigned long word; BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits)); @@ -540,12 +618,8 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags, flags <<= bitidx; word = READ_ONCE(bitmap[word_bitidx]); - for (;;) { - old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); - if (word == old_word) - break; - word = old_word; - } + do { + } while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags)); } void set_pageblock_migratetype(struct page *page, int migratetype) @@ -653,7 +727,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order) #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (order > PAGE_ALLOC_COSTLY_ORDER) { VM_BUG_ON(order != pageblock_order); - base = PAGE_ALLOC_COSTLY_ORDER + 1; + return NR_LOWORDER_PCP_LISTS; } #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); @@ -667,7 +741,7 @@ static inline int pindex_to_order(unsigned int pindex) int order = pindex / MIGRATE_PCPTYPES; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (order > PAGE_ALLOC_COSTLY_ORDER) + if (pindex == NR_LOWORDER_PCP_LISTS) order = pageblock_order; #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); @@ -744,6 +818,14 @@ void prep_compound_page(struct page *page, unsigned int order) prep_compound_head(page, order); } +void destroy_large_folio(struct folio *folio) +{ + enum compound_dtor_id dtor = folio_page(folio, 1)->compound_dtor; + + VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio); + compound_page_dtors[dtor](&folio->page); +} + #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; @@ -785,7 +867,7 @@ static inline bool set_page_guard(struct zone *zone, struct page *page, return false; __SetPageGuard(page); - INIT_LIST_HEAD(&page->lru); + INIT_LIST_HEAD(&page->buddy_list); set_page_private(page, order); /* Guard pages are not available for any usage */ __mod_zone_freepage_state(zone, -(1 << order), migratetype); @@ -928,7 +1010,7 @@ static inline void add_to_free_list(struct page *page, struct zone *zone, { struct free_area *area = &zone->free_area[order]; - list_add(&page->lru, &area->free_list[migratetype]); + list_add(&page->buddy_list, &area->free_list[migratetype]); area->nr_free++; } @@ -938,7 +1020,7 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone, { struct free_area *area = &zone->free_area[order]; - list_add_tail(&page->lru, &area->free_list[migratetype]); + list_add_tail(&page->buddy_list, &area->free_list[migratetype]); area->nr_free++; } @@ -952,7 +1034,7 @@ static inline void move_to_free_list(struct page *page, struct zone *zone, { struct free_area *area = &zone->free_area[order]; - list_move_tail(&page->lru, &area->free_list[migratetype]); + list_move_tail(&page->buddy_list, &area->free_list[migratetype]); } static inline void del_page_from_free_list(struct page *page, struct zone *zone, @@ -962,7 +1044,7 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone, if (page_reported(page)) __ClearPageReported(page); - list_del(&page->lru); + list_del(&page->buddy_list); __ClearPageBuddy(page); set_page_private(page, 0); zone->free_area[order].nr_free--; @@ -1296,18 +1378,14 @@ static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) PageSkipKASanPoison(page); } -static void kernel_init_free_pages(struct page *page, int numpages) +static void kernel_init_pages(struct page *page, int numpages) { int i; /* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); - for (i = 0; i < numpages; i++) { - u8 tag = page_kasan_tag(page + i); - page_kasan_tag_reset(page + i); - clear_highpage(page + i); - page_kasan_tag_set(page + i, tag); - } + for (i = 0; i < numpages; i++) + clear_highpage_kasan_tagged(page + i); kasan_enable_current(); } @@ -1396,7 +1474,7 @@ static __always_inline bool free_pages_prepare(struct page *page, init = false; } if (init) - kernel_init_free_pages(page, 1 << order); + kernel_init_pages(page, 1 << order); /* * arch_free_page() can make the page's contents inaccessible. s390 @@ -1473,10 +1551,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* Ensure requested pindex is drained first. */ pindex = pindex - 1; - /* - * local_lock_irq held so equivalent to spin_lock_irqsave for - * both PREEMPT_RT and non-PREEMPT_RT configurations. - */ + /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */ spin_lock(&zone->lock); isolated_pageblocks = has_isolate_pageblock(zone); @@ -1504,11 +1579,11 @@ static void free_pcppages_bulk(struct zone *zone, int count, do { int mt; - page = list_last_entry(list, struct page, lru); + page = list_last_entry(list, struct page, pcp_list); mt = get_pcppage_migratetype(page); /* must delete to avoid corrupting pcp list */ - list_del(&page->lru); + list_del(&page->pcp_list); count -= nr_pages; pcp->count -= nr_pages; @@ -2361,7 +2436,7 @@ static inline bool check_new_pcp(struct page *page, unsigned int order) } #endif /* CONFIG_DEBUG_VM */ -static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags) +static inline bool should_skip_kasan_unpoison(gfp_t flags) { /* Don't skip if a software KASAN mode is enabled. */ if (IS_ENABLED(CONFIG_KASAN_GENERIC) || @@ -2373,12 +2448,10 @@ static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags) return true; /* - * With hardware tag-based KASAN enabled, skip if either: - * - * 1. Memory tags have already been cleared via tag_clear_highpage(). - * 2. Skipping has been requested via __GFP_SKIP_KASAN_UNPOISON. + * With hardware tag-based KASAN enabled, skip if this has been + * requested via __GFP_SKIP_KASAN_UNPOISON. */ - return init_tags || (flags & __GFP_SKIP_KASAN_UNPOISON); + return flags & __GFP_SKIP_KASAN_UNPOISON; } static inline bool should_skip_init(gfp_t flags) @@ -2397,6 +2470,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && !should_skip_init(gfp_flags); bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); + int i; set_page_private(page, 0); set_page_refcounted(page); @@ -2422,8 +2496,6 @@ inline void post_alloc_hook(struct page *page, unsigned int order, * should be initialized as well). */ if (init_tags) { - int i; - /* Initialize both memory and tags. */ for (i = 0; i != 1 << order; ++i) tag_clear_highpage(page + i); @@ -2431,17 +2503,21 @@ inline void post_alloc_hook(struct page *page, unsigned int order, /* Note that memory is already initialized by the loop above. */ init = false; } - if (!should_skip_kasan_unpoison(gfp_flags, init_tags)) { + if (!should_skip_kasan_unpoison(gfp_flags)) { /* Unpoison shadow memory or set memory tags. */ kasan_unpoison_pages(page, order, init); /* Note that memory is already initialized by KASAN. */ if (kasan_has_integrated_init()) init = false; + } else { + /* Ensure page_address() dereferencing does not fault. */ + for (i = 0; i != 1 << order; ++i) + page_kasan_tag_reset(page + i); } /* If memory is still not initialized, do it now. */ if (init) - kernel_init_free_pages(page, 1 << order); + kernel_init_pages(page, 1 << order); /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */ if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON)) SetPageSkipKASanPoison(page); @@ -3044,10 +3120,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, { int i, allocated = 0; - /* - * local_lock_irq held so equivalent to spin_lock_irqsave for - * both PREEMPT_RT and non-PREEMPT_RT configurations. - */ + /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */ spin_lock(&zone->lock); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype, @@ -3068,7 +3141,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * for IO devices that can merge IO requests if the physical * pages are ordered properly. */ - list_add_tail(&page->lru, list); + list_add_tail(&page->pcp_list, list); allocated++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, @@ -3091,51 +3164,48 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * Called from the vmstat counter updater to drain pagesets of this * currently executing processor on remote nodes after they have * expired. - * - * Note that this function must be called with the thread pinned to - * a single processor. */ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { - unsigned long flags; int to_drain, batch; - local_lock_irqsave(&pagesets.lock, flags); batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); - if (to_drain > 0) + if (to_drain > 0) { + unsigned long flags; + + /* + * free_pcppages_bulk expects IRQs disabled for zone->lock + * so even though pcp->lock is not intended to be IRQ-safe, + * it's needed in this context. + */ + spin_lock_irqsave(&pcp->lock, flags); free_pcppages_bulk(zone, to_drain, pcp, 0); - local_unlock_irqrestore(&pagesets.lock, flags); + spin_unlock_irqrestore(&pcp->lock, flags); + } } #endif /* * Drain pcplists of the indicated processor and zone. - * - * The processor must either be the current processor and the - * thread pinned to the current processor or a processor that - * is not online. */ static void drain_pages_zone(unsigned int cpu, struct zone *zone) { - unsigned long flags; struct per_cpu_pages *pcp; - local_lock_irqsave(&pagesets.lock, flags); - pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); - if (pcp->count) - free_pcppages_bulk(zone, pcp->count, pcp, 0); + if (pcp->count) { + unsigned long flags; - local_unlock_irqrestore(&pagesets.lock, flags); + /* See drain_zone_pages on why this is disabling IRQs */ + spin_lock_irqsave(&pcp->lock, flags); + free_pcppages_bulk(zone, pcp->count, pcp, 0); + spin_unlock_irqrestore(&pcp->lock, flags); + } } /* * Drain pcplists of all zones on the indicated processor. - * - * The processor must either be the current processor and the - * thread pinned to the current processor or a processor that - * is not online. */ static void drain_pages(unsigned int cpu) { @@ -3148,9 +3218,6 @@ static void drain_pages(unsigned int cpu) /* * Spill all of this CPU's per-cpu pages back into the buddy allocator. - * - * The CPU has to be pinned. When zone parameter is non-NULL, spill just - * the single zone's pages. */ void drain_local_pages(struct zone *zone) { @@ -3162,24 +3229,6 @@ void drain_local_pages(struct zone *zone) drain_pages(cpu); } -static void drain_local_pages_wq(struct work_struct *work) -{ - struct pcpu_drain *drain; - - drain = container_of(work, struct pcpu_drain, work); - - /* - * drain_all_pages doesn't use proper cpu hotplug protection so - * we can race with cpu offline when the WQ can move this from - * a cpu pinned worker to an unbound one. We can operate on a different - * cpu which is alright but we also have to make sure to not move to - * a different one. - */ - migrate_disable(); - drain_local_pages(drain->zone); - migrate_enable(); -} - /* * The implementation of drain_all_pages(), exposing an extra parameter to * drain on all cpus. @@ -3201,13 +3250,6 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) static cpumask_t cpus_with_pcps; /* - * Make sure nobody triggers this path before mm_percpu_wq is fully - * initialized. - */ - if (WARN_ON_ONCE(!mm_percpu_wq)) - return; - - /* * Do not drain if one is already in progress unless it's specific to * a zone. Such callers are primarily CMA and memory hotplug and need * the drain to be complete when the call returns. @@ -3256,14 +3298,11 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) } for_each_cpu(cpu, &cpus_with_pcps) { - struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu); - - drain->zone = zone; - INIT_WORK(&drain->work, drain_local_pages_wq); - queue_work_on(cpu, mm_percpu_wq, &drain->work); + if (zone) + drain_pages_zone(cpu, zone); + else + drain_pages(cpu); } - for_each_cpu(cpu, &cpus_with_pcps) - flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work); mutex_unlock(&pcpu_drain_mutex); } @@ -3272,8 +3311,6 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) * Spill all the per-cpu pages from all CPUs back into the buddy allocator. * * When zone parameter is non-NULL, spill just the single zone's pages. - * - * Note that this can be extremely slow as the draining happens in a workqueue. */ void drain_all_pages(struct zone *zone) { @@ -3318,7 +3355,7 @@ void mark_free_pages(struct zone *zone) for_each_migratetype_order(order, t) { list_for_each_entry(page, - &zone->free_area[order].free_list[t], lru) { + &zone->free_area[order].free_list[t], buddy_list) { unsigned long i; pfn = page_to_pfn(page); @@ -3395,19 +3432,17 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, return min(READ_ONCE(pcp->batch) << 2, high); } -static void free_unref_page_commit(struct page *page, int migratetype, +static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, + struct page *page, int migratetype, unsigned int order) { - struct zone *zone = page_zone(page); - struct per_cpu_pages *pcp; int high; int pindex; bool free_high; __count_vm_event(PGFREE); - pcp = this_cpu_ptr(zone->per_cpu_pageset); pindex = order_to_pindex(migratetype, order); - list_add(&page->lru, &pcp->lists[pindex]); + list_add(&page->pcp_list, &pcp->lists[pindex]); pcp->count += 1 << order; /* @@ -3432,6 +3467,9 @@ static void free_unref_page_commit(struct page *page, int migratetype, void free_unref_page(struct page *page, unsigned int order) { unsigned long flags; + unsigned long __maybe_unused UP_flags; + struct per_cpu_pages *pcp; + struct zone *zone; unsigned long pfn = page_to_pfn(page); int migratetype; @@ -3454,9 +3492,16 @@ void free_unref_page(struct page *page, unsigned int order) migratetype = MIGRATE_MOVABLE; } - local_lock_irqsave(&pagesets.lock, flags); - free_unref_page_commit(page, migratetype, order); - local_unlock_irqrestore(&pagesets.lock, flags); + zone = page_zone(page); + pcp_trylock_prepare(UP_flags); + pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); + if (pcp) { + free_unref_page_commit(zone, pcp, page, migratetype, order); + pcp_spin_unlock_irqrestore(pcp, flags); + } else { + free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); + } + pcp_trylock_finish(UP_flags); } /* @@ -3465,6 +3510,8 @@ void free_unref_page(struct page *page, unsigned int order) void free_unref_page_list(struct list_head *list) { struct page *page, *next; + struct per_cpu_pages *pcp = NULL; + struct zone *locked_zone = NULL; unsigned long flags; int batch_count = 0; int migratetype; @@ -3489,8 +3536,18 @@ void free_unref_page_list(struct list_head *list) } } - local_lock_irqsave(&pagesets.lock, flags); list_for_each_entry_safe(page, next, list, lru) { + struct zone *zone = page_zone(page); + + /* Different zone, different pcp lock. */ + if (zone != locked_zone) { + if (pcp) + pcp_spin_unlock_irqrestore(pcp, flags); + + locked_zone = zone; + pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags); + } + /* * Non-isolated types over MIGRATE_PCPTYPES get added * to the MIGRATE_MOVABLE pcp list. @@ -3500,19 +3557,21 @@ void free_unref_page_list(struct list_head *list) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(page); - free_unref_page_commit(page, migratetype, 0); + free_unref_page_commit(zone, pcp, page, migratetype, 0); /* * Guard against excessive IRQ disabled times when we get * a large list of pages to free. */ if (++batch_count == SWAP_CLUSTER_MAX) { - local_unlock_irqrestore(&pagesets.lock, flags); + pcp_spin_unlock_irqrestore(pcp, flags); batch_count = 0; - local_lock_irqsave(&pagesets.lock, flags); + pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags); } } - local_unlock_irqrestore(&pagesets.lock, flags); + + if (pcp) + pcp_spin_unlock_irqrestore(pcp, flags); } /* @@ -3637,6 +3696,43 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, #endif } +static __always_inline +struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, + unsigned int order, unsigned int alloc_flags, + int migratetype) +{ + struct page *page; + unsigned long flags; + + do { + page = NULL; + spin_lock_irqsave(&zone->lock, flags); + /* + * order-0 request can reach here when the pcplist is skipped + * due to non-CMA allocation context. HIGHATOMIC area is + * reserved for high-order atomic allocation, so order-0 + * request should skip it. + */ + if (order > 0 && alloc_flags & ALLOC_HARDER) + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (!page) { + page = __rmqueue(zone, order, migratetype, alloc_flags); + if (!page) { + spin_unlock_irqrestore(&zone->lock, flags); + return NULL; + } + } + __mod_zone_freepage_state(zone, -(1 << order), + get_pcppage_migratetype(page)); + spin_unlock_irqrestore(&zone->lock, flags); + } while (check_new_pages(page, order)); + + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); + zone_statistics(preferred_zone, zone, 1); + + return page; +} + /* Remove page from the per-cpu list, caller must protect the list */ static inline struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, @@ -3670,8 +3766,8 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, return NULL; } - page = list_first_entry(list, struct page, lru); - list_del(&page->lru); + page = list_first_entry(list, struct page, pcp_list); + list_del(&page->pcp_list); pcp->count -= 1 << order; } while (check_new_pcp(page, order)); @@ -3688,19 +3784,29 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct list_head *list; struct page *page; unsigned long flags; + unsigned long __maybe_unused UP_flags; - local_lock_irqsave(&pagesets.lock, flags); + /* + * spin_trylock may fail due to a parallel drain. In the future, the + * trylock will also protect against IRQ reentrancy. + */ + pcp_trylock_prepare(UP_flags); + pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); + if (!pcp) { + pcp_trylock_finish(UP_flags); + return NULL; + } /* * On allocation, reduce the number of pages that are batch freed. * See nr_pcp_free() where free_factor is increased for subsequent * frees. */ - pcp = this_cpu_ptr(zone->per_cpu_pageset); pcp->free_factor >>= 1; list = &pcp->lists[order_to_pindex(migratetype, order)]; page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); - local_unlock_irqrestore(&pagesets.lock, flags); + pcp_spin_unlock_irqrestore(pcp, flags); + pcp_trylock_finish(UP_flags); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); zone_statistics(preferred_zone, zone, 1); @@ -3717,9 +3823,14 @@ struct page *rmqueue(struct zone *preferred_zone, gfp_t gfp_flags, unsigned int alloc_flags, int migratetype) { - unsigned long flags; struct page *page; + /* + * We most definitely don't want callers attempting to + * allocate greater than order-1 page units with __GFP_NOFAIL. + */ + WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); + if (likely(pcp_allowed_order(order))) { /* * MIGRATE_MOVABLE pcplist could have the pages on CMA area and @@ -3729,53 +3840,23 @@ struct page *rmqueue(struct zone *preferred_zone, migratetype != MIGRATE_MOVABLE) { page = rmqueue_pcplist(preferred_zone, zone, order, gfp_flags, migratetype, alloc_flags); - goto out; + if (likely(page)) + goto out; } } - /* - * We most definitely don't want callers attempting to - * allocate greater than order-1 page units with __GFP_NOFAIL. - */ - WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); - - do { - page = NULL; - spin_lock_irqsave(&zone->lock, flags); - /* - * order-0 request can reach here when the pcplist is skipped - * due to non-CMA allocation context. HIGHATOMIC area is - * reserved for high-order atomic allocation, so order-0 - * request should skip it. - */ - if (order > 0 && alloc_flags & ALLOC_HARDER) - page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); - if (!page) { - page = __rmqueue(zone, order, migratetype, alloc_flags); - if (!page) - goto failed; - } - __mod_zone_freepage_state(zone, -(1 << order), - get_pcppage_migratetype(page)); - spin_unlock_irqrestore(&zone->lock, flags); - } while (check_new_pages(page, order)); - - __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); - zone_statistics(preferred_zone, zone, 1); + page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags, + migratetype); out: /* Separate test+clear to avoid unnecessary atomics */ - if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) { + if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) { clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); wakeup_kswapd(zone, 0, 0, zone_idx(zone)); } VM_BUG_ON_PAGE(page && bad_range(zone, page), page); return page; - -failed: - spin_unlock_irqrestore(&zone->lock, flags); - return NULL; } #ifdef CONFIG_FAIL_PAGE_ALLOC @@ -3968,11 +4049,15 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, * need to be calculated. */ if (!order) { - long fast_free; + long usable_free; + long reserved; + + usable_free = free_pages; + reserved = __zone_watermark_unusable_free(z, 0, alloc_flags); - fast_free = free_pages; - fast_free -= __zone_watermark_unusable_free(z, 0, alloc_flags); - if (fast_free > mark + z->lowmem_reserve[highest_zoneidx]) + /* reserved may over estimate high-atomic reserves. */ + usable_free -= min(usable_free, reserved); + if (usable_free > mark + z->lowmem_reserve[highest_zoneidx]) return true; } @@ -4090,7 +4175,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, retry: /* * Scan zonelist, looking for a zone with enough free. - * See also __cpuset_node_allowed() comment in kernel/cpuset.c. + * See also __cpuset_node_allowed() comment in kernel/cgroup/cpuset.c. */ no_fallback = alloc_flags & ALLOC_NOFRAGMENT; z = ac->preferred_zoneref; @@ -4623,6 +4708,30 @@ void fs_reclaim_release(gfp_t gfp_mask) EXPORT_SYMBOL_GPL(fs_reclaim_release); #endif +/* + * Zonelists may change due to hotplug during allocation. Detect when zonelists + * have been rebuilt so allocation retries. Reader side does not lock and + * retries the allocation if zonelist changes. Writer side is protected by the + * embedded spin_lock. + */ +static DEFINE_SEQLOCK(zonelist_update_seq); + +static unsigned int zonelist_iter_begin(void) +{ + if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) + return read_seqbegin(&zonelist_update_seq); + + return 0; +} + +static unsigned int check_retry_zonelist(unsigned int seq) +{ + if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) + return read_seqretry(&zonelist_update_seq, seq); + + return seq; +} + /* Perform direct synchronous page reclaim */ static unsigned long __perform_reclaim(gfp_t gfp_mask, unsigned int order, @@ -4916,6 +5025,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int compaction_retries; int no_progress_loops; unsigned int cpuset_mems_cookie; + unsigned int zonelist_iter_cookie; int reserve_flags; /* @@ -4926,11 +5036,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) gfp_mask &= ~__GFP_ATOMIC; -retry_cpuset: +restart: compaction_retries = 0; no_progress_loops = 0; compact_priority = DEF_COMPACT_PRIORITY; cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist_iter_cookie = zonelist_iter_begin(); /* * The fast path uses conservative alloc_flags to succeed only until @@ -5102,9 +5213,13 @@ retry: goto retry; - /* Deal with possible cpuset update races before we start OOM killing */ - if (check_retry_cpuset(cpuset_mems_cookie, ac)) - goto retry_cpuset; + /* + * Deal with possible cpuset update races or zonelist updates to avoid + * a unnecessary OOM kill. + */ + if (check_retry_cpuset(cpuset_mems_cookie, ac) || + check_retry_zonelist(zonelist_iter_cookie)) + goto restart; /* Reclaim has failed us, start killing things */ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); @@ -5124,9 +5239,13 @@ retry: } nopage: - /* Deal with possible cpuset update races before we fail */ - if (check_retry_cpuset(cpuset_mems_cookie, ac)) - goto retry_cpuset; + /* + * Deal with possible cpuset update races or zonelist updates to avoid + * a unnecessary OOM kill. + */ + if (check_retry_cpuset(cpuset_mems_cookie, ac) || + check_retry_zonelist(zonelist_iter_cookie)) + goto restart; /* * Make sure that __GFP_NOFAIL request doesn't leak out and make sure @@ -5197,10 +5316,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, *alloc_flags |= ALLOC_CPUSET; } - fs_reclaim_acquire(gfp_mask); - fs_reclaim_release(gfp_mask); - - might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); + might_alloc(gfp_mask); if (should_fail_alloc_page(gfp_mask, order)) return false; @@ -5248,6 +5364,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, { struct page *page; unsigned long flags; + unsigned long __maybe_unused UP_flags; struct zone *zone; struct zoneref *z; struct per_cpu_pages *pcp; @@ -5328,11 +5445,14 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, if (unlikely(!zone)) goto failed; + /* Is a parallel drain in progress? */ + pcp_trylock_prepare(UP_flags); + pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); + if (!pcp) + goto failed_irq; + /* Attempt the batch allocation */ - local_lock_irqsave(&pagesets.lock, flags); - pcp = this_cpu_ptr(zone->per_cpu_pageset); pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; - while (nr_populated < nr_pages) { /* Skip existing pages */ @@ -5345,8 +5465,10 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, pcp, pcp_list); if (unlikely(!page)) { /* Try and allocate at least one page */ - if (!nr_account) + if (!nr_account) { + pcp_spin_unlock_irqrestore(pcp, flags); goto failed_irq; + } break; } nr_account++; @@ -5359,7 +5481,8 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nr_populated++; } - local_unlock_irqrestore(&pagesets.lock, flags); + pcp_spin_unlock_irqrestore(pcp, flags); + pcp_trylock_finish(UP_flags); __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); zone_statistics(ac.preferred_zoneref->zone, zone, nr_account); @@ -5368,7 +5491,7 @@ out: return nr_populated; failed_irq: - local_unlock_irqrestore(&pagesets.lock, flags); + pcp_trylock_finish(UP_flags); failed: page = __alloc_pages(gfp, 0, preferred_nid, nodemask); @@ -5617,6 +5740,18 @@ refill: /* reset page count bias and offset to start of new frag */ nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; offset = size - fragsz; + if (unlikely(offset < 0)) { + /* + * The caller is trying to allocate a fragment + * with fragsz > PAGE_SIZE but the cache isn't big + * enough to satisfy the request, this may + * happen in low memory conditions. + * We don't release the cache page because + * it could make memory pressure worse + * so we simply return NULL here. + */ + return NULL; + } } nc->pagecnt_bias--; @@ -5799,14 +5934,14 @@ long si_mem_available(void) /* * Estimate the amount of memory available for userspace allocations, - * without causing swapping. + * without causing swapping or OOM. */ available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; /* * Not all the page cache can be freed, otherwise the system will - * start swapping. Assume at least half of the page cache, or the - * low watermark worth of cache, needs to stay. + * start swapping or thrashing. Assume at least half of the page + * cache, or the low watermark worth of cache, needs to stay. */ pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; pagecache -= min(pagecache / 2, wmark_low); @@ -5934,7 +6069,7 @@ static void show_migration_types(unsigned char type) void show_free_areas(unsigned int filter, nodemask_t *nodemask) { unsigned long free_pcp = 0; - int cpu; + int cpu, nid; struct zone *zone; pg_data_t *pgdat; @@ -6122,7 +6257,11 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) printk(KERN_CONT "= %lukB\n", K(total)); } - hugetlb_show_meminfo(); + for_each_online_node(nid) { + if (show_mem_node_skip(filter, nid, nodemask)) + continue; + hugetlb_show_meminfo_node(nid); + } printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); @@ -6421,9 +6560,8 @@ static void __build_all_zonelists(void *data) int nid; int __maybe_unused cpu; pg_data_t *self = data; - static DEFINE_SPINLOCK(lock); - spin_lock(&lock); + write_seqlock(&zonelist_update_seq); #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); @@ -6460,7 +6598,7 @@ static void __build_all_zonelists(void *data) #endif } - spin_unlock(&lock); + write_sequnlock(&zonelist_update_seq); } static noinline void __init @@ -7008,6 +7146,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta memset(pcp, 0, sizeof(*pcp)); memset(pzstats, 0, sizeof(*pzstats)); + spin_lock_init(&pcp->lock); for (pindex = 0; pindex < NR_PCP_LISTS; pindex++) INIT_LIST_HEAD(&pcp->lists[pindex]); |