diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 335 |
1 files changed, 227 insertions, 108 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b37435c274cf..d4205e5e41d1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -19,6 +19,7 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/swap.h> +#include <linux/swapops.h> #include <linux/interrupt.h> #include <linux/pagemap.h> #include <linux/jiffies.h> @@ -63,6 +64,7 @@ #include <linux/sched/rt.h> #include <linux/sched/mm.h> #include <linux/page_owner.h> +#include <linux/page_table_check.h> #include <linux/kthread.h> #include <linux/memcontrol.h> #include <linux/ftrace.h> @@ -677,10 +679,8 @@ static inline int pindex_to_order(unsigned int pindex) int order = pindex / MIGRATE_PCPTYPES; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (order > PAGE_ALLOC_COSTLY_ORDER) { + if (order > PAGE_ALLOC_COSTLY_ORDER) order = pageblock_order; - VM_BUG_ON(order != pageblock_order); - } #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); #endif @@ -724,27 +724,37 @@ static inline void free_the_page(struct page *page, unsigned int order) void free_compound_page(struct page *page) { - mem_cgroup_uncharge(page); + mem_cgroup_uncharge(page_folio(page)); free_the_page(page, compound_order(page)); } +static void prep_compound_head(struct page *page, unsigned int order) +{ + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); + set_compound_order(page, order); + atomic_set(compound_mapcount_ptr(page), -1); + if (hpage_pincount_available(page)) + atomic_set(compound_pincount_ptr(page), 0); +} + +static void prep_compound_tail(struct page *head, int tail_idx) +{ + struct page *p = head + tail_idx; + + p->mapping = TAIL_MAPPING; + set_compound_head(p, head); +} + void prep_compound_page(struct page *page, unsigned int order) { int i; int nr_pages = 1 << order; __SetPageHead(page); - for (i = 1; i < nr_pages; i++) { - struct page *p = page + i; - p->mapping = TAIL_MAPPING; - set_compound_head(p, page); - } + for (i = 1; i < nr_pages; i++) + prep_compound_tail(page, i); - set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); - set_compound_order(page, order); - atomic_set(compound_mapcount_ptr(page), -1); - if (hpage_pincount_available(page)) - atomic_set(compound_pincount_ptr(page), 0); + prep_compound_head(page, order); } #ifdef CONFIG_DEBUG_PAGEALLOC @@ -1299,6 +1309,7 @@ static __always_inline bool free_pages_prepare(struct page *page, if (memcg_kmem_enabled() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); reset_page_owner(page, order); + page_table_check_free(page, order); return false; } @@ -1312,8 +1323,10 @@ static __always_inline bool free_pages_prepare(struct page *page, VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); - if (compound) + if (compound) { ClearPageDoubleMap(page); + ClearPageHasHWPoisoned(page); + } for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_pages_check(page, page + i); @@ -1336,6 +1349,7 @@ static __always_inline bool free_pages_prepare(struct page *page, page_cpupid_reset_last(page); page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; reset_page_owner(page, order); + page_table_check_free(page, order); if (!PageHighMem(page)) { debug_check_no_locks_freed(page_address(page), @@ -1428,14 +1442,8 @@ static inline void prefetch_buddy(struct page *page) /* * Frees a number of pages from the PCP lists - * Assumes all pages on list are in same zone, and of same order. + * Assumes all pages on list are in same zone. * count is the number of pages to free. - * - * If the zone was previously in an "all pages pinned" state then look to - * see if this freeing clears that state. - * - * And clear the zone's pages_scanned counter, to hold off the "all pages are - * pinned" detection logic. */ static void free_pcppages_bulk(struct zone *zone, int count, struct per_cpu_pages *pcp) @@ -1589,7 +1597,7 @@ static void __meminit init_reserved_page(unsigned long pfn) for (zid = 0; zid < MAX_NR_ZONES; zid++) { struct zone *zone = &pgdat->node_zones[zid]; - if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) + if (zone_spans_pfn(zone, pfn)) break; } __init_single_page(pfn_to_page(pfn), pfn, zid, nid); @@ -2416,6 +2424,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, } set_page_owner(page, order, gfp_flags); + page_table_check_alloc(page, order); } static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, @@ -3147,9 +3156,9 @@ static void drain_local_pages_wq(struct work_struct *work) * cpu which is alright but we also have to make sure to not move to * a different one. */ - preempt_disable(); + migrate_disable(); drain_local_pages(drain->zone); - preempt_enable(); + migrate_enable(); } /* @@ -3966,6 +3975,8 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order, } #ifdef CONFIG_NUMA +int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; + static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= @@ -4208,7 +4219,9 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) va_list args; static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1); - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) + if ((gfp_mask & __GFP_NOWARN) || + !__ratelimit(&nopage_rs) || + ((gfp_mask & __GFP_DMA) && !has_managed_dma())) return; va_start(args, fmt); @@ -4795,30 +4808,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, trace_reclaim_retry_zone(z, order, reclaimable, available, min_wmark, *no_progress_loops, wmark); if (wmark) { - /* - * If we didn't make any progress and have a lot of - * dirty + writeback pages then we should wait for - * an IO to complete to slow down the reclaim and - * prevent from pre mature OOM - */ - if (!did_some_progress) { - unsigned long write_pending; - - write_pending = zone_page_state_snapshot(zone, - NR_ZONE_WRITE_PENDING); - - if (2 * write_pending > reclaimable) { - congestion_wait(BLK_RW_ASYNC, HZ/10); - return true; - } - } - ret = true; - goto out; + break; } } -out: /* * Memory allocation/reclaim might be called from a WQ context and the * current implementation of the WQ concurrency control doesn't @@ -4914,6 +4908,19 @@ retry_cpuset: if (!ac->preferred_zoneref->zone) goto nopage; + /* + * Check for insane configurations where the cpuset doesn't contain + * any suitable zone to satisfy the request - e.g. non-movable + * GFP_HIGHUSER allocations from MOVABLE nodes only. + */ + if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) { + struct zoneref *z = first_zones_zonelist(ac->zonelist, + ac->highest_zoneidx, + &cpuset_current_mems_allowed); + if (!z->zone) + goto nopage; + } + if (alloc_flags & ALLOC_KSWAPD) wake_all_kswapds(order, gfp_mask, ac); @@ -5223,6 +5230,10 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, if (unlikely(page_array && nr_pages - nr_populated == 0)) goto out; + /* Bulk allocator does not support memcg accounting. */ + if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT)) + goto failed; + /* Use the single page allocator for one page. */ if (nr_pages - nr_populated == 1) goto failed; @@ -5400,6 +5411,18 @@ out: } EXPORT_SYMBOL(__alloc_pages); +struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid, + nodemask_t *nodemask) +{ + struct page *page = __alloc_pages(gfp | __GFP_COMP, order, + preferred_nid, nodemask); + + if (page && order > 1) + prep_transhuge_page(page); + return (struct folio *)page; +} +EXPORT_SYMBOL(__folio_alloc); + /* * Common helper functions. Never use with __GFP_HIGHMEM because the returned * address cannot represent highmem pages. Use alloc_pages and then kmap if @@ -5612,8 +5635,8 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) unsigned int order = get_order(size); unsigned long addr; - if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) - gfp_mask &= ~__GFP_COMP; + if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) + gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); addr = __get_free_pages(gfp_mask, order); return make_alloc_exact(addr, order, size); @@ -5637,8 +5660,8 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) unsigned int order = get_order(size); struct page *p; - if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) - gfp_mask &= ~__GFP_COMP; + if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) + gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); p = alloc_pages_node(nid, gfp_mask, order); if (!p) @@ -5980,6 +6003,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) printk(KERN_CONT "%s" " free:%lukB" + " boost:%lukB" " min:%lukB" " low:%lukB" " high:%lukB" @@ -6000,6 +6024,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) "\n", zone->name, K(zone_page_state(zone, NR_FREE_PAGES)), + K(zone->watermark_boost), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), @@ -6255,7 +6280,7 @@ static void build_zonelists(pg_data_t *pgdat) */ if (node_distance(local_node, node) != node_distance(local_node, prev_node)) - node_load[node] = load; + node_load[node] += load; node_order[nr_nodes++] = node; prev_node = node; @@ -6264,6 +6289,10 @@ static void build_zonelists(pg_data_t *pgdat) build_zonelists_in_node_order(pgdat, node_order, nr_nodes); build_thisnode_zonelists(pgdat); + pr_info("Fallback order for Node %d: ", local_node); + for (node = 0; node < nr_nodes; node++) + pr_cont("%d ", node_order[node]); + pr_cont("\n"); } #ifdef CONFIG_HAVE_MEMORYLESS_NODES @@ -6550,6 +6579,75 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone } #ifdef CONFIG_ZONE_DEVICE +static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, + unsigned long zone_idx, int nid, + struct dev_pagemap *pgmap) +{ + + __init_single_page(page, pfn, zone_idx, nid); + + /* + * Mark page reserved as it will need to wait for onlining + * phase for it to be fully associated with a zone. + * + * We can use the non-atomic __set_bit operation for setting + * the flag as we are still initializing the pages. + */ + __SetPageReserved(page); + + /* + * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer + * and zone_device_data. It is a bug if a ZONE_DEVICE page is + * ever freed or placed on a driver-private list. + */ + page->pgmap = pgmap; + page->zone_device_data = NULL; + + /* + * Mark the block movable so that blocks are reserved for + * movable at startup. This will force kernel allocations + * to reserve their blocks rather than leaking throughout + * the address space during boot when many long-lived + * kernel allocations are made. + * + * Please note that MEMINIT_HOTPLUG path doesn't clear memmap + * because this is done early in section_activate() + */ + if (IS_ALIGNED(pfn, pageblock_nr_pages)) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + cond_resched(); + } +} + +static void __ref memmap_init_compound(struct page *head, + unsigned long head_pfn, + unsigned long zone_idx, int nid, + struct dev_pagemap *pgmap, + unsigned long nr_pages) +{ + unsigned long pfn, end_pfn = head_pfn + nr_pages; + unsigned int order = pgmap->vmemmap_shift; + + __SetPageHead(head); + for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) { + struct page *page = pfn_to_page(pfn); + + __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); + prep_compound_tail(head, pfn - head_pfn); + set_page_count(page, 0); + + /* + * The first tail page stores compound_mapcount_ptr() and + * compound_order() and the second tail page stores + * compound_pincount_ptr(). Call prep_compound_head() after + * the first and second tail pages have been initialized to + * not have the data overwritten. + */ + if (pfn == head_pfn + 2) + prep_compound_head(head, order); + } +} + void __ref memmap_init_zone_device(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, @@ -6558,6 +6656,7 @@ void __ref memmap_init_zone_device(struct zone *zone, unsigned long pfn, end_pfn = start_pfn + nr_pages; struct pglist_data *pgdat = zone->zone_pgdat; struct vmem_altmap *altmap = pgmap_altmap(pgmap); + unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap); unsigned long zone_idx = zone_idx(zone); unsigned long start = jiffies; int nid = pgdat->node_id; @@ -6575,42 +6674,16 @@ void __ref memmap_init_zone_device(struct zone *zone, nr_pages = end_pfn - start_pfn; } - for (pfn = start_pfn; pfn < end_pfn; pfn++) { + for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) { struct page *page = pfn_to_page(pfn); - __init_single_page(page, pfn, zone_idx, nid); + __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); - /* - * Mark page reserved as it will need to wait for onlining - * phase for it to be fully associated with a zone. - * - * We can use the non-atomic __set_bit operation for setting - * the flag as we are still initializing the pages. - */ - __SetPageReserved(page); - - /* - * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer - * and zone_device_data. It is a bug if a ZONE_DEVICE page is - * ever freed or placed on a driver-private list. - */ - page->pgmap = pgmap; - page->zone_device_data = NULL; + if (pfns_per_compound == 1) + continue; - /* - * Mark the block movable so that blocks are reserved for - * movable at startup. This will force kernel allocations - * to reserve their blocks rather than leaking throughout - * the address space during boot when many long-lived - * kernel allocations are made. - * - * Please note that MEMINIT_HOTPLUG path doesn't clear memmap - * because this is done early in section_activate() - */ - if (IS_ALIGNED(pfn, pageblock_nr_pages)) { - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - cond_resched(); - } + memmap_init_compound(page, pfn, zone_idx, nid, pgmap, + pfns_per_compound); } pr_info("%s initialised %lu pages in %ums\n", __func__, @@ -7389,6 +7462,8 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat) {} static void __meminit pgdat_init_internals(struct pglist_data *pgdat) { + int i; + pgdat_resize_init(pgdat); pgdat_init_split_queue(pgdat); @@ -7397,6 +7472,9 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); + for (i = 0; i < NR_VMSCAN_THROTTLE; i++) + init_waitqueue_head(&pgdat->reclaim_wait[i]); + pgdat_page_ext_init(pgdat); lruvec_init(&pgdat->__lruvec); } @@ -8126,8 +8204,7 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char } if (pages && s) - pr_info("Freeing %s memory: %ldK\n", - s, pages << (PAGE_SHIFT - 10)); + pr_info("Freeing %s memory: %ldK\n", s, K(pages)); return pages; } @@ -8154,7 +8231,7 @@ void __init mem_init_print_info(void) */ #define adj_init_size(start, end, size, pos, adj) \ do { \ - if (start <= pos && pos < end && size > adj) \ + if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \ size -= adj; \ } while (0) @@ -8172,14 +8249,13 @@ void __init mem_init_print_info(void) ", %luK highmem" #endif ")\n", - nr_free_pages() << (PAGE_SHIFT - 10), - physpages << (PAGE_SHIFT - 10), + K(nr_free_pages()), K(physpages), codesize >> 10, datasize >> 10, rosize >> 10, (init_data_size + init_code_size) >> 10, bss_size >> 10, - (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10), - totalcma_pages << (PAGE_SHIFT - 10) + K(physpages - totalram_pages() - totalcma_pages), + K(totalcma_pages) #ifdef CONFIG_HIGHMEM - , totalhigh_pages() << (PAGE_SHIFT - 10) + , K(totalhigh_pages()) #endif ); } @@ -8452,7 +8528,7 @@ void setup_per_zone_wmarks(void) * 8192MB: 11584k * 16384MB: 16384k */ -int __meminit init_per_zone_wmark_min(void) +void calculate_min_free_kbytes(void) { unsigned long lowmem_kbytes; int new_min_free_kbytes; @@ -8460,16 +8536,17 @@ int __meminit init_per_zone_wmark_min(void) lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); - if (new_min_free_kbytes > user_min_free_kbytes) { - min_free_kbytes = new_min_free_kbytes; - if (min_free_kbytes < 128) - min_free_kbytes = 128; - if (min_free_kbytes > 262144) - min_free_kbytes = 262144; - } else { + if (new_min_free_kbytes > user_min_free_kbytes) + min_free_kbytes = clamp(new_min_free_kbytes, 128, 262144); + else pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", new_min_free_kbytes, user_min_free_kbytes); - } + +} + +int __meminit init_per_zone_wmark_min(void) +{ + calculate_min_free_kbytes(); setup_per_zone_wmarks(); refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); @@ -8756,7 +8833,8 @@ void *__init alloc_large_system_hash(const char *tablename, } else if (get_order(size) >= MAX_ORDER || hashdist) { table = __vmalloc(size, gfp_flags); virt = true; - huge = is_vm_area_hugepages(table); + if (table) + huge = is_vm_area_hugepages(table); } else { /* * If bucketsize is not a power-of-two, we may free @@ -9197,8 +9275,8 @@ static bool zone_spans_last_pfn(const struct zone *zone, * for allocation requests which can not be fulfilled with the buddy allocator. * * The allocated memory is always aligned to a page boundary. If nr_pages is a - * power of two then the alignment is guaranteed to be to the given nr_pages - * (e.g. 1GB request would be aligned to 1GB). + * power of two, then allocated range is also guaranteed to be aligned to same + * nr_pages (e.g. 1GB request would be aligned to 1GB). * * Allocated pages can be freed with free_contig_range() or by manually calling * __free_page() on each allocated page. @@ -9353,21 +9431,21 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) } #endif +/* + * This function returns a stable result only if called under zone lock. + */ bool is_free_buddy_page(struct page *page) { - struct zone *zone = page_zone(page); unsigned long pfn = page_to_pfn(page); - unsigned long flags; unsigned int order; - spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); - if (PageBuddy(page_head) && buddy_order(page_head) >= order) + if (PageBuddy(page_head) && + buddy_order_unsafe(page_head) >= order) break; } - spin_unlock_irqrestore(&zone->lock, flags); return order < MAX_ORDER; } @@ -9431,6 +9509,7 @@ bool take_page_off_buddy(struct page *page) del_page_from_free_list(page_head, zone, page_order); break_down_buddy_pages(zone, page_head, page, 0, page_order, migratetype); + SetPageHWPoisonTakenOff(page); if (!is_migrate_isolate(migratetype)) __mod_zone_freepage_state(zone, -1, migratetype); ret = true; @@ -9442,4 +9521,44 @@ bool take_page_off_buddy(struct page *page) spin_unlock_irqrestore(&zone->lock, flags); return ret; } + +/* + * Cancel takeoff done by take_page_off_buddy(). + */ +bool put_page_back_buddy(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long pfn = page_to_pfn(page); + unsigned long flags; + int migratetype = get_pfnblock_migratetype(page, pfn); + bool ret = false; + + spin_lock_irqsave(&zone->lock, flags); + if (put_page_testzero(page)) { + ClearPageHWPoisonTakenOff(page); + __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE); + if (TestClearPageHWPoison(page)) { + num_poisoned_pages_dec(); + ret = true; + } + } + spin_unlock_irqrestore(&zone->lock, flags); + + return ret; +} #endif + +#ifdef CONFIG_ZONE_DMA +bool has_managed_dma(void) +{ + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) { + struct zone *zone = &pgdat->node_zones[ZONE_DMA]; + + if (managed_zone(zone)) + return true; + } + return false; +} +#endif /* CONFIG_ZONE_DMA */ |