diff options
Diffstat (limited to 'mm/mm_init.c')
| -rw-r--r-- | mm/mm_init.c | 218 | 
1 files changed, 92 insertions, 126 deletions
diff --git a/mm/mm_init.c b/mm/mm_init.c index 549e76af8f82..f72b852bd5b8 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -24,9 +24,11 @@  #include <linux/page_ext.h>  #include <linux/pti.h>  #include <linux/pgtable.h> +#include <linux/stackdepot.h>  #include <linux/swap.h>  #include <linux/cma.h>  #include <linux/crash_dump.h> +#include <linux/execmem.h>  #include "internal.h"  #include "slab.h"  #include "shuffle.h" @@ -226,7 +228,6 @@ static unsigned long required_movablecore_percent __initdata;  static unsigned long nr_kernel_pages __initdata;  static unsigned long nr_all_pages __initdata; -static unsigned long dma_reserve __initdata;  static bool deferred_struct_pages __meminitdata; @@ -1144,7 +1145,7 @@ static void __init adjust_zone_range_for_zone_movable(int nid,   * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,   * then all holes in the requested range will be accounted for.   */ -unsigned long __init __absent_pages_in_range(int nid, +static unsigned long __init __absent_pages_in_range(int nid,  				unsigned long range_start_pfn,  				unsigned long range_end_pfn)  { @@ -1265,6 +1266,30 @@ static void __init reset_memoryless_node_totalpages(struct pglist_data *pgdat)  	pr_debug("On node %d totalpages: 0\n", pgdat->node_id);  } +static void __init calc_nr_kernel_pages(void) +{ +	unsigned long start_pfn, end_pfn; +	phys_addr_t start_addr, end_addr; +	u64 u; +#ifdef CONFIG_HIGHMEM +	unsigned long high_zone_low = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]; +#endif + +	for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) { +		start_pfn = PFN_UP(start_addr); +		end_pfn   = PFN_DOWN(end_addr); + +		if (start_pfn < end_pfn) { +			nr_all_pages += end_pfn - start_pfn; +#ifdef CONFIG_HIGHMEM +			start_pfn = clamp(start_pfn, 0, high_zone_low); +			end_pfn = clamp(end_pfn, 0, high_zone_low); +#endif +			nr_kernel_pages += end_pfn - start_pfn; +		} +	} +} +  static void __init calculate_node_totalpages(struct pglist_data *pgdat,  						unsigned long node_start_pfn,  						unsigned long node_end_pfn) @@ -1308,26 +1333,6 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat,  	pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);  } -static unsigned long __init calc_memmap_size(unsigned long spanned_pages, -						unsigned long present_pages) -{ -	unsigned long pages = spanned_pages; - -	/* -	 * Provide a more accurate estimation if there are holes within -	 * the zone and SPARSEMEM is in use. If there are holes within the -	 * zone, each populated memory region may cost us one or two extra -	 * memmap pages due to alignment because memmap pages for each -	 * populated regions may not be naturally aligned on page boundary. -	 * So the (present_pages >> 4) heuristic is a tradeoff for that. -	 */ -	if (spanned_pages > present_pages + (present_pages >> 4) && -	    IS_ENABLED(CONFIG_SPARSEMEM)) -		pages = present_pages; - -	return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; -} -  #ifdef CONFIG_TRANSPARENT_HUGEPAGE  static void pgdat_init_split_queue(struct pglist_data *pgdat)  { @@ -1542,15 +1547,6 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)  }  #endif -/* - * Set up the zone data structures: - *   - mark all pages reserved - *   - mark all memory queues empty - *   - clear the memory bitmaps - * - * NOTE: pgdat should get zeroed by caller. - * NOTE: this function is only called during early init. - */  static void __init free_area_init_core(struct pglist_data *pgdat)  {  	enum zone_type j; @@ -1561,47 +1557,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat)  	for (j = 0; j < MAX_NR_ZONES; j++) {  		struct zone *zone = pgdat->node_zones + j; -		unsigned long size, freesize, memmap_pages; - -		size = zone->spanned_pages; -		freesize = zone->present_pages; - -		/* -		 * Adjust freesize so that it accounts for how much memory -		 * is used by this zone for memmap. This affects the watermark -		 * and per-cpu initialisations -		 */ -		memmap_pages = calc_memmap_size(size, freesize); -		if (!is_highmem_idx(j)) { -			if (freesize >= memmap_pages) { -				freesize -= memmap_pages; -				if (memmap_pages) -					pr_debug("  %s zone: %lu pages used for memmap\n", -						 zone_names[j], memmap_pages); -			} else -				pr_warn("  %s zone: %lu memmap pages exceeds freesize %lu\n", -					zone_names[j], memmap_pages, freesize); -		} - -		/* Account for reserved pages */ -		if (j == 0 && freesize > dma_reserve) { -			freesize -= dma_reserve; -			pr_debug("  %s zone: %lu pages reserved\n", zone_names[0], dma_reserve); -		} - -		if (!is_highmem_idx(j)) -			nr_kernel_pages += freesize; -		/* Charge for highmem memmap if there are enough kernel pages */ -		else if (nr_kernel_pages > memmap_pages * 2) -			nr_kernel_pages -= memmap_pages; -		nr_all_pages += freesize; +		unsigned long size = zone->spanned_pages;  		/* -		 * Set an approximate value for lowmem here, it will be adjusted -		 * when the bootmem allocator frees pages into the buddy system. -		 * And all highmem pages will be managed by the buddy system. +		 * Initialize zone->managed_pages as 0 , it will be reset +		 * when memblock allocator frees pages into buddy system.  		 */ -		zone_init_internals(zone, j, nid, freesize); +		zone_init_internals(zone, j, nid, zone->present_pages);  		if (!size)  			continue; @@ -1874,30 +1836,26 @@ void __init free_area_init(unsigned long *max_zone_pfn)  				panic("Cannot allocate %zuB for node %d.\n",  				       sizeof(*pgdat), nid);  			arch_refresh_nodedata(nid, pgdat); -			free_area_init_node(nid); - -			/* -			 * We do not want to confuse userspace by sysfs -			 * files/directories for node without any memory -			 * attached to it, so this node is not marked as -			 * N_MEMORY and not marked online so that no sysfs -			 * hierarchy will be created via register_one_node for -			 * it. The pgdat will get fully initialized by -			 * hotadd_init_pgdat() when memory is hotplugged into -			 * this node. -			 */ -			continue;  		}  		pgdat = NODE_DATA(nid);  		free_area_init_node(nid); -		/* Any memory on that node */ -		if (pgdat->node_present_pages) +		/* +		 * No sysfs hierarcy will be created via register_one_node() +		 *for memory-less node because here it's not marked as N_MEMORY +		 *and won't be set online later. The benefit is userspace +		 *program won't be confused by sysfs files/directories of +		 *memory-less node. The pgdat will get fully initialized by +		 *hotadd_init_pgdat() when memory is hotplugged into this node. +		 */ +		if (pgdat->node_present_pages) {  			node_set_state(nid, N_MEMORY); -		check_for_memory(pgdat); +			check_for_memory(pgdat); +		}  	} +	calc_nr_kernel_pages();  	memmap_init();  	/* disable hash distribution for systems with a single node */ @@ -2057,7 +2015,7 @@ static unsigned long  __init deferred_init_pages(struct zone *zone,  		__init_single_page(page, pfn, zid, nid);  		nr_pages++;  	} -	return (nr_pages); +	return nr_pages;  }  /* @@ -2259,10 +2217,6 @@ zone_empty:   * Return true when zone was grown, otherwise return false. We return true even   * when we grow less than requested, to let the caller decide if there are   * enough pages to satisfy the allocation. - * - * Note: We use noinline because this function is needed only during boot, and - * it is called from a __ref function _deferred_grow_zone. This way we are - * making sure that it is not inlined into permanent text section.   */  bool __init deferred_grow_zone(struct zone *zone, unsigned int order)  { @@ -2412,17 +2366,6 @@ void __init page_alloc_init_late(void)  	page_alloc_sysctl_init();  } -#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES -/* - * Returns the number of pages that arch has reserved but - * is not known to alloc_large_system_hash(). - */ -static unsigned long __init arch_reserved_kernel_pages(void) -{ -	return 0; -} -#endif -  /*   * Adaptive scale is meant to reduce sizes of hash tables on large memory   * machines. As memory size is increased the scale is also increased but at @@ -2465,7 +2408,6 @@ void *__init alloc_large_system_hash(const char *tablename,  	if (!numentries) {  		/* round applicable memory size up to nearest megabyte */  		numentries = nr_kernel_pages; -		numentries -= arch_reserved_kernel_pages();  		/* It isn't necessary when PAGE_SIZE >= 1MB */  		if (PAGE_SIZE < SZ_1M) @@ -2547,26 +2489,9 @@ void *__init alloc_large_system_hash(const char *tablename,  	return table;  } -/** - * set_dma_reserve - set the specified number of pages reserved in the first zone - * @new_dma_reserve: The number of pages to mark reserved - * - * The per-cpu batchsize and zone watermarks are determined by managed_pages. - * In the DMA zone, a significant percentage may be consumed by kernel image - * and other unfreeable allocations which can skew the watermarks badly. This - * function may optionally be used to account for unfreeable pages in the - * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and - * smaller per-cpu batchsize. - */ -void __init set_dma_reserve(unsigned long new_dma_reserve) -{ -	dma_reserve = new_dma_reserve; -} -  void __init memblock_free_pages(struct page *page, unsigned long pfn,  							unsigned int order)  { -  	if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) {  		int nid = early_pfn_to_nid(pfn); @@ -2578,6 +2503,17 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,  		/* KMSAN will take care of these pages. */  		return;  	} + +	/* pages were reserved and not allocated */ +	if (mem_alloc_profiling_enabled()) { +		union codetag_ref *ref = get_page_tag_ref(page); + +		if (ref) { +			set_codetag_empty(ref); +			put_page_tag_ref(ref); +		} +	} +  	__free_pages_core(page, order);  } @@ -2587,6 +2523,9 @@ EXPORT_SYMBOL(init_on_alloc);  DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);  EXPORT_SYMBOL(init_on_free); +DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, init_mlocked_on_free); +EXPORT_SYMBOL(init_mlocked_on_free); +  static bool _init_on_alloc_enabled_early __read_mostly  				= IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);  static int __init early_init_on_alloc(char *buf) @@ -2604,6 +2543,14 @@ static int __init early_init_on_free(char *buf)  }  early_param("init_on_free", early_init_on_free); +static bool _init_mlocked_on_free_enabled_early __read_mostly +				= IS_ENABLED(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON); +static int __init early_init_mlocked_on_free(char *buf) +{ +	return kstrtobool(buf, &_init_mlocked_on_free_enabled_early); +} +early_param("init_mlocked_on_free", early_init_mlocked_on_free); +  DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);  /* @@ -2631,12 +2578,21 @@ static void __init mem_debugging_and_hardening_init(void)  	}  #endif -	if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) && +	if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early || +	    _init_mlocked_on_free_enabled_early) &&  	    page_poisoning_requested) {  		pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " -			"will take precedence over init_on_alloc and init_on_free\n"); +			"will take precedence over init_on_alloc, init_on_free " +			"and init_mlocked_on_free\n");  		_init_on_alloc_enabled_early = false;  		_init_on_free_enabled_early = false; +		_init_mlocked_on_free_enabled_early = false; +	} + +	if (_init_mlocked_on_free_enabled_early && _init_on_free_enabled_early) { +		pr_info("mem auto-init: init_on_free is on, " +			"will take precedence over init_mlocked_on_free\n"); +		_init_mlocked_on_free_enabled_early = false;  	}  	if (_init_on_alloc_enabled_early) { @@ -2653,9 +2609,17 @@ static void __init mem_debugging_and_hardening_init(void)  		static_branch_disable(&init_on_free);  	} -	if (IS_ENABLED(CONFIG_KMSAN) && -	    (_init_on_alloc_enabled_early || _init_on_free_enabled_early)) -		pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n"); +	if (_init_mlocked_on_free_enabled_early) { +		want_check_pages = true; +		static_branch_enable(&init_mlocked_on_free); +	} else { +		static_branch_disable(&init_mlocked_on_free); +	} + +	if (IS_ENABLED(CONFIG_KMSAN) && (_init_on_alloc_enabled_early || +	    _init_on_free_enabled_early || _init_mlocked_on_free_enabled_early)) +		pr_info("mem auto-init: please make sure init_on_alloc, init_on_free and " +			"init_mlocked_on_free are disabled when running KMSAN\n");  #ifdef CONFIG_DEBUG_PAGEALLOC  	if (debug_pagealloc_enabled()) { @@ -2694,9 +2658,10 @@ static void __init report_meminit(void)  	else  		stack = "off"; -	pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n", +	pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s, mlocked free:%s\n",  		stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off", -		want_init_on_free() ? "on" : "off"); +		want_init_on_free() ? "on" : "off", +		want_init_mlocked_on_free() ? "on" : "off");  	if (want_init_on_free())  		pr_info("mem auto-init: clearing system memory may take some time...\n");  } @@ -2793,4 +2758,5 @@ void __init mm_core_init(void)  	pti_init();  	kmsan_init_runtime();  	mm_cache_init(); +	execmem_init();  }  |