diff options
Diffstat (limited to 'mm/page_alloc.c')
| -rw-r--r-- | mm/page_alloc.c | 164 | 
1 files changed, 85 insertions, 79 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef44ad736ca1..eee961958021 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -680,9 +680,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,  	int migratetype = 0;  	int batch_free = 0;  	int to_free = count; +	unsigned long nr_scanned;  	spin_lock(&zone->lock); -	zone->pages_scanned = 0; +	nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); +	if (nr_scanned) +		__mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);  	while (to_free) {  		struct page *page; @@ -731,8 +734,11 @@ static void free_one_page(struct zone *zone,  				unsigned int order,  				int migratetype)  { +	unsigned long nr_scanned;  	spin_lock(&zone->lock); -	zone->pages_scanned = 0; +	nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); +	if (nr_scanned) +		__mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);  	__free_one_page(page, pfn, zone, order, migratetype);  	if (unlikely(!is_migrate_isolate(migratetype))) @@ -1257,15 +1263,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)  {  	unsigned long flags; -	int to_drain; -	unsigned long batch; +	int to_drain, batch;  	local_irq_save(flags);  	batch = ACCESS_ONCE(pcp->batch); -	if (pcp->count >= batch) -		to_drain = batch; -	else -		to_drain = pcp->count; +	to_drain = min(pcp->count, batch);  	if (to_drain > 0) {  		free_pcppages_bulk(zone, to_drain, pcp);  		pcp->count -= to_drain; @@ -1610,6 +1612,9 @@ again:  	}  	__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); +	if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && +	    !zone_is_fair_depleted(zone)) +		zone_set_flag(zone, ZONE_FAIR_DEPLETED);  	__count_zone_vm_events(PGALLOC, zone, 1 << order);  	zone_statistics(preferred_zone, zone, gfp_flags); @@ -1712,7 +1717,6 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,  {  	/* free_pages my go negative - that's OK */  	long min = mark; -	long lowmem_reserve = z->lowmem_reserve[classzone_idx];  	int o;  	long free_cma = 0; @@ -1727,7 +1731,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,  		free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);  #endif -	if (free_pages - free_cma <= min + lowmem_reserve) +	if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])  		return false;  	for (o = 0; o < order; o++) {  		/* At the next order, this order's pages become unavailable */ @@ -1922,6 +1926,18 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)  #endif	/* CONFIG_NUMA */ +static void reset_alloc_batches(struct zone *preferred_zone) +{ +	struct zone *zone = preferred_zone->zone_pgdat->node_zones; + +	do { +		mod_zone_page_state(zone, NR_ALLOC_BATCH, +			high_wmark_pages(zone) - low_wmark_pages(zone) - +			atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); +		zone_clear_flag(zone, ZONE_FAIR_DEPLETED); +	} while (zone++ != preferred_zone); +} +  /*   * get_page_from_freelist goes through the zonelist trying to allocate   * a page. @@ -1939,8 +1955,12 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,  	int did_zlc_setup = 0;		/* just call zlc_setup() one time */  	bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&  				(gfp_mask & __GFP_WRITE); +	int nr_fair_skipped = 0; +	bool zonelist_rescan;  zonelist_scan: +	zonelist_rescan = false; +  	/*  	 * Scan zonelist, looking for a zone with enough free.  	 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. @@ -1964,9 +1984,11 @@ zonelist_scan:  		 */  		if (alloc_flags & ALLOC_FAIR) {  			if (!zone_local(preferred_zone, zone)) +				break; +			if (zone_is_fair_depleted(zone)) { +				nr_fair_skipped++;  				continue; -			if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) -				continue; +			}  		}  		/*  		 * When allocating a page cache page for writing, we @@ -2072,13 +2094,7 @@ this_zone_full:  			zlc_mark_zone_full(zonelist, z);  	} -	if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { -		/* Disable zlc cache for second zonelist scan */ -		zlc_active = 0; -		goto zonelist_scan; -	} - -	if (page) +	if (page) {  		/*  		 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was  		 * necessary to allocate the page. The expectation is @@ -2087,8 +2103,37 @@ this_zone_full:  		 * for !PFMEMALLOC purposes.  		 */  		page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); +		return page; +	} -	return page; +	/* +	 * The first pass makes sure allocations are spread fairly within the +	 * local node.  However, the local node might have free pages left +	 * after the fairness batches are exhausted, and remote zones haven't +	 * even been considered yet.  Try once more without fairness, and +	 * include remote zones now, before entering the slowpath and waking +	 * kswapd: prefer spilling to a remote zone over swapping locally. +	 */ +	if (alloc_flags & ALLOC_FAIR) { +		alloc_flags &= ~ALLOC_FAIR; +		if (nr_fair_skipped) { +			zonelist_rescan = true; +			reset_alloc_batches(preferred_zone); +		} +		if (nr_online_nodes > 1) +			zonelist_rescan = true; +	} + +	if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { +		/* Disable zlc cache for second zonelist scan */ +		zlc_active = 0; +		zonelist_rescan = true; +	} + +	if (zonelist_rescan) +		goto zonelist_scan; + +	return NULL;  }  /* @@ -2201,8 +2246,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,  {  	struct page *page; -	/* Acquire the OOM killer lock for the zones in zonelist */ -	if (!try_set_zonelist_oom(zonelist, gfp_mask)) { +	/* Acquire the per-zone oom lock for each zone */ +	if (!oom_zonelist_trylock(zonelist, gfp_mask)) {  		schedule_timeout_uninterruptible(1);  		return NULL;  	} @@ -2240,7 +2285,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,  	out_of_memory(zonelist, gfp_mask, order, nodemask, false);  out: -	clear_zonelist_oom(zonelist, gfp_mask); +	oom_zonelist_unlock(zonelist, gfp_mask);  	return page;  } @@ -2409,28 +2454,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,  	return page;  } -static void reset_alloc_batches(struct zonelist *zonelist, -				enum zone_type high_zoneidx, -				struct zone *preferred_zone) -{ -	struct zoneref *z; -	struct zone *zone; - -	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { -		/* -		 * Only reset the batches of zones that were actually -		 * considered in the fairness pass, we don't want to -		 * trash fairness information for zones that are not -		 * actually part of this zonelist's round-robin cycle. -		 */ -		if (!zone_local(preferred_zone, zone)) -			continue; -		mod_zone_page_state(zone, NR_ALLOC_BATCH, -			high_wmark_pages(zone) - low_wmark_pages(zone) - -			atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); -	} -} -  static void wake_all_kswapds(unsigned int order,  			     struct zonelist *zonelist,  			     enum zone_type high_zoneidx, @@ -2616,14 +2639,6 @@ rebalance:  		goto got_pg;  	/* -	 * It can become very expensive to allocate transparent hugepages at -	 * fault, so use asynchronous memory compaction for THP unless it is -	 * khugepaged trying to collapse. -	 */ -	if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD)) -		migration_mode = MIGRATE_SYNC_LIGHT; - -	/*  	 * If compaction is deferred for high-order allocations, it is because  	 * sync compaction recently failed. In this is the case and the caller  	 * requested a movable allocation that does not heavily disrupt the @@ -2633,6 +2648,15 @@ rebalance:  						(gfp_mask & __GFP_NO_KSWAPD))  		goto nopage; +	/* +	 * It can become very expensive to allocate transparent hugepages at +	 * fault, so use asynchronous memory compaction for THP unless it is +	 * khugepaged trying to collapse. +	 */ +	if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE || +						(current->flags & PF_KTHREAD)) +		migration_mode = MIGRATE_SYNC_LIGHT; +  	/* Try direct reclaim and then allocating */  	page = __alloc_pages_direct_reclaim(gfp_mask, order,  					zonelist, high_zoneidx, @@ -2766,29 +2790,12 @@ retry_cpuset:  	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)  		alloc_flags |= ALLOC_CMA;  #endif -retry:  	/* First allocation attempt */  	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,  			zonelist, high_zoneidx, alloc_flags,  			preferred_zone, classzone_idx, migratetype);  	if (unlikely(!page)) {  		/* -		 * The first pass makes sure allocations are spread -		 * fairly within the local node.  However, the local -		 * node might have free pages left after the fairness -		 * batches are exhausted, and remote zones haven't -		 * even been considered yet.  Try once more without -		 * fairness, and include remote zones now, before -		 * entering the slowpath and waking kswapd: prefer -		 * spilling to a remote zone over swapping locally. -		 */ -		if (alloc_flags & ALLOC_FAIR) { -			reset_alloc_batches(zonelist, high_zoneidx, -					    preferred_zone); -			alloc_flags &= ~ALLOC_FAIR; -			goto retry; -		} -		/*  		 * Runtime PM, block IO and its error handling path  		 * can deadlock because I/O on the device might not  		 * complete. @@ -2962,7 +2969,7 @@ EXPORT_SYMBOL(alloc_pages_exact);   * Note this is not alloc_pages_exact_node() which allocates on a specific node,   * but is not exact.   */ -void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) +void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)  {  	unsigned order = get_order(size);  	struct page *p = alloc_pages_node(nid, gfp_mask, order); @@ -2970,7 +2977,6 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)  		return NULL;  	return make_alloc_exact((unsigned long)page_address(p), order, size);  } -EXPORT_SYMBOL(alloc_pages_exact_nid);  /**   * free_pages_exact - release memory allocated via alloc_pages_exact() @@ -3052,7 +3058,7 @@ static inline void show_node(struct zone *zone)  void si_meminfo(struct sysinfo *val)  {  	val->totalram = totalram_pages; -	val->sharedram = 0; +	val->sharedram = global_page_state(NR_SHMEM);  	val->freeram = global_page_state(NR_FREE_PAGES);  	val->bufferram = nr_blockdev_pages();  	val->totalhigh = totalhigh_pages; @@ -3072,6 +3078,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)  	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)  		managed_pages += pgdat->node_zones[zone_type].managed_pages;  	val->totalram = managed_pages; +	val->sharedram = node_page_state(nid, NR_SHMEM);  	val->freeram = node_page_state(nid, NR_FREE_PAGES);  #ifdef CONFIG_HIGHMEM  	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; @@ -3253,12 +3260,12 @@ void show_free_areas(unsigned int filter)  			K(zone_page_state(zone, NR_BOUNCE)),  			K(zone_page_state(zone, NR_FREE_CMA_PAGES)),  			K(zone_page_state(zone, NR_WRITEBACK_TEMP)), -			zone->pages_scanned, +			K(zone_page_state(zone, NR_PAGES_SCANNED)),  			(!zone_reclaimable(zone) ? "yes" : "no")  			);  		printk("lowmem_reserve[]:");  		for (i = 0; i < MAX_NR_ZONES; i++) -			printk(" %lu", zone->lowmem_reserve[i]); +			printk(" %ld", zone->lowmem_reserve[i]);  		printk("\n");  	} @@ -5579,7 +5586,7 @@ static void calculate_totalreserve_pages(void)  	for_each_online_pgdat(pgdat) {  		for (i = 0; i < MAX_NR_ZONES; i++) {  			struct zone *zone = pgdat->node_zones + i; -			unsigned long max = 0; +			long max = 0;  			/* Find valid and maximum lowmem_reserve in the zone */  			for (j = i; j < MAX_NR_ZONES; j++) { @@ -5694,9 +5701,8 @@ static void __setup_per_zone_wmarks(void)  		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);  		__mod_zone_page_state(zone, NR_ALLOC_BATCH, -				      high_wmark_pages(zone) - -				      low_wmark_pages(zone) - -				      zone_page_state(zone, NR_ALLOC_BATCH)); +			high_wmark_pages(zone) - low_wmark_pages(zone) - +			atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));  		setup_zone_migrate_reserve(zone);  		spin_unlock_irqrestore(&zone->lock, flags);  |