diff options
Diffstat (limited to 'mm/vmscan.c')
| -rw-r--r-- | mm/vmscan.c | 244 | 
1 files changed, 203 insertions, 41 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 03822f86f288..7e7d25504651 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -65,12 +65,6 @@ struct scan_control {  	/* How many pages shrink_list() should reclaim */  	unsigned long nr_to_reclaim; -	/* This context's GFP mask */ -	gfp_t gfp_mask; - -	/* Allocation order */ -	int order; -  	/*  	 * Nodemask of nodes allowed by the caller. If NULL, all nodes  	 * are scanned. @@ -83,12 +77,6 @@ struct scan_control {  	 */  	struct mem_cgroup *target_mem_cgroup; -	/* Scan (total_size >> priority) pages at once */ -	int priority; - -	/* The highest zone to isolate pages for reclaim from */ -	enum zone_type reclaim_idx; -  	/* Writepage batching in laptop mode; RECLAIM_WRITE */  	unsigned int may_writepage:1; @@ -111,6 +99,18 @@ struct scan_control {  	/* One of the zones is ready for compaction */  	unsigned int compaction_ready:1; +	/* Allocation order */ +	s8 order; + +	/* Scan (total_size >> priority) pages at once */ +	s8 priority; + +	/* The highest zone to isolate pages for reclaim from */ +	s8 reclaim_idx; + +	/* This context's GFP mask */ +	gfp_t gfp_mask; +  	/* Incremented by the number of inactive pages that were scanned */  	unsigned long nr_scanned; @@ -169,6 +169,70 @@ unsigned long vm_total_pages;  static LIST_HEAD(shrinker_list);  static DECLARE_RWSEM(shrinker_rwsem); +#ifdef CONFIG_MEMCG_KMEM + +/* + * We allow subsystems to populate their shrinker-related + * LRU lists before register_shrinker_prepared() is called + * for the shrinker, since we don't want to impose + * restrictions on their internal registration order. + * In this case shrink_slab_memcg() may find corresponding + * bit is set in the shrinkers map. + * + * This value is used by the function to detect registering + * shrinkers and to skip do_shrink_slab() calls for them. + */ +#define SHRINKER_REGISTERING ((struct shrinker *)~0UL) + +static DEFINE_IDR(shrinker_idr); +static int shrinker_nr_max; + +static int prealloc_memcg_shrinker(struct shrinker *shrinker) +{ +	int id, ret = -ENOMEM; + +	down_write(&shrinker_rwsem); +	/* This may call shrinker, so it must use down_read_trylock() */ +	id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL); +	if (id < 0) +		goto unlock; + +	if (id >= shrinker_nr_max) { +		if (memcg_expand_shrinker_maps(id)) { +			idr_remove(&shrinker_idr, id); +			goto unlock; +		} + +		shrinker_nr_max = id + 1; +	} +	shrinker->id = id; +	ret = 0; +unlock: +	up_write(&shrinker_rwsem); +	return ret; +} + +static void unregister_memcg_shrinker(struct shrinker *shrinker) +{ +	int id = shrinker->id; + +	BUG_ON(id < 0); + +	down_write(&shrinker_rwsem); +	idr_remove(&shrinker_idr, id); +	up_write(&shrinker_rwsem); +} +#else /* CONFIG_MEMCG_KMEM */ +static int prealloc_memcg_shrinker(struct shrinker *shrinker) +{ +	return 0; +} + +static void unregister_memcg_shrinker(struct shrinker *shrinker) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ +  #ifdef CONFIG_MEMCG  static bool global_reclaim(struct scan_control *sc)  { @@ -313,11 +377,28 @@ int prealloc_shrinker(struct shrinker *shrinker)  	shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);  	if (!shrinker->nr_deferred)  		return -ENOMEM; + +	if (shrinker->flags & SHRINKER_MEMCG_AWARE) { +		if (prealloc_memcg_shrinker(shrinker)) +			goto free_deferred; +	} +  	return 0; + +free_deferred: +	kfree(shrinker->nr_deferred); +	shrinker->nr_deferred = NULL; +	return -ENOMEM;  }  void free_prealloced_shrinker(struct shrinker *shrinker)  { +	if (!shrinker->nr_deferred) +		return; + +	if (shrinker->flags & SHRINKER_MEMCG_AWARE) +		unregister_memcg_shrinker(shrinker); +  	kfree(shrinker->nr_deferred);  	shrinker->nr_deferred = NULL;  } @@ -326,6 +407,10 @@ void register_shrinker_prepared(struct shrinker *shrinker)  {  	down_write(&shrinker_rwsem);  	list_add_tail(&shrinker->list, &shrinker_list); +#ifdef CONFIG_MEMCG_KMEM +	if (shrinker->flags & SHRINKER_MEMCG_AWARE) +		idr_replace(&shrinker_idr, shrinker, shrinker->id); +#endif  	up_write(&shrinker_rwsem);  } @@ -347,6 +432,8 @@ void unregister_shrinker(struct shrinker *shrinker)  {  	if (!shrinker->nr_deferred)  		return; +	if (shrinker->flags & SHRINKER_MEMCG_AWARE) +		unregister_memcg_shrinker(shrinker);  	down_write(&shrinker_rwsem);  	list_del(&shrinker->list);  	up_write(&shrinker_rwsem); @@ -371,9 +458,12 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,  					  : SHRINK_BATCH;  	long scanned = 0, next_deferred; +	if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) +		nid = 0; +  	freeable = shrinker->count_objects(shrinker, shrinkctl); -	if (freeable == 0) -		return 0; +	if (freeable == 0 || freeable == SHRINK_EMPTY) +		return freeable;  	/*  	 * copy the current shrinker scan count into a local variable @@ -474,6 +564,84 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,  	return freed;  } +#ifdef CONFIG_MEMCG_KMEM +static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, +			struct mem_cgroup *memcg, int priority) +{ +	struct memcg_shrinker_map *map; +	unsigned long freed = 0; +	int ret, i; + +	if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)) +		return 0; + +	if (!down_read_trylock(&shrinker_rwsem)) +		return 0; + +	map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map, +					true); +	if (unlikely(!map)) +		goto unlock; + +	for_each_set_bit(i, map->map, shrinker_nr_max) { +		struct shrink_control sc = { +			.gfp_mask = gfp_mask, +			.nid = nid, +			.memcg = memcg, +		}; +		struct shrinker *shrinker; + +		shrinker = idr_find(&shrinker_idr, i); +		if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) { +			if (!shrinker) +				clear_bit(i, map->map); +			continue; +		} + +		ret = do_shrink_slab(&sc, shrinker, priority); +		if (ret == SHRINK_EMPTY) { +			clear_bit(i, map->map); +			/* +			 * After the shrinker reported that it had no objects to +			 * free, but before we cleared the corresponding bit in +			 * the memcg shrinker map, a new object might have been +			 * added. To make sure, we have the bit set in this +			 * case, we invoke the shrinker one more time and reset +			 * the bit if it reports that it is not empty anymore. +			 * The memory barrier here pairs with the barrier in +			 * memcg_set_shrinker_bit(): +			 * +			 * list_lru_add()     shrink_slab_memcg() +			 *   list_add_tail()    clear_bit() +			 *   <MB>               <MB> +			 *   set_bit()          do_shrink_slab() +			 */ +			smp_mb__after_atomic(); +			ret = do_shrink_slab(&sc, shrinker, priority); +			if (ret == SHRINK_EMPTY) +				ret = 0; +			else +				memcg_set_shrinker_bit(memcg, nid, i); +		} +		freed += ret; + +		if (rwsem_is_contended(&shrinker_rwsem)) { +			freed = freed ? : 1; +			break; +		} +	} +unlock: +	up_read(&shrinker_rwsem); +	return freed; +} +#else /* CONFIG_MEMCG_KMEM */ +static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, +			struct mem_cgroup *memcg, int priority) +{ +	return 0; +} +#endif /* CONFIG_MEMCG_KMEM */ +  /**   * shrink_slab - shrink slab caches   * @gfp_mask: allocation context @@ -486,10 +654,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,   * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,   * unaware shrinkers will receive a node id of 0 instead.   * - * @memcg specifies the memory cgroup to target. If it is not NULL, - * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan - * objects from the memory cgroup specified. Otherwise, only unaware - * shrinkers are called. + * @memcg specifies the memory cgroup to target. Unaware shrinkers + * are called only if it is the root cgroup.   *   * @priority is sc->priority, we take the number of objects and >> by priority   * in order to get the scan target. @@ -502,9 +668,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,  {  	struct shrinker *shrinker;  	unsigned long freed = 0; +	int ret; -	if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))) -		return 0; +	if (!mem_cgroup_is_root(memcg)) +		return shrink_slab_memcg(gfp_mask, nid, memcg, priority);  	if (!down_read_trylock(&shrinker_rwsem))  		goto out; @@ -516,19 +683,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,  			.memcg = memcg,  		}; -		/* -		 * If kernel memory accounting is disabled, we ignore -		 * SHRINKER_MEMCG_AWARE flag and call all shrinkers -		 * passing NULL for memcg. -		 */ -		if (memcg_kmem_enabled() && -		    !!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE)) -			continue; - -		if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) -			sc.nid = 0; - -		freed += do_shrink_slab(&sc, shrinker, priority); +		ret = do_shrink_slab(&sc, shrinker, priority); +		if (ret == SHRINK_EMPTY) +			ret = 0; +		freed += ret;  		/*  		 * Bail out if someone want to register a new shrinker to  		 * prevent the regsitration from being stalled for long periods @@ -554,6 +712,7 @@ void drop_slab_node(int nid)  		struct mem_cgroup *memcg = NULL;  		freed = 0; +		memcg = mem_cgroup_iter(NULL, NULL, NULL);  		do {  			freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);  		} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); @@ -744,7 +903,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,  		refcount = 2;  	if (!page_ref_freeze(page, refcount))  		goto cannot_free; -	/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ +	/* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */  	if (unlikely(PageDirty(page))) {  		page_ref_unfreeze(page, refcount);  		goto cannot_free; @@ -2573,9 +2732,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)  			shrink_node_memcg(pgdat, memcg, sc, &lru_pages);  			node_lru_pages += lru_pages; -			if (memcg) -				shrink_slab(sc->gfp_mask, pgdat->node_id, -					    memcg, sc->priority); +			shrink_slab(sc->gfp_mask, pgdat->node_id, +				    memcg, sc->priority);  			/* Record the group's reclaim efficiency */  			vmpressure(sc->gfp_mask, memcg, false, @@ -2599,10 +2757,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)  			}  		} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); -		if (global_reclaim(sc)) -			shrink_slab(sc->gfp_mask, pgdat->node_id, NULL, -				    sc->priority); -  		if (reclaim_state) {  			sc->nr_reclaimed += reclaim_state->reclaimed_slab;  			reclaim_state->reclaimed_slab = 0; @@ -3064,6 +3218,14 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,  	};  	/* +	 * scan_control uses s8 fields for order, priority, and reclaim_idx. +	 * Confirm they are large enough for max values. +	 */ +	BUILD_BUG_ON(MAX_ORDER > S8_MAX); +	BUILD_BUG_ON(DEF_PRIORITY > S8_MAX); +	BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX); + +	/*  	 * Do not enter reclaim if fatal signal was delivered while throttled.  	 * 1 is returned so that the page allocator does not OOM kill at this  	 * point.  |