diff options
Diffstat (limited to 'mm/memcontrol.c')
| -rw-r--r-- | mm/memcontrol.c | 582 | 
1 files changed, 457 insertions, 125 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b2173f7e5164..4ead5a4817de 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -233,6 +233,21 @@ enum res_type {  /* Used for OOM nofiier */  #define OOM_CONTROL		(0) +/* + * Iteration constructs for visiting all cgroups (under a tree).  If + * loops are exited prematurely (break), mem_cgroup_iter_break() must + * be used for reference counting. + */ +#define for_each_mem_cgroup_tree(iter, root)		\ +	for (iter = mem_cgroup_iter(root, NULL, NULL);	\ +	     iter != NULL;				\ +	     iter = mem_cgroup_iter(root, iter, NULL)) + +#define for_each_mem_cgroup(iter)			\ +	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\ +	     iter != NULL;				\ +	     iter = mem_cgroup_iter(NULL, iter, NULL)) +  /* Some nice accessors for the vmpressure. */  struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)  { @@ -246,12 +261,7 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)  	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;  } -static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) -{ -	return (memcg == root_mem_cgroup); -} - -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM  /*   * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.   * The main reason for not using cgroup id for this: @@ -305,7 +315,135 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);  struct workqueue_struct *memcg_kmem_cache_wq; -#endif /* !CONFIG_SLOB */ +static int memcg_shrinker_map_size; +static DEFINE_MUTEX(memcg_shrinker_map_mutex); + +static void memcg_free_shrinker_map_rcu(struct rcu_head *head) +{ +	kvfree(container_of(head, struct memcg_shrinker_map, rcu)); +} + +static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg, +					 int size, int old_size) +{ +	struct memcg_shrinker_map *new, *old; +	int nid; + +	lockdep_assert_held(&memcg_shrinker_map_mutex); + +	for_each_node(nid) { +		old = rcu_dereference_protected( +			mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true); +		/* Not yet online memcg */ +		if (!old) +			return 0; + +		new = kvmalloc(sizeof(*new) + size, GFP_KERNEL); +		if (!new) +			return -ENOMEM; + +		/* Set all old bits, clear all new bits */ +		memset(new->map, (int)0xff, old_size); +		memset((void *)new->map + old_size, 0, size - old_size); + +		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new); +		call_rcu(&old->rcu, memcg_free_shrinker_map_rcu); +	} + +	return 0; +} + +static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) +{ +	struct mem_cgroup_per_node *pn; +	struct memcg_shrinker_map *map; +	int nid; + +	if (mem_cgroup_is_root(memcg)) +		return; + +	for_each_node(nid) { +		pn = mem_cgroup_nodeinfo(memcg, nid); +		map = rcu_dereference_protected(pn->shrinker_map, true); +		if (map) +			kvfree(map); +		rcu_assign_pointer(pn->shrinker_map, NULL); +	} +} + +static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) +{ +	struct memcg_shrinker_map *map; +	int nid, size, ret = 0; + +	if (mem_cgroup_is_root(memcg)) +		return 0; + +	mutex_lock(&memcg_shrinker_map_mutex); +	size = memcg_shrinker_map_size; +	for_each_node(nid) { +		map = kvzalloc(sizeof(*map) + size, GFP_KERNEL); +		if (!map) { +			memcg_free_shrinker_maps(memcg); +			ret = -ENOMEM; +			break; +		} +		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); +	} +	mutex_unlock(&memcg_shrinker_map_mutex); + +	return ret; +} + +int memcg_expand_shrinker_maps(int new_id) +{ +	int size, old_size, ret = 0; +	struct mem_cgroup *memcg; + +	size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long); +	old_size = memcg_shrinker_map_size; +	if (size <= old_size) +		return 0; + +	mutex_lock(&memcg_shrinker_map_mutex); +	if (!root_mem_cgroup) +		goto unlock; + +	for_each_mem_cgroup(memcg) { +		if (mem_cgroup_is_root(memcg)) +			continue; +		ret = memcg_expand_one_shrinker_map(memcg, size, old_size); +		if (ret) +			goto unlock; +	} +unlock: +	if (!ret) +		memcg_shrinker_map_size = size; +	mutex_unlock(&memcg_shrinker_map_mutex); +	return ret; +} + +void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) +{ +	if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { +		struct memcg_shrinker_map *map; + +		rcu_read_lock(); +		map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); +		/* Pairs with smp mb in shrink_slab() */ +		smp_mb__before_atomic(); +		set_bit(shrinker_id, map->map); +		rcu_read_unlock(); +	} +} + +#else /* CONFIG_MEMCG_KMEM */ +static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) +{ +	return 0; +} +static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { } +#endif /* CONFIG_MEMCG_KMEM */  /**   * mem_cgroup_css_from_page - css of the memcg associated with a page @@ -678,9 +816,20 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)  }  EXPORT_SYMBOL(mem_cgroup_from_task); -static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) +/** + * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. + * @mm: mm from which memcg should be extracted. It can be NULL. + * + * Obtain a reference on mm->memcg and returns it if successful. Otherwise + * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is + * returned. + */ +struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)  { -	struct mem_cgroup *memcg = NULL; +	struct mem_cgroup *memcg; + +	if (mem_cgroup_disabled()) +		return NULL;  	rcu_read_lock();  	do { @@ -700,6 +849,46 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)  	rcu_read_unlock();  	return memcg;  } +EXPORT_SYMBOL(get_mem_cgroup_from_mm); + +/** + * get_mem_cgroup_from_page: Obtain a reference on given page's memcg. + * @page: page from which memcg should be extracted. + * + * Obtain a reference on page->memcg and returns it if successful. Otherwise + * root_mem_cgroup is returned. + */ +struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) +{ +	struct mem_cgroup *memcg = page->mem_cgroup; + +	if (mem_cgroup_disabled()) +		return NULL; + +	rcu_read_lock(); +	if (!memcg || !css_tryget_online(&memcg->css)) +		memcg = root_mem_cgroup; +	rcu_read_unlock(); +	return memcg; +} +EXPORT_SYMBOL(get_mem_cgroup_from_page); + +/** + * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg. + */ +static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) +{ +	if (unlikely(current->active_memcg)) { +		struct mem_cgroup *memcg = root_mem_cgroup; + +		rcu_read_lock(); +		if (css_tryget_online(¤t->active_memcg->css)) +			memcg = current->active_memcg; +		rcu_read_unlock(); +		return memcg; +	} +	return get_mem_cgroup_from_mm(current->mm); +}  /**   * mem_cgroup_iter - iterate over memory cgroup hierarchy @@ -862,21 +1051,6 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)  	}  } -/* - * Iteration constructs for visiting all cgroups (under a tree).  If - * loops are exited prematurely (break), mem_cgroup_iter_break() must - * be used for reference counting. - */ -#define for_each_mem_cgroup_tree(iter, root)		\ -	for (iter = mem_cgroup_iter(root, NULL, NULL);	\ -	     iter != NULL;				\ -	     iter = mem_cgroup_iter(root, iter, NULL)) - -#define for_each_mem_cgroup(iter)			\ -	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\ -	     iter != NULL;				\ -	     iter = mem_cgroup_iter(NULL, iter, NULL)) -  /**   * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy   * @memcg: hierarchy root @@ -1483,28 +1657,53 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)  		__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);  } -static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) +enum oom_status { +	OOM_SUCCESS, +	OOM_FAILED, +	OOM_ASYNC, +	OOM_SKIPPED +}; + +static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)  { -	if (!current->memcg_may_oom || order > PAGE_ALLOC_COSTLY_ORDER) -		return; +	if (order > PAGE_ALLOC_COSTLY_ORDER) +		return OOM_SKIPPED; +  	/*  	 * We are in the middle of the charge context here, so we  	 * don't want to block when potentially sitting on a callstack  	 * that holds all kinds of filesystem and mm locks.  	 * -	 * Also, the caller may handle a failed allocation gracefully -	 * (like optional page cache readahead) and so an OOM killer -	 * invocation might not even be necessary. +	 * cgroup1 allows disabling the OOM killer and waiting for outside +	 * handling until the charge can succeed; remember the context and put +	 * the task to sleep at the end of the page fault when all locks are +	 * released.  	 * -	 * That's why we don't do anything here except remember the -	 * OOM context and then deal with it at the end of the page -	 * fault when the stack is unwound, the locks are released, -	 * and when we know whether the fault was overall successful. +	 * On the other hand, in-kernel OOM killer allows for an async victim +	 * memory reclaim (oom_reaper) and that means that we are not solely +	 * relying on the oom victim to make a forward progress and we can +	 * invoke the oom killer here. +	 * +	 * Please note that mem_cgroup_out_of_memory might fail to find a +	 * victim and then we have to bail out from the charge path.  	 */ -	css_get(&memcg->css); -	current->memcg_in_oom = memcg; -	current->memcg_oom_gfp_mask = mask; -	current->memcg_oom_order = order; +	if (memcg->oom_kill_disable) { +		if (!current->in_user_fault) +			return OOM_SKIPPED; +		css_get(&memcg->css); +		current->memcg_in_oom = memcg; +		current->memcg_oom_gfp_mask = mask; +		current->memcg_oom_order = order; + +		return OOM_ASYNC; +	} + +	if (mem_cgroup_out_of_memory(memcg, mask, order)) +		return OOM_SUCCESS; + +	WARN(1,"Memory cgroup charge failed because of no reclaimable memory! " +		"This looks like a misconfiguration or a kernel bug."); +	return OOM_FAILED;  }  /** @@ -1578,6 +1777,62 @@ cleanup:  }  /** + * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM + * @victim: task to be killed by the OOM killer + * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM + * + * Returns a pointer to a memory cgroup, which has to be cleaned up + * by killing all belonging OOM-killable tasks. + * + * Caller has to call mem_cgroup_put() on the returned non-NULL memcg. + */ +struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, +					    struct mem_cgroup *oom_domain) +{ +	struct mem_cgroup *oom_group = NULL; +	struct mem_cgroup *memcg; + +	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) +		return NULL; + +	if (!oom_domain) +		oom_domain = root_mem_cgroup; + +	rcu_read_lock(); + +	memcg = mem_cgroup_from_task(victim); +	if (memcg == root_mem_cgroup) +		goto out; + +	/* +	 * Traverse the memory cgroup hierarchy from the victim task's +	 * cgroup up to the OOMing cgroup (or root) to find the +	 * highest-level memory cgroup with oom.group set. +	 */ +	for (; memcg; memcg = parent_mem_cgroup(memcg)) { +		if (memcg->oom_group) +			oom_group = memcg; + +		if (memcg == oom_domain) +			break; +	} + +	if (oom_group) +		css_get(&oom_group->css); +out: +	rcu_read_unlock(); + +	return oom_group; +} + +void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) +{ +	pr_info("Tasks in "); +	pr_cont_cgroup_path(memcg->css.cgroup); +	pr_cont(" are going to be killed due to memory.oom.group set\n"); +} + +/**   * lock_page_memcg - lock a page->mem_cgroup binding   * @page: the page   * @@ -1899,6 +2154,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,  	unsigned long nr_reclaimed;  	bool may_swap = true;  	bool drained = false; +	bool oomed = false; +	enum oom_status oom_status;  	if (mem_cgroup_is_root(memcg))  		return 0; @@ -1986,6 +2243,9 @@ retry:  	if (nr_retries--)  		goto retry; +	if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed) +		goto nomem; +  	if (gfp_mask & __GFP_NOFAIL)  		goto force; @@ -1994,8 +2254,23 @@ retry:  	memcg_memory_event(mem_over_limit, MEMCG_OOM); -	mem_cgroup_oom(mem_over_limit, gfp_mask, +	/* +	 * keep retrying as long as the memcg oom killer is able to make +	 * a forward progress or bypass the charge if the oom killer +	 * couldn't make any progress. +	 */ +	oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,  		       get_order(nr_pages * PAGE_SIZE)); +	switch (oom_status) { +	case OOM_SUCCESS: +		nr_retries = MEM_CGROUP_RECLAIM_RETRIES; +		oomed = true; +		goto retry; +	case OOM_FAILED: +		goto force; +	default: +		goto nomem; +	}  nomem:  	if (!(gfp_mask & __GFP_NOFAIL))  		return -ENOMEM; @@ -2119,7 +2394,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,  		unlock_page_lru(page, isolated);  } -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM  static int memcg_alloc_cache_id(void)  {  	int id, size; @@ -2261,7 +2536,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)  	if (current->memcg_kmem_skip_account)  		return cachep; -	memcg = get_mem_cgroup_from_mm(current->mm); +	memcg = get_mem_cgroup_from_current();  	kmemcg_id = READ_ONCE(memcg->kmemcg_id);  	if (kmemcg_id < 0)  		goto out; @@ -2345,7 +2620,7 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)  	if (memcg_kmem_bypass())  		return 0; -	memcg = get_mem_cgroup_from_mm(current->mm); +	memcg = get_mem_cgroup_from_current();  	if (!mem_cgroup_is_root(memcg)) {  		ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);  		if (!ret) @@ -2384,7 +2659,7 @@ void memcg_kmem_uncharge(struct page *page, int order)  	css_put_many(&memcg->css, nr_pages);  } -#endif /* !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */  #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -2680,29 +2955,34 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,  	return retval;  } -static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat) -{ -	struct mem_cgroup *iter; -	int i; - -	memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT); - -	for_each_mem_cgroup_tree(iter, memcg) { -		for (i = 0; i < MEMCG_NR_STAT; i++) -			stat[i] += memcg_page_state(iter, i); -	} -} +struct accumulated_stats { +	unsigned long stat[MEMCG_NR_STAT]; +	unsigned long events[NR_VM_EVENT_ITEMS]; +	unsigned long lru_pages[NR_LRU_LISTS]; +	const unsigned int *stats_array; +	const unsigned int *events_array; +	int stats_size; +	int events_size; +}; -static void tree_events(struct mem_cgroup *memcg, unsigned long *events) +static void accumulate_memcg_tree(struct mem_cgroup *memcg, +				  struct accumulated_stats *acc)  { -	struct mem_cgroup *iter; +	struct mem_cgroup *mi;  	int i; -	memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS); +	for_each_mem_cgroup_tree(mi, memcg) { +		for (i = 0; i < acc->stats_size; i++) +			acc->stat[i] += memcg_page_state(mi, +				acc->stats_array ? acc->stats_array[i] : i); -	for_each_mem_cgroup_tree(iter, memcg) { -		for (i = 0; i < NR_VM_EVENT_ITEMS; i++) -			events[i] += memcg_sum_events(iter, i); +		for (i = 0; i < acc->events_size; i++) +			acc->events[i] += memcg_sum_events(mi, +				acc->events_array ? acc->events_array[i] : i); + +		for (i = 0; i < NR_LRU_LISTS; i++) +			acc->lru_pages[i] += +				mem_cgroup_nr_lru_pages(mi, BIT(i));  	}  } @@ -2779,7 +3059,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,  	}  } -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM  static int memcg_online_kmem(struct mem_cgroup *memcg)  {  	int memcg_id; @@ -2851,7 +3131,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)  	}  	rcu_read_unlock(); -	memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); +	memcg_drain_all_list_lrus(kmemcg_id, parent);  	memcg_free_cache_id(kmemcg_id);  } @@ -2879,7 +3159,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)  static void memcg_free_kmem(struct mem_cgroup *memcg)  {  } -#endif /* !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */  static int memcg_update_kmem_max(struct mem_cgroup *memcg,  				 unsigned long max) @@ -3113,6 +3393,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)  	unsigned long memory, memsw;  	struct mem_cgroup *mi;  	unsigned int i; +	struct accumulated_stats acc;  	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));  	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); @@ -3145,32 +3426,27 @@ static int memcg_stat_show(struct seq_file *m, void *v)  		seq_printf(m, "hierarchical_memsw_limit %llu\n",  			   (u64)memsw * PAGE_SIZE); -	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { -		unsigned long long val = 0; +	memset(&acc, 0, sizeof(acc)); +	acc.stats_size = ARRAY_SIZE(memcg1_stats); +	acc.stats_array = memcg1_stats; +	acc.events_size = ARRAY_SIZE(memcg1_events); +	acc.events_array = memcg1_events; +	accumulate_memcg_tree(memcg, &acc); +	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {  		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())  			continue; -		for_each_mem_cgroup_tree(mi, memcg) -			val += memcg_page_state(mi, memcg1_stats[i]) * -			PAGE_SIZE; -		seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val); -	} - -	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) { -		unsigned long long val = 0; - -		for_each_mem_cgroup_tree(mi, memcg) -			val += memcg_sum_events(mi, memcg1_events[i]); -		seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val); +		seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], +			   (u64)acc.stat[i] * PAGE_SIZE);  	} -	for (i = 0; i < NR_LRU_LISTS; i++) { -		unsigned long long val = 0; +	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) +		seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], +			   (u64)acc.events[i]); -		for_each_mem_cgroup_tree(mi, memcg) -			val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; -		seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); -	} +	for (i = 0; i < NR_LRU_LISTS; i++) +		seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], +			   (u64)acc.lru_pages[i] * PAGE_SIZE);  #ifdef CONFIG_DEBUG_VM  	{ @@ -4183,7 +4459,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)  	INIT_LIST_HEAD(&memcg->event_list);  	spin_lock_init(&memcg->event_list_lock);  	memcg->socket_pressure = jiffies; -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM  	memcg->kmemcg_id = -1;  #endif  #ifdef CONFIG_CGROUP_WRITEBACK @@ -4260,6 +4536,16 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)  {  	struct mem_cgroup *memcg = mem_cgroup_from_css(css); +	/* +	 * A memcg must be visible for memcg_expand_shrinker_maps() +	 * by the time the maps are allocated. So, we allocate maps +	 * here, when for_each_mem_cgroup() can't skip it. +	 */ +	if (memcg_alloc_shrinker_maps(memcg)) { +		mem_cgroup_id_remove(memcg); +		return -ENOMEM; +	} +  	/* Online state pins memcg ID, memcg ID pins CSS */  	atomic_set(&memcg->id.ref, 1);  	css_get(css); @@ -4312,6 +4598,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)  	vmpressure_cleanup(&memcg->vmpressure);  	cancel_work_sync(&memcg->high_work);  	mem_cgroup_remove_from_trees(memcg); +	memcg_free_shrinker_maps(memcg);  	memcg_free_kmem(memcg);  	mem_cgroup_free(memcg);  } @@ -5256,8 +5543,7 @@ static int memory_events_show(struct seq_file *m, void *v)  static int memory_stat_show(struct seq_file *m, void *v)  {  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); -	unsigned long stat[MEMCG_NR_STAT]; -	unsigned long events[NR_VM_EVENT_ITEMS]; +	struct accumulated_stats acc;  	int i;  	/* @@ -5271,70 +5557,97 @@ static int memory_stat_show(struct seq_file *m, void *v)  	 * Current memory state:  	 */ -	tree_stat(memcg, stat); -	tree_events(memcg, events); +	memset(&acc, 0, sizeof(acc)); +	acc.stats_size = MEMCG_NR_STAT; +	acc.events_size = NR_VM_EVENT_ITEMS; +	accumulate_memcg_tree(memcg, &acc);  	seq_printf(m, "anon %llu\n", -		   (u64)stat[MEMCG_RSS] * PAGE_SIZE); +		   (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);  	seq_printf(m, "file %llu\n", -		   (u64)stat[MEMCG_CACHE] * PAGE_SIZE); +		   (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);  	seq_printf(m, "kernel_stack %llu\n", -		   (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024); +		   (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);  	seq_printf(m, "slab %llu\n", -		   (u64)(stat[NR_SLAB_RECLAIMABLE] + -			 stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); +		   (u64)(acc.stat[NR_SLAB_RECLAIMABLE] + +			 acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);  	seq_printf(m, "sock %llu\n", -		   (u64)stat[MEMCG_SOCK] * PAGE_SIZE); +		   (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);  	seq_printf(m, "shmem %llu\n", -		   (u64)stat[NR_SHMEM] * PAGE_SIZE); +		   (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);  	seq_printf(m, "file_mapped %llu\n", -		   (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE); +		   (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);  	seq_printf(m, "file_dirty %llu\n", -		   (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE); +		   (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);  	seq_printf(m, "file_writeback %llu\n", -		   (u64)stat[NR_WRITEBACK] * PAGE_SIZE); - -	for (i = 0; i < NR_LRU_LISTS; i++) { -		struct mem_cgroup *mi; -		unsigned long val = 0; +		   (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE); -		for_each_mem_cgroup_tree(mi, memcg) -			val += mem_cgroup_nr_lru_pages(mi, BIT(i)); -		seq_printf(m, "%s %llu\n", -			   mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE); -	} +	for (i = 0; i < NR_LRU_LISTS; i++) +		seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], +			   (u64)acc.lru_pages[i] * PAGE_SIZE);  	seq_printf(m, "slab_reclaimable %llu\n", -		   (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); +		   (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);  	seq_printf(m, "slab_unreclaimable %llu\n", -		   (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE); +		   (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);  	/* Accumulated memory events */ -	seq_printf(m, "pgfault %lu\n", events[PGFAULT]); -	seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]); +	seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]); +	seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]); -	seq_printf(m, "pgrefill %lu\n", events[PGREFILL]); -	seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] + -		   events[PGSCAN_DIRECT]); -	seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] + -		   events[PGSTEAL_DIRECT]); -	seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]); -	seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]); -	seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]); -	seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]); +	seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]); +	seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] + +		   acc.events[PGSCAN_DIRECT]); +	seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] + +		   acc.events[PGSTEAL_DIRECT]); +	seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]); +	seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]); +	seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); +	seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);  	seq_printf(m, "workingset_refault %lu\n", -		   stat[WORKINGSET_REFAULT]); +		   acc.stat[WORKINGSET_REFAULT]);  	seq_printf(m, "workingset_activate %lu\n", -		   stat[WORKINGSET_ACTIVATE]); +		   acc.stat[WORKINGSET_ACTIVATE]);  	seq_printf(m, "workingset_nodereclaim %lu\n", -		   stat[WORKINGSET_NODERECLAIM]); +		   acc.stat[WORKINGSET_NODERECLAIM]);  	return 0;  } +static int memory_oom_group_show(struct seq_file *m, void *v) +{ +	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + +	seq_printf(m, "%d\n", memcg->oom_group); + +	return 0; +} + +static ssize_t memory_oom_group_write(struct kernfs_open_file *of, +				      char *buf, size_t nbytes, loff_t off) +{ +	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); +	int ret, oom_group; + +	buf = strstrip(buf); +	if (!buf) +		return -EINVAL; + +	ret = kstrtoint(buf, 0, &oom_group); +	if (ret) +		return ret; + +	if (oom_group != 0 && oom_group != 1) +		return -EINVAL; + +	memcg->oom_group = oom_group; + +	return nbytes; +} +  static struct cftype memory_files[] = {  	{  		.name = "current", @@ -5376,6 +5689,12 @@ static struct cftype memory_files[] = {  		.flags = CFTYPE_NOT_ON_ROOT,  		.seq_show = memory_stat_show,  	}, +	{ +		.name = "oom.group", +		.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, +		.seq_show = memory_oom_group_show, +		.write = memory_oom_group_write, +	},  	{ }	/* terminate */  }; @@ -5600,6 +5919,19 @@ out:  	return ret;  } +int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, +			  gfp_t gfp_mask, struct mem_cgroup **memcgp, +			  bool compound) +{ +	struct mem_cgroup *memcg; +	int ret; + +	ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound); +	memcg = *memcgp; +	mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask); +	return ret; +} +  /**   * mem_cgroup_commit_charge - commit a page charge   * @page: page to charge @@ -6010,7 +6342,7 @@ static int __init mem_cgroup_init(void)  {  	int cpu, node; -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM  	/*  	 * Kmem cache creation is mostly done with the slab_mutex held,  	 * so use a workqueue with limited concurrency to avoid stalling  |