diff options
Diffstat (limited to 'mm/memcontrol.c')
| -rw-r--r-- | mm/memcontrol.c | 961 | 
1 files changed, 613 insertions, 348 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bf5e89457149..19d5d4274e22 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -45,16 +45,17 @@  #include <linux/swapops.h>  #include <linux/spinlock.h>  #include <linux/eventfd.h> +#include <linux/poll.h>  #include <linux/sort.h>  #include <linux/fs.h>  #include <linux/seq_file.h> -#include <linux/vmalloc.h>  #include <linux/vmpressure.h>  #include <linux/mm_inline.h>  #include <linux/page_cgroup.h>  #include <linux/cpu.h>  #include <linux/oom.h>  #include <linux/lockdep.h> +#include <linux/file.h>  #include "internal.h"  #include <net/sock.h>  #include <net/ip.h> @@ -148,7 +149,7 @@ struct mem_cgroup_reclaim_iter {  	 * matches memcg->dead_count of the hierarchy root group.  	 */  	struct mem_cgroup *last_visited; -	unsigned long last_dead_count; +	int last_dead_count;  	/* scan generation, increased every round-trip */  	unsigned int generation; @@ -227,6 +228,46 @@ struct mem_cgroup_eventfd_list {  	struct eventfd_ctx *eventfd;  }; +/* + * cgroup_event represents events which userspace want to receive. + */ +struct mem_cgroup_event { +	/* +	 * memcg which the event belongs to. +	 */ +	struct mem_cgroup *memcg; +	/* +	 * eventfd to signal userspace about the event. +	 */ +	struct eventfd_ctx *eventfd; +	/* +	 * Each of these stored in a list by the cgroup. +	 */ +	struct list_head list; +	/* +	 * register_event() callback will be used to add new userspace +	 * waiter for changes related to this event.  Use eventfd_signal() +	 * on eventfd to send notification to userspace. +	 */ +	int (*register_event)(struct mem_cgroup *memcg, +			      struct eventfd_ctx *eventfd, const char *args); +	/* +	 * unregister_event() callback will be called when userspace closes +	 * the eventfd or on cgroup removing.  This callback must be set, +	 * if you want provide notification functionality. +	 */ +	void (*unregister_event)(struct mem_cgroup *memcg, +				 struct eventfd_ctx *eventfd); +	/* +	 * All fields below needed to unregister event when +	 * userspace closes eventfd. +	 */ +	poll_table pt; +	wait_queue_head_t *wqh; +	wait_queue_t wait; +	struct work_struct remove; +}; +  static void mem_cgroup_threshold(struct mem_cgroup *memcg);  static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); @@ -331,27 +372,20 @@ struct mem_cgroup {  	atomic_t	numainfo_updating;  #endif +	/* List of events which userspace want to receive */ +	struct list_head event_list; +	spinlock_t event_list_lock; +  	struct mem_cgroup_per_node *nodeinfo[0];  	/* WARNING: nodeinfo must be the last member here */  }; -static size_t memcg_size(void) -{ -	return sizeof(struct mem_cgroup) + -		nr_node_ids * sizeof(struct mem_cgroup_per_node); -} -  /* internal only representation about the status of kmem accounting. */  enum { -	KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ -	KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */ +	KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */  	KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */  }; -/* We account when limit is on, but only after call sites are patched */ -#define KMEM_ACCOUNTED_MASK \ -		((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED)) -  #ifdef CONFIG_MEMCG_KMEM  static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)  { @@ -363,16 +397,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg)  	return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);  } -static void memcg_kmem_set_activated(struct mem_cgroup *memcg) -{ -	set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); -} - -static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) -{ -	clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); -} -  static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)  {  	/* @@ -490,11 +514,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)  	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;  } -struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css) -{ -	return &mem_cgroup_from_css(css)->vmpressure; -} -  static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)  {  	return (memcg == root_mem_cgroup); @@ -1098,16 +1117,22 @@ skip_node:  	 * skipped and we should continue the tree walk.  	 * last_visited css is safe to use because it is  	 * protected by css_get and the tree walk is rcu safe. +	 * +	 * We do not take a reference on the root of the tree walk +	 * because we might race with the root removal when it would +	 * be the only node in the iterated hierarchy and mem_cgroup_iter +	 * would end up in an endless loop because it expects that at +	 * least one valid node will be returned. Root cannot disappear +	 * because caller of the iterator should hold it already so +	 * skipping css reference should be safe.  	 */  	if (next_css) { -		struct mem_cgroup *mem = mem_cgroup_from_css(next_css); +		if ((next_css->flags & CSS_ONLINE) && +				(next_css == &root->css || css_tryget(next_css))) +			return mem_cgroup_from_css(next_css); -		if (css_tryget(&mem->css)) -			return mem; -		else { -			prev_css = next_css; -			goto skip_node; -		} +		prev_css = next_css; +		goto skip_node;  	}  	return NULL; @@ -1141,7 +1166,15 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,  	if (iter->last_dead_count == *sequence) {  		smp_rmb();  		position = iter->last_visited; -		if (position && !css_tryget(&position->css)) + +		/* +		 * We cannot take a reference to root because we might race +		 * with root removal and returning NULL would end up in +		 * an endless loop on the iterator user level when root +		 * would be returned all the time. +		 */ +		if (position && position != root && +				!css_tryget(&position->css))  			position = NULL;  	}  	return position; @@ -1150,9 +1183,11 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,  static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,  				   struct mem_cgroup *last_visited,  				   struct mem_cgroup *new_position, +				   struct mem_cgroup *root,  				   int sequence)  { -	if (last_visited) +	/* root reference counting symmetric to mem_cgroup_iter_load */ +	if (last_visited && last_visited != root)  		css_put(&last_visited->css);  	/*  	 * We store the sequence count from the time @last_visited was @@ -1227,7 +1262,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,  		memcg = __mem_cgroup_iter_next(root, last_visited);  		if (reclaim) { -			mem_cgroup_iter_update(iter, last_visited, memcg, seq); +			mem_cgroup_iter_update(iter, last_visited, memcg, root, +					seq);  			if (!memcg)  				iter->generation++; @@ -1647,13 +1683,13 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,   */  void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)  { -	struct cgroup *task_cgrp; -	struct cgroup *mem_cgrp;  	/* -	 * Need a buffer in BSS, can't rely on allocations. The code relies -	 * on the assumption that OOM is serialized for memory controller. -	 * If this assumption is broken, revisit this code. +	 * protects memcg_name and makes sure that parallel ooms do not +	 * interleave  	 */ +	static DEFINE_SPINLOCK(oom_info_lock); +	struct cgroup *task_cgrp; +	struct cgroup *mem_cgrp;  	static char memcg_name[PATH_MAX];  	int ret;  	struct mem_cgroup *iter; @@ -1662,6 +1698,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)  	if (!p)  		return; +	spin_lock(&oom_info_lock);  	rcu_read_lock();  	mem_cgrp = memcg->css.cgroup; @@ -1730,6 +1767,7 @@ done:  		pr_cont("\n");  	} +	spin_unlock(&oom_info_lock);  }  /* @@ -1822,13 +1860,18 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,  				break;  			};  			points = oom_badness(task, memcg, NULL, totalpages); -			if (points > chosen_points) { -				if (chosen) -					put_task_struct(chosen); -				chosen = task; -				chosen_points = points; -				get_task_struct(chosen); -			} +			if (!points || points < chosen_points) +				continue; +			/* Prefer thread group leaders for display purposes */ +			if (points == chosen_points && +			    thread_group_leader(chosen)) +				continue; + +			if (chosen) +				put_task_struct(chosen); +			chosen = task; +			chosen_points = points; +			get_task_struct(chosen);  		}  		css_task_iter_end(&it);  	} @@ -2861,7 +2904,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)  	unsigned short id;  	swp_entry_t ent; -	VM_BUG_ON(!PageLocked(page)); +	VM_BUG_ON_PAGE(!PageLocked(page), page);  	pc = lookup_page_cgroup(page);  	lock_page_cgroup(pc); @@ -2895,7 +2938,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,  	bool anon;  	lock_page_cgroup(pc); -	VM_BUG_ON(PageCgroupUsed(pc)); +	VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);  	/*  	 * we don't need page_cgroup_lock about tail pages, becase they are not  	 * accessed by any other context at this point. @@ -2930,7 +2973,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,  	if (lrucare) {  		if (was_on_lru) {  			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); -			VM_BUG_ON(PageLRU(page)); +			VM_BUG_ON_PAGE(PageLRU(page), page);  			SetPageLRU(page);  			add_page_to_lru_list(page, lruvec, page_lru(page));  		} @@ -2956,10 +2999,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,  static DEFINE_MUTEX(set_limit_mutex);  #ifdef CONFIG_MEMCG_KMEM +static DEFINE_MUTEX(activate_kmem_mutex); +  static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)  {  	return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && -		(memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); +		memcg_kmem_is_active(memcg);  }  /* @@ -2976,10 +3021,9 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)  }  #ifdef CONFIG_SLABINFO -static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css, -				    struct cftype *cft, struct seq_file *m) +static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)  { -	struct mem_cgroup *memcg = mem_cgroup_from_css(css); +	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));  	struct memcg_cache_params *params;  	if (!memcg_can_account_kmem(memcg)) @@ -3059,16 +3103,6 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)  		css_put(&memcg->css);  } -void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) -{ -	if (!memcg) -		return; - -	mutex_lock(&memcg->slab_caches_mutex); -	list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); -	mutex_unlock(&memcg->slab_caches_mutex); -} -  /*   * helper for acessing a memcg's index. It will be used as an index in the   * child cache array in kmem_cache, and also to derive its name. This function @@ -3079,43 +3113,6 @@ int memcg_cache_id(struct mem_cgroup *memcg)  	return memcg ? memcg->kmemcg_id : -1;  } -/* - * This ends up being protected by the set_limit mutex, during normal - * operation, because that is its main call site. - * - * But when we create a new cache, we can call this as well if its parent - * is kmem-limited. That will have to hold set_limit_mutex as well. - */ -int memcg_update_cache_sizes(struct mem_cgroup *memcg) -{ -	int num, ret; - -	num = ida_simple_get(&kmem_limited_groups, -				0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); -	if (num < 0) -		return num; -	/* -	 * After this point, kmem_accounted (that we test atomically in -	 * the beginning of this conditional), is no longer 0. This -	 * guarantees only one process will set the following boolean -	 * to true. We don't need test_and_set because we're protected -	 * by the set_limit_mutex anyway. -	 */ -	memcg_kmem_set_activated(memcg); - -	ret = memcg_update_all_caches(num+1); -	if (ret) { -		ida_simple_remove(&kmem_limited_groups, num); -		memcg_kmem_clear_activated(memcg); -		return ret; -	} - -	memcg->kmemcg_id = num; -	INIT_LIST_HEAD(&memcg->memcg_slab_caches); -	mutex_init(&memcg->slab_caches_mutex); -	return 0; -} -  static size_t memcg_caches_array_size(int num_groups)  {  	ssize_t size; @@ -3152,18 +3149,17 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)  	if (num_groups > memcg_limited_groups_array_size) {  		int i; +		struct memcg_cache_params *new_params;  		ssize_t size = memcg_caches_array_size(num_groups);  		size *= sizeof(void *);  		size += offsetof(struct memcg_cache_params, memcg_caches); -		s->memcg_params = kzalloc(size, GFP_KERNEL); -		if (!s->memcg_params) { -			s->memcg_params = cur_params; +		new_params = kzalloc(size, GFP_KERNEL); +		if (!new_params)  			return -ENOMEM; -		} -		s->memcg_params->is_root_cache = true; +		new_params->is_root_cache = true;  		/*  		 * There is the chance it will be bigger than @@ -3177,7 +3173,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)  		for (i = 0; i < memcg_limited_groups_array_size; i++) {  			if (!cur_params->memcg_caches[i])  				continue; -			s->memcg_params->memcg_caches[i] = +			new_params->memcg_caches[i] =  						cur_params->memcg_caches[i];  		} @@ -3190,13 +3186,15 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)  		 * bigger than the others. And all updates will reset this  		 * anyway.  		 */ -		kfree(cur_params); +		rcu_assign_pointer(s->memcg_params, new_params); +		if (cur_params) +			kfree_rcu(cur_params, rcu_head);  	}  	return 0;  } -int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, -			 struct kmem_cache *root_cache) +int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, +			     struct kmem_cache *root_cache)  {  	size_t size; @@ -3224,35 +3222,85 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,  	return 0;  } -void memcg_release_cache(struct kmem_cache *s) +void memcg_free_cache_params(struct kmem_cache *s) +{ +	kfree(s->memcg_params); +} + +void memcg_register_cache(struct kmem_cache *s)  {  	struct kmem_cache *root;  	struct mem_cgroup *memcg;  	int id; -	/* -	 * This happens, for instance, when a root cache goes away before we -	 * add any memcg. -	 */ -	if (!s->memcg_params) +	if (is_root_cache(s))  		return; -	if (s->memcg_params->is_root_cache) -		goto out; +	/* +	 * Holding the slab_mutex assures nobody will touch the memcg_caches +	 * array while we are modifying it. +	 */ +	lockdep_assert_held(&slab_mutex); +	root = s->memcg_params->root_cache;  	memcg = s->memcg_params->memcg; -	id  = memcg_cache_id(memcg); +	id = memcg_cache_id(memcg); + +	css_get(&memcg->css); + + +	/* +	 * Since readers won't lock (see cache_from_memcg_idx()), we need a +	 * barrier here to ensure nobody will see the kmem_cache partially +	 * initialized. +	 */ +	smp_wmb(); + +	/* +	 * Initialize the pointer to this cache in its parent's memcg_params +	 * before adding it to the memcg_slab_caches list, otherwise we can +	 * fail to convert memcg_params_to_cache() while traversing the list. +	 */ +	VM_BUG_ON(root->memcg_params->memcg_caches[id]); +	root->memcg_params->memcg_caches[id] = s; + +	mutex_lock(&memcg->slab_caches_mutex); +	list_add(&s->memcg_params->list, &memcg->memcg_slab_caches); +	mutex_unlock(&memcg->slab_caches_mutex); +} + +void memcg_unregister_cache(struct kmem_cache *s) +{ +	struct kmem_cache *root; +	struct mem_cgroup *memcg; +	int id; + +	if (is_root_cache(s)) +		return; + +	/* +	 * Holding the slab_mutex assures nobody will touch the memcg_caches +	 * array while we are modifying it. +	 */ +	lockdep_assert_held(&slab_mutex);  	root = s->memcg_params->root_cache; -	root->memcg_params->memcg_caches[id] = NULL; +	memcg = s->memcg_params->memcg; +	id = memcg_cache_id(memcg);  	mutex_lock(&memcg->slab_caches_mutex);  	list_del(&s->memcg_params->list);  	mutex_unlock(&memcg->slab_caches_mutex); +	/* +	 * Clear the pointer to this cache in its parent's memcg_params only +	 * after removing it from the memcg_slab_caches list, otherwise we can +	 * fail to convert memcg_params_to_cache() while traversing the list. +	 */ +	VM_BUG_ON(!root->memcg_params->memcg_caches[id]); +	root->memcg_params->memcg_caches[id] = NULL; +  	css_put(&memcg->css); -out: -	kfree(s->memcg_params);  }  /* @@ -3311,11 +3359,9 @@ static void kmem_cache_destroy_work_func(struct work_struct *w)  	 * So if we aren't down to zero, we'll just schedule a worker and try  	 * again  	 */ -	if (atomic_read(&cachep->memcg_params->nr_pages) != 0) { +	if (atomic_read(&cachep->memcg_params->nr_pages) != 0)  		kmem_cache_shrink(cachep); -		if (atomic_read(&cachep->memcg_params->nr_pages) == 0) -			return; -	} else +	else  		kmem_cache_destroy(cachep);  } @@ -3351,27 +3397,16 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)  	schedule_work(&cachep->memcg_params->destroy);  } -/* - * This lock protects updaters, not readers. We want readers to be as fast as - * they can, and they will either see NULL or a valid cache value. Our model - * allow them to see NULL, in which case the root memcg will be selected. - * - * We need this lock because multiple allocations to the same cache from a non - * will span more than one worker. Only one of them can create the cache. - */ -static DEFINE_MUTEX(memcg_cache_mutex); - -/* - * Called with memcg_cache_mutex held - */ -static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, -					 struct kmem_cache *s) +static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, +						  struct kmem_cache *s)  {  	struct kmem_cache *new;  	static char *tmp_name = NULL; +	static DEFINE_MUTEX(mutex);	/* protects tmp_name */ -	lockdep_assert_held(&memcg_cache_mutex); +	BUG_ON(!memcg_can_account_kmem(memcg)); +	mutex_lock(&mutex);  	/*  	 * kmem_cache_create_memcg duplicates the given name and  	 * cgroup_name for this name requires RCU context. @@ -3394,47 +3429,13 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,  	if (new)  		new->allocflags |= __GFP_KMEMCG; +	else +		new = s; +	mutex_unlock(&mutex);  	return new;  } -static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, -						  struct kmem_cache *cachep) -{ -	struct kmem_cache *new_cachep; -	int idx; - -	BUG_ON(!memcg_can_account_kmem(memcg)); - -	idx = memcg_cache_id(memcg); - -	mutex_lock(&memcg_cache_mutex); -	new_cachep = cache_from_memcg_idx(cachep, idx); -	if (new_cachep) { -		css_put(&memcg->css); -		goto out; -	} - -	new_cachep = kmem_cache_dup(memcg, cachep); -	if (new_cachep == NULL) { -		new_cachep = cachep; -		css_put(&memcg->css); -		goto out; -	} - -	atomic_set(&new_cachep->memcg_params->nr_pages , 0); - -	cachep->memcg_params->memcg_caches[idx] = new_cachep; -	/* -	 * the readers won't lock, make sure everybody sees the updated value, -	 * so they won't put stuff in the queue again for no reason -	 */ -	wmb(); -out: -	mutex_unlock(&memcg_cache_mutex); -	return new_cachep; -} -  void kmem_cache_destroy_memcg_children(struct kmem_cache *s)  {  	struct kmem_cache *c; @@ -3452,9 +3453,10 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)  	 *  	 * Still, we don't want anyone else freeing memcg_caches under our  	 * noses, which can happen if a new memcg comes to life. As usual, -	 * we'll take the set_limit_mutex to protect ourselves against this. +	 * we'll take the activate_kmem_mutex to protect ourselves against +	 * this.  	 */ -	mutex_lock(&set_limit_mutex); +	mutex_lock(&activate_kmem_mutex);  	for_each_memcg_cache_index(i) {  		c = cache_from_memcg_idx(s, i);  		if (!c) @@ -3477,7 +3479,7 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)  		cancel_work_sync(&c->memcg_params->destroy);  		kmem_cache_destroy(c);  	} -	mutex_unlock(&set_limit_mutex); +	mutex_unlock(&activate_kmem_mutex);  }  struct create_work { @@ -3509,6 +3511,7 @@ static void memcg_create_cache_work_func(struct work_struct *w)  	cw = container_of(w, struct create_work, work);  	memcg_create_kmem_cache(cw->memcg, cw->cachep); +	css_put(&cw->memcg->css);  	kfree(cw);  } @@ -3568,7 +3571,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,  					  gfp_t gfp)  {  	struct mem_cgroup *memcg; -	int idx; +	struct kmem_cache *memcg_cachep;  	VM_BUG_ON(!cachep->memcg_params);  	VM_BUG_ON(!cachep->memcg_params->is_root_cache); @@ -3582,15 +3585,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,  	if (!memcg_can_account_kmem(memcg))  		goto out; -	idx = memcg_cache_id(memcg); - -	/* -	 * barrier to mare sure we're always seeing the up to date value.  The -	 * code updating memcg_caches will issue a write barrier to match this. -	 */ -	read_barrier_depends(); -	if (likely(cache_from_memcg_idx(cachep, idx))) { -		cachep = cache_from_memcg_idx(cachep, idx); +	memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); +	if (likely(memcg_cachep)) { +		cachep = memcg_cachep;  		goto out;  	} @@ -3744,7 +3741,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)  	if (!memcg)  		return; -	VM_BUG_ON(mem_cgroup_is_root(memcg)); +	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);  	memcg_uncharge_kmem(memcg, PAGE_SIZE << order);  }  #else @@ -3823,7 +3820,7 @@ static int mem_cgroup_move_account(struct page *page,  	bool anon = PageAnon(page);  	VM_BUG_ON(from == to); -	VM_BUG_ON(PageLRU(page)); +	VM_BUG_ON_PAGE(PageLRU(page), page);  	/*  	 * The page is isolated from LRU. So, collapse function  	 * will not handle this page. But page splitting can happen. @@ -3916,7 +3913,7 @@ static int mem_cgroup_move_parent(struct page *page,  		parent = root_mem_cgroup;  	if (nr_pages > 1) { -		VM_BUG_ON(!PageTransHuge(page)); +		VM_BUG_ON_PAGE(!PageTransHuge(page), page);  		flags = compound_lock_irqsave(page);  	} @@ -3950,7 +3947,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,  	if (PageTransHuge(page)) {  		nr_pages <<= compound_order(page); -		VM_BUG_ON(!PageTransHuge(page)); +		VM_BUG_ON_PAGE(!PageTransHuge(page), page);  		/*  		 * Never OOM-kill a process for a huge page.  The  		 * fault handler will fall back to regular pages. @@ -3970,8 +3967,8 @@ int mem_cgroup_newpage_charge(struct page *page,  {  	if (mem_cgroup_disabled())  		return 0; -	VM_BUG_ON(page_mapped(page)); -	VM_BUG_ON(page->mapping && !PageAnon(page)); +	VM_BUG_ON_PAGE(page_mapped(page), page); +	VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);  	VM_BUG_ON(!mm);  	return mem_cgroup_charge_common(page, mm, gfp_mask,  					MEM_CGROUP_CHARGE_TYPE_ANON); @@ -4175,7 +4172,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,  	if (PageTransHuge(page)) {  		nr_pages <<= compound_order(page); -		VM_BUG_ON(!PageTransHuge(page)); +		VM_BUG_ON_PAGE(!PageTransHuge(page), page);  	}  	/*  	 * Check if our page_cgroup is valid @@ -4267,7 +4264,7 @@ void mem_cgroup_uncharge_page(struct page *page)  	/* early check. */  	if (page_mapped(page))  		return; -	VM_BUG_ON(page->mapping && !PageAnon(page)); +	VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);  	/*  	 * If the page is in swap cache, uncharge should be deferred  	 * to the swap path, which also properly accounts swap usage @@ -4287,8 +4284,8 @@ void mem_cgroup_uncharge_page(struct page *page)  void mem_cgroup_uncharge_cache_page(struct page *page)  { -	VM_BUG_ON(page_mapped(page)); -	VM_BUG_ON(page->mapping); +	VM_BUG_ON_PAGE(page_mapped(page), page); +	VM_BUG_ON_PAGE(page->mapping, page);  	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);  } @@ -5112,14 +5109,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)  	return val << PAGE_SHIFT;  } -static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css, -			       struct cftype *cft, struct file *file, -			       char __user *buf, size_t nbytes, loff_t *ppos) +static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, +				   struct cftype *cft)  {  	struct mem_cgroup *memcg = mem_cgroup_from_css(css); -	char str[64];  	u64 val; -	int name, len; +	int name;  	enum res_type type;  	type = MEMFILE_TYPE(cft->private); @@ -5145,15 +5140,26 @@ static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,  		BUG();  	} -	len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); -	return simple_read_from_buffer(buf, nbytes, ppos, str, len); +	return val;  } -static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) -{ -	int ret = -EINVAL;  #ifdef CONFIG_MEMCG_KMEM -	struct mem_cgroup *memcg = mem_cgroup_from_css(css); +/* should be called with activate_kmem_mutex held */ +static int __memcg_activate_kmem(struct mem_cgroup *memcg, +				 unsigned long long limit) +{ +	int err = 0; +	int memcg_id; + +	if (memcg_kmem_is_active(memcg)) +		return 0; + +	/* +	 * We are going to allocate memory for data shared by all memory +	 * cgroups so let's stop accounting here. +	 */ +	memcg_stop_kmem_account(); +  	/*  	 * For simplicity, we won't allow this to be disabled.  It also can't  	 * be changed if the cgroup has children already, or if tasks had @@ -5167,72 +5173,101 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)  	 * of course permitted.  	 */  	mutex_lock(&memcg_create_mutex); -	mutex_lock(&set_limit_mutex); -	if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) { -		if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { -			ret = -EBUSY; -			goto out; -		} -		ret = res_counter_set_limit(&memcg->kmem, val); -		VM_BUG_ON(ret); +	if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg)) +		err = -EBUSY; +	mutex_unlock(&memcg_create_mutex); +	if (err) +		goto out; -		ret = memcg_update_cache_sizes(memcg); -		if (ret) { -			res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX); -			goto out; -		} -		static_key_slow_inc(&memcg_kmem_enabled_key); -		/* -		 * setting the active bit after the inc will guarantee no one -		 * starts accounting before all call sites are patched -		 */ -		memcg_kmem_set_active(memcg); -	} else -		ret = res_counter_set_limit(&memcg->kmem, val); +	memcg_id = ida_simple_get(&kmem_limited_groups, +				  0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); +	if (memcg_id < 0) { +		err = memcg_id; +		goto out; +	} + +	/* +	 * Make sure we have enough space for this cgroup in each root cache's +	 * memcg_params. +	 */ +	err = memcg_update_all_caches(memcg_id + 1); +	if (err) +		goto out_rmid; + +	memcg->kmemcg_id = memcg_id; +	INIT_LIST_HEAD(&memcg->memcg_slab_caches); +	mutex_init(&memcg->slab_caches_mutex); + +	/* +	 * We couldn't have accounted to this cgroup, because it hasn't got the +	 * active bit set yet, so this should succeed. +	 */ +	err = res_counter_set_limit(&memcg->kmem, limit); +	VM_BUG_ON(err); + +	static_key_slow_inc(&memcg_kmem_enabled_key); +	/* +	 * Setting the active bit after enabling static branching will +	 * guarantee no one starts accounting before all call sites are +	 * patched. +	 */ +	memcg_kmem_set_active(memcg);  out: -	mutex_unlock(&set_limit_mutex); -	mutex_unlock(&memcg_create_mutex); -#endif +	memcg_resume_kmem_account(); +	return err; + +out_rmid: +	ida_simple_remove(&kmem_limited_groups, memcg_id); +	goto out; +} + +static int memcg_activate_kmem(struct mem_cgroup *memcg, +			       unsigned long long limit) +{ +	int ret; + +	mutex_lock(&activate_kmem_mutex); +	ret = __memcg_activate_kmem(memcg, limit); +	mutex_unlock(&activate_kmem_mutex); +	return ret; +} + +static int memcg_update_kmem_limit(struct mem_cgroup *memcg, +				   unsigned long long val) +{ +	int ret; + +	if (!memcg_kmem_is_active(memcg)) +		ret = memcg_activate_kmem(memcg, val); +	else +		ret = res_counter_set_limit(&memcg->kmem, val);  	return ret;  } -#ifdef CONFIG_MEMCG_KMEM  static int memcg_propagate_kmem(struct mem_cgroup *memcg)  {  	int ret = 0;  	struct mem_cgroup *parent = parent_mem_cgroup(memcg); -	if (!parent) -		goto out; -	memcg->kmem_account_flags = parent->kmem_account_flags; -	/* -	 * When that happen, we need to disable the static branch only on those -	 * memcgs that enabled it. To achieve this, we would be forced to -	 * complicate the code by keeping track of which memcgs were the ones -	 * that actually enabled limits, and which ones got it from its -	 * parents. -	 * -	 * It is a lot simpler just to do static_key_slow_inc() on every child -	 * that is accounted. -	 */ -	if (!memcg_kmem_is_active(memcg)) -		goto out; +	if (!parent) +		return 0; +	mutex_lock(&activate_kmem_mutex);  	/* -	 * __mem_cgroup_free() will issue static_key_slow_dec() because this -	 * memcg is active already. If the later initialization fails then the -	 * cgroup core triggers the cleanup so we do not have to do it here. +	 * If the parent cgroup is not kmem-active now, it cannot be activated +	 * after this point, because it has at least one child already.  	 */ -	static_key_slow_inc(&memcg_kmem_enabled_key); - -	mutex_lock(&set_limit_mutex); -	memcg_stop_kmem_account(); -	ret = memcg_update_cache_sizes(memcg); -	memcg_resume_kmem_account(); -	mutex_unlock(&set_limit_mutex); -out: +	if (memcg_kmem_is_active(parent)) +		ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX); +	mutex_unlock(&activate_kmem_mutex);  	return ret;  } +#else +static int memcg_update_kmem_limit(struct mem_cgroup *memcg, +				   unsigned long long val) +{ +	return -EINVAL; +}  #endif /* CONFIG_MEMCG_KMEM */  /* @@ -5266,7 +5301,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,  		else if (type == _MEMSWAP)  			ret = mem_cgroup_resize_memsw_limit(memcg, val);  		else if (type == _KMEM) -			ret = memcg_update_kmem_limit(css, val); +			ret = memcg_update_kmem_limit(memcg, val);  		else  			return -EINVAL;  		break; @@ -5383,8 +5418,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,  #endif  #ifdef CONFIG_NUMA -static int memcg_numa_stat_show(struct cgroup_subsys_state *css, -				struct cftype *cft, struct seq_file *m) +static int memcg_numa_stat_show(struct seq_file *m, void *v)  {  	struct numa_stat {  		const char *name; @@ -5400,7 +5434,7 @@ static int memcg_numa_stat_show(struct cgroup_subsys_state *css,  	const struct numa_stat *stat;  	int nid;  	unsigned long nr; -	struct mem_cgroup *memcg = mem_cgroup_from_css(css); +	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));  	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {  		nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); @@ -5439,10 +5473,9 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)  	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);  } -static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft, -				 struct seq_file *m) +static int memcg_stat_show(struct seq_file *m, void *v)  { -	struct mem_cgroup *memcg = mem_cgroup_from_css(css); +	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));  	struct mem_cgroup *mi;  	unsigned int i; @@ -5651,13 +5684,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)  		mem_cgroup_oom_notify_cb(iter);  } -static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, -	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) +static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, +	struct eventfd_ctx *eventfd, const char *args, enum res_type type)  { -	struct mem_cgroup *memcg = mem_cgroup_from_css(css);  	struct mem_cgroup_thresholds *thresholds;  	struct mem_cgroup_threshold_ary *new; -	enum res_type type = MEMFILE_TYPE(cft->private);  	u64 threshold, usage;  	int i, size, ret; @@ -5734,13 +5765,23 @@ unlock:  	return ret;  } -static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, -	struct cftype *cft, struct eventfd_ctx *eventfd) +static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, +	struct eventfd_ctx *eventfd, const char *args) +{ +	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); +} + +static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, +	struct eventfd_ctx *eventfd, const char *args) +{ +	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); +} + +static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, +	struct eventfd_ctx *eventfd, enum res_type type)  { -	struct mem_cgroup *memcg = mem_cgroup_from_css(css);  	struct mem_cgroup_thresholds *thresholds;  	struct mem_cgroup_threshold_ary *new; -	enum res_type type = MEMFILE_TYPE(cft->private);  	u64 usage;  	int i, j, size; @@ -5813,14 +5854,23 @@ unlock:  	mutex_unlock(&memcg->thresholds_lock);  } -static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, -	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) +static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, +	struct eventfd_ctx *eventfd) +{ +	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); +} + +static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, +	struct eventfd_ctx *eventfd) +{ +	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); +} + +static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, +	struct eventfd_ctx *eventfd, const char *args)  { -	struct mem_cgroup *memcg = mem_cgroup_from_css(css);  	struct mem_cgroup_eventfd_list *event; -	enum res_type type = MEMFILE_TYPE(cft->private); -	BUG_ON(type != _OOM_TYPE);  	event = kmalloc(sizeof(*event),	GFP_KERNEL);  	if (!event)  		return -ENOMEM; @@ -5838,14 +5888,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,  	return 0;  } -static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, -	struct cftype *cft, struct eventfd_ctx *eventfd) +static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, +	struct eventfd_ctx *eventfd)  { -	struct mem_cgroup *memcg = mem_cgroup_from_css(css);  	struct mem_cgroup_eventfd_list *ev, *tmp; -	enum res_type type = MEMFILE_TYPE(cft->private); - -	BUG_ON(type != _OOM_TYPE);  	spin_lock(&memcg_oom_lock); @@ -5859,17 +5905,12 @@ static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,  	spin_unlock(&memcg_oom_lock);  } -static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css, -	struct cftype *cft,  struct cgroup_map_cb *cb) +static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)  { -	struct mem_cgroup *memcg = mem_cgroup_from_css(css); - -	cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); +	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); -	if (atomic_read(&memcg->under_oom)) -		cb->fill(cb, "under_oom", 1); -	else -		cb->fill(cb, "under_oom", 0); +	seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); +	seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));  	return 0;  } @@ -5962,41 +6003,261 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)  }  #endif +/* + * DO NOT USE IN NEW FILES. + * + * "cgroup.event_control" implementation. + * + * This is way over-engineered.  It tries to support fully configurable + * events for each user.  Such level of flexibility is completely + * unnecessary especially in the light of the planned unified hierarchy. + * + * Please deprecate this and replace with something simpler if at all + * possible. + */ + +/* + * Unregister event and free resources. + * + * Gets called from workqueue. + */ +static void memcg_event_remove(struct work_struct *work) +{ +	struct mem_cgroup_event *event = +		container_of(work, struct mem_cgroup_event, remove); +	struct mem_cgroup *memcg = event->memcg; + +	remove_wait_queue(event->wqh, &event->wait); + +	event->unregister_event(memcg, event->eventfd); + +	/* Notify userspace the event is going away. */ +	eventfd_signal(event->eventfd, 1); + +	eventfd_ctx_put(event->eventfd); +	kfree(event); +	css_put(&memcg->css); +} + +/* + * Gets called on POLLHUP on eventfd when user closes it. + * + * Called with wqh->lock held and interrupts disabled. + */ +static int memcg_event_wake(wait_queue_t *wait, unsigned mode, +			    int sync, void *key) +{ +	struct mem_cgroup_event *event = +		container_of(wait, struct mem_cgroup_event, wait); +	struct mem_cgroup *memcg = event->memcg; +	unsigned long flags = (unsigned long)key; + +	if (flags & POLLHUP) { +		/* +		 * If the event has been detached at cgroup removal, we +		 * can simply return knowing the other side will cleanup +		 * for us. +		 * +		 * We can't race against event freeing since the other +		 * side will require wqh->lock via remove_wait_queue(), +		 * which we hold. +		 */ +		spin_lock(&memcg->event_list_lock); +		if (!list_empty(&event->list)) { +			list_del_init(&event->list); +			/* +			 * We are in atomic context, but cgroup_event_remove() +			 * may sleep, so we have to call it in workqueue. +			 */ +			schedule_work(&event->remove); +		} +		spin_unlock(&memcg->event_list_lock); +	} + +	return 0; +} + +static void memcg_event_ptable_queue_proc(struct file *file, +		wait_queue_head_t *wqh, poll_table *pt) +{ +	struct mem_cgroup_event *event = +		container_of(pt, struct mem_cgroup_event, pt); + +	event->wqh = wqh; +	add_wait_queue(wqh, &event->wait); +} + +/* + * DO NOT USE IN NEW FILES. + * + * Parse input and register new cgroup event handler. + * + * Input must be in format '<event_fd> <control_fd> <args>'. + * Interpretation of args is defined by control file implementation. + */ +static int memcg_write_event_control(struct cgroup_subsys_state *css, +				     struct cftype *cft, const char *buffer) +{ +	struct mem_cgroup *memcg = mem_cgroup_from_css(css); +	struct mem_cgroup_event *event; +	struct cgroup_subsys_state *cfile_css; +	unsigned int efd, cfd; +	struct fd efile; +	struct fd cfile; +	const char *name; +	char *endp; +	int ret; + +	efd = simple_strtoul(buffer, &endp, 10); +	if (*endp != ' ') +		return -EINVAL; +	buffer = endp + 1; + +	cfd = simple_strtoul(buffer, &endp, 10); +	if ((*endp != ' ') && (*endp != '\0')) +		return -EINVAL; +	buffer = endp + 1; + +	event = kzalloc(sizeof(*event), GFP_KERNEL); +	if (!event) +		return -ENOMEM; + +	event->memcg = memcg; +	INIT_LIST_HEAD(&event->list); +	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); +	init_waitqueue_func_entry(&event->wait, memcg_event_wake); +	INIT_WORK(&event->remove, memcg_event_remove); + +	efile = fdget(efd); +	if (!efile.file) { +		ret = -EBADF; +		goto out_kfree; +	} + +	event->eventfd = eventfd_ctx_fileget(efile.file); +	if (IS_ERR(event->eventfd)) { +		ret = PTR_ERR(event->eventfd); +		goto out_put_efile; +	} + +	cfile = fdget(cfd); +	if (!cfile.file) { +		ret = -EBADF; +		goto out_put_eventfd; +	} + +	/* the process need read permission on control file */ +	/* AV: shouldn't we check that it's been opened for read instead? */ +	ret = inode_permission(file_inode(cfile.file), MAY_READ); +	if (ret < 0) +		goto out_put_cfile; + +	/* +	 * Determine the event callbacks and set them in @event.  This used +	 * to be done via struct cftype but cgroup core no longer knows +	 * about these events.  The following is crude but the whole thing +	 * is for compatibility anyway. +	 * +	 * DO NOT ADD NEW FILES. +	 */ +	name = cfile.file->f_dentry->d_name.name; + +	if (!strcmp(name, "memory.usage_in_bytes")) { +		event->register_event = mem_cgroup_usage_register_event; +		event->unregister_event = mem_cgroup_usage_unregister_event; +	} else if (!strcmp(name, "memory.oom_control")) { +		event->register_event = mem_cgroup_oom_register_event; +		event->unregister_event = mem_cgroup_oom_unregister_event; +	} else if (!strcmp(name, "memory.pressure_level")) { +		event->register_event = vmpressure_register_event; +		event->unregister_event = vmpressure_unregister_event; +	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { +		event->register_event = memsw_cgroup_usage_register_event; +		event->unregister_event = memsw_cgroup_usage_unregister_event; +	} else { +		ret = -EINVAL; +		goto out_put_cfile; +	} + +	/* +	 * Verify @cfile should belong to @css.  Also, remaining events are +	 * automatically removed on cgroup destruction but the removal is +	 * asynchronous, so take an extra ref on @css. +	 */ +	rcu_read_lock(); + +	ret = -EINVAL; +	cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, +				 &mem_cgroup_subsys); +	if (cfile_css == css && css_tryget(css)) +		ret = 0; + +	rcu_read_unlock(); +	if (ret) +		goto out_put_cfile; + +	ret = event->register_event(memcg, event->eventfd, buffer); +	if (ret) +		goto out_put_css; + +	efile.file->f_op->poll(efile.file, &event->pt); + +	spin_lock(&memcg->event_list_lock); +	list_add(&event->list, &memcg->event_list); +	spin_unlock(&memcg->event_list_lock); + +	fdput(cfile); +	fdput(efile); + +	return 0; + +out_put_css: +	css_put(css); +out_put_cfile: +	fdput(cfile); +out_put_eventfd: +	eventfd_ctx_put(event->eventfd); +out_put_efile: +	fdput(efile); +out_kfree: +	kfree(event); + +	return ret; +} +  static struct cftype mem_cgroup_files[] = {  	{  		.name = "usage_in_bytes",  		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE), -		.read = mem_cgroup_read, -		.register_event = mem_cgroup_usage_register_event, -		.unregister_event = mem_cgroup_usage_unregister_event, +		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "max_usage_in_bytes",  		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),  		.trigger = mem_cgroup_reset, -		.read = mem_cgroup_read, +		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "limit_in_bytes",  		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),  		.write_string = mem_cgroup_write, -		.read = mem_cgroup_read, +		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "soft_limit_in_bytes",  		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),  		.write_string = mem_cgroup_write, -		.read = mem_cgroup_read, +		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "failcnt",  		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),  		.trigger = mem_cgroup_reset, -		.read = mem_cgroup_read, +		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "stat", -		.read_seq_string = memcg_stat_show, +		.seq_show = memcg_stat_show,  	},  	{  		.name = "force_empty", @@ -6009,6 +6270,12 @@ static struct cftype mem_cgroup_files[] = {  		.read_u64 = mem_cgroup_hierarchy_read,  	},  	{ +		.name = "cgroup.event_control",		/* XXX: for compat */ +		.write_string = memcg_write_event_control, +		.flags = CFTYPE_NO_PREFIX, +		.mode = S_IWUGO, +	}, +	{  		.name = "swappiness",  		.read_u64 = mem_cgroup_swappiness_read,  		.write_u64 = mem_cgroup_swappiness_write, @@ -6020,21 +6287,17 @@ static struct cftype mem_cgroup_files[] = {  	},  	{  		.name = "oom_control", -		.read_map = mem_cgroup_oom_control_read, +		.seq_show = mem_cgroup_oom_control_read,  		.write_u64 = mem_cgroup_oom_control_write, -		.register_event = mem_cgroup_oom_register_event, -		.unregister_event = mem_cgroup_oom_unregister_event,  		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),  	},  	{  		.name = "pressure_level", -		.register_event = vmpressure_register_event, -		.unregister_event = vmpressure_unregister_event,  	},  #ifdef CONFIG_NUMA  	{  		.name = "numa_stat", -		.read_seq_string = memcg_numa_stat_show, +		.seq_show = memcg_numa_stat_show,  	},  #endif  #ifdef CONFIG_MEMCG_KMEM @@ -6042,29 +6305,29 @@ static struct cftype mem_cgroup_files[] = {  		.name = "kmem.limit_in_bytes",  		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),  		.write_string = mem_cgroup_write, -		.read = mem_cgroup_read, +		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "kmem.usage_in_bytes",  		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), -		.read = mem_cgroup_read, +		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "kmem.failcnt",  		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),  		.trigger = mem_cgroup_reset, -		.read = mem_cgroup_read, +		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "kmem.max_usage_in_bytes",  		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),  		.trigger = mem_cgroup_reset, -		.read = mem_cgroup_read, +		.read_u64 = mem_cgroup_read_u64,  	},  #ifdef CONFIG_SLABINFO  	{  		.name = "kmem.slabinfo", -		.read_seq_string = mem_cgroup_slabinfo_read, +		.seq_show = mem_cgroup_slabinfo_read,  	},  #endif  #endif @@ -6076,27 +6339,25 @@ static struct cftype memsw_cgroup_files[] = {  	{  		.name = "memsw.usage_in_bytes",  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), -		.read = mem_cgroup_read, -		.register_event = mem_cgroup_usage_register_event, -		.unregister_event = mem_cgroup_usage_unregister_event, +		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "memsw.max_usage_in_bytes",  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),  		.trigger = mem_cgroup_reset, -		.read = mem_cgroup_read, +		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "memsw.limit_in_bytes",  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),  		.write_string = mem_cgroup_write, -		.read = mem_cgroup_read, +		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "memsw.failcnt",  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),  		.trigger = mem_cgroup_reset, -		.read = mem_cgroup_read, +		.read_u64 = mem_cgroup_read_u64,  	},  	{ },	/* terminate */  }; @@ -6139,14 +6400,12 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)  static struct mem_cgroup *mem_cgroup_alloc(void)  {  	struct mem_cgroup *memcg; -	size_t size = memcg_size(); +	size_t size; -	/* Can be very big if nr_node_ids is very big */ -	if (size < PAGE_SIZE) -		memcg = kzalloc(size, GFP_KERNEL); -	else -		memcg = vzalloc(size); +	size = sizeof(struct mem_cgroup); +	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); +	memcg = kzalloc(size, GFP_KERNEL);  	if (!memcg)  		return NULL; @@ -6157,10 +6416,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)  	return memcg;  out_free: -	if (size < PAGE_SIZE) -		kfree(memcg); -	else -		vfree(memcg); +	kfree(memcg);  	return NULL;  } @@ -6178,7 +6434,6 @@ out_free:  static void __mem_cgroup_free(struct mem_cgroup *memcg)  {  	int node; -	size_t size = memcg_size();  	mem_cgroup_remove_from_trees(memcg); @@ -6199,10 +6454,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)  	 * the cgroup_lock.  	 */  	disarm_static_keys(memcg); -	if (size < PAGE_SIZE) -		kfree(memcg); -	else -		vfree(memcg); +	kfree(memcg);  }  /* @@ -6268,6 +6520,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)  	mutex_init(&memcg->thresholds_lock);  	spin_lock_init(&memcg->move_lock);  	vmpressure_init(&memcg->vmpressure); +	INIT_LIST_HEAD(&memcg->event_list); +	spin_lock_init(&memcg->event_list_lock);  	return &memcg->css; @@ -6281,7 +6535,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)  {  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);  	struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); -	int error = 0;  	if (css->cgroup->id > MEM_CGROUP_ID_MAX)  		return -ENOSPC; @@ -6316,10 +6569,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)  		if (parent != root_mem_cgroup)  			mem_cgroup_subsys.broken_hierarchy = true;  	} - -	error = memcg_init_kmem(memcg, &mem_cgroup_subsys);  	mutex_unlock(&memcg_create_mutex); -	return error; + +	return memcg_init_kmem(memcg, &mem_cgroup_subsys);  }  /* @@ -6343,6 +6595,19 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)  static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)  {  	struct mem_cgroup *memcg = mem_cgroup_from_css(css); +	struct mem_cgroup_event *event, *tmp; + +	/* +	 * Unregister events and notify userspace. +	 * Notify userspace about cgroup removing only after rmdir of cgroup +	 * directory to avoid race between userspace and kernelspace. +	 */ +	spin_lock(&memcg->event_list_lock); +	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { +		list_del_init(&event->list); +		schedule_work(&event->remove); +	} +	spin_unlock(&memcg->event_list_lock);  	kmem_cgroup_css_offline(memcg); @@ -6615,7 +6880,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,  	enum mc_target_type ret = MC_TARGET_NONE;  	page = pmd_page(pmd); -	VM_BUG_ON(!page || !PageHead(page)); +	VM_BUG_ON_PAGE(!page || !PageHead(page), page);  	if (!move_anon())  		return ret;  	pc = lookup_page_cgroup(page);  |