diff options
Diffstat (limited to 'mm/memcontrol.c')
| -rw-r--r-- | mm/memcontrol.c | 1694 | 
1 files changed, 698 insertions, 996 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1f14a430c656..28928ce9b07f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -292,6 +292,9 @@ struct mem_cgroup {  	/* vmpressure notifications */  	struct vmpressure vmpressure; +	/* css_online() has been completed */ +	int initialized; +  	/*  	 * the counter to account for mem+swap usage.  	 */ @@ -754,9 +757,11 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,  static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,  				       struct mem_cgroup_tree_per_zone *mctz)  { -	spin_lock(&mctz->lock); +	unsigned long flags; + +	spin_lock_irqsave(&mctz->lock, flags);  	__mem_cgroup_remove_exceeded(mz, mctz); -	spin_unlock(&mctz->lock); +	spin_unlock_irqrestore(&mctz->lock, flags);  } @@ -779,7 +784,9 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)  		 * mem is over its softlimit.  		 */  		if (excess || mz->on_tree) { -			spin_lock(&mctz->lock); +			unsigned long flags; + +			spin_lock_irqsave(&mctz->lock, flags);  			/* if on-tree, remove it */  			if (mz->on_tree)  				__mem_cgroup_remove_exceeded(mz, mctz); @@ -788,7 +795,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)  			 * If excess is 0, no tree ops.  			 */  			__mem_cgroup_insert_exceeded(mz, mctz, excess); -			spin_unlock(&mctz->lock); +			spin_unlock_irqrestore(&mctz->lock, flags);  		}  	}  } @@ -839,9 +846,9 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)  {  	struct mem_cgroup_per_zone *mz; -	spin_lock(&mctz->lock); +	spin_lock_irq(&mctz->lock);  	mz = __mem_cgroup_largest_soft_limit_node(mctz); -	spin_unlock(&mctz->lock); +	spin_unlock_irq(&mctz->lock);  	return mz;  } @@ -882,13 +889,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *memcg,  	return val;  } -static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, -					 bool charge) -{ -	int val = (charge) ? 1 : -1; -	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); -} -  static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,  					    enum mem_cgroup_events_index idx)  { @@ -909,13 +909,13 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,  static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,  					 struct page *page, -					 bool anon, int nr_pages) +					 int nr_pages)  {  	/*  	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is  	 * counted as CACHE even if it's on ANON LRU.  	 */ -	if (anon) +	if (PageAnon(page))  		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],  				nr_pages);  	else @@ -1013,7 +1013,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,   */  static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)  { -	preempt_disable();  	/* threshold event is triggered in finer grain than soft limit */  	if (unlikely(mem_cgroup_event_ratelimit(memcg,  						MEM_CGROUP_TARGET_THRESH))) { @@ -1026,8 +1025,6 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)  		do_numainfo = mem_cgroup_event_ratelimit(memcg,  						MEM_CGROUP_TARGET_NUMAINFO);  #endif -		preempt_enable(); -  		mem_cgroup_threshold(memcg);  		if (unlikely(do_softlimit))  			mem_cgroup_update_tree(memcg, page); @@ -1035,8 +1032,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)  		if (unlikely(do_numainfo))  			atomic_inc(&memcg->numainfo_events);  #endif -	} else -		preempt_enable(); +	}  }  struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) @@ -1106,10 +1102,21 @@ skip_node:  	 * skipping css reference should be safe.  	 */  	if (next_css) { -		if ((next_css == &root->css) || -		    ((next_css->flags & CSS_ONLINE) && -		     css_tryget_online(next_css))) -			return mem_cgroup_from_css(next_css); +		struct mem_cgroup *memcg = mem_cgroup_from_css(next_css); + +		if (next_css == &root->css) +			return memcg; + +		if (css_tryget_online(next_css)) { +			/* +			 * Make sure the memcg is initialized: +			 * mem_cgroup_css_online() orders the the +			 * initialization against setting the flag. +			 */ +			if (smp_load_acquire(&memcg->initialized)) +				return memcg; +			css_put(next_css); +		}  		prev_css = next_css;  		goto skip_node; @@ -1347,20 +1354,6 @@ out:  	return lruvec;  } -/* - * Following LRU functions are allowed to be used without PCG_LOCK. - * Operations are called by routine of global LRU independently from memcg. - * What we have to take care of here is validness of pc->mem_cgroup. - * - * Changes to pc->mem_cgroup happens when - * 1. charge - * 2. moving account - * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. - * It is added to LRU before charge. - * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. - * When moving account, the page is not on LRU. It's isolated. - */ -  /**   * mem_cgroup_page_lruvec - return lruvec for adding an lru page   * @page: the page @@ -2261,22 +2254,14 @@ cleanup:   *   * Notes: Race condition   * - * We usually use lock_page_cgroup() for accessing page_cgroup member but - * it tends to be costly. But considering some conditions, we doesn't need - * to do so _always_. + * Charging occurs during page instantiation, while the page is + * unmapped and locked in page migration, or while the page table is + * locked in THP migration.  No race is possible.   * - * Considering "charge", lock_page_cgroup() is not required because all - * file-stat operations happen after a page is attached to radix-tree. There - * are no race with "charge". + * Uncharge happens to pages with zero references, no race possible.   * - * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup - * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even - * if there are race with "uncharge". Statistics itself is properly handled - * by flags. - * - * Considering "move", this is an only case we see a race. To make the race - * small, we check memcg->moving_account and detect there are possibility - * of race or not. If there is, we take a lock. + * Charge moving between groups is protected by checking mm->moving + * account and taking the move_lock in the slowpath.   */  void __mem_cgroup_begin_update_page_stat(struct page *page, @@ -2551,55 +2536,65 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,  	return NOTIFY_OK;  } - -/* See mem_cgroup_try_charge() for details */ -enum { -	CHARGE_OK,		/* success */ -	CHARGE_RETRY,		/* need to retry but retry is not bad */ -	CHARGE_NOMEM,		/* we can't do more. return -ENOMEM */ -	CHARGE_WOULDBLOCK,	/* GFP_WAIT wasn't set and no enough res. */ -}; - -static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, -				unsigned int nr_pages, unsigned int min_pages, -				bool invoke_oom) +static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, +		      unsigned int nr_pages)  { -	unsigned long csize = nr_pages * PAGE_SIZE; +	unsigned int batch = max(CHARGE_BATCH, nr_pages); +	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;  	struct mem_cgroup *mem_over_limit;  	struct res_counter *fail_res; +	unsigned long nr_reclaimed;  	unsigned long flags = 0; -	int ret; +	unsigned long long size; +	int ret = 0; -	ret = res_counter_charge(&memcg->res, csize, &fail_res); +	if (mem_cgroup_is_root(memcg)) +		goto done; +retry: +	if (consume_stock(memcg, nr_pages)) +		goto done; -	if (likely(!ret)) { +	size = batch * PAGE_SIZE; +	if (!res_counter_charge(&memcg->res, size, &fail_res)) {  		if (!do_swap_account) -			return CHARGE_OK; -		ret = res_counter_charge(&memcg->memsw, csize, &fail_res); -		if (likely(!ret)) -			return CHARGE_OK; - -		res_counter_uncharge(&memcg->res, csize); +			goto done_restock; +		if (!res_counter_charge(&memcg->memsw, size, &fail_res)) +			goto done_restock; +		res_counter_uncharge(&memcg->res, size);  		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);  		flags |= MEM_CGROUP_RECLAIM_NOSWAP;  	} else  		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); + +	if (batch > nr_pages) { +		batch = nr_pages; +		goto retry; +	} +  	/* -	 * Never reclaim on behalf of optional batching, retry with a -	 * single page instead. +	 * Unlike in global OOM situations, memcg is not in a physical +	 * memory shortage.  Allow dying and OOM-killed tasks to +	 * bypass the last charges so that they can exit quickly and +	 * free their memory.  	 */ -	if (nr_pages > min_pages) -		return CHARGE_RETRY; +	if (unlikely(test_thread_flag(TIF_MEMDIE) || +		     fatal_signal_pending(current) || +		     current->flags & PF_EXITING)) +		goto bypass; + +	if (unlikely(task_in_memcg_oom(current))) +		goto nomem;  	if (!(gfp_mask & __GFP_WAIT)) -		return CHARGE_WOULDBLOCK; +		goto nomem; -	if (gfp_mask & __GFP_NORETRY) -		return CHARGE_NOMEM; +	nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); -	ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);  	if (mem_cgroup_margin(mem_over_limit) >= nr_pages) -		return CHARGE_RETRY; +		goto retry; + +	if (gfp_mask & __GFP_NORETRY) +		goto nomem;  	/*  	 * Even though the limit is exceeded at this point, reclaim  	 * may have been able to free some pages.  Retry the charge @@ -2609,142 +2604,48 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,  	 * unlikely to succeed so close to the limit, and we fall back  	 * to regular pages anyway in case of failure.  	 */ -	if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) -		return CHARGE_RETRY; - +	if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) +		goto retry;  	/*  	 * At task move, charge accounts can be doubly counted. So, it's  	 * better to wait until the end of task_move if something is going on.  	 */  	if (mem_cgroup_wait_acct_move(mem_over_limit)) -		return CHARGE_RETRY; - -	if (invoke_oom) -		mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); - -	return CHARGE_NOMEM; -} - -/** - * mem_cgroup_try_charge - try charging a memcg - * @memcg: memcg to charge - * @nr_pages: number of pages to charge - * @oom: trigger OOM if reclaim fails - * - * Returns 0 if @memcg was charged successfully, -EINTR if the charge - * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed. - */ -static int mem_cgroup_try_charge(struct mem_cgroup *memcg, -				 gfp_t gfp_mask, -				 unsigned int nr_pages, -				 bool oom) -{ -	unsigned int batch = max(CHARGE_BATCH, nr_pages); -	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; -	int ret; - -	if (mem_cgroup_is_root(memcg)) -		goto done; -	/* -	 * Unlike in global OOM situations, memcg is not in a physical -	 * memory shortage.  Allow dying and OOM-killed tasks to -	 * bypass the last charges so that they can exit quickly and -	 * free their memory. -	 */ -	if (unlikely(test_thread_flag(TIF_MEMDIE) || -		     fatal_signal_pending(current) || -		     current->flags & PF_EXITING)) -		goto bypass; +		goto retry; -	if (unlikely(task_in_memcg_oom(current))) -		goto nomem; +	if (nr_retries--) +		goto retry;  	if (gfp_mask & __GFP_NOFAIL) -		oom = false; -again: -	if (consume_stock(memcg, nr_pages)) -		goto done; - -	do { -		bool invoke_oom = oom && !nr_oom_retries; - -		/* If killed, bypass charge */ -		if (fatal_signal_pending(current)) -			goto bypass; +		goto bypass; -		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, -					   nr_pages, invoke_oom); -		switch (ret) { -		case CHARGE_OK: -			break; -		case CHARGE_RETRY: /* not in OOM situation but retry */ -			batch = nr_pages; -			goto again; -		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ -			goto nomem; -		case CHARGE_NOMEM: /* OOM routine works */ -			if (!oom || invoke_oom) -				goto nomem; -			nr_oom_retries--; -			break; -		} -	} while (ret != CHARGE_OK); +	if (fatal_signal_pending(current)) +		goto bypass; -	if (batch > nr_pages) -		refill_stock(memcg, batch - nr_pages); -done: -	return 0; +	mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));  nomem:  	if (!(gfp_mask & __GFP_NOFAIL))  		return -ENOMEM;  bypass:  	return -EINTR; -} - -/** - * mem_cgroup_try_charge_mm - try charging a mm - * @mm: mm_struct to charge - * @nr_pages: number of pages to charge - * @oom: trigger OOM if reclaim fails - * - * Returns the charged mem_cgroup associated with the given mm_struct or - * NULL the charge failed. - */ -static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, -				 gfp_t gfp_mask, -				 unsigned int nr_pages, -				 bool oom) - -{ -	struct mem_cgroup *memcg; -	int ret; - -	memcg = get_mem_cgroup_from_mm(mm); -	ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom); -	css_put(&memcg->css); -	if (ret == -EINTR) -		memcg = root_mem_cgroup; -	else if (ret) -		memcg = NULL; -	return memcg; +done_restock: +	if (batch > nr_pages) +		refill_stock(memcg, batch - nr_pages); +done: +	return ret;  } -/* - * Somemtimes we have to undo a charge we got by try_charge(). - * This function is for that and do uncharge, put css's refcnt. - * gotten by try_charge(). - */ -static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, -				       unsigned int nr_pages) +static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)  { -	if (!mem_cgroup_is_root(memcg)) { -		unsigned long bytes = nr_pages * PAGE_SIZE; +	unsigned long bytes = nr_pages * PAGE_SIZE; -		res_counter_uncharge(&memcg->res, bytes); -		if (do_swap_account) -			res_counter_uncharge(&memcg->memsw, bytes); -	} +	if (mem_cgroup_is_root(memcg)) +		return; + +	res_counter_uncharge(&memcg->res, bytes); +	if (do_swap_account) +		res_counter_uncharge(&memcg->memsw, bytes);  }  /* @@ -2779,6 +2680,16 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)  	return mem_cgroup_from_id(id);  } +/* + * try_get_mem_cgroup_from_page - look up page's memcg association + * @page: the page + * + * Look up, get a css reference, and return the memcg that owns @page. + * + * The page must be locked to prevent racing with swap-in and page + * cache charges.  If coming from an unlocked page table, the caller + * must ensure the page is on the LRU or this can race with charging. + */  struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)  {  	struct mem_cgroup *memcg = NULL; @@ -2789,7 +2700,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)  	VM_BUG_ON_PAGE(!PageLocked(page), page);  	pc = lookup_page_cgroup(page); -	lock_page_cgroup(pc);  	if (PageCgroupUsed(pc)) {  		memcg = pc->mem_cgroup;  		if (memcg && !css_tryget_online(&memcg->css)) @@ -2803,23 +2713,46 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)  			memcg = NULL;  		rcu_read_unlock();  	} -	unlock_page_cgroup(pc);  	return memcg;  } -static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, -				       struct page *page, -				       unsigned int nr_pages, -				       enum charge_type ctype, -				       bool lrucare) +static void lock_page_lru(struct page *page, int *isolated) +{ +	struct zone *zone = page_zone(page); + +	spin_lock_irq(&zone->lru_lock); +	if (PageLRU(page)) { +		struct lruvec *lruvec; + +		lruvec = mem_cgroup_page_lruvec(page, zone); +		ClearPageLRU(page); +		del_page_from_lru_list(page, lruvec, page_lru(page)); +		*isolated = 1; +	} else +		*isolated = 0; +} + +static void unlock_page_lru(struct page *page, int isolated) +{ +	struct zone *zone = page_zone(page); + +	if (isolated) { +		struct lruvec *lruvec; + +		lruvec = mem_cgroup_page_lruvec(page, zone); +		VM_BUG_ON_PAGE(PageLRU(page), page); +		SetPageLRU(page); +		add_page_to_lru_list(page, lruvec, page_lru(page)); +	} +	spin_unlock_irq(&zone->lru_lock); +} + +static void commit_charge(struct page *page, struct mem_cgroup *memcg, +			  bool lrucare)  {  	struct page_cgroup *pc = lookup_page_cgroup(page); -	struct zone *uninitialized_var(zone); -	struct lruvec *lruvec; -	bool was_on_lru = false; -	bool anon; +	int isolated; -	lock_page_cgroup(pc);  	VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);  	/*  	 * we don't need page_cgroup_lock about tail pages, becase they are not @@ -2830,52 +2763,28 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,  	 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page  	 * may already be on some other mem_cgroup's LRU.  Take care of it.  	 */ -	if (lrucare) { -		zone = page_zone(page); -		spin_lock_irq(&zone->lru_lock); -		if (PageLRU(page)) { -			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); -			ClearPageLRU(page); -			del_page_from_lru_list(page, lruvec, page_lru(page)); -			was_on_lru = true; -		} -	} +	if (lrucare) +		lock_page_lru(page, &isolated); -	pc->mem_cgroup = memcg;  	/* -	 * We access a page_cgroup asynchronously without lock_page_cgroup(). -	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup -	 * is accessed after testing USED bit. To make pc->mem_cgroup visible -	 * before USED bit, we need memory barrier here. -	 * See mem_cgroup_add_lru_list(), etc. +	 * Nobody should be changing or seriously looking at +	 * pc->mem_cgroup and pc->flags at this point: +	 * +	 * - the page is uncharged +	 * +	 * - the page is off-LRU +	 * +	 * - an anonymous fault has exclusive page access, except for +	 *   a locked page table +	 * +	 * - a page cache insertion, a swapin fault, or a migration +	 *   have the page locked  	 */ -	smp_wmb(); -	SetPageCgroupUsed(pc); - -	if (lrucare) { -		if (was_on_lru) { -			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); -			VM_BUG_ON_PAGE(PageLRU(page), page); -			SetPageLRU(page); -			add_page_to_lru_list(page, lruvec, page_lru(page)); -		} -		spin_unlock_irq(&zone->lru_lock); -	} - -	if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON) -		anon = true; -	else -		anon = false; - -	mem_cgroup_charge_statistics(memcg, page, anon, nr_pages); -	unlock_page_cgroup(pc); +	pc->mem_cgroup = memcg; +	pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0); -	/* -	 * "charge_statistics" updated event counter. Then, check it. -	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. -	 * if they exceeds softlimit. -	 */ -	memcg_check_events(memcg, page); +	if (lrucare) +		unlock_page_lru(page, isolated);  }  static DEFINE_MUTEX(set_limit_mutex); @@ -2937,22 +2846,21 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)  	if (ret)  		return ret; -	ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT, -				    oom_gfp_allowed(gfp)); +	ret = try_charge(memcg, gfp, size >> PAGE_SHIFT);  	if (ret == -EINTR)  {  		/* -		 * mem_cgroup_try_charge() chosed to bypass to root due to -		 * OOM kill or fatal signal.  Since our only options are to -		 * either fail the allocation or charge it to this cgroup, do -		 * it as a temporary condition. But we can't fail. From a -		 * kmem/slab perspective, the cache has already been selected, -		 * by mem_cgroup_kmem_get_cache(), so it is too late to change +		 * try_charge() chose to bypass to root due to OOM kill or +		 * fatal signal.  Since our only options are to either fail +		 * the allocation or charge it to this cgroup, do it as a +		 * temporary condition. But we can't fail. From a kmem/slab +		 * perspective, the cache has already been selected, by +		 * mem_cgroup_kmem_get_cache(), so it is too late to change  		 * our minds.  		 *  		 * This condition will only trigger if the task entered -		 * memcg_charge_kmem in a sane state, but was OOM-killed during -		 * mem_cgroup_try_charge() above. Tasks that were already -		 * dying when the allocation triggers should have been already +		 * memcg_charge_kmem in a sane state, but was OOM-killed +		 * during try_charge() above. Tasks that were already dying +		 * when the allocation triggers should have been already  		 * directed to the root cgroup in memcontrol.h  		 */  		res_counter_charge_nofail(&memcg->res, size, &fail_res); @@ -3463,12 +3371,13 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,  		memcg_uncharge_kmem(memcg, PAGE_SIZE << order);  		return;  	} - +	/* +	 * The page is freshly allocated and not visible to any +	 * outside callers yet.  Set up pc non-atomically. +	 */  	pc = lookup_page_cgroup(page); -	lock_page_cgroup(pc);  	pc->mem_cgroup = memcg; -	SetPageCgroupUsed(pc); -	unlock_page_cgroup(pc); +	pc->flags = PCG_USED;  }  void __memcg_kmem_uncharge_pages(struct page *page, int order) @@ -3478,19 +3387,11 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)  	pc = lookup_page_cgroup(page); -	/* -	 * Fast unlocked return. Theoretically might have changed, have to -	 * check again after locking. -	 */  	if (!PageCgroupUsed(pc))  		return; -	lock_page_cgroup(pc); -	if (PageCgroupUsed(pc)) { -		memcg = pc->mem_cgroup; -		ClearPageCgroupUsed(pc); -	} -	unlock_page_cgroup(pc); +	memcg = pc->mem_cgroup; +	pc->flags = 0;  	/*  	 * We trust that only if there is a memcg associated with the page, it @@ -3510,7 +3411,6 @@ static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)  #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)  /*   * Because tail pages are not marked as "used", set it. We're under   * zone->lru_lock, 'splitting on pmd' and compound_lock. @@ -3531,8 +3431,7 @@ void mem_cgroup_split_huge_fixup(struct page *head)  	for (i = 1; i < HPAGE_PMD_NR; i++) {  		pc = head_pc + i;  		pc->mem_cgroup = memcg; -		smp_wmb();/* see __commit_charge() */ -		pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; +		pc->flags = head_pc->flags;  	}  	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],  		       HPAGE_PMD_NR); @@ -3562,7 +3461,6 @@ static int mem_cgroup_move_account(struct page *page,  {  	unsigned long flags;  	int ret; -	bool anon = PageAnon(page);  	VM_BUG_ON(from == to);  	VM_BUG_ON_PAGE(PageLRU(page), page); @@ -3576,15 +3474,21 @@ static int mem_cgroup_move_account(struct page *page,  	if (nr_pages > 1 && !PageTransHuge(page))  		goto out; -	lock_page_cgroup(pc); +	/* +	 * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup +	 * of its source page while we change it: page migration takes +	 * both pages off the LRU, but page cache replacement doesn't. +	 */ +	if (!trylock_page(page)) +		goto out;  	ret = -EINVAL;  	if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) -		goto unlock; +		goto out_unlock;  	move_lock_mem_cgroup(from, &flags); -	if (!anon && page_mapped(page)) { +	if (!PageAnon(page) && page_mapped(page)) {  		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],  			       nr_pages);  		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], @@ -3598,20 +3502,25 @@ static int mem_cgroup_move_account(struct page *page,  			       nr_pages);  	} -	mem_cgroup_charge_statistics(from, page, anon, -nr_pages); +	/* +	 * It is safe to change pc->mem_cgroup here because the page +	 * is referenced, charged, and isolated - we can't race with +	 * uncharging, charging, migration, or LRU putback. +	 */  	/* caller should have done css_get */  	pc->mem_cgroup = to; -	mem_cgroup_charge_statistics(to, page, anon, nr_pages);  	move_unlock_mem_cgroup(from, &flags);  	ret = 0; -unlock: -	unlock_page_cgroup(pc); -	/* -	 * check events -	 */ + +	local_irq_disable(); +	mem_cgroup_charge_statistics(to, page, nr_pages);  	memcg_check_events(to, page); +	mem_cgroup_charge_statistics(from, page, -nr_pages);  	memcg_check_events(from, page); +	local_irq_enable(); +out_unlock: +	unlock_page(page);  out:  	return ret;  } @@ -3682,456 +3591,12 @@ out:  	return ret;  } -int mem_cgroup_charge_anon(struct page *page, -			      struct mm_struct *mm, gfp_t gfp_mask) -{ -	unsigned int nr_pages = 1; -	struct mem_cgroup *memcg; -	bool oom = true; - -	if (mem_cgroup_disabled()) -		return 0; - -	VM_BUG_ON_PAGE(page_mapped(page), page); -	VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); -	VM_BUG_ON(!mm); - -	if (PageTransHuge(page)) { -		nr_pages <<= compound_order(page); -		VM_BUG_ON_PAGE(!PageTransHuge(page), page); -		/* -		 * Never OOM-kill a process for a huge page.  The -		 * fault handler will fall back to regular pages. -		 */ -		oom = false; -	} - -	memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom); -	if (!memcg) -		return -ENOMEM; -	__mem_cgroup_commit_charge(memcg, page, nr_pages, -				   MEM_CGROUP_CHARGE_TYPE_ANON, false); -	return 0; -} - -/* - * While swap-in, try_charge -> commit or cancel, the page is locked. - * And when try_charge() successfully returns, one refcnt to memcg without - * struct page_cgroup is acquired. This refcnt will be consumed by - * "commit()" or removed by "cancel()" - */ -static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, -					  struct page *page, -					  gfp_t mask, -					  struct mem_cgroup **memcgp) -{ -	struct mem_cgroup *memcg = NULL; -	struct page_cgroup *pc; -	int ret; - -	pc = lookup_page_cgroup(page); -	/* -	 * Every swap fault against a single page tries to charge the -	 * page, bail as early as possible.  shmem_unuse() encounters -	 * already charged pages, too.  The USED bit is protected by -	 * the page lock, which serializes swap cache removal, which -	 * in turn serializes uncharging. -	 */ -	if (PageCgroupUsed(pc)) -		goto out; -	if (do_swap_account) -		memcg = try_get_mem_cgroup_from_page(page); -	if (!memcg) -		memcg = get_mem_cgroup_from_mm(mm); -	ret = mem_cgroup_try_charge(memcg, mask, 1, true); -	css_put(&memcg->css); -	if (ret == -EINTR) -		memcg = root_mem_cgroup; -	else if (ret) -		return ret; -out: -	*memcgp = memcg; -	return 0; -} - -int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, -				 gfp_t gfp_mask, struct mem_cgroup **memcgp) -{ -	if (mem_cgroup_disabled()) { -		*memcgp = NULL; -		return 0; -	} -	/* -	 * A racing thread's fault, or swapoff, may have already -	 * updated the pte, and even removed page from swap cache: in -	 * those cases unuse_pte()'s pte_same() test will fail; but -	 * there's also a KSM case which does need to charge the page. -	 */ -	if (!PageSwapCache(page)) { -		struct mem_cgroup *memcg; - -		memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); -		if (!memcg) -			return -ENOMEM; -		*memcgp = memcg; -		return 0; -	} -	return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); -} - -void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) -{ -	if (mem_cgroup_disabled()) -		return; -	if (!memcg) -		return; -	__mem_cgroup_cancel_charge(memcg, 1); -} - -static void -__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, -					enum charge_type ctype) -{ -	if (mem_cgroup_disabled()) -		return; -	if (!memcg) -		return; - -	__mem_cgroup_commit_charge(memcg, page, 1, ctype, true); -	/* -	 * Now swap is on-memory. This means this page may be -	 * counted both as mem and swap....double count. -	 * Fix it by uncharging from memsw. Basically, this SwapCache is stable -	 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() -	 * may call delete_from_swap_cache() before reach here. -	 */ -	if (do_swap_account && PageSwapCache(page)) { -		swp_entry_t ent = {.val = page_private(page)}; -		mem_cgroup_uncharge_swap(ent); -	} -} - -void mem_cgroup_commit_charge_swapin(struct page *page, -				     struct mem_cgroup *memcg) -{ -	__mem_cgroup_commit_charge_swapin(page, memcg, -					  MEM_CGROUP_CHARGE_TYPE_ANON); -} - -int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, -				gfp_t gfp_mask) -{ -	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; -	struct mem_cgroup *memcg; -	int ret; - -	if (mem_cgroup_disabled()) -		return 0; -	if (PageCompound(page)) -		return 0; - -	if (PageSwapCache(page)) { /* shmem */ -		ret = __mem_cgroup_try_charge_swapin(mm, page, -						     gfp_mask, &memcg); -		if (ret) -			return ret; -		__mem_cgroup_commit_charge_swapin(page, memcg, type); -		return 0; -	} - -	memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); -	if (!memcg) -		return -ENOMEM; -	__mem_cgroup_commit_charge(memcg, page, 1, type, false); -	return 0; -} - -static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, -				   unsigned int nr_pages, -				   const enum charge_type ctype) -{ -	struct memcg_batch_info *batch = NULL; -	bool uncharge_memsw = true; - -	/* If swapout, usage of swap doesn't decrease */ -	if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) -		uncharge_memsw = false; - -	batch = ¤t->memcg_batch; -	/* -	 * In usual, we do css_get() when we remember memcg pointer. -	 * But in this case, we keep res->usage until end of a series of -	 * uncharges. Then, it's ok to ignore memcg's refcnt. -	 */ -	if (!batch->memcg) -		batch->memcg = memcg; -	/* -	 * do_batch > 0 when unmapping pages or inode invalidate/truncate. -	 * In those cases, all pages freed continuously can be expected to be in -	 * the same cgroup and we have chance to coalesce uncharges. -	 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) -	 * because we want to do uncharge as soon as possible. -	 */ - -	if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) -		goto direct_uncharge; - -	if (nr_pages > 1) -		goto direct_uncharge; - -	/* -	 * In typical case, batch->memcg == mem. This means we can -	 * merge a series of uncharges to an uncharge of res_counter. -	 * If not, we uncharge res_counter ony by one. -	 */ -	if (batch->memcg != memcg) -		goto direct_uncharge; -	/* remember freed charge and uncharge it later */ -	batch->nr_pages++; -	if (uncharge_memsw) -		batch->memsw_nr_pages++; -	return; -direct_uncharge: -	res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); -	if (uncharge_memsw) -		res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); -	if (unlikely(batch->memcg != memcg)) -		memcg_oom_recover(memcg); -} - -/* - * uncharge if !page_mapped(page) - */ -static struct mem_cgroup * -__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, -			     bool end_migration) -{ -	struct mem_cgroup *memcg = NULL; -	unsigned int nr_pages = 1; -	struct page_cgroup *pc; -	bool anon; - -	if (mem_cgroup_disabled()) -		return NULL; - -	if (PageTransHuge(page)) { -		nr_pages <<= compound_order(page); -		VM_BUG_ON_PAGE(!PageTransHuge(page), page); -	} -	/* -	 * Check if our page_cgroup is valid -	 */ -	pc = lookup_page_cgroup(page); -	if (unlikely(!PageCgroupUsed(pc))) -		return NULL; - -	lock_page_cgroup(pc); - -	memcg = pc->mem_cgroup; - -	if (!PageCgroupUsed(pc)) -		goto unlock_out; - -	anon = PageAnon(page); - -	switch (ctype) { -	case MEM_CGROUP_CHARGE_TYPE_ANON: -		/* -		 * Generally PageAnon tells if it's the anon statistics to be -		 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is -		 * used before page reached the stage of being marked PageAnon. -		 */ -		anon = true; -		/* fallthrough */ -	case MEM_CGROUP_CHARGE_TYPE_DROP: -		/* See mem_cgroup_prepare_migration() */ -		if (page_mapped(page)) -			goto unlock_out; -		/* -		 * Pages under migration may not be uncharged.  But -		 * end_migration() /must/ be the one uncharging the -		 * unused post-migration page and so it has to call -		 * here with the migration bit still set.  See the -		 * res_counter handling below. -		 */ -		if (!end_migration && PageCgroupMigration(pc)) -			goto unlock_out; -		break; -	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: -		if (!PageAnon(page)) {	/* Shared memory */ -			if (page->mapping && !page_is_file_cache(page)) -				goto unlock_out; -		} else if (page_mapped(page)) /* Anon */ -				goto unlock_out; -		break; -	default: -		break; -	} - -	mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages); - -	ClearPageCgroupUsed(pc); -	/* -	 * pc->mem_cgroup is not cleared here. It will be accessed when it's -	 * freed from LRU. This is safe because uncharged page is expected not -	 * to be reused (freed soon). Exception is SwapCache, it's handled by -	 * special functions. -	 */ - -	unlock_page_cgroup(pc); -	/* -	 * even after unlock, we have memcg->res.usage here and this memcg -	 * will never be freed, so it's safe to call css_get(). -	 */ -	memcg_check_events(memcg, page); -	if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { -		mem_cgroup_swap_statistics(memcg, true); -		css_get(&memcg->css); -	} -	/* -	 * Migration does not charge the res_counter for the -	 * replacement page, so leave it alone when phasing out the -	 * page that is unused after the migration. -	 */ -	if (!end_migration && !mem_cgroup_is_root(memcg)) -		mem_cgroup_do_uncharge(memcg, nr_pages, ctype); - -	return memcg; - -unlock_out: -	unlock_page_cgroup(pc); -	return NULL; -} - -void mem_cgroup_uncharge_page(struct page *page) -{ -	/* early check. */ -	if (page_mapped(page)) -		return; -	VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); -	/* -	 * If the page is in swap cache, uncharge should be deferred -	 * to the swap path, which also properly accounts swap usage -	 * and handles memcg lifetime. -	 * -	 * Note that this check is not stable and reclaim may add the -	 * page to swap cache at any time after this.  However, if the -	 * page is not in swap cache by the time page->mapcount hits -	 * 0, there won't be any page table references to the swap -	 * slot, and reclaim will free it and not actually write the -	 * page to disk. -	 */ -	if (PageSwapCache(page)) -		return; -	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false); -} - -void mem_cgroup_uncharge_cache_page(struct page *page) -{ -	VM_BUG_ON_PAGE(page_mapped(page), page); -	VM_BUG_ON_PAGE(page->mapping, page); -	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); -} - -/* - * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. - * In that cases, pages are freed continuously and we can expect pages - * are in the same memcg. All these calls itself limits the number of - * pages freed at once, then uncharge_start/end() is called properly. - * This may be called prural(2) times in a context, - */ - -void mem_cgroup_uncharge_start(void) -{ -	current->memcg_batch.do_batch++; -	/* We can do nest. */ -	if (current->memcg_batch.do_batch == 1) { -		current->memcg_batch.memcg = NULL; -		current->memcg_batch.nr_pages = 0; -		current->memcg_batch.memsw_nr_pages = 0; -	} -} - -void mem_cgroup_uncharge_end(void) -{ -	struct memcg_batch_info *batch = ¤t->memcg_batch; - -	if (!batch->do_batch) -		return; - -	batch->do_batch--; -	if (batch->do_batch) /* If stacked, do nothing. */ -		return; - -	if (!batch->memcg) -		return; -	/* -	 * This "batch->memcg" is valid without any css_get/put etc... -	 * bacause we hide charges behind us. -	 */ -	if (batch->nr_pages) -		res_counter_uncharge(&batch->memcg->res, -				     batch->nr_pages * PAGE_SIZE); -	if (batch->memsw_nr_pages) -		res_counter_uncharge(&batch->memcg->memsw, -				     batch->memsw_nr_pages * PAGE_SIZE); -	memcg_oom_recover(batch->memcg); -	/* forget this pointer (for sanity check) */ -	batch->memcg = NULL; -} - -#ifdef CONFIG_SWAP -/* - * called after __delete_from_swap_cache() and drop "page" account. - * memcg information is recorded to swap_cgroup of "ent" - */ -void -mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) -{ -	struct mem_cgroup *memcg; -	int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; - -	if (!swapout) /* this was a swap cache but the swap is unused ! */ -		ctype = MEM_CGROUP_CHARGE_TYPE_DROP; - -	memcg = __mem_cgroup_uncharge_common(page, ctype, false); - -	/* -	 * record memcg information,  if swapout && memcg != NULL, -	 * css_get() was called in uncharge(). -	 */ -	if (do_swap_account && swapout && memcg) -		swap_cgroup_record(ent, mem_cgroup_id(memcg)); -} -#endif -  #ifdef CONFIG_MEMCG_SWAP -/* - * called from swap_entry_free(). remove record in swap_cgroup and - * uncharge "memsw" account. - */ -void mem_cgroup_uncharge_swap(swp_entry_t ent) +static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, +					 bool charge)  { -	struct mem_cgroup *memcg; -	unsigned short id; - -	if (!do_swap_account) -		return; - -	id = swap_cgroup_record(ent, 0); -	rcu_read_lock(); -	memcg = mem_cgroup_lookup(id); -	if (memcg) { -		/* -		 * We uncharge this because swap is freed.  This memcg can -		 * be obsolete one. We avoid calling css_tryget_online(). -		 */ -		if (!mem_cgroup_is_root(memcg)) -			res_counter_uncharge(&memcg->memsw, PAGE_SIZE); -		mem_cgroup_swap_statistics(memcg, false); -		css_put(&memcg->css); -	} -	rcu_read_unlock(); +	int val = (charge) ? 1 : -1; +	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);  }  /** @@ -4183,175 +3648,6 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,  }  #endif -/* - * Before starting migration, account PAGE_SIZE to mem_cgroup that the old - * page belongs to. - */ -void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, -				  struct mem_cgroup **memcgp) -{ -	struct mem_cgroup *memcg = NULL; -	unsigned int nr_pages = 1; -	struct page_cgroup *pc; -	enum charge_type ctype; - -	*memcgp = NULL; - -	if (mem_cgroup_disabled()) -		return; - -	if (PageTransHuge(page)) -		nr_pages <<= compound_order(page); - -	pc = lookup_page_cgroup(page); -	lock_page_cgroup(pc); -	if (PageCgroupUsed(pc)) { -		memcg = pc->mem_cgroup; -		css_get(&memcg->css); -		/* -		 * At migrating an anonymous page, its mapcount goes down -		 * to 0 and uncharge() will be called. But, even if it's fully -		 * unmapped, migration may fail and this page has to be -		 * charged again. We set MIGRATION flag here and delay uncharge -		 * until end_migration() is called -		 * -		 * Corner Case Thinking -		 * A) -		 * When the old page was mapped as Anon and it's unmap-and-freed -		 * while migration was ongoing. -		 * If unmap finds the old page, uncharge() of it will be delayed -		 * until end_migration(). If unmap finds a new page, it's -		 * uncharged when it make mapcount to be 1->0. If unmap code -		 * finds swap_migration_entry, the new page will not be mapped -		 * and end_migration() will find it(mapcount==0). -		 * -		 * B) -		 * When the old page was mapped but migraion fails, the kernel -		 * remaps it. A charge for it is kept by MIGRATION flag even -		 * if mapcount goes down to 0. We can do remap successfully -		 * without charging it again. -		 * -		 * C) -		 * The "old" page is under lock_page() until the end of -		 * migration, so, the old page itself will not be swapped-out. -		 * If the new page is swapped out before end_migraton, our -		 * hook to usual swap-out path will catch the event. -		 */ -		if (PageAnon(page)) -			SetPageCgroupMigration(pc); -	} -	unlock_page_cgroup(pc); -	/* -	 * If the page is not charged at this point, -	 * we return here. -	 */ -	if (!memcg) -		return; - -	*memcgp = memcg; -	/* -	 * We charge new page before it's used/mapped. So, even if unlock_page() -	 * is called before end_migration, we can catch all events on this new -	 * page. In the case new page is migrated but not remapped, new page's -	 * mapcount will be finally 0 and we call uncharge in end_migration(). -	 */ -	if (PageAnon(page)) -		ctype = MEM_CGROUP_CHARGE_TYPE_ANON; -	else -		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; -	/* -	 * The page is committed to the memcg, but it's not actually -	 * charged to the res_counter since we plan on replacing the -	 * old one and only one page is going to be left afterwards. -	 */ -	__mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); -} - -/* remove redundant charge if migration failed*/ -void mem_cgroup_end_migration(struct mem_cgroup *memcg, -	struct page *oldpage, struct page *newpage, bool migration_ok) -{ -	struct page *used, *unused; -	struct page_cgroup *pc; -	bool anon; - -	if (!memcg) -		return; - -	if (!migration_ok) { -		used = oldpage; -		unused = newpage; -	} else { -		used = newpage; -		unused = oldpage; -	} -	anon = PageAnon(used); -	__mem_cgroup_uncharge_common(unused, -				     anon ? MEM_CGROUP_CHARGE_TYPE_ANON -				     : MEM_CGROUP_CHARGE_TYPE_CACHE, -				     true); -	css_put(&memcg->css); -	/* -	 * We disallowed uncharge of pages under migration because mapcount -	 * of the page goes down to zero, temporarly. -	 * Clear the flag and check the page should be charged. -	 */ -	pc = lookup_page_cgroup(oldpage); -	lock_page_cgroup(pc); -	ClearPageCgroupMigration(pc); -	unlock_page_cgroup(pc); - -	/* -	 * If a page is a file cache, radix-tree replacement is very atomic -	 * and we can skip this check. When it was an Anon page, its mapcount -	 * goes down to 0. But because we added MIGRATION flage, it's not -	 * uncharged yet. There are several case but page->mapcount check -	 * and USED bit check in mem_cgroup_uncharge_page() will do enough -	 * check. (see prepare_charge() also) -	 */ -	if (anon) -		mem_cgroup_uncharge_page(used); -} - -/* - * At replace page cache, newpage is not under any memcg but it's on - * LRU. So, this function doesn't touch res_counter but handles LRU - * in correct way. Both pages are locked so we cannot race with uncharge. - */ -void mem_cgroup_replace_page_cache(struct page *oldpage, -				  struct page *newpage) -{ -	struct mem_cgroup *memcg = NULL; -	struct page_cgroup *pc; -	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; - -	if (mem_cgroup_disabled()) -		return; - -	pc = lookup_page_cgroup(oldpage); -	/* fix accounting on old pages */ -	lock_page_cgroup(pc); -	if (PageCgroupUsed(pc)) { -		memcg = pc->mem_cgroup; -		mem_cgroup_charge_statistics(memcg, oldpage, false, -1); -		ClearPageCgroupUsed(pc); -	} -	unlock_page_cgroup(pc); - -	/* -	 * When called from shmem_replace_page(), in some cases the -	 * oldpage has already been charged, and in some cases not. -	 */ -	if (!memcg) -		return; -	/* -	 * Even if newpage->mapping was NULL before starting replacement, -	 * the newpage may be on LRU(or pagevec for LRU) already. We lock -	 * LRU while we overwrite pc->mem_cgroup. -	 */ -	__mem_cgroup_commit_charge(memcg, newpage, 1, type, true); -} -  #ifdef CONFIG_DEBUG_VM  static struct page_cgroup *lookup_page_cgroup_used(struct page *page)  { @@ -4550,7 +3846,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,  						    gfp_mask, &nr_scanned);  		nr_reclaimed += reclaimed;  		*total_scanned += nr_scanned; -		spin_lock(&mctz->lock); +		spin_lock_irq(&mctz->lock);  		/*  		 * If we failed to reclaim anything from this memory cgroup @@ -4590,7 +3886,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,  		 */  		/* If excess == 0, no tree ops */  		__mem_cgroup_insert_exceeded(mz, mctz, excess); -		spin_unlock(&mctz->lock); +		spin_unlock_irq(&mctz->lock);  		css_put(&mz->memcg->css);  		loop++;  		/* @@ -4817,7 +4113,6 @@ out:  	return retval;  } -  static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,  					       enum mem_cgroup_stat_index idx)  { @@ -4857,38 +4152,29 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)  	return val << PAGE_SHIFT;  } +  static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, -				   struct cftype *cft) +			       struct cftype *cft)  {  	struct mem_cgroup *memcg = mem_cgroup_from_css(css); -	u64 val; -	int name; -	enum res_type type; - -	type = MEMFILE_TYPE(cft->private); -	name = MEMFILE_ATTR(cft->private); +	enum res_type type = MEMFILE_TYPE(cft->private); +	int name = MEMFILE_ATTR(cft->private);  	switch (type) {  	case _MEM:  		if (name == RES_USAGE) -			val = mem_cgroup_usage(memcg, false); -		else -			val = res_counter_read_u64(&memcg->res, name); -		break; +			return mem_cgroup_usage(memcg, false); +		return res_counter_read_u64(&memcg->res, name);  	case _MEMSWAP:  		if (name == RES_USAGE) -			val = mem_cgroup_usage(memcg, true); -		else -			val = res_counter_read_u64(&memcg->memsw, name); -		break; +			return mem_cgroup_usage(memcg, true); +		return res_counter_read_u64(&memcg->memsw, name);  	case _KMEM: -		val = res_counter_read_u64(&memcg->kmem, name); +		return res_counter_read_u64(&memcg->kmem, name);  		break;  	default:  		BUG();  	} - -	return val;  }  #ifdef CONFIG_MEMCG_KMEM @@ -5446,15 +4732,15 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,  	mutex_lock(&memcg->thresholds_lock); -	if (type == _MEM) +	if (type == _MEM) {  		thresholds = &memcg->thresholds; -	else if (type == _MEMSWAP) +		usage = mem_cgroup_usage(memcg, false); +	} else if (type == _MEMSWAP) {  		thresholds = &memcg->memsw_thresholds; -	else +		usage = mem_cgroup_usage(memcg, true); +	} else  		BUG(); -	usage = mem_cgroup_usage(memcg, type == _MEMSWAP); -  	/* Check if a threshold crossed before adding a new one */  	if (thresholds->primary)  		__mem_cgroup_threshold(memcg, type == _MEMSWAP); @@ -5534,18 +4820,19 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,  	int i, j, size;  	mutex_lock(&memcg->thresholds_lock); -	if (type == _MEM) + +	if (type == _MEM) {  		thresholds = &memcg->thresholds; -	else if (type == _MEMSWAP) +		usage = mem_cgroup_usage(memcg, false); +	} else if (type == _MEMSWAP) {  		thresholds = &memcg->memsw_thresholds; -	else +		usage = mem_cgroup_usage(memcg, true); +	} else  		BUG();  	if (!thresholds->primary)  		goto unlock; -	usage = mem_cgroup_usage(memcg, type == _MEMSWAP); -  	/* Check if a threshold crossed before removing */  	__mem_cgroup_threshold(memcg, type == _MEMSWAP); @@ -6007,7 +5294,6 @@ static struct cftype mem_cgroup_files[] = {  	},  	{  		.name = "use_hierarchy", -		.flags = CFTYPE_INSANE,  		.write_u64 = mem_cgroup_hierarchy_write,  		.read_u64 = mem_cgroup_hierarchy_read,  	}, @@ -6277,6 +5563,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)  {  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);  	struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); +	int ret;  	if (css->id > MEM_CGROUP_ID_MAX)  		return -ENOSPC; @@ -6313,7 +5600,18 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)  	}  	mutex_unlock(&memcg_create_mutex); -	return memcg_init_kmem(memcg, &memory_cgrp_subsys); +	ret = memcg_init_kmem(memcg, &memory_cgrp_subsys); +	if (ret) +		return ret; + +	/* +	 * Make sure the memcg is initialized: mem_cgroup_iter() +	 * orders reading memcg->initialized against its callers +	 * reading the memcg members. +	 */ +	smp_store_release(&memcg->initialized, 1); + +	return 0;  }  /* @@ -6411,57 +5709,63 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)  	__mem_cgroup_free(memcg);  } +/** + * mem_cgroup_css_reset - reset the states of a mem_cgroup + * @css: the target css + * + * Reset the states of the mem_cgroup associated with @css.  This is + * invoked when the userland requests disabling on the default hierarchy + * but the memcg is pinned through dependency.  The memcg should stop + * applying policies and should revert to the vanilla state as it may be + * made visible again. + * + * The current implementation only resets the essential configurations. + * This needs to be expanded to cover all the visible parts. + */ +static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) +{ +	struct mem_cgroup *memcg = mem_cgroup_from_css(css); + +	mem_cgroup_resize_limit(memcg, ULLONG_MAX); +	mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX); +	memcg_update_kmem_limit(memcg, ULLONG_MAX); +	res_counter_set_soft_limit(&memcg->res, ULLONG_MAX); +} +  #ifdef CONFIG_MMU  /* Handlers for move charge at task migration. */ -#define PRECHARGE_COUNT_AT_ONCE	256  static int mem_cgroup_do_precharge(unsigned long count)  { -	int ret = 0; -	int batch_count = PRECHARGE_COUNT_AT_ONCE; -	struct mem_cgroup *memcg = mc.to; +	int ret; -	if (mem_cgroup_is_root(memcg)) { +	/* Try a single bulk charge without reclaim first */ +	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); +	if (!ret) {  		mc.precharge += count; -		/* we don't need css_get for root */  		return ret;  	} -	/* try to charge at once */ -	if (count > 1) { -		struct res_counter *dummy; -		/* -		 * "memcg" cannot be under rmdir() because we've already checked -		 * by cgroup_lock_live_cgroup() that it is not removed and we -		 * are still under the same cgroup_mutex. So we can postpone -		 * css_get(). -		 */ -		if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) -			goto one_by_one; -		if (do_swap_account && res_counter_charge(&memcg->memsw, -						PAGE_SIZE * count, &dummy)) { -			res_counter_uncharge(&memcg->res, PAGE_SIZE * count); -			goto one_by_one; -		} -		mc.precharge += count; +	if (ret == -EINTR) { +		cancel_charge(root_mem_cgroup, count);  		return ret;  	} -one_by_one: -	/* fall back to one by one charge */ + +	/* Try charges one by one with reclaim */  	while (count--) { -		if (signal_pending(current)) { -			ret = -EINTR; -			break; -		} -		if (!batch_count--) { -			batch_count = PRECHARGE_COUNT_AT_ONCE; -			cond_resched(); -		} -		ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false); +		ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); +		/* +		 * In case of failure, any residual charges against +		 * mc.to will be dropped by mem_cgroup_clear_mc() +		 * later on.  However, cancel any charges that are +		 * bypassed to root right away or they'll be lost. +		 */ +		if (ret == -EINTR) +			cancel_charge(root_mem_cgroup, 1);  		if (ret) -			/* mem_cgroup_clear_mc() will do uncharge later */  			return ret;  		mc.precharge++; +		cond_resched();  	} -	return ret; +	return 0;  }  /** @@ -6597,9 +5901,9 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,  	if (page) {  		pc = lookup_page_cgroup(page);  		/* -		 * Do only loose check w/o page_cgroup lock. -		 * mem_cgroup_move_account() checks the pc is valid or not under -		 * the lock. +		 * Do only loose check w/o serialization. +		 * mem_cgroup_move_account() checks the pc is valid or +		 * not under LRU exclusion.  		 */  		if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {  			ret = MC_TARGET_PAGE; @@ -6724,7 +6028,7 @@ static void __mem_cgroup_clear_mc(void)  	/* we must uncharge all the leftover precharges from mc.to */  	if (mc.precharge) { -		__mem_cgroup_cancel_charge(mc.to, mc.precharge); +		cancel_charge(mc.to, mc.precharge);  		mc.precharge = 0;  	}  	/* @@ -6732,7 +6036,7 @@ static void __mem_cgroup_clear_mc(void)  	 * we must uncharge here.  	 */  	if (mc.moved_charge) { -		__mem_cgroup_cancel_charge(mc.from, mc.moved_charge); +		cancel_charge(mc.from, mc.moved_charge);  		mc.moved_charge = 0;  	}  	/* we must fixup refcnts and charges */ @@ -6740,19 +6044,18 @@ static void __mem_cgroup_clear_mc(void)  		/* uncharge swap account from the old cgroup */  		if (!mem_cgroup_is_root(mc.from))  			res_counter_uncharge(&mc.from->memsw, -						PAGE_SIZE * mc.moved_swap); +					     PAGE_SIZE * mc.moved_swap);  		for (i = 0; i < mc.moved_swap; i++)  			css_put(&mc.from->css); -		if (!mem_cgroup_is_root(mc.to)) { -			/* -			 * we charged both to->res and to->memsw, so we should -			 * uncharge to->res. -			 */ +		/* +		 * we charged both to->res and to->memsw, so we should +		 * uncharge to->res. +		 */ +		if (!mem_cgroup_is_root(mc.to))  			res_counter_uncharge(&mc.to->res, -						PAGE_SIZE * mc.moved_swap); -		} +					     PAGE_SIZE * mc.moved_swap);  		/* we've already done css_get(mc.to) */  		mc.moved_swap = 0;  	} @@ -7005,16 +6308,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys_state *css,  /*   * Cgroup retains root cgroups across [un]mount cycles making it necessary - * to verify sane_behavior flag on each mount attempt. + * to verify whether we're attached to the default hierarchy on each mount + * attempt.   */  static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)  {  	/* -	 * use_hierarchy is forced with sane_behavior.  cgroup core +	 * use_hierarchy is forced on the default hierarchy.  cgroup core  	 * guarantees that @root doesn't have any children, so turning it  	 * on for the root memcg is enough.  	 */ -	if (cgroup_sane_behavior(root_css->cgroup)) +	if (cgroup_on_dfl(root_css->cgroup))  		mem_cgroup_from_css(root_css)->use_hierarchy = true;  } @@ -7023,11 +6327,12 @@ struct cgroup_subsys memory_cgrp_subsys = {  	.css_online = mem_cgroup_css_online,  	.css_offline = mem_cgroup_css_offline,  	.css_free = mem_cgroup_css_free, +	.css_reset = mem_cgroup_css_reset,  	.can_attach = mem_cgroup_can_attach,  	.cancel_attach = mem_cgroup_cancel_attach,  	.attach = mem_cgroup_move_task,  	.bind = mem_cgroup_bind, -	.base_cftypes = mem_cgroup_files, +	.legacy_cftypes = mem_cgroup_files,  	.early_init = 0,  }; @@ -7044,7 +6349,8 @@ __setup("swapaccount=", enable_swap_account);  static void __init memsw_file_init(void)  { -	WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files)); +	WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, +					  memsw_cgroup_files));  }  static void __init enable_swap_cgroup(void) @@ -7061,6 +6367,402 @@ static void __init enable_swap_cgroup(void)  }  #endif +#ifdef CONFIG_MEMCG_SWAP +/** + * mem_cgroup_swapout - transfer a memsw charge to swap + * @page: page whose memsw charge to transfer + * @entry: swap entry to move the charge to + * + * Transfer the memsw charge of @page to @entry. + */ +void mem_cgroup_swapout(struct page *page, swp_entry_t entry) +{ +	struct page_cgroup *pc; +	unsigned short oldid; + +	VM_BUG_ON_PAGE(PageLRU(page), page); +	VM_BUG_ON_PAGE(page_count(page), page); + +	if (!do_swap_account) +		return; + +	pc = lookup_page_cgroup(page); + +	/* Readahead page, never charged */ +	if (!PageCgroupUsed(pc)) +		return; + +	VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page); + +	oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup)); +	VM_BUG_ON_PAGE(oldid, page); + +	pc->flags &= ~PCG_MEMSW; +	css_get(&pc->mem_cgroup->css); +	mem_cgroup_swap_statistics(pc->mem_cgroup, true); +} + +/** + * mem_cgroup_uncharge_swap - uncharge a swap entry + * @entry: swap entry to uncharge + * + * Drop the memsw charge associated with @entry. + */ +void mem_cgroup_uncharge_swap(swp_entry_t entry) +{ +	struct mem_cgroup *memcg; +	unsigned short id; + +	if (!do_swap_account) +		return; + +	id = swap_cgroup_record(entry, 0); +	rcu_read_lock(); +	memcg = mem_cgroup_lookup(id); +	if (memcg) { +		if (!mem_cgroup_is_root(memcg)) +			res_counter_uncharge(&memcg->memsw, PAGE_SIZE); +		mem_cgroup_swap_statistics(memcg, false); +		css_put(&memcg->css); +	} +	rcu_read_unlock(); +} +#endif + +/** + * mem_cgroup_try_charge - try charging a page + * @page: page to charge + * @mm: mm context of the victim + * @gfp_mask: reclaim mode + * @memcgp: charged memcg return + * + * Try to charge @page to the memcg that @mm belongs to, reclaiming + * pages according to @gfp_mask if necessary. + * + * Returns 0 on success, with *@memcgp pointing to the charged memcg. + * Otherwise, an error code is returned. + * + * After page->mapping has been set up, the caller must finalize the + * charge with mem_cgroup_commit_charge().  Or abort the transaction + * with mem_cgroup_cancel_charge() in case page instantiation fails. + */ +int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, +			  gfp_t gfp_mask, struct mem_cgroup **memcgp) +{ +	struct mem_cgroup *memcg = NULL; +	unsigned int nr_pages = 1; +	int ret = 0; + +	if (mem_cgroup_disabled()) +		goto out; + +	if (PageSwapCache(page)) { +		struct page_cgroup *pc = lookup_page_cgroup(page); +		/* +		 * Every swap fault against a single page tries to charge the +		 * page, bail as early as possible.  shmem_unuse() encounters +		 * already charged pages, too.  The USED bit is protected by +		 * the page lock, which serializes swap cache removal, which +		 * in turn serializes uncharging. +		 */ +		if (PageCgroupUsed(pc)) +			goto out; +	} + +	if (PageTransHuge(page)) { +		nr_pages <<= compound_order(page); +		VM_BUG_ON_PAGE(!PageTransHuge(page), page); +	} + +	if (do_swap_account && PageSwapCache(page)) +		memcg = try_get_mem_cgroup_from_page(page); +	if (!memcg) +		memcg = get_mem_cgroup_from_mm(mm); + +	ret = try_charge(memcg, gfp_mask, nr_pages); + +	css_put(&memcg->css); + +	if (ret == -EINTR) { +		memcg = root_mem_cgroup; +		ret = 0; +	} +out: +	*memcgp = memcg; +	return ret; +} + +/** + * mem_cgroup_commit_charge - commit a page charge + * @page: page to charge + * @memcg: memcg to charge the page to + * @lrucare: page might be on LRU already + * + * Finalize a charge transaction started by mem_cgroup_try_charge(), + * after page->mapping has been set up.  This must happen atomically + * as part of the page instantiation, i.e. under the page table lock + * for anonymous pages, under the page lock for page and swap cache. + * + * In addition, the page must not be on the LRU during the commit, to + * prevent racing with task migration.  If it might be, use @lrucare. + * + * Use mem_cgroup_cancel_charge() to cancel the transaction instead. + */ +void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, +			      bool lrucare) +{ +	unsigned int nr_pages = 1; + +	VM_BUG_ON_PAGE(!page->mapping, page); +	VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); + +	if (mem_cgroup_disabled()) +		return; +	/* +	 * Swap faults will attempt to charge the same page multiple +	 * times.  But reuse_swap_page() might have removed the page +	 * from swapcache already, so we can't check PageSwapCache(). +	 */ +	if (!memcg) +		return; + +	commit_charge(page, memcg, lrucare); + +	if (PageTransHuge(page)) { +		nr_pages <<= compound_order(page); +		VM_BUG_ON_PAGE(!PageTransHuge(page), page); +	} + +	local_irq_disable(); +	mem_cgroup_charge_statistics(memcg, page, nr_pages); +	memcg_check_events(memcg, page); +	local_irq_enable(); + +	if (do_swap_account && PageSwapCache(page)) { +		swp_entry_t entry = { .val = page_private(page) }; +		/* +		 * The swap entry might not get freed for a long time, +		 * let's not wait for it.  The page already received a +		 * memory+swap charge, drop the swap entry duplicate. +		 */ +		mem_cgroup_uncharge_swap(entry); +	} +} + +/** + * mem_cgroup_cancel_charge - cancel a page charge + * @page: page to charge + * @memcg: memcg to charge the page to + * + * Cancel a charge transaction started by mem_cgroup_try_charge(). + */ +void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) +{ +	unsigned int nr_pages = 1; + +	if (mem_cgroup_disabled()) +		return; +	/* +	 * Swap faults will attempt to charge the same page multiple +	 * times.  But reuse_swap_page() might have removed the page +	 * from swapcache already, so we can't check PageSwapCache(). +	 */ +	if (!memcg) +		return; + +	if (PageTransHuge(page)) { +		nr_pages <<= compound_order(page); +		VM_BUG_ON_PAGE(!PageTransHuge(page), page); +	} + +	cancel_charge(memcg, nr_pages); +} + +static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, +			   unsigned long nr_mem, unsigned long nr_memsw, +			   unsigned long nr_anon, unsigned long nr_file, +			   unsigned long nr_huge, struct page *dummy_page) +{ +	unsigned long flags; + +	if (!mem_cgroup_is_root(memcg)) { +		if (nr_mem) +			res_counter_uncharge(&memcg->res, +					     nr_mem * PAGE_SIZE); +		if (nr_memsw) +			res_counter_uncharge(&memcg->memsw, +					     nr_memsw * PAGE_SIZE); +		memcg_oom_recover(memcg); +	} + +	local_irq_save(flags); +	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); +	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); +	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); +	__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); +	__this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file); +	memcg_check_events(memcg, dummy_page); +	local_irq_restore(flags); +} + +static void uncharge_list(struct list_head *page_list) +{ +	struct mem_cgroup *memcg = NULL; +	unsigned long nr_memsw = 0; +	unsigned long nr_anon = 0; +	unsigned long nr_file = 0; +	unsigned long nr_huge = 0; +	unsigned long pgpgout = 0; +	unsigned long nr_mem = 0; +	struct list_head *next; +	struct page *page; + +	next = page_list->next; +	do { +		unsigned int nr_pages = 1; +		struct page_cgroup *pc; + +		page = list_entry(next, struct page, lru); +		next = page->lru.next; + +		VM_BUG_ON_PAGE(PageLRU(page), page); +		VM_BUG_ON_PAGE(page_count(page), page); + +		pc = lookup_page_cgroup(page); +		if (!PageCgroupUsed(pc)) +			continue; + +		/* +		 * Nobody should be changing or seriously looking at +		 * pc->mem_cgroup and pc->flags at this point, we have +		 * fully exclusive access to the page. +		 */ + +		if (memcg != pc->mem_cgroup) { +			if (memcg) { +				uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, +					       nr_anon, nr_file, nr_huge, page); +				pgpgout = nr_mem = nr_memsw = 0; +				nr_anon = nr_file = nr_huge = 0; +			} +			memcg = pc->mem_cgroup; +		} + +		if (PageTransHuge(page)) { +			nr_pages <<= compound_order(page); +			VM_BUG_ON_PAGE(!PageTransHuge(page), page); +			nr_huge += nr_pages; +		} + +		if (PageAnon(page)) +			nr_anon += nr_pages; +		else +			nr_file += nr_pages; + +		if (pc->flags & PCG_MEM) +			nr_mem += nr_pages; +		if (pc->flags & PCG_MEMSW) +			nr_memsw += nr_pages; +		pc->flags = 0; + +		pgpgout++; +	} while (next != page_list); + +	if (memcg) +		uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, +			       nr_anon, nr_file, nr_huge, page); +} + +/** + * mem_cgroup_uncharge - uncharge a page + * @page: page to uncharge + * + * Uncharge a page previously charged with mem_cgroup_try_charge() and + * mem_cgroup_commit_charge(). + */ +void mem_cgroup_uncharge(struct page *page) +{ +	struct page_cgroup *pc; + +	if (mem_cgroup_disabled()) +		return; + +	/* Don't touch page->lru of any random page, pre-check: */ +	pc = lookup_page_cgroup(page); +	if (!PageCgroupUsed(pc)) +		return; + +	INIT_LIST_HEAD(&page->lru); +	uncharge_list(&page->lru); +} + +/** + * mem_cgroup_uncharge_list - uncharge a list of page + * @page_list: list of pages to uncharge + * + * Uncharge a list of pages previously charged with + * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). + */ +void mem_cgroup_uncharge_list(struct list_head *page_list) +{ +	if (mem_cgroup_disabled()) +		return; + +	if (!list_empty(page_list)) +		uncharge_list(page_list); +} + +/** + * mem_cgroup_migrate - migrate a charge to another page + * @oldpage: currently charged page + * @newpage: page to transfer the charge to + * @lrucare: both pages might be on the LRU already + * + * Migrate the charge from @oldpage to @newpage. + * + * Both pages must be locked, @newpage->mapping must be set up. + */ +void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, +			bool lrucare) +{ +	struct page_cgroup *pc; +	int isolated; + +	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); +	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); +	VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage); +	VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage); +	VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); +	VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), +		       newpage); + +	if (mem_cgroup_disabled()) +		return; + +	/* Page cache replacement: new page already charged? */ +	pc = lookup_page_cgroup(newpage); +	if (PageCgroupUsed(pc)) +		return; + +	/* Re-entrant migration: old page already uncharged? */ +	pc = lookup_page_cgroup(oldpage); +	if (!PageCgroupUsed(pc)) +		return; + +	VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage); +	VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage); + +	if (lrucare) +		lock_page_lru(oldpage, &isolated); + +	pc->flags = 0; + +	if (lrucare) +		unlock_page_lru(oldpage, isolated); + +	commit_charge(newpage, pc->mem_cgroup, lrucare); +} +  /*   * subsys_initcall() for memory controller.   *  |