diff options
Diffstat (limited to 'mm/memcontrol.c')
| -rw-r--r-- | mm/memcontrol.c | 569 | 
1 files changed, 264 insertions, 305 deletions
| diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6da5020a8656..508bcea7df56 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -103,11 +103,6 @@ static bool do_memsw_account(void)  	return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;  } -/* memcg and lruvec stats flushing */ -static void flush_memcg_stats_dwork(struct work_struct *w); -static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); -static DEFINE_SPINLOCK(stats_flush_lock); -  #define THRESHOLDS_EVENTS_TARGET 128  #define SOFTLIMIT_EVENTS_TARGET 1024 @@ -239,7 +234,7 @@ enum res_type {  	     iter != NULL;				\  	     iter = mem_cgroup_iter(NULL, iter, NULL)) -static inline bool should_force_charge(void) +static inline bool task_is_dying(void)  {  	return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||  		(current->flags & PF_EXITING); @@ -456,28 +451,6 @@ ino_t page_cgroup_ino(struct page *page)  	return ino;  } -static struct mem_cgroup_per_node * -mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page) -{ -	int nid = page_to_nid(page); - -	return memcg->nodeinfo[nid]; -} - -static struct mem_cgroup_tree_per_node * -soft_limit_tree_node(int nid) -{ -	return soft_limit_tree.rb_tree_per_node[nid]; -} - -static struct mem_cgroup_tree_per_node * -soft_limit_tree_from_page(struct page *page) -{ -	int nid = page_to_nid(page); - -	return soft_limit_tree.rb_tree_per_node[nid]; -} -  static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,  					 struct mem_cgroup_tree_per_node *mctz,  					 unsigned long new_usage_in_excess) @@ -548,13 +521,13 @@ static unsigned long soft_limit_excess(struct mem_cgroup *memcg)  	return excess;  } -static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) +static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)  {  	unsigned long excess;  	struct mem_cgroup_per_node *mz;  	struct mem_cgroup_tree_per_node *mctz; -	mctz = soft_limit_tree_from_page(page); +	mctz = soft_limit_tree.rb_tree_per_node[nid];  	if (!mctz)  		return;  	/* @@ -562,7 +535,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)  	 * because their event counter is not touched.  	 */  	for (; memcg; memcg = parent_mem_cgroup(memcg)) { -		mz = mem_cgroup_page_nodeinfo(memcg, page); +		mz = memcg->nodeinfo[nid];  		excess = soft_limit_excess(memcg);  		/*  		 * We have to update the tree if mz is on RB-tree or @@ -593,7 +566,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)  	for_each_node(nid) {  		mz = memcg->nodeinfo[nid]; -		mctz = soft_limit_tree_node(nid); +		mctz = soft_limit_tree.rb_tree_per_node[nid];  		if (mctz)  			mem_cgroup_remove_exceeded(mz, mctz);  	} @@ -635,6 +608,58 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)  	return mz;  } +/* + * memcg and lruvec stats flushing + * + * Many codepaths leading to stats update or read are performance sensitive and + * adding stats flushing in such codepaths is not desirable. So, to optimize the + * flushing the kernel does: + * + * 1) Periodically and asynchronously flush the stats every 2 seconds to not let + *    rstat update tree grow unbounded. + * + * 2) Flush the stats synchronously on reader side only when there are more than + *    (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization + *    will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but + *    only for 2 seconds due to (1). + */ +static void flush_memcg_stats_dwork(struct work_struct *w); +static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); +static DEFINE_SPINLOCK(stats_flush_lock); +static DEFINE_PER_CPU(unsigned int, stats_updates); +static atomic_t stats_flush_threshold = ATOMIC_INIT(0); + +static inline void memcg_rstat_updated(struct mem_cgroup *memcg) +{ +	cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); +	if (!(__this_cpu_inc_return(stats_updates) % MEMCG_CHARGE_BATCH)) +		atomic_inc(&stats_flush_threshold); +} + +static void __mem_cgroup_flush_stats(void) +{ +	unsigned long flag; + +	if (!spin_trylock_irqsave(&stats_flush_lock, flag)) +		return; + +	cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); +	atomic_set(&stats_flush_threshold, 0); +	spin_unlock_irqrestore(&stats_flush_lock, flag); +} + +void mem_cgroup_flush_stats(void) +{ +	if (atomic_read(&stats_flush_threshold) > num_online_cpus()) +		__mem_cgroup_flush_stats(); +} + +static void flush_memcg_stats_dwork(struct work_struct *w) +{ +	mem_cgroup_flush_stats(); +	queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); +} +  /**   * __mod_memcg_state - update cgroup memory statistics   * @memcg: the memory cgroup @@ -647,7 +672,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)  		return;  	__this_cpu_add(memcg->vmstats_percpu->state[idx], val); -	cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); +	memcg_rstat_updated(memcg);  }  /* idx can be of type enum memcg_stat_item or node_stat_item. */ @@ -675,10 +700,12 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,  	memcg = pn->memcg;  	/* Update memcg */ -	__mod_memcg_state(memcg, idx, val); +	__this_cpu_add(memcg->vmstats_percpu->state[idx], val);  	/* Update lruvec */  	__this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); + +	memcg_rstat_updated(memcg);  }  /** @@ -780,7 +807,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,  		return;  	__this_cpu_add(memcg->vmstats_percpu->events[idx], count); -	cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); +	memcg_rstat_updated(memcg);  }  static unsigned long memcg_events(struct mem_cgroup *memcg, int event) @@ -799,7 +826,6 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)  }  static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, -					 struct page *page,  					 int nr_pages)  {  	/* pagein of a big page is an event. So, ignore page size */ @@ -842,7 +868,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,   * Check events in order.   *   */ -static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) +static void memcg_check_events(struct mem_cgroup *memcg, int nid)  {  	/* threshold event is triggered in finer grain than soft limit */  	if (unlikely(mem_cgroup_event_ratelimit(memcg, @@ -853,7 +879,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)  						MEM_CGROUP_TARGET_SOFTLIMIT);  		mem_cgroup_threshold(memcg);  		if (unlikely(do_softlimit)) -			mem_cgroup_update_tree(memcg, page); +			mem_cgroup_update_tree(memcg, nid);  	}  } @@ -1149,64 +1175,88 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,  }  #ifdef CONFIG_DEBUG_VM -void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)  {  	struct mem_cgroup *memcg;  	if (mem_cgroup_disabled())  		return; -	memcg = page_memcg(page); +	memcg = folio_memcg(folio);  	if (!memcg) -		VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page); +		VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != root_mem_cgroup, folio);  	else -		VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != memcg, page); +		VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);  }  #endif  /** - * lock_page_lruvec - lock and return lruvec for a given page. - * @page: the page + * folio_lruvec_lock - Lock the lruvec for a folio. + * @folio: Pointer to the folio.   *   * These functions are safe to use under any of the following conditions: - * - page locked - * - PageLRU cleared - * - lock_page_memcg() - * - page->_refcount is zero + * - folio locked + * - folio_test_lru false + * - folio_memcg_lock() + * - folio frozen (refcount of 0) + * + * Return: The lruvec this folio is on with its lock held.   */ -struct lruvec *lock_page_lruvec(struct page *page) +struct lruvec *folio_lruvec_lock(struct folio *folio)  { -	struct lruvec *lruvec; +	struct lruvec *lruvec = folio_lruvec(folio); -	lruvec = mem_cgroup_page_lruvec(page);  	spin_lock(&lruvec->lru_lock); - -	lruvec_memcg_debug(lruvec, page); +	lruvec_memcg_debug(lruvec, folio);  	return lruvec;  } -struct lruvec *lock_page_lruvec_irq(struct page *page) +/** + * folio_lruvec_lock_irq - Lock the lruvec for a folio. + * @folio: Pointer to the folio. + * + * These functions are safe to use under any of the following conditions: + * - folio locked + * - folio_test_lru false + * - folio_memcg_lock() + * - folio frozen (refcount of 0) + * + * Return: The lruvec this folio is on with its lock held and interrupts + * disabled. + */ +struct lruvec *folio_lruvec_lock_irq(struct folio *folio)  { -	struct lruvec *lruvec; +	struct lruvec *lruvec = folio_lruvec(folio); -	lruvec = mem_cgroup_page_lruvec(page);  	spin_lock_irq(&lruvec->lru_lock); - -	lruvec_memcg_debug(lruvec, page); +	lruvec_memcg_debug(lruvec, folio);  	return lruvec;  } -struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags) +/** + * folio_lruvec_lock_irqsave - Lock the lruvec for a folio. + * @folio: Pointer to the folio. + * @flags: Pointer to irqsave flags. + * + * These functions are safe to use under any of the following conditions: + * - folio locked + * - folio_test_lru false + * - folio_memcg_lock() + * - folio frozen (refcount of 0) + * + * Return: The lruvec this folio is on with its lock held and interrupts + * disabled. + */ +struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, +		unsigned long *flags)  { -	struct lruvec *lruvec; +	struct lruvec *lruvec = folio_lruvec(folio); -	lruvec = mem_cgroup_page_lruvec(page);  	spin_lock_irqsave(&lruvec->lru_lock, *flags); - -	lruvec_memcg_debug(lruvec, page); +	lruvec_memcg_debug(lruvec, folio);  	return lruvec;  } @@ -1414,7 +1464,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg)  	 *  	 * Current memory state:  	 */ -	cgroup_rstat_flush(memcg->css.cgroup); +	mem_cgroup_flush_stats();  	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {  		u64 size; @@ -1575,7 +1625,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,  	 * A few threads which were not waiting at mutex_lock_killable() can  	 * fail to bail out. Therefore, check again after holding oom_lock.  	 */ -	ret = should_force_charge() || out_of_memory(&oc); +	ret = task_is_dying() || out_of_memory(&oc);  unlock:  	mutex_unlock(&oom_lock); @@ -1956,18 +2006,17 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)  }  /** - * lock_page_memcg - lock a page and memcg binding - * @page: the page + * folio_memcg_lock - Bind a folio to its memcg. + * @folio: The folio.   * - * This function protects unlocked LRU pages from being moved to + * This function prevents unlocked LRU folios from being moved to   * another cgroup.   * - * It ensures lifetime of the locked memcg. Caller is responsible - * for the lifetime of the page. + * It ensures lifetime of the bound memcg.  The caller is responsible + * for the lifetime of the folio.   */ -void lock_page_memcg(struct page *page) +void folio_memcg_lock(struct folio *folio)  { -	struct page *head = compound_head(page); /* rmap on tail pages */  	struct mem_cgroup *memcg;  	unsigned long flags; @@ -1981,7 +2030,7 @@ void lock_page_memcg(struct page *page)  	if (mem_cgroup_disabled())  		return;  again: -	memcg = page_memcg(head); +	memcg = folio_memcg(folio);  	if (unlikely(!memcg))  		return; @@ -1995,7 +2044,7 @@ again:  		return;  	spin_lock_irqsave(&memcg->move_lock, flags); -	if (memcg != page_memcg(head)) { +	if (memcg != folio_memcg(folio)) {  		spin_unlock_irqrestore(&memcg->move_lock, flags);  		goto again;  	} @@ -2009,9 +2058,15 @@ again:  	memcg->move_lock_task = current;  	memcg->move_lock_flags = flags;  } +EXPORT_SYMBOL(folio_memcg_lock); + +void lock_page_memcg(struct page *page) +{ +	folio_memcg_lock(page_folio(page)); +}  EXPORT_SYMBOL(lock_page_memcg); -static void __unlock_page_memcg(struct mem_cgroup *memcg) +static void __folio_memcg_unlock(struct mem_cgroup *memcg)  {  	if (memcg && memcg->move_lock_task == current) {  		unsigned long flags = memcg->move_lock_flags; @@ -2026,14 +2081,22 @@ static void __unlock_page_memcg(struct mem_cgroup *memcg)  }  /** - * unlock_page_memcg - unlock a page and memcg binding - * @page: the page + * folio_memcg_unlock - Release the binding between a folio and its memcg. + * @folio: The folio. + * + * This releases the binding created by folio_memcg_lock().  This does + * not change the accounting of this folio to its memcg, but it does + * permit others to change it.   */ -void unlock_page_memcg(struct page *page) +void folio_memcg_unlock(struct folio *folio)  { -	struct page *head = compound_head(page); +	__folio_memcg_unlock(folio_memcg(folio)); +} +EXPORT_SYMBOL(folio_memcg_unlock); -	__unlock_page_memcg(page_memcg(head)); +void unlock_page_memcg(struct page *page) +{ +	folio_memcg_unlock(page_folio(page));  }  EXPORT_SYMBOL(unlock_page_memcg); @@ -2530,6 +2593,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,  	struct page_counter *counter;  	enum oom_status oom_status;  	unsigned long nr_reclaimed; +	bool passed_oom = false;  	bool may_swap = true;  	bool drained = false;  	unsigned long pflags; @@ -2565,15 +2629,6 @@ retry:  		goto force;  	/* -	 * Unlike in global OOM situations, memcg is not in a physical -	 * memory shortage.  Allow dying and OOM-killed tasks to -	 * bypass the last charges so that they can exit quickly and -	 * free their memory. -	 */ -	if (unlikely(should_force_charge())) -		goto force; - -	/*  	 * Prevent unbounded recursion when reclaim operations need to  	 * allocate memory. This might exceed the limits temporarily,  	 * but we prefer facilitating memory reclaim and getting back @@ -2630,8 +2685,9 @@ retry:  	if (gfp_mask & __GFP_RETRY_MAYFAIL)  		goto nomem; -	if (fatal_signal_pending(current)) -		goto force; +	/* Avoid endless loop for tasks bypassed by the oom killer */ +	if (passed_oom && task_is_dying()) +		goto nomem;  	/*  	 * keep retrying as long as the memcg oom killer is able to make @@ -2640,14 +2696,10 @@ retry:  	 */  	oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,  		       get_order(nr_pages * PAGE_SIZE)); -	switch (oom_status) { -	case OOM_SUCCESS: +	if (oom_status == OOM_SUCCESS) { +		passed_oom = true;  		nr_retries = MAX_RECLAIM_RETRIES;  		goto retry; -	case OOM_FAILED: -		goto force; -	default: -		goto nomem;  	}  nomem:  	if (!(gfp_mask & __GFP_NOFAIL)) @@ -2722,8 +2774,7 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,  	return try_charge_memcg(memcg, gfp_mask, nr_pages);  } -#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU) -static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) +static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)  {  	if (mem_cgroup_is_root(memcg))  		return; @@ -2732,11 +2783,10 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)  	if (do_memsw_account())  		page_counter_uncharge(&memcg->memsw, nr_pages);  } -#endif -static void commit_charge(struct page *page, struct mem_cgroup *memcg) +static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)  { -	VM_BUG_ON_PAGE(page_memcg(page), page); +	VM_BUG_ON_FOLIO(folio_memcg(folio), folio);  	/*  	 * Any of the following ensures page's memcg stability:  	 * @@ -2745,7 +2795,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg)  	 * - lock_page_memcg()  	 * - exclusive reference  	 */ -	page->memcg_data = (unsigned long)memcg; +	folio->memcg_data = (unsigned long)memcg;  }  static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) @@ -2951,7 +3001,6 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,  static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,  				   unsigned int nr_pages)  { -	struct page_counter *counter;  	struct mem_cgroup *memcg;  	int ret; @@ -2961,21 +3010,8 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,  	if (ret)  		goto out; -	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && -	    !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { - -		/* -		 * Enforce __GFP_NOFAIL allocation because callers are not -		 * prepared to see failures and likely do not have any failure -		 * handling code. -		 */ -		if (gfp & __GFP_NOFAIL) { -			page_counter_charge(&memcg->kmem, nr_pages); -			goto out; -		} -		cancel_charge(memcg, nr_pages); -		ret = -ENOMEM; -	} +	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) +		page_counter_charge(&memcg->kmem, nr_pages);  out:  	css_put(&memcg->css); @@ -3015,15 +3051,16 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)   */  void __memcg_kmem_uncharge_page(struct page *page, int order)  { +	struct folio *folio = page_folio(page);  	struct obj_cgroup *objcg;  	unsigned int nr_pages = 1 << order; -	if (!PageMemcgKmem(page)) +	if (!folio_memcg_kmem(folio))  		return; -	objcg = __page_objcg(page); +	objcg = __folio_objcg(folio);  	obj_cgroup_uncharge_pages(objcg, nr_pages); -	page->memcg_data = 0; +	folio->memcg_data = 0;  	obj_cgroup_put(objcg);  } @@ -3257,17 +3294,18 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)   */  void split_page_memcg(struct page *head, unsigned int nr)  { -	struct mem_cgroup *memcg = page_memcg(head); +	struct folio *folio = page_folio(head); +	struct mem_cgroup *memcg = folio_memcg(folio);  	int i;  	if (mem_cgroup_disabled() || !memcg)  		return;  	for (i = 1; i < nr; i++) -		head[i].memcg_data = head->memcg_data; +		folio_page(folio, i)->memcg_data = folio->memcg_data; -	if (PageMemcgKmem(head)) -		obj_cgroup_get_many(__page_objcg(head), nr - 1); +	if (folio_memcg_kmem(folio)) +		obj_cgroup_get_many(__folio_objcg(folio), nr - 1);  	else  		css_get_many(&memcg->css, nr - 1);  } @@ -3381,7 +3419,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,  	if (order > 0)  		return 0; -	mctz = soft_limit_tree_node(pgdat->node_id); +	mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];  	/*  	 * Do not even bother to check the largest node if the root @@ -3465,19 +3503,11 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)  	/* try to free all pages in this cgroup */  	while (nr_retries && page_counter_read(&memcg->memory)) { -		int progress; -  		if (signal_pending(current))  			return -EINTR; -		progress = try_to_free_mem_cgroup_pages(memcg, 1, -							GFP_KERNEL, true); -		if (!progress) { +		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true))  			nr_retries--; -			/* maybe some writeback is necessary */ -			congestion_wait(BLK_RW_ASYNC, HZ/10); -		} -  	}  	return 0; @@ -3518,8 +3548,7 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)  	unsigned long val;  	if (mem_cgroup_is_root(memcg)) { -		/* mem_cgroup_threshold() calls here from irqsafe context */ -		cgroup_rstat_flush_irqsafe(memcg->css.cgroup); +		mem_cgroup_flush_stats();  		val = memcg_page_state(memcg, NR_FILE_PAGES) +  			memcg_page_state(memcg, NR_ANON_MAPPED);  		if (swap) @@ -3594,7 +3623,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)  		return 0;  	BUG_ON(memcg->kmemcg_id >= 0); -	BUG_ON(memcg->kmem_state);  	memcg_id = memcg_alloc_cache_id();  	if (memcg_id < 0) @@ -3611,22 +3639,18 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)  	static_branch_enable(&memcg_kmem_enabled_key);  	memcg->kmemcg_id = memcg_id; -	memcg->kmem_state = KMEM_ONLINE;  	return 0;  }  static void memcg_offline_kmem(struct mem_cgroup *memcg)  { -	struct cgroup_subsys_state *css; -	struct mem_cgroup *parent, *child; +	struct mem_cgroup *parent;  	int kmemcg_id; -	if (memcg->kmem_state != KMEM_ONLINE) +	if (memcg->kmemcg_id == -1)  		return; -	memcg->kmem_state = KMEM_ALLOCATED; -  	parent = parent_mem_cgroup(memcg);  	if (!parent)  		parent = root_mem_cgroup; @@ -3637,31 +3661,15 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)  	BUG_ON(kmemcg_id < 0);  	/* -	 * Change kmemcg_id of this cgroup and all its descendants to the -	 * parent's id, and then move all entries from this cgroup's list_lrus -	 * to ones of the parent. After we have finished, all list_lrus -	 * corresponding to this cgroup are guaranteed to remain empty. The -	 * ordering is imposed by list_lru_node->lock taken by +	 * After we have finished memcg_reparent_objcgs(), all list_lrus +	 * corresponding to this cgroup are guaranteed to remain empty. +	 * The ordering is imposed by list_lru_node->lock taken by  	 * memcg_drain_all_list_lrus().  	 */ -	rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */ -	css_for_each_descendant_pre(css, &memcg->css) { -		child = mem_cgroup_from_css(css); -		BUG_ON(child->kmemcg_id != kmemcg_id); -		child->kmemcg_id = parent->kmemcg_id; -	} -	rcu_read_unlock(); -  	memcg_drain_all_list_lrus(kmemcg_id, parent);  	memcg_free_cache_id(kmemcg_id); -} - -static void memcg_free_kmem(struct mem_cgroup *memcg) -{ -	/* css_alloc() failed, offlining didn't happen */ -	if (unlikely(memcg->kmem_state == KMEM_ONLINE)) -		memcg_offline_kmem(memcg); +	memcg->kmemcg_id = -1;  }  #else  static int memcg_online_kmem(struct mem_cgroup *memcg) @@ -3671,22 +3679,8 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)  static void memcg_offline_kmem(struct mem_cgroup *memcg)  {  } -static void memcg_free_kmem(struct mem_cgroup *memcg) -{ -}  #endif /* CONFIG_MEMCG_KMEM */ -static int memcg_update_kmem_max(struct mem_cgroup *memcg, -				 unsigned long max) -{ -	int ret; - -	mutex_lock(&memcg_max_mutex); -	ret = page_counter_set_max(&memcg->kmem, max); -	mutex_unlock(&memcg_max_mutex); -	return ret; -} -  static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)  {  	int ret; @@ -3752,10 +3746,8 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,  			ret = mem_cgroup_resize_max(memcg, nr_pages, true);  			break;  		case _KMEM: -			pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " -				     "Please report your usecase to [email protected] if you " -				     "depend on this functionality.\n"); -			ret = memcg_update_kmem_max(memcg, nr_pages); +			/* kmem.limit_in_bytes is deprecated. */ +			ret = -EOPNOTSUPP;  			break;  		case _TCP:  			ret = memcg_update_tcp_max(memcg, nr_pages); @@ -3900,7 +3892,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)  	int nid;  	struct mem_cgroup *memcg = mem_cgroup_from_seq(m); -	cgroup_rstat_flush(memcg->css.cgroup); +	mem_cgroup_flush_stats();  	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {  		seq_printf(m, "%s=%lu", stat->name, @@ -3972,7 +3964,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)  	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); -	cgroup_rstat_flush(memcg->css.cgroup); +	mem_cgroup_flush_stats();  	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {  		unsigned long nr; @@ -4475,7 +4467,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,  	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);  	struct mem_cgroup *parent; -	cgroup_rstat_flush_irqsafe(memcg->css.cgroup); +	mem_cgroup_flush_stats();  	*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);  	*pwriteback = memcg_page_state(memcg, NR_WRITEBACK); @@ -4537,17 +4529,17 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,   * As being wrong occasionally doesn't matter, updates and accesses to the   * records are lockless and racy.   */ -void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, +void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,  					     struct bdi_writeback *wb)  { -	struct mem_cgroup *memcg = page_memcg(page); +	struct mem_cgroup *memcg = folio_memcg(folio);  	struct memcg_cgwb_frn *frn;  	u64 now = get_jiffies_64();  	u64 oldest_at = now;  	int oldest = -1;  	int i; -	trace_track_foreign_dirty(page, wb); +	trace_track_foreign_dirty(folio, wb);  	/*  	 * Pick the slot to use.  If there is already a slot for @wb, keep @@ -5308,7 +5300,9 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)  	cancel_work_sync(&memcg->high_work);  	mem_cgroup_remove_from_trees(memcg);  	free_shrinker_info(memcg); -	memcg_free_kmem(memcg); + +	/* Need to offline kmem if online_css() fails */ +	memcg_offline_kmem(memcg);  	mem_cgroup_free(memcg);  } @@ -5341,21 +5335,6 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)  	memcg_wb_domain_size_changed(memcg);  } -void mem_cgroup_flush_stats(void) -{ -	if (!spin_trylock(&stats_flush_lock)) -		return; - -	cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); -	spin_unlock(&stats_flush_lock); -} - -static void flush_memcg_stats_dwork(struct work_struct *w) -{ -	mem_cgroup_flush_stats(); -	queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); -} -  static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)  {  	struct mem_cgroup *memcg = mem_cgroup_from_css(css); @@ -5545,7 +5524,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,  #endif  static struct page *mc_handle_file_pte(struct vm_area_struct *vma, -			unsigned long addr, pte_t ptent, swp_entry_t *entry) +			unsigned long addr, pte_t ptent)  {  	if (!vma->vm_file) /* anonymous vma */  		return NULL; @@ -5575,38 +5554,39 @@ static int mem_cgroup_move_account(struct page *page,  				   struct mem_cgroup *from,  				   struct mem_cgroup *to)  { +	struct folio *folio = page_folio(page);  	struct lruvec *from_vec, *to_vec;  	struct pglist_data *pgdat; -	unsigned int nr_pages = compound ? thp_nr_pages(page) : 1; -	int ret; +	unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1; +	int nid, ret;  	VM_BUG_ON(from == to); -	VM_BUG_ON_PAGE(PageLRU(page), page); -	VM_BUG_ON(compound && !PageTransHuge(page)); +	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); +	VM_BUG_ON(compound && !folio_test_multi(folio));  	/*  	 * Prevent mem_cgroup_migrate() from looking at  	 * page's memory cgroup of its source page while we change it.  	 */  	ret = -EBUSY; -	if (!trylock_page(page)) +	if (!folio_trylock(folio))  		goto out;  	ret = -EINVAL; -	if (page_memcg(page) != from) +	if (folio_memcg(folio) != from)  		goto out_unlock; -	pgdat = page_pgdat(page); +	pgdat = folio_pgdat(folio);  	from_vec = mem_cgroup_lruvec(from, pgdat);  	to_vec = mem_cgroup_lruvec(to, pgdat); -	lock_page_memcg(page); +	folio_memcg_lock(folio); -	if (PageAnon(page)) { -		if (page_mapped(page)) { +	if (folio_test_anon(folio)) { +		if (folio_mapped(folio)) {  			__mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);  			__mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); -			if (PageTransHuge(page)) { +			if (folio_test_transhuge(folio)) {  				__mod_lruvec_state(from_vec, NR_ANON_THPS,  						   -nr_pages);  				__mod_lruvec_state(to_vec, NR_ANON_THPS, @@ -5617,18 +5597,18 @@ static int mem_cgroup_move_account(struct page *page,  		__mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);  		__mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); -		if (PageSwapBacked(page)) { +		if (folio_test_swapbacked(folio)) {  			__mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);  			__mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);  		} -		if (page_mapped(page)) { +		if (folio_mapped(folio)) {  			__mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);  			__mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);  		} -		if (PageDirty(page)) { -			struct address_space *mapping = page_mapping(page); +		if (folio_test_dirty(folio)) { +			struct address_space *mapping = folio_mapping(folio);  			if (mapping_can_writeback(mapping)) {  				__mod_lruvec_state(from_vec, NR_FILE_DIRTY, @@ -5639,7 +5619,7 @@ static int mem_cgroup_move_account(struct page *page,  		}  	} -	if (PageWriteback(page)) { +	if (folio_test_writeback(folio)) {  		__mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);  		__mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);  	} @@ -5662,20 +5642,21 @@ static int mem_cgroup_move_account(struct page *page,  	css_get(&to->css);  	css_put(&from->css); -	page->memcg_data = (unsigned long)to; +	folio->memcg_data = (unsigned long)to; -	__unlock_page_memcg(from); +	__folio_memcg_unlock(from);  	ret = 0; +	nid = folio_nid(folio);  	local_irq_disable(); -	mem_cgroup_charge_statistics(to, page, nr_pages); -	memcg_check_events(to, page); -	mem_cgroup_charge_statistics(from, page, -nr_pages); -	memcg_check_events(from, page); +	mem_cgroup_charge_statistics(to, nr_pages); +	memcg_check_events(to, nid); +	mem_cgroup_charge_statistics(from, -nr_pages); +	memcg_check_events(from, nid);  	local_irq_enable();  out_unlock: -	unlock_page(page); +	folio_unlock(folio);  out:  	return ret;  } @@ -5718,7 +5699,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,  	else if (is_swap_pte(ptent))  		page = mc_handle_swap_pte(vma, ptent, &ent);  	else if (pte_none(ptent)) -		page = mc_handle_file_pte(vma, addr, ptent, &ent); +		page = mc_handle_file_pte(vma, addr, ptent);  	if (!page && !ent.val)  		return ret; @@ -6373,7 +6354,7 @@ static int memory_numa_stat_show(struct seq_file *m, void *v)  	int i;  	struct mem_cgroup *memcg = mem_cgroup_from_seq(m); -	cgroup_rstat_flush(memcg->css.cgroup); +	mem_cgroup_flush_stats();  	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {  		int nid; @@ -6680,9 +6661,10 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,  			atomic_long_read(&parent->memory.children_low_usage)));  } -static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp) +static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, +			gfp_t gfp)  { -	unsigned int nr_pages = thp_nr_pages(page); +	long nr_pages = folio_nr_pages(folio);  	int ret;  	ret = try_charge(memcg, gfp, nr_pages); @@ -6690,38 +6672,23 @@ static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp)  		goto out;  	css_get(&memcg->css); -	commit_charge(page, memcg); +	commit_charge(folio, memcg);  	local_irq_disable(); -	mem_cgroup_charge_statistics(memcg, page, nr_pages); -	memcg_check_events(memcg, page); +	mem_cgroup_charge_statistics(memcg, nr_pages); +	memcg_check_events(memcg, folio_nid(folio));  	local_irq_enable();  out:  	return ret;  } -/** - * __mem_cgroup_charge - charge a newly allocated page to a cgroup - * @page: page to charge - * @mm: mm context of the victim - * @gfp_mask: reclaim mode - * - * Try to charge @page to the memcg that @mm belongs to, reclaiming - * pages according to @gfp_mask if necessary. if @mm is NULL, try to - * charge to the active memcg. - * - * Do not use this for pages allocated for swapin. - * - * Returns 0 on success. Otherwise, an error code is returned. - */ -int __mem_cgroup_charge(struct page *page, struct mm_struct *mm, -			gfp_t gfp_mask) +int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)  {  	struct mem_cgroup *memcg;  	int ret;  	memcg = get_mem_cgroup_from_mm(mm); -	ret = charge_memcg(page, memcg, gfp_mask); +	ret = charge_memcg(folio, memcg, gfp);  	css_put(&memcg->css);  	return ret; @@ -6742,6 +6709,7 @@ int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,  int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,  				  gfp_t gfp, swp_entry_t entry)  { +	struct folio *folio = page_folio(page);  	struct mem_cgroup *memcg;  	unsigned short id;  	int ret; @@ -6756,7 +6724,7 @@ int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,  		memcg = get_mem_cgroup_from_mm(mm);  	rcu_read_unlock(); -	ret = charge_memcg(page, memcg, gfp); +	ret = charge_memcg(folio, memcg, gfp);  	css_put(&memcg->css);  	return ret; @@ -6800,7 +6768,7 @@ struct uncharge_gather {  	unsigned long nr_memory;  	unsigned long pgpgout;  	unsigned long nr_kmem; -	struct page *dummy_page; +	int nid;  };  static inline void uncharge_gather_clear(struct uncharge_gather *ug) @@ -6824,36 +6792,36 @@ static void uncharge_batch(const struct uncharge_gather *ug)  	local_irq_save(flags);  	__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);  	__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); -	memcg_check_events(ug->memcg, ug->dummy_page); +	memcg_check_events(ug->memcg, ug->nid);  	local_irq_restore(flags); -	/* drop reference from uncharge_page */ +	/* drop reference from uncharge_folio */  	css_put(&ug->memcg->css);  } -static void uncharge_page(struct page *page, struct uncharge_gather *ug) +static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)  { -	unsigned long nr_pages; +	long nr_pages;  	struct mem_cgroup *memcg;  	struct obj_cgroup *objcg; -	bool use_objcg = PageMemcgKmem(page); +	bool use_objcg = folio_memcg_kmem(folio); -	VM_BUG_ON_PAGE(PageLRU(page), page); +	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);  	/*  	 * Nobody should be changing or seriously looking at -	 * page memcg or objcg at this point, we have fully -	 * exclusive access to the page. +	 * folio memcg or objcg at this point, we have fully +	 * exclusive access to the folio.  	 */  	if (use_objcg) { -		objcg = __page_objcg(page); +		objcg = __folio_objcg(folio);  		/*  		 * This get matches the put at the end of the function and  		 * kmem pages do not hold memcg references anymore.  		 */  		memcg = get_mem_cgroup_from_objcg(objcg);  	} else { -		memcg = __page_memcg(page); +		memcg = __folio_memcg(folio);  	}  	if (!memcg) @@ -6865,19 +6833,19 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)  			uncharge_gather_clear(ug);  		}  		ug->memcg = memcg; -		ug->dummy_page = page; +		ug->nid = folio_nid(folio);  		/* pairs with css_put in uncharge_batch */  		css_get(&memcg->css);  	} -	nr_pages = compound_nr(page); +	nr_pages = folio_nr_pages(folio);  	if (use_objcg) {  		ug->nr_memory += nr_pages;  		ug->nr_kmem += nr_pages; -		page->memcg_data = 0; +		folio->memcg_data = 0;  		obj_cgroup_put(objcg);  	} else {  		/* LRU pages aren't accounted at the root level */ @@ -6885,28 +6853,22 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)  			ug->nr_memory += nr_pages;  		ug->pgpgout++; -		page->memcg_data = 0; +		folio->memcg_data = 0;  	}  	css_put(&memcg->css);  } -/** - * __mem_cgroup_uncharge - uncharge a page - * @page: page to uncharge - * - * Uncharge a page previously charged with __mem_cgroup_charge(). - */ -void __mem_cgroup_uncharge(struct page *page) +void __mem_cgroup_uncharge(struct folio *folio)  {  	struct uncharge_gather ug; -	/* Don't touch page->lru of any random page, pre-check: */ -	if (!page_memcg(page)) +	/* Don't touch folio->lru of any random page, pre-check: */ +	if (!folio_memcg(folio))  		return;  	uncharge_gather_clear(&ug); -	uncharge_page(page, &ug); +	uncharge_folio(folio, &ug);  	uncharge_batch(&ug);  } @@ -6920,52 +6882,49 @@ void __mem_cgroup_uncharge(struct page *page)  void __mem_cgroup_uncharge_list(struct list_head *page_list)  {  	struct uncharge_gather ug; -	struct page *page; +	struct folio *folio;  	uncharge_gather_clear(&ug); -	list_for_each_entry(page, page_list, lru) -		uncharge_page(page, &ug); +	list_for_each_entry(folio, page_list, lru) +		uncharge_folio(folio, &ug);  	if (ug.memcg)  		uncharge_batch(&ug);  }  /** - * mem_cgroup_migrate - charge a page's replacement - * @oldpage: currently circulating page - * @newpage: replacement page + * mem_cgroup_migrate - Charge a folio's replacement. + * @old: Currently circulating folio. + * @new: Replacement folio.   * - * Charge @newpage as a replacement page for @oldpage. @oldpage will + * Charge @new as a replacement folio for @old. @old will   * be uncharged upon free.   * - * Both pages must be locked, @newpage->mapping must be set up. + * Both folios must be locked, @new->mapping must be set up.   */ -void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) +void mem_cgroup_migrate(struct folio *old, struct folio *new)  {  	struct mem_cgroup *memcg; -	unsigned int nr_pages; +	long nr_pages = folio_nr_pages(new);  	unsigned long flags; -	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); -	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); -	VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); -	VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), -		       newpage); +	VM_BUG_ON_FOLIO(!folio_test_locked(old), old); +	VM_BUG_ON_FOLIO(!folio_test_locked(new), new); +	VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new); +	VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new);  	if (mem_cgroup_disabled())  		return; -	/* Page cache replacement: new page already charged? */ -	if (page_memcg(newpage)) +	/* Page cache replacement: new folio already charged? */ +	if (folio_memcg(new))  		return; -	memcg = page_memcg(oldpage); -	VM_WARN_ON_ONCE_PAGE(!memcg, oldpage); +	memcg = folio_memcg(old); +	VM_WARN_ON_ONCE_FOLIO(!memcg, old);  	if (!memcg)  		return;  	/* Force-charge the new page. The old one will be freed soon */ -	nr_pages = thp_nr_pages(newpage); -  	if (!mem_cgroup_is_root(memcg)) {  		page_counter_charge(&memcg->memory, nr_pages);  		if (do_memsw_account()) @@ -6973,11 +6932,11 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)  	}  	css_get(&memcg->css); -	commit_charge(newpage, memcg); +	commit_charge(new, memcg);  	local_irq_save(flags); -	mem_cgroup_charge_statistics(memcg, newpage, nr_pages); -	memcg_check_events(memcg, newpage); +	mem_cgroup_charge_statistics(memcg, nr_pages); +	memcg_check_events(memcg, folio_nid(new));  	local_irq_restore(flags);  } @@ -7204,8 +7163,8 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)  	 * only synchronisation we have for updating the per-CPU variables.  	 */  	VM_BUG_ON(!irqs_disabled()); -	mem_cgroup_charge_statistics(memcg, page, -nr_entries); -	memcg_check_events(memcg, page); +	mem_cgroup_charge_statistics(memcg, -nr_entries); +	memcg_check_events(memcg, page_to_nid(page));  	css_put(&memcg->css);  } |