diff options
Diffstat (limited to 'mm/memcontrol.c')
| -rw-r--r-- | mm/memcontrol.c | 798 | 
1 files changed, 503 insertions, 295 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 13f559af1ab6..b807952b4d43 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -73,8 +73,6 @@ EXPORT_SYMBOL(memory_cgrp_subsys);  struct mem_cgroup *root_mem_cgroup __read_mostly; -#define MEM_CGROUP_RECLAIM_RETRIES	5 -  /* Socket memory accounting disabled? */  static bool cgroup_memory_nosocket; @@ -257,8 +255,100 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)  }  #ifdef CONFIG_MEMCG_KMEM +extern spinlock_t css_set_lock; + +static void obj_cgroup_release(struct percpu_ref *ref) +{ +	struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt); +	struct mem_cgroup *memcg; +	unsigned int nr_bytes; +	unsigned int nr_pages; +	unsigned long flags; + +	/* +	 * At this point all allocated objects are freed, and +	 * objcg->nr_charged_bytes can't have an arbitrary byte value. +	 * However, it can be PAGE_SIZE or (x * PAGE_SIZE). +	 * +	 * The following sequence can lead to it: +	 * 1) CPU0: objcg == stock->cached_objcg +	 * 2) CPU1: we do a small allocation (e.g. 92 bytes), +	 *          PAGE_SIZE bytes are charged +	 * 3) CPU1: a process from another memcg is allocating something, +	 *          the stock if flushed, +	 *          objcg->nr_charged_bytes = PAGE_SIZE - 92 +	 * 5) CPU0: we do release this object, +	 *          92 bytes are added to stock->nr_bytes +	 * 6) CPU0: stock is flushed, +	 *          92 bytes are added to objcg->nr_charged_bytes +	 * +	 * In the result, nr_charged_bytes == PAGE_SIZE. +	 * This page will be uncharged in obj_cgroup_release(). +	 */ +	nr_bytes = atomic_read(&objcg->nr_charged_bytes); +	WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); +	nr_pages = nr_bytes >> PAGE_SHIFT; + +	spin_lock_irqsave(&css_set_lock, flags); +	memcg = obj_cgroup_memcg(objcg); +	if (nr_pages) +		__memcg_kmem_uncharge(memcg, nr_pages); +	list_del(&objcg->list); +	mem_cgroup_put(memcg); +	spin_unlock_irqrestore(&css_set_lock, flags); + +	percpu_ref_exit(ref); +	kfree_rcu(objcg, rcu); +} + +static struct obj_cgroup *obj_cgroup_alloc(void) +{ +	struct obj_cgroup *objcg; +	int ret; + +	objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL); +	if (!objcg) +		return NULL; + +	ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0, +			      GFP_KERNEL); +	if (ret) { +		kfree(objcg); +		return NULL; +	} +	INIT_LIST_HEAD(&objcg->list); +	return objcg; +} + +static void memcg_reparent_objcgs(struct mem_cgroup *memcg, +				  struct mem_cgroup *parent) +{ +	struct obj_cgroup *objcg, *iter; + +	objcg = rcu_replace_pointer(memcg->objcg, NULL, true); + +	spin_lock_irq(&css_set_lock); + +	/* Move active objcg to the parent's list */ +	xchg(&objcg->memcg, parent); +	css_get(&parent->css); +	list_add(&objcg->list, &parent->objcg_list); + +	/* Move already reparented objcgs to the parent's list */ +	list_for_each_entry(iter, &memcg->objcg_list, list) { +		css_get(&parent->css); +		xchg(&iter->memcg, parent); +		css_put(&memcg->css); +	} +	list_splice(&memcg->objcg_list, &parent->objcg_list); + +	spin_unlock_irq(&css_set_lock); + +	percpu_ref_kill(&objcg->refcnt); +} +  /* - * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. + * This will be used as a shrinker list's index.   * The main reason for not using cgroup id for this:   *  this works better in sparse environments, where we have a lot of memcgs,   *  but only a few kmem-limited. Or also, if we have, for instance, 200 @@ -301,14 +391,12 @@ void memcg_put_cache_ids(void)  /*   * A lot of the calls to the cache allocation functions are expected to be - * inlined by the compiler. Since the calls to memcg_kmem_get_cache are + * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are   * conditional to this static branch, we'll have to allow modules that does   * kmem_cache_alloc and the such to see this symbol as well   */  DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);  EXPORT_SYMBOL(memcg_kmem_enabled_key); - -struct workqueue_struct *memcg_kmem_cache_wq;  #endif  static int memcg_shrinker_map_size; @@ -477,10 +565,17 @@ ino_t page_cgroup_ino(struct page *page)  	unsigned long ino = 0;  	rcu_read_lock(); -	if (PageSlab(page) && !PageTail(page)) -		memcg = memcg_from_slab_page(page); -	else -		memcg = READ_ONCE(page->mem_cgroup); +	memcg = page->mem_cgroup; + +	/* +	 * The lowest bit set means that memcg isn't a valid +	 * memcg pointer, but a obj_cgroups pointer. +	 * In this case the page is shared and doesn't belong +	 * to any specific memory cgroup. +	 */ +	if ((unsigned long) memcg & 0x1UL) +		memcg = NULL; +  	while (memcg && !(memcg->css.flags & CSS_ONLINE))  		memcg = parent_mem_cgroup(memcg);  	if (memcg) @@ -681,13 +776,16 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)   */  void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)  { -	long x; +	long x, threshold = MEMCG_CHARGE_BATCH;  	if (mem_cgroup_disabled())  		return; +	if (memcg_stat_item_in_bytes(idx)) +		threshold <<= PAGE_SHIFT; +  	x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); -	if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { +	if (unlikely(abs(x) > threshold)) {  		struct mem_cgroup *mi;  		/* @@ -713,29 +811,12 @@ parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)  	return mem_cgroup_nodeinfo(parent, nid);  } -/** - * __mod_lruvec_state - update lruvec memory statistics - * @lruvec: the lruvec - * @idx: the stat item - * @val: delta to add to the counter, can be negative - * - * The lruvec is the intersection of the NUMA node and a cgroup. This - * function updates the all three counters that are affected by a - * change of state at this level: per-node, per-cgroup, per-lruvec. - */ -void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, -			int val) +void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, +			      int val)  { -	pg_data_t *pgdat = lruvec_pgdat(lruvec);  	struct mem_cgroup_per_node *pn;  	struct mem_cgroup *memcg; -	long x; - -	/* Update node */ -	__mod_node_page_state(pgdat, idx, val); - -	if (mem_cgroup_disabled()) -		return; +	long x, threshold = MEMCG_CHARGE_BATCH;  	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);  	memcg = pn->memcg; @@ -746,8 +827,12 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,  	/* Update lruvec */  	__this_cpu_add(pn->lruvec_stat_local->count[idx], val); +	if (vmstat_item_in_bytes(idx)) +		threshold <<= PAGE_SHIFT; +  	x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); -	if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { +	if (unlikely(abs(x) > threshold)) { +		pg_data_t *pgdat = lruvec_pgdat(lruvec);  		struct mem_cgroup_per_node *pi;  		for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) @@ -757,6 +842,27 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,  	__this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);  } +/** + * __mod_lruvec_state - update lruvec memory statistics + * @lruvec: the lruvec + * @idx: the stat item + * @val: delta to add to the counter, can be negative + * + * The lruvec is the intersection of the NUMA node and a cgroup. This + * function updates the all three counters that are affected by a + * change of state at this level: per-node, per-cgroup, per-lruvec. + */ +void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, +			int val) +{ +	/* Update node */ +	__mod_node_page_state(lruvec_pgdat(lruvec), idx, val); + +	/* Update memcg and lruvec */ +	if (!mem_cgroup_disabled()) +		__mod_memcg_lruvec_state(lruvec, idx, val); +} +  void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)  {  	pg_data_t *pgdat = page_pgdat(virt_to_page(p)); @@ -1004,7 +1110,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,  				   struct mem_cgroup *prev,  				   struct mem_cgroup_reclaim_cookie *reclaim)  { -	struct mem_cgroup_reclaim_iter *uninitialized_var(iter); +	struct mem_cgroup_reclaim_iter *iter;  	struct cgroup_subsys_state *css = NULL;  	struct mem_cgroup *memcg = NULL;  	struct mem_cgroup *pos = NULL; @@ -1377,12 +1483,13 @@ static char *memory_stat_format(struct mem_cgroup *memcg)  		       (u64)memcg_page_state(memcg, NR_FILE_PAGES) *  		       PAGE_SIZE);  	seq_buf_printf(&s, "kernel_stack %llu\n", -		       (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * +		       (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *  		       1024);  	seq_buf_printf(&s, "slab %llu\n", -		       (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) + -			     memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) * -		       PAGE_SIZE); +		       (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) + +			     memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B))); +	seq_buf_printf(&s, "percpu %llu\n", +		       (u64)memcg_page_state(memcg, MEMCG_PERCPU_B));  	seq_buf_printf(&s, "sock %llu\n",  		       (u64)memcg_page_state(memcg, MEMCG_SOCK) *  		       PAGE_SIZE); @@ -1412,11 +1519,9 @@ static char *memory_stat_format(struct mem_cgroup *memcg)  			       PAGE_SIZE);  	seq_buf_printf(&s, "slab_reclaimable %llu\n", -		       (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) * -		       PAGE_SIZE); +		       (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B));  	seq_buf_printf(&s, "slab_unreclaimable %llu\n", -		       (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) * -		       PAGE_SIZE); +		       (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B));  	/* Accumulated memory events */ @@ -1425,12 +1530,18 @@ static char *memory_stat_format(struct mem_cgroup *memcg)  	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),  		       memcg_events(memcg, PGMAJFAULT)); -	seq_buf_printf(&s, "workingset_refault %lu\n", -		       memcg_page_state(memcg, WORKINGSET_REFAULT)); -	seq_buf_printf(&s, "workingset_activate %lu\n", -		       memcg_page_state(memcg, WORKINGSET_ACTIVATE)); +	seq_buf_printf(&s, "workingset_refault_anon %lu\n", +		       memcg_page_state(memcg, WORKINGSET_REFAULT_ANON)); +	seq_buf_printf(&s, "workingset_refault_file %lu\n", +		       memcg_page_state(memcg, WORKINGSET_REFAULT_FILE)); +	seq_buf_printf(&s, "workingset_activate_anon %lu\n", +		       memcg_page_state(memcg, WORKINGSET_ACTIVATE_ANON)); +	seq_buf_printf(&s, "workingset_activate_file %lu\n", +		       memcg_page_state(memcg, WORKINGSET_ACTIVATE_FILE));  	seq_buf_printf(&s, "workingset_restore %lu\n", -		       memcg_page_state(memcg, WORKINGSET_RESTORE)); +		       memcg_page_state(memcg, WORKINGSET_RESTORE_ANON)); +	seq_buf_printf(&s, "workingset_restore %lu\n", +		       memcg_page_state(memcg, WORKINGSET_RESTORE_FILE));  	seq_buf_printf(&s, "workingset_nodereclaim %lu\n",  		       memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); @@ -1560,15 +1671,21 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,  		.gfp_mask = gfp_mask,  		.order = order,  	}; -	bool ret; +	bool ret = true;  	if (mutex_lock_killable(&oom_lock))  		return true; + +	if (mem_cgroup_margin(memcg) >= (1 << order)) +		goto unlock; +  	/*  	 * A few threads which were not waiting at mutex_lock_killable() can  	 * fail to bail out. Therefore, check again after holding oom_lock.  	 */  	ret = should_force_charge() || out_of_memory(&oc); + +unlock:  	mutex_unlock(&oom_lock);  	return ret;  } @@ -2039,6 +2156,12 @@ EXPORT_SYMBOL(unlock_page_memcg);  struct memcg_stock_pcp {  	struct mem_cgroup *cached; /* this never be root cgroup */  	unsigned int nr_pages; + +#ifdef CONFIG_MEMCG_KMEM +	struct obj_cgroup *cached_objcg; +	unsigned int nr_bytes; +#endif +  	struct work_struct work;  	unsigned long flags;  #define FLUSHING_CACHED_CHARGE	0 @@ -2046,6 +2169,22 @@ struct memcg_stock_pcp {  static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);  static DEFINE_MUTEX(percpu_charge_mutex); +#ifdef CONFIG_MEMCG_KMEM +static void drain_obj_stock(struct memcg_stock_pcp *stock); +static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, +				     struct mem_cgroup *root_memcg); + +#else +static inline void drain_obj_stock(struct memcg_stock_pcp *stock) +{ +} +static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, +				     struct mem_cgroup *root_memcg) +{ +	return false; +} +#endif +  /**   * consume_stock: Try to consume stocked charge on this cpu.   * @memcg: memcg to consume from. @@ -2086,13 +2225,17 @@ static void drain_stock(struct memcg_stock_pcp *stock)  {  	struct mem_cgroup *old = stock->cached; +	if (!old) +		return; +  	if (stock->nr_pages) {  		page_counter_uncharge(&old->memory, stock->nr_pages);  		if (do_memsw_account())  			page_counter_uncharge(&old->memsw, stock->nr_pages); -		css_put_many(&old->css, stock->nr_pages);  		stock->nr_pages = 0;  	} + +	css_put(&old->css);  	stock->cached = NULL;  } @@ -2108,6 +2251,7 @@ static void drain_local_stock(struct work_struct *dummy)  	local_irq_save(flags);  	stock = this_cpu_ptr(&memcg_stock); +	drain_obj_stock(stock);  	drain_stock(stock);  	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); @@ -2128,6 +2272,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)  	stock = this_cpu_ptr(&memcg_stock);  	if (stock->cached != memcg) { /* reset if necessary */  		drain_stock(stock); +		css_get(&memcg->css);  		stock->cached = memcg;  	}  	stock->nr_pages += nr_pages; @@ -2166,6 +2311,8 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)  		if (memcg && stock->nr_pages &&  		    mem_cgroup_is_descendant(memcg, root_memcg))  			flush = true; +		if (obj_stock_flush_required(stock, root_memcg)) +			flush = true;  		rcu_read_unlock();  		if (flush && @@ -2228,18 +2375,29 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)  	return 0;  } -static void reclaim_high(struct mem_cgroup *memcg, -			 unsigned int nr_pages, -			 gfp_t gfp_mask) +static unsigned long reclaim_high(struct mem_cgroup *memcg, +				  unsigned int nr_pages, +				  gfp_t gfp_mask)  { +	unsigned long nr_reclaimed = 0; +  	do { +		unsigned long pflags; +  		if (page_counter_read(&memcg->memory) <=  		    READ_ONCE(memcg->memory.high))  			continue; +  		memcg_memory_event(memcg, MEMCG_HIGH); -		try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); + +		psi_memstall_enter(&pflags); +		nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, +							     gfp_mask, true); +		psi_memstall_leave(&pflags);  	} while ((memcg = parent_mem_cgroup(memcg)) &&  		 !mem_cgroup_is_root(memcg)); + +	return nr_reclaimed;  }  static void high_work_func(struct work_struct *work) @@ -2264,7 +2422,7 @@ static void high_work_func(struct work_struct *work)   *   * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the   *   overage ratio to a delay. - * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the + * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the   *   proposed penalty in order to reduce to a reasonable number of jiffies, and   *   to produce a reasonable delay curve.   * @@ -2395,16 +2553,32 @@ void mem_cgroup_handle_over_high(void)  {  	unsigned long penalty_jiffies;  	unsigned long pflags; +	unsigned long nr_reclaimed;  	unsigned int nr_pages = current->memcg_nr_pages_over_high; +	int nr_retries = MAX_RECLAIM_RETRIES;  	struct mem_cgroup *memcg; +	bool in_retry = false;  	if (likely(!nr_pages))  		return;  	memcg = get_mem_cgroup_from_mm(current->mm); -	reclaim_high(memcg, nr_pages, GFP_KERNEL);  	current->memcg_nr_pages_over_high = 0; +retry_reclaim: +	/* +	 * The allocating task should reclaim at least the batch size, but for +	 * subsequent retries we only want to do what's necessary to prevent oom +	 * or breaching resource isolation. +	 * +	 * This is distinct from memory.max or page allocator behaviour because +	 * memory.high is currently batched, whereas memory.max and the page +	 * allocator run every time an allocation is made. +	 */ +	nr_reclaimed = reclaim_high(memcg, +				    in_retry ? SWAP_CLUSTER_MAX : nr_pages, +				    GFP_KERNEL); +  	/*  	 * memory.high is breached and reclaim is unable to keep up. Throttle  	 * allocators proactively to slow down excessive growth. @@ -2432,6 +2606,16 @@ void mem_cgroup_handle_over_high(void)  		goto out;  	/* +	 * If reclaim is making forward progress but we're still over +	 * memory.high, we want to encourage that rather than doing allocator +	 * throttling. +	 */ +	if (nr_reclaimed || nr_retries--) { +		in_retry = true; +		goto retry_reclaim; +	} + +	/*  	 * If we exit early, we're guaranteed to die (since  	 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't  	 * need to account for any ill-begotten jiffies to pay them off later. @@ -2448,13 +2632,14 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,  		      unsigned int nr_pages)  {  	unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); -	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; +	int nr_retries = MAX_RECLAIM_RETRIES;  	struct mem_cgroup *mem_over_limit;  	struct page_counter *counter; +	enum oom_status oom_status;  	unsigned long nr_reclaimed;  	bool may_swap = true;  	bool drained = false; -	enum oom_status oom_status; +	unsigned long pflags;  	if (mem_cgroup_is_root(memcg))  		return 0; @@ -2514,8 +2699,10 @@ retry:  	memcg_memory_event(mem_over_limit, MEMCG_MAX); +	psi_memstall_enter(&pflags);  	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,  						    gfp_mask, may_swap); +	psi_memstall_leave(&pflags);  	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)  		goto retry; @@ -2567,7 +2754,7 @@ retry:  		       get_order(nr_pages * PAGE_SIZE));  	switch (oom_status) {  	case OOM_SUCCESS: -		nr_retries = MEM_CGROUP_RECLAIM_RETRIES; +		nr_retries = MAX_RECLAIM_RETRIES;  		goto retry;  	case OOM_FAILED:  		goto force; @@ -2586,12 +2773,10 @@ force:  	page_counter_charge(&memcg->memory, nr_pages);  	if (do_memsw_account())  		page_counter_charge(&memcg->memsw, nr_pages); -	css_get_many(&memcg->css, nr_pages);  	return 0;  done_restock: -	css_get_many(&memcg->css, batch);  	if (batch > nr_pages)  		refill_stock(memcg, batch - nr_pages); @@ -2649,8 +2834,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)  	page_counter_uncharge(&memcg->memory, nr_pages);  	if (do_memsw_account())  		page_counter_uncharge(&memcg->memsw, nr_pages); - -	css_put_many(&memcg->css, nr_pages);  }  #endif @@ -2669,6 +2852,26 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg)  }  #ifdef CONFIG_MEMCG_KMEM +int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, +				 gfp_t gfp) +{ +	unsigned int objects = objs_per_slab_page(s, page); +	void *vec; + +	vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, +			   page_to_nid(page)); +	if (!vec) +		return -ENOMEM; + +	if (cmpxchg(&page->obj_cgroups, NULL, +		    (struct obj_cgroup **) ((unsigned long)vec | 0x1UL))) +		kfree(vec); +	else +		kmemleak_not_leak(vec); + +	return 0; +} +  /*   * Returns a pointer to the memory cgroup to which the kernel object is charged.   * @@ -2685,17 +2888,50 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)  	page = virt_to_head_page(p);  	/* -	 * Slab pages don't have page->mem_cgroup set because corresponding -	 * kmem caches can be reparented during the lifetime. That's why -	 * memcg_from_slab_page() should be used instead. +	 * Slab objects are accounted individually, not per-page. +	 * Memcg membership data for each individual object is saved in +	 * the page->obj_cgroups.  	 */ -	if (PageSlab(page)) -		return memcg_from_slab_page(page); +	if (page_has_obj_cgroups(page)) { +		struct obj_cgroup *objcg; +		unsigned int off; + +		off = obj_to_index(page->slab_cache, page, p); +		objcg = page_obj_cgroups(page)[off]; +		if (objcg) +			return obj_cgroup_memcg(objcg); + +		return NULL; +	}  	/* All other pages use page->mem_cgroup */  	return page->mem_cgroup;  } +__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) +{ +	struct obj_cgroup *objcg = NULL; +	struct mem_cgroup *memcg; + +	if (unlikely(!current->mm && !current->active_memcg)) +		return NULL; + +	rcu_read_lock(); +	if (unlikely(current->active_memcg)) +		memcg = rcu_dereference(current->active_memcg); +	else +		memcg = mem_cgroup_from_task(current); + +	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { +		objcg = rcu_dereference(memcg->objcg); +		if (objcg && obj_cgroup_tryget(objcg)) +			break; +	} +	rcu_read_unlock(); + +	return objcg; +} +  static int memcg_alloc_cache_id(void)  {  	int id, size; @@ -2721,9 +2957,7 @@ static int memcg_alloc_cache_id(void)  	else if (size > MEMCG_CACHES_MAX_SIZE)  		size = MEMCG_CACHES_MAX_SIZE; -	err = memcg_update_all_caches(size); -	if (!err) -		err = memcg_update_all_list_lrus(size); +	err = memcg_update_all_list_lrus(size);  	if (!err)  		memcg_nr_cache_ids = size; @@ -2741,150 +2975,6 @@ static void memcg_free_cache_id(int id)  	ida_simple_remove(&memcg_cache_ida, id);  } -struct memcg_kmem_cache_create_work { -	struct mem_cgroup *memcg; -	struct kmem_cache *cachep; -	struct work_struct work; -}; - -static void memcg_kmem_cache_create_func(struct work_struct *w) -{ -	struct memcg_kmem_cache_create_work *cw = -		container_of(w, struct memcg_kmem_cache_create_work, work); -	struct mem_cgroup *memcg = cw->memcg; -	struct kmem_cache *cachep = cw->cachep; - -	memcg_create_kmem_cache(memcg, cachep); - -	css_put(&memcg->css); -	kfree(cw); -} - -/* - * Enqueue the creation of a per-memcg kmem_cache. - */ -static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, -					       struct kmem_cache *cachep) -{ -	struct memcg_kmem_cache_create_work *cw; - -	if (!css_tryget_online(&memcg->css)) -		return; - -	cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN); -	if (!cw) { -		css_put(&memcg->css); -		return; -	} - -	cw->memcg = memcg; -	cw->cachep = cachep; -	INIT_WORK(&cw->work, memcg_kmem_cache_create_func); - -	queue_work(memcg_kmem_cache_wq, &cw->work); -} - -static inline bool memcg_kmem_bypass(void) -{ -	if (in_interrupt()) -		return true; - -	/* Allow remote memcg charging in kthread contexts. */ -	if ((!current->mm || (current->flags & PF_KTHREAD)) && -	     !current->active_memcg) -		return true; -	return false; -} - -/** - * memcg_kmem_get_cache: select the correct per-memcg cache for allocation - * @cachep: the original global kmem cache - * - * Return the kmem_cache we're supposed to use for a slab allocation. - * We try to use the current memcg's version of the cache. - * - * If the cache does not exist yet, if we are the first user of it, we - * create it asynchronously in a workqueue and let the current allocation - * go through with the original cache. - * - * This function takes a reference to the cache it returns to assure it - * won't get destroyed while we are working with it. Once the caller is - * done with it, memcg_kmem_put_cache() must be called to release the - * reference. - */ -struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) -{ -	struct mem_cgroup *memcg; -	struct kmem_cache *memcg_cachep; -	struct memcg_cache_array *arr; -	int kmemcg_id; - -	VM_BUG_ON(!is_root_cache(cachep)); - -	if (memcg_kmem_bypass()) -		return cachep; - -	rcu_read_lock(); - -	if (unlikely(current->active_memcg)) -		memcg = current->active_memcg; -	else -		memcg = mem_cgroup_from_task(current); - -	if (!memcg || memcg == root_mem_cgroup) -		goto out_unlock; - -	kmemcg_id = READ_ONCE(memcg->kmemcg_id); -	if (kmemcg_id < 0) -		goto out_unlock; - -	arr = rcu_dereference(cachep->memcg_params.memcg_caches); - -	/* -	 * Make sure we will access the up-to-date value. The code updating -	 * memcg_caches issues a write barrier to match the data dependency -	 * barrier inside READ_ONCE() (see memcg_create_kmem_cache()). -	 */ -	memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]); - -	/* -	 * If we are in a safe context (can wait, and not in interrupt -	 * context), we could be be predictable and return right away. -	 * This would guarantee that the allocation being performed -	 * already belongs in the new cache. -	 * -	 * However, there are some clashes that can arrive from locking. -	 * For instance, because we acquire the slab_mutex while doing -	 * memcg_create_kmem_cache, this means no further allocation -	 * could happen with the slab_mutex held. So it's better to -	 * defer everything. -	 * -	 * If the memcg is dying or memcg_cache is about to be released, -	 * don't bother creating new kmem_caches. Because memcg_cachep -	 * is ZEROed as the fist step of kmem offlining, we don't need -	 * percpu_ref_tryget_live() here. css_tryget_online() check in -	 * memcg_schedule_kmem_cache_create() will prevent us from -	 * creation of a new kmem_cache. -	 */ -	if (unlikely(!memcg_cachep)) -		memcg_schedule_kmem_cache_create(memcg, cachep); -	else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt)) -		cachep = memcg_cachep; -out_unlock: -	rcu_read_unlock(); -	return cachep; -} - -/** - * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache - * @cachep: the cache returned by memcg_kmem_get_cache - */ -void memcg_kmem_put_cache(struct kmem_cache *cachep) -{ -	if (!is_root_cache(cachep)) -		percpu_ref_put(&cachep->memcg_params.refcnt); -} -  /**   * __memcg_kmem_charge: charge a number of kernel pages to a memcg   * @memcg: memory cgroup to charge @@ -2958,6 +3048,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)  		if (!ret) {  			page->mem_cgroup = memcg;  			__SetPageKmemcg(page); +			return 0;  		}  	}  	css_put(&memcg->css); @@ -2980,13 +3071,146 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)  	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);  	__memcg_kmem_uncharge(memcg, nr_pages);  	page->mem_cgroup = NULL; +	css_put(&memcg->css);  	/* slab pages do not have PageKmemcg flag set */  	if (PageKmemcg(page))  		__ClearPageKmemcg(page); +} + +static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) +{ +	struct memcg_stock_pcp *stock; +	unsigned long flags; +	bool ret = false; + +	local_irq_save(flags); -	css_put_many(&memcg->css, nr_pages); +	stock = this_cpu_ptr(&memcg_stock); +	if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { +		stock->nr_bytes -= nr_bytes; +		ret = true; +	} + +	local_irq_restore(flags); + +	return ret; +} + +static void drain_obj_stock(struct memcg_stock_pcp *stock) +{ +	struct obj_cgroup *old = stock->cached_objcg; + +	if (!old) +		return; + +	if (stock->nr_bytes) { +		unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; +		unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); + +		if (nr_pages) { +			rcu_read_lock(); +			__memcg_kmem_uncharge(obj_cgroup_memcg(old), nr_pages); +			rcu_read_unlock(); +		} + +		/* +		 * The leftover is flushed to the centralized per-memcg value. +		 * On the next attempt to refill obj stock it will be moved +		 * to a per-cpu stock (probably, on an other CPU), see +		 * refill_obj_stock(). +		 * +		 * How often it's flushed is a trade-off between the memory +		 * limit enforcement accuracy and potential CPU contention, +		 * so it might be changed in the future. +		 */ +		atomic_add(nr_bytes, &old->nr_charged_bytes); +		stock->nr_bytes = 0; +	} + +	obj_cgroup_put(old); +	stock->cached_objcg = NULL;  } + +static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, +				     struct mem_cgroup *root_memcg) +{ +	struct mem_cgroup *memcg; + +	if (stock->cached_objcg) { +		memcg = obj_cgroup_memcg(stock->cached_objcg); +		if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) +			return true; +	} + +	return false; +} + +static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) +{ +	struct memcg_stock_pcp *stock; +	unsigned long flags; + +	local_irq_save(flags); + +	stock = this_cpu_ptr(&memcg_stock); +	if (stock->cached_objcg != objcg) { /* reset if necessary */ +		drain_obj_stock(stock); +		obj_cgroup_get(objcg); +		stock->cached_objcg = objcg; +		stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0); +	} +	stock->nr_bytes += nr_bytes; + +	if (stock->nr_bytes > PAGE_SIZE) +		drain_obj_stock(stock); + +	local_irq_restore(flags); +} + +int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) +{ +	struct mem_cgroup *memcg; +	unsigned int nr_pages, nr_bytes; +	int ret; + +	if (consume_obj_stock(objcg, size)) +		return 0; + +	/* +	 * In theory, memcg->nr_charged_bytes can have enough +	 * pre-charged bytes to satisfy the allocation. However, +	 * flushing memcg->nr_charged_bytes requires two atomic +	 * operations, and memcg->nr_charged_bytes can't be big, +	 * so it's better to ignore it and try grab some new pages. +	 * memcg->nr_charged_bytes will be flushed in +	 * refill_obj_stock(), called from this function or +	 * independently later. +	 */ +	rcu_read_lock(); +	memcg = obj_cgroup_memcg(objcg); +	css_get(&memcg->css); +	rcu_read_unlock(); + +	nr_pages = size >> PAGE_SHIFT; +	nr_bytes = size & (PAGE_SIZE - 1); + +	if (nr_bytes) +		nr_pages += 1; + +	ret = __memcg_kmem_charge(memcg, gfp, nr_pages); +	if (!ret && nr_bytes) +		refill_obj_stock(objcg, PAGE_SIZE - nr_bytes); + +	css_put(&memcg->css); +	return ret; +} + +void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) +{ +	refill_obj_stock(objcg, size); +} +  #endif /* CONFIG_MEMCG_KMEM */  #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -2997,13 +3221,16 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)   */  void mem_cgroup_split_huge_fixup(struct page *head)  { +	struct mem_cgroup *memcg = head->mem_cgroup;  	int i;  	if (mem_cgroup_disabled())  		return; -	for (i = 1; i < HPAGE_PMD_NR; i++) -		head[i].mem_cgroup = head->mem_cgroup; +	for (i = 1; i < HPAGE_PMD_NR; i++) { +		css_get(&memcg->css); +		head[i].mem_cgroup = memcg; +	}  }  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -3207,7 +3434,7 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)   */  static int mem_cgroup_force_empty(struct mem_cgroup *memcg)  { -	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; +	int nr_retries = MAX_RECLAIM_RETRIES;  	/* we call try-to-free pages for make this cgroup empty */  	lru_add_drain_all(); @@ -3404,6 +3631,7 @@ static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)  #ifdef CONFIG_MEMCG_KMEM  static int memcg_online_kmem(struct mem_cgroup *memcg)  { +	struct obj_cgroup *objcg;  	int memcg_id;  	if (cgroup_memory_nokmem) @@ -3416,7 +3644,16 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)  	if (memcg_id < 0)  		return memcg_id; -	static_branch_inc(&memcg_kmem_enabled_key); +	objcg = obj_cgroup_alloc(); +	if (!objcg) { +		memcg_free_cache_id(memcg_id); +		return -ENOMEM; +	} +	objcg->memcg = memcg; +	rcu_assign_pointer(memcg->objcg, objcg); + +	static_branch_enable(&memcg_kmem_enabled_key); +  	/*  	 * A memory cgroup is considered kmem-online as soon as it gets  	 * kmemcg_id. Setting the id after enabling static branching will @@ -3425,7 +3662,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)  	 */  	memcg->kmemcg_id = memcg_id;  	memcg->kmem_state = KMEM_ONLINE; -	INIT_LIST_HEAD(&memcg->kmem_caches);  	return 0;  } @@ -3438,22 +3674,14 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)  	if (memcg->kmem_state != KMEM_ONLINE)  		return; -	/* -	 * Clear the online state before clearing memcg_caches array -	 * entries. The slab_mutex in memcg_deactivate_kmem_caches() -	 * guarantees that no cache will be created for this cgroup -	 * after we are done (see memcg_create_kmem_cache()). -	 */ +  	memcg->kmem_state = KMEM_ALLOCATED;  	parent = parent_mem_cgroup(memcg);  	if (!parent)  		parent = root_mem_cgroup; -	/* -	 * Deactivate and reparent kmem_caches. -	 */ -	memcg_deactivate_kmem_caches(memcg, parent); +	memcg_reparent_objcgs(memcg, parent);  	kmemcg_id = memcg->kmemcg_id;  	BUG_ON(kmemcg_id < 0); @@ -3486,11 +3714,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)  	/* css_alloc() failed, offlining didn't happen */  	if (unlikely(memcg->kmem_state == KMEM_ONLINE))  		memcg_offline_kmem(memcg); - -	if (memcg->kmem_state == KMEM_ALLOCATED) { -		WARN_ON(!list_empty(&memcg->kmem_caches)); -		static_branch_dec(&memcg_kmem_enabled_key); -	}  }  #else  static int memcg_online_kmem(struct mem_cgroup *memcg) @@ -4800,9 +5023,6 @@ static struct cftype mem_cgroup_legacy_files[] = {  	(defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))  	{  		.name = "kmem.slabinfo", -		.seq_start = memcg_slab_start, -		.seq_next = memcg_slab_next, -		.seq_stop = memcg_slab_stop,  		.seq_show = memcg_slab_show,  	},  #endif @@ -4917,13 +5137,15 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)  	if (!pn)  		return 1; -	pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat); +	pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat, +						 GFP_KERNEL_ACCOUNT);  	if (!pn->lruvec_stat_local) {  		kfree(pn);  		return 1;  	} -	pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat); +	pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat, +					       GFP_KERNEL_ACCOUNT);  	if (!pn->lruvec_stat_cpu) {  		free_percpu(pn->lruvec_stat_local);  		kfree(pn); @@ -4997,11 +5219,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void)  		goto fail;  	} -	memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu); +	memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu, +						GFP_KERNEL_ACCOUNT);  	if (!memcg->vmstats_local)  		goto fail; -	memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu); +	memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, +						 GFP_KERNEL_ACCOUNT);  	if (!memcg->vmstats_percpu)  		goto fail; @@ -5022,6 +5246,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)  	memcg->socket_pressure = jiffies;  #ifdef CONFIG_MEMCG_KMEM  	memcg->kmemcg_id = -1; +	INIT_LIST_HEAD(&memcg->objcg_list);  #endif  #ifdef CONFIG_CGROUP_WRITEBACK  	INIT_LIST_HEAD(&memcg->cgwb_list); @@ -5049,7 +5274,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)  	struct mem_cgroup *memcg;  	long error = -ENOMEM; +	memalloc_use_memcg(parent);  	memcg = mem_cgroup_alloc(); +	memalloc_unuse_memcg();  	if (IS_ERR(memcg))  		return ERR_CAST(memcg); @@ -5084,9 +5311,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)  	/* The following stuff does not apply to the root */  	if (!parent) { -#ifdef CONFIG_MEMCG_KMEM -		INIT_LIST_HEAD(&memcg->kmem_caches); -#endif  		root_mem_cgroup = memcg;  		return &memcg->css;  	} @@ -5365,7 +5589,7 @@ static int mem_cgroup_move_account(struct page *page,  {  	struct lruvec *from_vec, *to_vec;  	struct pglist_data *pgdat; -	unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; +	unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;  	int ret;  	VM_BUG_ON(from == to); @@ -5448,7 +5672,10 @@ static int mem_cgroup_move_account(struct page *page,  	 */  	smp_mb(); -	page->mem_cgroup = to; 	/* caller should have done css_get */ +	css_get(&to->css); +	css_put(&from->css); + +	page->mem_cgroup = to;  	__unlock_page_memcg(from); @@ -5669,8 +5896,6 @@ static void __mem_cgroup_clear_mc(void)  		if (!mem_cgroup_is_root(mc.to))  			page_counter_uncharge(&mc.to->memory, mc.moved_swap); -		css_put_many(&mc.to->css, mc.moved_swap); -  		mc.moved_swap = 0;  	}  	memcg_oom_recover(from); @@ -6036,7 +6261,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,  				 char *buf, size_t nbytes, loff_t off)  {  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); -	unsigned int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; +	unsigned int nr_retries = MAX_RECLAIM_RETRIES;  	bool drained = false;  	unsigned long high;  	int err; @@ -6046,8 +6271,6 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,  	if (err)  		return err; -	page_counter_set_high(&memcg->memory, high); -  	for (;;) {  		unsigned long nr_pages = page_counter_read(&memcg->memory);  		unsigned long reclaimed; @@ -6071,6 +6294,10 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,  			break;  	} +	page_counter_set_high(&memcg->memory, high); + +	memcg_wb_domain_size_changed(memcg); +  	return nbytes;  } @@ -6084,7 +6311,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,  				char *buf, size_t nbytes, loff_t off)  {  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); -	unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES; +	unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;  	bool drained = false;  	unsigned long max;  	int err; @@ -6391,40 +6618,42 @@ static unsigned long effective_protection(unsigned long usage,   *   * WARNING: This function is not stateless! It can only be used as part   *          of a top-down tree iteration, not for isolated queries. - * - * Returns one of the following: - *   MEMCG_PROT_NONE: cgroup memory is not protected - *   MEMCG_PROT_LOW: cgroup memory is protected as long there is - *     an unprotected supply of reclaimable memory from other cgroups. - *   MEMCG_PROT_MIN: cgroup memory is protected   */ -enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, -						struct mem_cgroup *memcg) +void mem_cgroup_calculate_protection(struct mem_cgroup *root, +				     struct mem_cgroup *memcg)  {  	unsigned long usage, parent_usage;  	struct mem_cgroup *parent;  	if (mem_cgroup_disabled()) -		return MEMCG_PROT_NONE; +		return;  	if (!root)  		root = root_mem_cgroup; + +	/* +	 * Effective values of the reclaim targets are ignored so they +	 * can be stale. Have a look at mem_cgroup_protection for more +	 * details. +	 * TODO: calculation should be more robust so that we do not need +	 * that special casing. +	 */  	if (memcg == root) -		return MEMCG_PROT_NONE; +		return;  	usage = page_counter_read(&memcg->memory);  	if (!usage) -		return MEMCG_PROT_NONE; +		return;  	parent = parent_mem_cgroup(memcg);  	/* No parent means a non-hierarchical mode on v1 memcg */  	if (!parent) -		return MEMCG_PROT_NONE; +		return;  	if (parent == root) {  		memcg->memory.emin = READ_ONCE(memcg->memory.min);  		memcg->memory.elow = READ_ONCE(memcg->memory.low); -		goto out; +		return;  	}  	parent_usage = page_counter_read(&parent->memory); @@ -6438,14 +6667,6 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,  			READ_ONCE(memcg->memory.low),  			READ_ONCE(parent->memory.elow),  			atomic_long_read(&parent->memory.children_low_usage))); - -out: -	if (usage <= memcg->memory.emin) -		return MEMCG_PROT_MIN; -	else if (usage <= memcg->memory.elow) -		return MEMCG_PROT_LOW; -	else -		return MEMCG_PROT_NONE;  }  /** @@ -6461,7 +6682,7 @@ out:   */  int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)  { -	unsigned int nr_pages = hpage_nr_pages(page); +	unsigned int nr_pages = thp_nr_pages(page);  	struct mem_cgroup *memcg = NULL;  	int ret = 0; @@ -6498,6 +6719,7 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)  	if (ret)  		goto out_put; +	css_get(&memcg->css);  	commit_charge(page, memcg);  	local_irq_disable(); @@ -6552,9 +6774,6 @@ static void uncharge_batch(const struct uncharge_gather *ug)  	__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);  	memcg_check_events(ug->memcg, ug->dummy_page);  	local_irq_restore(flags); - -	if (!mem_cgroup_is_root(ug->memcg)) -		css_put_many(&ug->memcg->css, ug->nr_pages);  }  static void uncharge_page(struct page *page, struct uncharge_gather *ug) @@ -6592,6 +6811,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)  	ug->dummy_page = page;  	page->mem_cgroup = NULL; +	css_put(&ug->memcg->css);  }  static void uncharge_list(struct list_head *page_list) @@ -6692,13 +6912,13 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)  		return;  	/* Force-charge the new page. The old one will be freed soon */ -	nr_pages = hpage_nr_pages(newpage); +	nr_pages = thp_nr_pages(newpage);  	page_counter_charge(&memcg->memory, nr_pages);  	if (do_memsw_account())  		page_counter_charge(&memcg->memsw, nr_pages); -	css_get_many(&memcg->css, nr_pages); +	css_get(&memcg->css);  	commit_charge(newpage, memcg);  	local_irq_save(flags); @@ -6821,17 +7041,6 @@ static int __init mem_cgroup_init(void)  {  	int cpu, node; -#ifdef CONFIG_MEMCG_KMEM -	/* -	 * Kmem cache creation is mostly done with the slab_mutex held, -	 * so use a workqueue with limited concurrency to avoid stalling -	 * all worker threads in case lots of cgroups are created and -	 * destroyed simultaneously. -	 */ -	memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1); -	BUG_ON(!memcg_kmem_cache_wq); -#endif -  	cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,  				  memcg_hotplug_cpu_dead); @@ -6905,7 +7114,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)  	 * ancestor for the swap instead and transfer the memory+swap charge.  	 */  	swap_memcg = mem_cgroup_id_get_online(memcg); -	nr_entries = hpage_nr_pages(page); +	nr_entries = thp_nr_pages(page);  	/* Get references for the tail pages, too */  	if (nr_entries > 1)  		mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); @@ -6935,8 +7144,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)  	mem_cgroup_charge_statistics(memcg, page, -nr_entries);  	memcg_check_events(memcg, page); -	if (!mem_cgroup_is_root(memcg)) -		css_put_many(&memcg->css, nr_entries); +	css_put(&memcg->css);  }  /** @@ -6950,7 +7158,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)   */  int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)  { -	unsigned int nr_pages = hpage_nr_pages(page); +	unsigned int nr_pages = thp_nr_pages(page);  	struct page_counter *counter;  	struct mem_cgroup *memcg;  	unsigned short oldid;  |