diff options
Diffstat (limited to 'mm/memcontrol.c')
| -rw-r--r-- | mm/memcontrol.c | 478 | 
1 files changed, 92 insertions, 386 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index acb93c554f6e..6ddaeba34e09 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -111,56 +111,10 @@ static const char * const mem_cgroup_lru_names[] = {  	"unevictable",  }; -/* - * Per memcg event counter is incremented at every pagein/pageout. With THP, - * it will be incremated by the number of pages. This counter is used for - * for trigger some periodic events. This is straightforward and better - * than using jiffies etc. to handle periodic memcg event. - */ -enum mem_cgroup_events_target { -	MEM_CGROUP_TARGET_THRESH, -	MEM_CGROUP_TARGET_SOFTLIMIT, -	MEM_CGROUP_TARGET_NUMAINFO, -	MEM_CGROUP_NTARGETS, -};  #define THRESHOLDS_EVENTS_TARGET 128  #define SOFTLIMIT_EVENTS_TARGET 1024  #define NUMAINFO_EVENTS_TARGET	1024 -struct mem_cgroup_stat_cpu { -	long count[MEM_CGROUP_STAT_NSTATS]; -	unsigned long events[MEMCG_NR_EVENTS]; -	unsigned long nr_page_events; -	unsigned long targets[MEM_CGROUP_NTARGETS]; -}; - -struct reclaim_iter { -	struct mem_cgroup *position; -	/* scan generation, increased every round-trip */ -	unsigned int generation; -}; - -/* - * per-zone information in memory controller. - */ -struct mem_cgroup_per_zone { -	struct lruvec		lruvec; -	unsigned long		lru_size[NR_LRU_LISTS]; - -	struct reclaim_iter	iter[DEF_PRIORITY + 1]; - -	struct rb_node		tree_node;	/* RB tree node */ -	unsigned long		usage_in_excess;/* Set to the value by which */ -						/* the soft limit is exceeded*/ -	bool			on_tree; -	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */ -						/* use container_of	   */ -}; - -struct mem_cgroup_per_node { -	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; -}; -  /*   * Cgroups above their limits are maintained in a RB-Tree, independent of   * their hierarchy representation @@ -181,32 +135,6 @@ struct mem_cgroup_tree {  static struct mem_cgroup_tree soft_limit_tree __read_mostly; -struct mem_cgroup_threshold { -	struct eventfd_ctx *eventfd; -	unsigned long threshold; -}; - -/* For threshold */ -struct mem_cgroup_threshold_ary { -	/* An array index points to threshold just below or equal to usage. */ -	int current_threshold; -	/* Size of entries[] */ -	unsigned int size; -	/* Array of thresholds */ -	struct mem_cgroup_threshold entries[0]; -}; - -struct mem_cgroup_thresholds { -	/* Primary thresholds array */ -	struct mem_cgroup_threshold_ary *primary; -	/* -	 * Spare threshold array. -	 * This is needed to make mem_cgroup_unregister_event() "never fail". -	 * It must be able to store at least primary->size - 1 entries. -	 */ -	struct mem_cgroup_threshold_ary *spare; -}; -  /* for OOM */  struct mem_cgroup_eventfd_list {  	struct list_head list; @@ -256,113 +184,6 @@ struct mem_cgroup_event {  static void mem_cgroup_threshold(struct mem_cgroup *memcg);  static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); -/* - * The memory controller data structure. The memory controller controls both - * page cache and RSS per cgroup. We would eventually like to provide - * statistics based on the statistics developed by Rik Van Riel for clock-pro, - * to help the administrator determine what knobs to tune. - */ -struct mem_cgroup { -	struct cgroup_subsys_state css; - -	/* Accounted resources */ -	struct page_counter memory; -	struct page_counter memsw; -	struct page_counter kmem; - -	/* Normal memory consumption range */ -	unsigned long low; -	unsigned long high; - -	unsigned long soft_limit; - -	/* vmpressure notifications */ -	struct vmpressure vmpressure; - -	/* css_online() has been completed */ -	int initialized; - -	/* -	 * Should the accounting and control be hierarchical, per subtree? -	 */ -	bool use_hierarchy; - -	/* protected by memcg_oom_lock */ -	bool		oom_lock; -	int		under_oom; - -	int	swappiness; -	/* OOM-Killer disable */ -	int		oom_kill_disable; - -	/* protect arrays of thresholds */ -	struct mutex thresholds_lock; - -	/* thresholds for memory usage. RCU-protected */ -	struct mem_cgroup_thresholds thresholds; - -	/* thresholds for mem+swap usage. RCU-protected */ -	struct mem_cgroup_thresholds memsw_thresholds; - -	/* For oom notifier event fd */ -	struct list_head oom_notify; - -	/* -	 * Should we move charges of a task when a task is moved into this -	 * mem_cgroup ? And what type of charges should we move ? -	 */ -	unsigned long move_charge_at_immigrate; -	/* -	 * set > 0 if pages under this cgroup are moving to other cgroup. -	 */ -	atomic_t		moving_account; -	/* taken only while moving_account > 0 */ -	spinlock_t		move_lock; -	struct task_struct	*move_lock_task; -	unsigned long		move_lock_flags; -	/* -	 * percpu counter. -	 */ -	struct mem_cgroup_stat_cpu __percpu *stat; -	spinlock_t pcp_counter_lock; - -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) -	struct cg_proto tcp_mem; -#endif -#if defined(CONFIG_MEMCG_KMEM) -        /* Index in the kmem_cache->memcg_params.memcg_caches array */ -	int kmemcg_id; -	bool kmem_acct_activated; -	bool kmem_acct_active; -#endif - -	int last_scanned_node; -#if MAX_NUMNODES > 1 -	nodemask_t	scan_nodes; -	atomic_t	numainfo_events; -	atomic_t	numainfo_updating; -#endif - -#ifdef CONFIG_CGROUP_WRITEBACK -	struct list_head cgwb_list; -	struct wb_domain cgwb_domain; -#endif - -	/* List of events which userspace want to receive */ -	struct list_head event_list; -	spinlock_t event_list_lock; - -	struct mem_cgroup_per_node *nodeinfo[0]; -	/* WARNING: nodeinfo must be the last member here */ -}; - -#ifdef CONFIG_MEMCG_KMEM -bool memcg_kmem_is_active(struct mem_cgroup *memcg) -{ -	return memcg->kmem_acct_active; -} -#endif -  /* Stuffs for move charges at task migration. */  /*   * Types of charges to be moved. @@ -423,11 +244,6 @@ enum res_type {   */  static DEFINE_MUTEX(memcg_create_mutex); -struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) -{ -	return s ? container_of(s, struct mem_cgroup, css) : NULL; -} -  /* Some nice accessors for the vmpressure. */  struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)  { @@ -499,8 +315,7 @@ void sock_update_memcg(struct sock *sk)  		rcu_read_lock();  		memcg = mem_cgroup_from_task(current);  		cg_proto = sk->sk_prot->proto_cgroup(memcg); -		if (!mem_cgroup_is_root(memcg) && -		    memcg_proto_active(cg_proto) && +		if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) &&  		    css_tryget_online(&memcg->css)) {  			sk->sk_cgrp = cg_proto;  		} @@ -593,11 +408,6 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)  	return &memcg->nodeinfo[nid]->zoneinfo[zid];  } -struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) -{ -	return &memcg->css; -} -  /**   * mem_cgroup_css_from_page - css of the memcg associated with a page   * @page: page of interest @@ -631,6 +441,34 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)  	return &memcg->css;  } +/** + * page_cgroup_ino - return inode number of the memcg a page is charged to + * @page: the page + * + * Look up the closest online ancestor of the memory cgroup @page is charged to + * and return its inode number or 0 if @page is not charged to any cgroup. It + * is safe to call this function without holding a reference to @page. + * + * Note, this function is inherently racy, because there is nothing to prevent + * the cgroup inode from getting torn down and potentially reallocated a moment + * after page_cgroup_ino() returns, so it only should be used by callers that + * do not care (such as procfs interfaces). + */ +ino_t page_cgroup_ino(struct page *page) +{ +	struct mem_cgroup *memcg; +	unsigned long ino = 0; + +	rcu_read_lock(); +	memcg = READ_ONCE(page->mem_cgroup); +	while (memcg && !(memcg->css.flags & CSS_ONLINE)) +		memcg = parent_mem_cgroup(memcg); +	if (memcg) +		ino = cgroup_ino(memcg->css.cgroup); +	rcu_read_unlock(); +	return ino; +} +  static struct mem_cgroup_per_zone *  mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)  { @@ -876,14 +714,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,  	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);  } -unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) -{ -	struct mem_cgroup_per_zone *mz; - -	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); -	return mz->lru_size[lru]; -} -  static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,  						  int nid,  						  unsigned int lru_mask) @@ -986,6 +816,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)  	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));  } +EXPORT_SYMBOL(mem_cgroup_from_task);  static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)  { @@ -1031,7 +862,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,  				   struct mem_cgroup *prev,  				   struct mem_cgroup_reclaim_cookie *reclaim)  { -	struct reclaim_iter *uninitialized_var(iter); +	struct mem_cgroup_reclaim_iter *uninitialized_var(iter);  	struct cgroup_subsys_state *css = NULL;  	struct mem_cgroup *memcg = NULL;  	struct mem_cgroup *pos = NULL; @@ -1173,30 +1004,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,  	     iter != NULL;				\  	     iter = mem_cgroup_iter(NULL, iter, NULL)) -void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) -{ -	struct mem_cgroup *memcg; - -	rcu_read_lock(); -	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); -	if (unlikely(!memcg)) -		goto out; - -	switch (idx) { -	case PGFAULT: -		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); -		break; -	case PGMAJFAULT: -		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); -		break; -	default: -		BUG(); -	} -out: -	rcu_read_unlock(); -} -EXPORT_SYMBOL(__mem_cgroup_count_vm_event); -  /**   * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg   * @zone: zone of the wanted lruvec @@ -1295,15 +1102,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,  	VM_BUG_ON((long)(*lru_size) < 0);  } -bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root) -{ -	if (root == memcg) -		return true; -	if (!root->use_hierarchy) -		return false; -	return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); -} -  bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)  {  	struct mem_cgroup *task_memcg; @@ -1330,39 +1128,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)  	return ret;  } -int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) -{ -	unsigned long inactive_ratio; -	unsigned long inactive; -	unsigned long active; -	unsigned long gb; - -	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); -	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); - -	gb = (inactive + active) >> (30 - PAGE_SHIFT); -	if (gb) -		inactive_ratio = int_sqrt(10 * gb); -	else -		inactive_ratio = 1; - -	return inactive * inactive_ratio < active; -} - -bool mem_cgroup_lruvec_online(struct lruvec *lruvec) -{ -	struct mem_cgroup_per_zone *mz; -	struct mem_cgroup *memcg; - -	if (mem_cgroup_disabled()) -		return true; - -	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); -	memcg = mz->memcg; - -	return !!(memcg->css.flags & CSS_ONLINE); -} -  #define mem_cgroup_from_counter(counter, member)	\  	container_of(counter, struct mem_cgroup, member) @@ -1394,15 +1159,6 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)  	return margin;  } -int mem_cgroup_swappiness(struct mem_cgroup *memcg) -{ -	/* root ? */ -	if (mem_cgroup_disabled() || !memcg->css.parent) -		return vm_swappiness; - -	return memcg->swappiness; -} -  /*   * A routine for checking "mem" is under move_account() or not.   * @@ -1545,6 +1301,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)  static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,  				     int order)  { +	struct oom_control oc = { +		.zonelist = NULL, +		.nodemask = NULL, +		.gfp_mask = gfp_mask, +		.order = order, +	};  	struct mem_cgroup *iter;  	unsigned long chosen_points = 0;  	unsigned long totalpages; @@ -1563,7 +1325,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,  		goto unlock;  	} -	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg); +	check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);  	totalpages = mem_cgroup_get_limit(memcg) ? : 1;  	for_each_mem_cgroup_tree(iter, memcg) {  		struct css_task_iter it; @@ -1571,8 +1333,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,  		css_task_iter_start(&iter->css, &it);  		while ((task = css_task_iter_next(&it))) { -			switch (oom_scan_process_thread(task, totalpages, NULL, -							false)) { +			switch (oom_scan_process_thread(&oc, task, totalpages)) {  			case OOM_SCAN_SELECT:  				if (chosen)  					put_task_struct(chosen); @@ -1610,8 +1371,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,  	if (chosen) {  		points = chosen_points * 1000 / totalpages; -		oom_kill_process(chosen, gfp_mask, order, points, totalpages, -				 memcg, NULL, "Memory cgroup out of memory"); +		oom_kill_process(&oc, chosen, points, totalpages, memcg, +				 "Memory cgroup out of memory");  	}  unlock:  	mutex_unlock(&oom_lock); @@ -2062,23 +1823,6 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)  }  EXPORT_SYMBOL(mem_cgroup_end_page_stat); -/** - * mem_cgroup_update_page_stat - update page state statistics - * @memcg: memcg to account against - * @idx: page state item to account - * @val: number of pages (positive or negative) - * - * See mem_cgroup_begin_page_stat() for locking requirements. - */ -void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, -				 enum mem_cgroup_stat_index idx, int val) -{ -	VM_BUG_ON(!rcu_read_lock_held()); - -	if (memcg) -		this_cpu_add(memcg->stat->count[idx], val); -} -  /*   * size of first charge trial. "32" comes from vmscan.c's magic value.   * TODO: maybe necessary to use big numbers in big irons. @@ -2355,40 +2099,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)  	css_put_many(&memcg->css, nr_pages);  } -/* - * try_get_mem_cgroup_from_page - look up page's memcg association - * @page: the page - * - * Look up, get a css reference, and return the memcg that owns @page. - * - * The page must be locked to prevent racing with swap-in and page - * cache charges.  If coming from an unlocked page table, the caller - * must ensure the page is on the LRU or this can race with charging. - */ -struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) -{ -	struct mem_cgroup *memcg; -	unsigned short id; -	swp_entry_t ent; - -	VM_BUG_ON_PAGE(!PageLocked(page), page); - -	memcg = page->mem_cgroup; -	if (memcg) { -		if (!css_tryget_online(&memcg->css)) -			memcg = NULL; -	} else if (PageSwapCache(page)) { -		ent.val = page_private(page); -		id = lookup_swap_cgroup_id(ent); -		rcu_read_lock(); -		memcg = mem_cgroup_from_id(id); -		if (memcg && !css_tryget_online(&memcg->css)) -			memcg = NULL; -		rcu_read_unlock(); -	} -	return memcg; -} -  static void lock_page_lru(struct page *page, int *isolated)  {  	struct zone *zone = page_zone(page); @@ -2504,16 +2214,6 @@ void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)  	css_put_many(&memcg->css, nr_pages);  } -/* - * helper for acessing a memcg's index. It will be used as an index in the - * child cache array in kmem_cache, and also to derive its name. This function - * will return -1 when this is not a kmem-limited memcg. - */ -int memcg_cache_id(struct mem_cgroup *memcg) -{ -	return memcg ? memcg->kmemcg_id : -1; -} -  static int memcg_alloc_cache_id(void)  {  	int id, size; @@ -5127,10 +4827,12 @@ static void mem_cgroup_clear_mc(void)  static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,  				 struct cgroup_taskset *tset)  { -	struct task_struct *p = cgroup_taskset_first(tset); -	int ret = 0;  	struct mem_cgroup *memcg = mem_cgroup_from_css(css); +	struct mem_cgroup *from; +	struct task_struct *p; +	struct mm_struct *mm;  	unsigned long move_flags; +	int ret = 0;  	/*  	 * We are now commited to this value whatever it is. Changes in this @@ -5138,36 +4840,37 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,  	 * So we need to save it, and keep it going.  	 */  	move_flags = READ_ONCE(memcg->move_charge_at_immigrate); -	if (move_flags) { -		struct mm_struct *mm; -		struct mem_cgroup *from = mem_cgroup_from_task(p); +	if (!move_flags) +		return 0; -		VM_BUG_ON(from == memcg); +	p = cgroup_taskset_first(tset); +	from = mem_cgroup_from_task(p); -		mm = get_task_mm(p); -		if (!mm) -			return 0; -		/* We move charges only when we move a owner of the mm */ -		if (mm->owner == p) { -			VM_BUG_ON(mc.from); -			VM_BUG_ON(mc.to); -			VM_BUG_ON(mc.precharge); -			VM_BUG_ON(mc.moved_charge); -			VM_BUG_ON(mc.moved_swap); - -			spin_lock(&mc.lock); -			mc.from = from; -			mc.to = memcg; -			mc.flags = move_flags; -			spin_unlock(&mc.lock); -			/* We set mc.moving_task later */ - -			ret = mem_cgroup_precharge_mc(mm); -			if (ret) -				mem_cgroup_clear_mc(); -		} -		mmput(mm); +	VM_BUG_ON(from == memcg); + +	mm = get_task_mm(p); +	if (!mm) +		return 0; +	/* We move charges only when we move a owner of the mm */ +	if (mm->owner == p) { +		VM_BUG_ON(mc.from); +		VM_BUG_ON(mc.to); +		VM_BUG_ON(mc.precharge); +		VM_BUG_ON(mc.moved_charge); +		VM_BUG_ON(mc.moved_swap); + +		spin_lock(&mc.lock); +		mc.from = from; +		mc.to = memcg; +		mc.flags = move_flags; +		spin_unlock(&mc.lock); +		/* We set mc.moving_task later */ + +		ret = mem_cgroup_precharge_mc(mm); +		if (ret) +			mem_cgroup_clear_mc();  	} +	mmput(mm);  	return ret;  } @@ -5521,19 +5224,6 @@ struct cgroup_subsys memory_cgrp_subsys = {  };  /** - * mem_cgroup_events - count memory events against a cgroup - * @memcg: the memory cgroup - * @idx: the event index - * @nr: the number of events to account for - */ -void mem_cgroup_events(struct mem_cgroup *memcg, -		       enum mem_cgroup_events_index idx, -		       unsigned int nr) -{ -	this_cpu_add(memcg->stat->events[idx], nr); -} - -/**   * mem_cgroup_low - check if memory consumption is below the normal range   * @root: the highest ancestor to consider   * @memcg: the memory cgroup to check @@ -5605,8 +5295,20 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,  		 * the page lock, which serializes swap cache removal, which  		 * in turn serializes uncharging.  		 */ +		VM_BUG_ON_PAGE(!PageLocked(page), page);  		if (page->mem_cgroup)  			goto out; + +		if (do_swap_account) { +			swp_entry_t ent = { .val = page_private(page), }; +			unsigned short id = lookup_swap_cgroup_id(ent); + +			rcu_read_lock(); +			memcg = mem_cgroup_from_id(id); +			if (memcg && !css_tryget_online(&memcg->css)) +				memcg = NULL; +			rcu_read_unlock(); +		}  	}  	if (PageTransHuge(page)) { @@ -5614,8 +5316,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,  		VM_BUG_ON_PAGE(!PageTransHuge(page), page);  	} -	if (do_swap_account && PageSwapCache(page)) -		memcg = try_get_mem_cgroup_from_page(page);  	if (!memcg)  		memcg = get_mem_cgroup_from_mm(mm); @@ -5965,7 +5665,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)  	if (!mem_cgroup_is_root(memcg))  		page_counter_uncharge(&memcg->memory, 1); -	/* Caller disabled preemption with mapping->tree_lock */ +	/* +	 * Interrupts should be disabled here because the caller holds the +	 * mapping->tree_lock lock which is taken with interrupts-off. It is +	 * important here to have the interrupts disabled because it is the +	 * only synchronisation we have for udpating the per-CPU variables. +	 */ +	VM_BUG_ON(!irqs_disabled());  	mem_cgroup_charge_statistics(memcg, page, -1);  	memcg_check_events(memcg, page);  }  |