diff options
Diffstat (limited to 'mm/memcontrol.c')
| -rw-r--r-- | mm/memcontrol.c | 298 | 
1 files changed, 197 insertions, 101 deletions
| diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e09741af816f..15af3da5af02 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -119,6 +119,7 @@ static const char *const mem_cgroup_lru_names[] = {  struct mem_cgroup_tree_per_node {  	struct rb_root rb_root; +	struct rb_node *rb_rightmost;  	spinlock_t lock;  }; @@ -386,6 +387,7 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,  	struct rb_node **p = &mctz->rb_root.rb_node;  	struct rb_node *parent = NULL;  	struct mem_cgroup_per_node *mz_node; +	bool rightmost = true;  	if (mz->on_tree)  		return; @@ -397,8 +399,11 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,  		parent = *p;  		mz_node = rb_entry(parent, struct mem_cgroup_per_node,  					tree_node); -		if (mz->usage_in_excess < mz_node->usage_in_excess) +		if (mz->usage_in_excess < mz_node->usage_in_excess) {  			p = &(*p)->rb_left; +			rightmost = false; +		} +  		/*  		 * We can't avoid mem cgroups that are over their soft  		 * limit by the same amount @@ -406,6 +411,10 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,  		else if (mz->usage_in_excess >= mz_node->usage_in_excess)  			p = &(*p)->rb_right;  	} + +	if (rightmost) +		mctz->rb_rightmost = &mz->tree_node; +  	rb_link_node(&mz->tree_node, parent, p);  	rb_insert_color(&mz->tree_node, &mctz->rb_root);  	mz->on_tree = true; @@ -416,6 +425,10 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,  {  	if (!mz->on_tree)  		return; + +	if (&mz->tree_node == mctz->rb_rightmost) +		mctz->rb_rightmost = rb_prev(&mz->tree_node); +  	rb_erase(&mz->tree_node, &mctz->rb_root);  	mz->on_tree = false;  } @@ -496,16 +509,15 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)  static struct mem_cgroup_per_node *  __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)  { -	struct rb_node *rightmost = NULL;  	struct mem_cgroup_per_node *mz;  retry:  	mz = NULL; -	rightmost = rb_last(&mctz->rb_root); -	if (!rightmost) +	if (!mctz->rb_rightmost)  		goto done;		/* Nothing to reclaim from */ -	mz = rb_entry(rightmost, struct mem_cgroup_per_node, tree_node); +	mz = rb_entry(mctz->rb_rightmost, +		      struct mem_cgroup_per_node, tree_node);  	/*  	 * Remove the node now but someone else can add it back,  	 * we will to add it back at the end of reclaim to its correct @@ -550,10 +562,12 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)   * value, and reading all cpu value can be performance bottleneck in some   * common workload, threshold and synchronization as vmstat[] should be   * implemented. + * + * The parameter idx can be of type enum memcg_event_item or vm_event_item.   */  static unsigned long memcg_sum_events(struct mem_cgroup *memcg, -				      enum memcg_event_item event) +				      int event)  {  	unsigned long val = 0;  	int cpu; @@ -917,7 +931,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,  		struct css_task_iter it;  		struct task_struct *task; -		css_task_iter_start(&iter->css, &it); +		css_task_iter_start(&iter->css, 0, &it);  		while (!ret && (task = css_task_iter_next(&it)))  			ret = fn(task, arg);  		css_task_iter_end(&it); @@ -1790,6 +1804,9 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)  	}  	stock->nr_pages += nr_pages; +	if (stock->nr_pages > CHARGE_BATCH) +		drain_stock(stock); +  	local_irq_restore(flags);  } @@ -1915,7 +1932,7 @@ retry:  	 * bypass the last charges so that they can exit quickly and  	 * free their memory.  	 */ -	if (unlikely(test_thread_flag(TIF_MEMDIE) || +	if (unlikely(tsk_is_oom_victim(current) ||  		     fatal_signal_pending(current) ||  		     current->flags & PF_EXITING))  		goto force; @@ -4319,6 +4336,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)  	}  	spin_unlock(&memcg->event_list_lock); +	memcg->low = 0; +  	memcg_offline_kmem(memcg);  	wb_memcg_offline(memcg); @@ -4410,12 +4429,13 @@ enum mc_target_type {  	MC_TARGET_NONE = 0,  	MC_TARGET_PAGE,  	MC_TARGET_SWAP, +	MC_TARGET_DEVICE,  };  static struct page *mc_handle_present_pte(struct vm_area_struct *vma,  						unsigned long addr, pte_t ptent)  { -	struct page *page = vm_normal_page(vma, addr, ptent); +	struct page *page = _vm_normal_page(vma, addr, ptent, true);  	if (!page || !page_mapped(page))  		return NULL; @@ -4432,7 +4452,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,  	return page;  } -#ifdef CONFIG_SWAP +#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)  static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,  			pte_t ptent, swp_entry_t *entry)  { @@ -4441,6 +4461,23 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,  	if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))  		return NULL; + +	/* +	 * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to +	 * a device and because they are not accessible by CPU they are store +	 * as special swap entry in the CPU page table. +	 */ +	if (is_device_private_entry(ent)) { +		page = device_private_entry_to_page(ent); +		/* +		 * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have +		 * a refcount of 1 when free (unlike normal page) +		 */ +		if (!page_ref_add_unless(page, 1, 1)) +			return NULL; +		return page; +	} +  	/*  	 * Because lookup_swap_cache() updates some statistics counter,  	 * we call find_get_page() with swapper_space directly. @@ -4601,6 +4638,13 @@ out:   *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a   *     target for charge migration. if @target is not NULL, the entry is stored   *     in target->ent. + *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PUBLIC + *     or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru). + *     For now we such page is charge like a regular page would be as for all + *     intent and purposes it is just special memory taking the place of a + *     regular page. + * + *     See Documentations/vm/hmm.txt and include/linux/hmm.h   *   * Called with pte lock held.   */ @@ -4629,14 +4673,20 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,  		 */  		if (page->mem_cgroup == mc.from) {  			ret = MC_TARGET_PAGE; +			if (is_device_private_page(page) || +			    is_device_public_page(page)) +				ret = MC_TARGET_DEVICE;  			if (target)  				target->page = page;  		}  		if (!ret || !target)  			put_page(page);  	} -	/* There is a swap entry and a page doesn't exist or isn't charged */ -	if (ent.val && !ret && +	/* +	 * There is a swap entry and a page doesn't exist or isn't charged. +	 * But we cannot move a tail-page in a THP. +	 */ +	if (ent.val && !ret && (!page || !PageTransCompound(page)) &&  	    mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {  		ret = MC_TARGET_SWAP;  		if (target) @@ -4647,8 +4697,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,  #ifdef CONFIG_TRANSPARENT_HUGEPAGE  /* - * We don't consider swapping or file mapped pages because THP does not - * support them for now. + * We don't consider PMD mapped swapping or file mapped pages because THP does + * not support them for now.   * Caller should make sure that pmd_trans_huge(pmd) is true.   */  static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, @@ -4657,6 +4707,11 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,  	struct page *page = NULL;  	enum mc_target_type ret = MC_TARGET_NONE; +	if (unlikely(is_swap_pmd(pmd))) { +		VM_BUG_ON(thp_migration_supported() && +				  !is_pmd_migration_entry(pmd)); +		return ret; +	}  	page = pmd_page(pmd);  	VM_BUG_ON_PAGE(!page || !PageHead(page), page);  	if (!(mc.flags & MOVE_ANON)) @@ -4688,6 +4743,11 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,  	ptl = pmd_trans_huge_lock(pmd, vma);  	if (ptl) { +		/* +		 * Note their can not be MC_TARGET_DEVICE for now as we do not +		 * support transparent huge page with MEMORY_DEVICE_PUBLIC or +		 * MEMORY_DEVICE_PRIVATE but this might change. +		 */  		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)  			mc.precharge += HPAGE_PMD_NR;  		spin_unlock(ptl); @@ -4903,6 +4963,14 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,  				putback_lru_page(page);  			}  			put_page(page); +		} else if (target_type == MC_TARGET_DEVICE) { +			page = target.page; +			if (!mem_cgroup_move_account(page, true, +						     mc.from, mc.to)) { +				mc.precharge -= HPAGE_PMD_NR; +				mc.moved_charge += HPAGE_PMD_NR; +			} +			put_page(page);  		}  		spin_unlock(ptl);  		return 0; @@ -4914,12 +4982,16 @@ retry:  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);  	for (; addr != end; addr += PAGE_SIZE) {  		pte_t ptent = *(pte++); +		bool device = false;  		swp_entry_t ent;  		if (!mc.precharge)  			break;  		switch (get_mctgt_type(vma, addr, ptent, &target)) { +		case MC_TARGET_DEVICE: +			device = true; +			/* fall through */  		case MC_TARGET_PAGE:  			page = target.page;  			/* @@ -4930,7 +5002,7 @@ retry:  			 */  			if (PageTransCompound(page))  				goto put; -			if (isolate_lru_page(page)) +			if (!device && isolate_lru_page(page))  				goto put;  			if (!mem_cgroup_move_account(page, false,  						mc.from, mc.to)) { @@ -4938,7 +5010,8 @@ retry:  				/* we uncharge from mc.from later. */  				mc.moved_charge++;  			} -			putback_lru_page(page); +			if (!device) +				putback_lru_page(page);  put:			/* get_mctgt_type() gets the page */  			put_page(page);  			break; @@ -5423,7 +5496,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,  		 * in turn serializes uncharging.  		 */  		VM_BUG_ON_PAGE(!PageLocked(page), page); -		if (page->mem_cgroup) +		if (compound_head(page)->mem_cgroup)  			goto out;  		if (do_swap_account) { @@ -5528,48 +5601,102 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,  	cancel_charge(memcg, nr_pages);  } -static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, -			   unsigned long nr_anon, unsigned long nr_file, -			   unsigned long nr_kmem, unsigned long nr_huge, -			   unsigned long nr_shmem, struct page *dummy_page) +struct uncharge_gather { +	struct mem_cgroup *memcg; +	unsigned long pgpgout; +	unsigned long nr_anon; +	unsigned long nr_file; +	unsigned long nr_kmem; +	unsigned long nr_huge; +	unsigned long nr_shmem; +	struct page *dummy_page; +}; + +static inline void uncharge_gather_clear(struct uncharge_gather *ug) +{ +	memset(ug, 0, sizeof(*ug)); +} + +static void uncharge_batch(const struct uncharge_gather *ug)  { -	unsigned long nr_pages = nr_anon + nr_file + nr_kmem; +	unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;  	unsigned long flags; -	if (!mem_cgroup_is_root(memcg)) { -		page_counter_uncharge(&memcg->memory, nr_pages); +	if (!mem_cgroup_is_root(ug->memcg)) { +		page_counter_uncharge(&ug->memcg->memory, nr_pages);  		if (do_memsw_account()) -			page_counter_uncharge(&memcg->memsw, nr_pages); -		if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem) -			page_counter_uncharge(&memcg->kmem, nr_kmem); -		memcg_oom_recover(memcg); +			page_counter_uncharge(&ug->memcg->memsw, nr_pages); +		if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) +			page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); +		memcg_oom_recover(ug->memcg);  	}  	local_irq_save(flags); -	__this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon); -	__this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file); -	__this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge); -	__this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem); -	__this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout); -	__this_cpu_add(memcg->stat->nr_page_events, nr_pages); -	memcg_check_events(memcg, dummy_page); +	__this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon); +	__this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file); +	__this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge); +	__this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem); +	__this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout); +	__this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages); +	memcg_check_events(ug->memcg, ug->dummy_page);  	local_irq_restore(flags); -	if (!mem_cgroup_is_root(memcg)) -		css_put_many(&memcg->css, nr_pages); +	if (!mem_cgroup_is_root(ug->memcg)) +		css_put_many(&ug->memcg->css, nr_pages); +} + +static void uncharge_page(struct page *page, struct uncharge_gather *ug) +{ +	VM_BUG_ON_PAGE(PageLRU(page), page); +	VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page); + +	if (!page->mem_cgroup) +		return; + +	/* +	 * Nobody should be changing or seriously looking at +	 * page->mem_cgroup at this point, we have fully +	 * exclusive access to the page. +	 */ + +	if (ug->memcg != page->mem_cgroup) { +		if (ug->memcg) { +			uncharge_batch(ug); +			uncharge_gather_clear(ug); +		} +		ug->memcg = page->mem_cgroup; +	} + +	if (!PageKmemcg(page)) { +		unsigned int nr_pages = 1; + +		if (PageTransHuge(page)) { +			nr_pages <<= compound_order(page); +			ug->nr_huge += nr_pages; +		} +		if (PageAnon(page)) +			ug->nr_anon += nr_pages; +		else { +			ug->nr_file += nr_pages; +			if (PageSwapBacked(page)) +				ug->nr_shmem += nr_pages; +		} +		ug->pgpgout++; +	} else { +		ug->nr_kmem += 1 << compound_order(page); +		__ClearPageKmemcg(page); +	} + +	ug->dummy_page = page; +	page->mem_cgroup = NULL;  }  static void uncharge_list(struct list_head *page_list)  { -	struct mem_cgroup *memcg = NULL; -	unsigned long nr_shmem = 0; -	unsigned long nr_anon = 0; -	unsigned long nr_file = 0; -	unsigned long nr_huge = 0; -	unsigned long nr_kmem = 0; -	unsigned long pgpgout = 0; +	struct uncharge_gather ug;  	struct list_head *next; -	struct page *page; + +	uncharge_gather_clear(&ug);  	/*  	 * Note that the list can be a single page->lru; hence the @@ -5577,57 +5704,16 @@ static void uncharge_list(struct list_head *page_list)  	 */  	next = page_list->next;  	do { +		struct page *page; +  		page = list_entry(next, struct page, lru);  		next = page->lru.next; -		VM_BUG_ON_PAGE(PageLRU(page), page); -		VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page); - -		if (!page->mem_cgroup) -			continue; - -		/* -		 * Nobody should be changing or seriously looking at -		 * page->mem_cgroup at this point, we have fully -		 * exclusive access to the page. -		 */ - -		if (memcg != page->mem_cgroup) { -			if (memcg) { -				uncharge_batch(memcg, pgpgout, nr_anon, nr_file, -					       nr_kmem, nr_huge, nr_shmem, page); -				pgpgout = nr_anon = nr_file = nr_kmem = 0; -				nr_huge = nr_shmem = 0; -			} -			memcg = page->mem_cgroup; -		} - -		if (!PageKmemcg(page)) { -			unsigned int nr_pages = 1; - -			if (PageTransHuge(page)) { -				nr_pages <<= compound_order(page); -				nr_huge += nr_pages; -			} -			if (PageAnon(page)) -				nr_anon += nr_pages; -			else { -				nr_file += nr_pages; -				if (PageSwapBacked(page)) -					nr_shmem += nr_pages; -			} -			pgpgout++; -		} else { -			nr_kmem += 1 << compound_order(page); -			__ClearPageKmemcg(page); -		} - -		page->mem_cgroup = NULL; +		uncharge_page(page, &ug);  	} while (next != page_list); -	if (memcg) -		uncharge_batch(memcg, pgpgout, nr_anon, nr_file, -			       nr_kmem, nr_huge, nr_shmem, page); +	if (ug.memcg) +		uncharge_batch(&ug);  }  /** @@ -5639,6 +5725,8 @@ static void uncharge_list(struct list_head *page_list)   */  void mem_cgroup_uncharge(struct page *page)  { +	struct uncharge_gather ug; +  	if (mem_cgroup_disabled())  		return; @@ -5646,8 +5734,9 @@ void mem_cgroup_uncharge(struct page *page)  	if (!page->mem_cgroup)  		return; -	INIT_LIST_HEAD(&page->lru); -	uncharge_list(&page->lru); +	uncharge_gather_clear(&ug); +	uncharge_page(page, &ug); +	uncharge_batch(&ug);  }  /** @@ -5812,8 +5901,7 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)  	this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages); -	page_counter_uncharge(&memcg->memory, nr_pages); -	css_put_many(&memcg->css, nr_pages); +	refill_stock(memcg, nr_pages);  }  static int __init cgroup_memory(char *s) @@ -5869,6 +5957,7 @@ static int __init mem_cgroup_init(void)  				    node_online(node) ? node : NUMA_NO_NODE);  		rtpn->rb_root = RB_ROOT; +		rtpn->rb_rightmost = NULL;  		spin_lock_init(&rtpn->lock);  		soft_limit_tree.rb_tree_per_node[node] = rtpn;  	} @@ -5906,6 +5995,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)  void mem_cgroup_swapout(struct page *page, swp_entry_t entry)  {  	struct mem_cgroup *memcg, *swap_memcg; +	unsigned int nr_entries;  	unsigned short oldid;  	VM_BUG_ON_PAGE(PageLRU(page), page); @@ -5926,19 +6016,24 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)  	 * ancestor for the swap instead and transfer the memory+swap charge.  	 */  	swap_memcg = mem_cgroup_id_get_online(memcg); -	oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1); +	nr_entries = hpage_nr_pages(page); +	/* Get references for the tail pages, too */ +	if (nr_entries > 1) +		mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); +	oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), +				   nr_entries);  	VM_BUG_ON_PAGE(oldid, page); -	mem_cgroup_swap_statistics(swap_memcg, 1); +	mem_cgroup_swap_statistics(swap_memcg, nr_entries);  	page->mem_cgroup = NULL;  	if (!mem_cgroup_is_root(memcg)) -		page_counter_uncharge(&memcg->memory, 1); +		page_counter_uncharge(&memcg->memory, nr_entries);  	if (memcg != swap_memcg) {  		if (!mem_cgroup_is_root(swap_memcg)) -			page_counter_charge(&swap_memcg->memsw, 1); -		page_counter_uncharge(&memcg->memsw, 1); +			page_counter_charge(&swap_memcg->memsw, nr_entries); +		page_counter_uncharge(&memcg->memsw, nr_entries);  	}  	/* @@ -5948,7 +6043,8 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)  	 * only synchronisation we have for udpating the per-CPU variables.  	 */  	VM_BUG_ON(!irqs_disabled()); -	mem_cgroup_charge_statistics(memcg, page, false, -1); +	mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page), +				     -nr_entries);  	memcg_check_events(memcg, page);  	if (!mem_cgroup_is_root(memcg)) |