diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 255 |
1 files changed, 127 insertions, 128 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e79cb59552d9..532e0e2a4817 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -39,6 +39,7 @@ #include <linux/shmem_fs.h> #include <linux/hugetlb.h> #include <linux/pagemap.h> +#include <linux/vm_event_item.h> #include <linux/smp.h> #include <linux/page-flags.h> #include <linux/backing-dev.h> @@ -248,6 +249,12 @@ enum res_type { iter != NULL; \ iter = mem_cgroup_iter(NULL, iter, NULL)) +static inline bool should_force_charge(void) +{ + return tsk_is_oom_victim(current) || fatal_signal_pending(current) || + (current->flags & PF_EXITING); +} + /* Some nice accessors for the vmpressure. */ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) { @@ -1293,32 +1300,39 @@ static const char *const memcg1_stat_names[] = { #define K(x) ((x) << (PAGE_SHIFT-10)) /** - * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. + * mem_cgroup_print_oom_context: Print OOM information relevant to + * memory controller. * @memcg: The memory cgroup that went over limit * @p: Task that is going to be killed * * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is * enabled */ -void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) +void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) { - struct mem_cgroup *iter; - unsigned int i; - rcu_read_lock(); + if (memcg) { + pr_cont(",oom_memcg="); + pr_cont_cgroup_path(memcg->css.cgroup); + } else + pr_cont(",global_oom"); if (p) { - pr_info("Task in "); + pr_cont(",task_memcg="); pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); - pr_cont(" killed as a result of limit of "); - } else { - pr_info("Memory limit reached of cgroup "); } - - pr_cont_cgroup_path(memcg->css.cgroup); - pr_cont("\n"); - rcu_read_unlock(); +} + +/** + * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to + * memory controller. + * @memcg: The memory cgroup that went over limit + */ +void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) +{ + struct mem_cgroup *iter; + unsigned int i; pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memory)), @@ -1382,8 +1396,13 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, }; bool ret; - mutex_lock(&oom_lock); - ret = out_of_memory(&oc); + if (mutex_lock_killable(&oom_lock)) + return true; + /* + * A few threads which were not waiting at mutex_lock_killable() can + * fail to bail out. Therefore, check again after holding oom_lock. + */ + ret = should_force_charge() || out_of_memory(&oc); mutex_unlock(&oom_lock); return ret; } @@ -1666,9 +1685,14 @@ enum oom_status { static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { + enum oom_status ret; + bool locked; + if (order > PAGE_ALLOC_COSTLY_ORDER) return OOM_SKIPPED; + memcg_memory_event(memcg, MEMCG_OOM); + /* * We are in the middle of the charge context here, so we * don't want to block when potentially sitting on a callstack @@ -1698,10 +1722,23 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int return OOM_ASYNC; } + mem_cgroup_mark_under_oom(memcg); + + locked = mem_cgroup_oom_trylock(memcg); + + if (locked) + mem_cgroup_oom_notify(memcg); + + mem_cgroup_unmark_under_oom(memcg); if (mem_cgroup_out_of_memory(memcg, mask, order)) - return OOM_SUCCESS; + ret = OOM_SUCCESS; + else + ret = OOM_FAILED; + + if (locked) + mem_cgroup_oom_unlock(memcg); - return OOM_FAILED; + return ret; } /** @@ -2184,9 +2221,7 @@ retry: * bypass the last charges so that they can exit quickly and * free their memory. */ - if (unlikely(tsk_is_oom_victim(current) || - fatal_signal_pending(current) || - current->flags & PF_EXITING)) + if (unlikely(should_force_charge())) goto force; /* @@ -2250,8 +2285,6 @@ retry: if (fatal_signal_pending(current)) goto force; - memcg_memory_event(mem_over_limit, MEMCG_OOM); - /* * keep retrying as long as the memcg oom killer is able to make * a forward progress or bypass the charge if the oom killer @@ -2329,13 +2362,13 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) static void lock_page_lru(struct page *page, int *isolated) { - struct zone *zone = page_zone(page); + pg_data_t *pgdat = page_pgdat(page); - spin_lock_irq(zone_lru_lock(zone)); + spin_lock_irq(&pgdat->lru_lock); if (PageLRU(page)) { struct lruvec *lruvec; - lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); + lruvec = mem_cgroup_page_lruvec(page, pgdat); ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_lru(page)); *isolated = 1; @@ -2345,17 +2378,17 @@ static void lock_page_lru(struct page *page, int *isolated) static void unlock_page_lru(struct page *page, int isolated) { - struct zone *zone = page_zone(page); + pg_data_t *pgdat = page_pgdat(page); if (isolated) { struct lruvec *lruvec; - lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); + lruvec = mem_cgroup_page_lruvec(page, pgdat); VM_BUG_ON_PAGE(PageLRU(page), page); SetPageLRU(page); add_page_to_lru_list(page, lruvec, page_lru(page)); } - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); } static void commit_charge(struct page *page, struct mem_cgroup *memcg, @@ -2460,7 +2493,7 @@ static void memcg_kmem_cache_create_func(struct work_struct *w) /* * Enqueue the creation of a per-memcg kmem_cache. */ -static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, +static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, struct kmem_cache *cachep) { struct memcg_kmem_cache_create_work *cw; @@ -2478,25 +2511,6 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, queue_work(memcg_kmem_cache_wq, &cw->work); } -static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, - struct kmem_cache *cachep) -{ - /* - * We need to stop accounting when we kmalloc, because if the - * corresponding kmalloc cache is not yet created, the first allocation - * in __memcg_schedule_kmem_cache_create will recurse. - * - * However, it is better to enclose the whole function. Depending on - * the debugging options enabled, INIT_WORK(), for instance, can - * trigger an allocation. This too, will make us recurse. Because at - * this point we can't allow ourselves back into memcg_kmem_get_cache, - * the safest choice is to do it like this, wrapping the whole function. - */ - current->memcg_kmem_skip_account = 1; - __memcg_schedule_kmem_cache_create(memcg, cachep); - current->memcg_kmem_skip_account = 0; -} - static inline bool memcg_kmem_bypass(void) { if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) @@ -2531,9 +2545,6 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) if (memcg_kmem_bypass()) return cachep; - if (current->memcg_kmem_skip_account) - return cachep; - memcg = get_mem_cgroup_from_current(); kmemcg_id = READ_ONCE(memcg->kmemcg_id); if (kmemcg_id < 0) @@ -2572,7 +2583,7 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep) } /** - * memcg_kmem_charge_memcg: charge a kmem page + * __memcg_kmem_charge_memcg: charge a kmem page * @page: page to charge * @gfp: reclaim mode * @order: allocation order @@ -2580,7 +2591,7 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep) * * Returns 0 on success, an error code on failure. */ -int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, +int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, struct mem_cgroup *memcg) { unsigned int nr_pages = 1 << order; @@ -2603,14 +2614,14 @@ int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, } /** - * memcg_kmem_charge: charge a kmem page to the current memory cgroup + * __memcg_kmem_charge: charge a kmem page to the current memory cgroup * @page: page to charge * @gfp: reclaim mode * @order: allocation order * * Returns 0 on success, an error code on failure. */ -int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { struct mem_cgroup *memcg; int ret = 0; @@ -2620,7 +2631,7 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) memcg = get_mem_cgroup_from_current(); if (!mem_cgroup_is_root(memcg)) { - ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); + ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); if (!ret) __SetPageKmemcg(page); } @@ -2628,11 +2639,11 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) return ret; } /** - * memcg_kmem_uncharge: uncharge a kmem page + * __memcg_kmem_uncharge: uncharge a kmem page * @page: page to uncharge * @order: allocation order */ -void memcg_kmem_uncharge(struct page *page, int order) +void __memcg_kmem_uncharge(struct page *page, int order) { struct mem_cgroup *memcg = page->mem_cgroup; unsigned int nr_pages = 1 << order; @@ -2663,7 +2674,7 @@ void memcg_kmem_uncharge(struct page *page, int order) /* * Because tail pages are not marked as "used", set it. We're under - * zone_lru_lock and migration entries setup in all page mappings. + * pgdat->lru_lock and migration entries setup in all page mappings. */ void mem_cgroup_split_huge_fixup(struct page *head) { @@ -3336,7 +3347,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) const struct numa_stat *stat; int nid; unsigned long nr; - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); @@ -3387,7 +3398,7 @@ static const char *const memcg1_event_names[] = { static int memcg_stat_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); unsigned long memory, memsw; struct mem_cgroup *mi; unsigned int i; @@ -3625,8 +3636,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, size = thresholds->primary ? thresholds->primary->size + 1 : 1; /* Allocate memory for new array of thresholds */ - new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), - GFP_KERNEL); + new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); if (!new) { ret = -ENOMEM; goto unlock; @@ -3820,7 +3830,7 @@ static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); @@ -4321,14 +4331,12 @@ static void mem_cgroup_id_remove(struct mem_cgroup *memcg) static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) { - VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0); - atomic_add(n, &memcg->id.ref); + refcount_add(n, &memcg->id.ref); } static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { - VM_BUG_ON(atomic_read(&memcg->id.ref) < n); - if (atomic_sub_and_test(n, &memcg->id.ref)) { + if (refcount_sub_and_test(n, &memcg->id.ref)) { mem_cgroup_id_remove(memcg); /* Memcg ID pins CSS */ @@ -4421,7 +4429,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) static struct mem_cgroup *mem_cgroup_alloc(void) { struct mem_cgroup *memcg; - size_t size; + unsigned int size; int node; size = sizeof(struct mem_cgroup); @@ -4545,7 +4553,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) } /* Online state pins memcg ID, memcg ID pins CSS */ - atomic_set(&memcg->id.ref, 1); + refcount_set(&memcg->id.ref, 1); css_get(css); return 0; } @@ -4573,6 +4581,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) memcg_offline_kmem(memcg); wb_memcg_offline(memcg); + drain_all_stock(memcg); + mem_cgroup_id_put(memcg); } @@ -4750,7 +4760,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, /* shmem/tmpfs may report page out on swap: account for that too. */ if (shmem_mapping(mapping)) { page = find_get_entry(mapping, pgoff); - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { swp_entry_t swp = radix_to_swp_entry(page); if (do_memsw_account()) *entry = swp; @@ -5353,6 +5363,16 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) root_mem_cgroup->use_hierarchy = false; } +static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) +{ + if (value == PAGE_COUNTER_MAX) + seq_puts(m, "max\n"); + else + seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); + + return 0; +} + static u64 memory_current_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -5363,15 +5383,8 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, static int memory_min_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long min = READ_ONCE(memcg->memory.min); - - if (min == PAGE_COUNTER_MAX) - seq_puts(m, "max\n"); - else - seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE); - - return 0; + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); } static ssize_t memory_min_write(struct kernfs_open_file *of, @@ -5393,15 +5406,8 @@ static ssize_t memory_min_write(struct kernfs_open_file *of, static int memory_low_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long low = READ_ONCE(memcg->memory.low); - - if (low == PAGE_COUNTER_MAX) - seq_puts(m, "max\n"); - else - seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE); - - return 0; + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); } static ssize_t memory_low_write(struct kernfs_open_file *of, @@ -5423,15 +5429,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, static int memory_high_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long high = READ_ONCE(memcg->high); - - if (high == PAGE_COUNTER_MAX) - seq_puts(m, "max\n"); - else - seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE); - - return 0; + return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high)); } static ssize_t memory_high_write(struct kernfs_open_file *of, @@ -5460,15 +5458,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, static int memory_max_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long max = READ_ONCE(memcg->memory.max); - - if (max == PAGE_COUNTER_MAX) - seq_puts(m, "max\n"); - else - seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); - - return 0; + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.max)); } static ssize_t memory_max_write(struct kernfs_open_file *of, @@ -5522,7 +5513,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, static int memory_events_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); seq_printf(m, "low %lu\n", atomic_long_read(&memcg->memory_events[MEMCG_LOW])); @@ -5540,7 +5531,7 @@ static int memory_events_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); struct accumulated_stats acc; int i; @@ -5581,6 +5572,15 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "file_writeback %llu\n", (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE); + /* + * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter + * with the NR_ANON_THP vm counter, but right now it's a pain in the + * arse because it requires migrating the work out of rmap to a place + * where the page->mem_cgroup is set up and stable. + */ + seq_printf(m, "anon_thp %llu\n", + (u64)acc.stat[MEMCG_RSS_HUGE] * PAGE_SIZE); + for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], (u64)acc.lru_pages[i] * PAGE_SIZE); @@ -5595,6 +5595,13 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]); seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]); + seq_printf(m, "workingset_refault %lu\n", + acc.stat[WORKINGSET_REFAULT]); + seq_printf(m, "workingset_activate %lu\n", + acc.stat[WORKINGSET_ACTIVATE]); + seq_printf(m, "workingset_nodereclaim %lu\n", + acc.stat[WORKINGSET_NODERECLAIM]); + seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]); seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] + acc.events[PGSCAN_DIRECT]); @@ -5605,19 +5612,18 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); - seq_printf(m, "workingset_refault %lu\n", - acc.stat[WORKINGSET_REFAULT]); - seq_printf(m, "workingset_activate %lu\n", - acc.stat[WORKINGSET_ACTIVATE]); - seq_printf(m, "workingset_nodereclaim %lu\n", - acc.stat[WORKINGSET_NODERECLAIM]); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]); + seq_printf(m, "thp_collapse_alloc %lu\n", + acc.events[THP_COLLAPSE_ALLOC]); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ return 0; } static int memory_oom_group_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); seq_printf(m, "%d\n", memcg->oom_group); @@ -5746,7 +5752,7 @@ struct cgroup_subsys memory_cgrp_subsys = { * * | memory.current, if memory.current < memory.low * low_usage = | - | 0, otherwise. + * | 0, otherwise. * * * Such definition of the effective memory.low provides the expected @@ -6377,7 +6383,7 @@ subsys_initcall(mem_cgroup_init); #ifdef CONFIG_MEMCG_SWAP static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) { - while (!atomic_inc_not_zero(&memcg->id.ref)) { + while (!refcount_inc_not_zero(&memcg->id.ref)) { /* * The root cgroup cannot be destroyed, so it's refcount must * always be >= 1. @@ -6600,15 +6606,8 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, static int swap_max_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long max = READ_ONCE(memcg->swap.max); - - if (max == PAGE_COUNTER_MAX) - seq_puts(m, "max\n"); - else - seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); - - return 0; + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->swap.max)); } static ssize_t swap_max_write(struct kernfs_open_file *of, @@ -6630,7 +6629,7 @@ static ssize_t swap_max_write(struct kernfs_open_file *of, static int swap_events_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); seq_printf(m, "max %lu\n", atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); |