diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 659 |
1 files changed, 424 insertions, 235 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ca0bc6e6be13..9ec5e12486a7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -57,6 +57,7 @@ #include <linux/lockdep.h> #include <linux/file.h> #include <linux/tracehook.h> +#include <linux/seq_buf.h> #include "internal.h" #include <net/sock.h> #include <net/ip.h> @@ -485,7 +486,10 @@ ino_t page_cgroup_ino(struct page *page) unsigned long ino = 0; rcu_read_lock(); - memcg = READ_ONCE(page->mem_cgroup); + if (PageHead(page) && PageSlab(page)) + memcg = memcg_from_slab_page(page); + else + memcg = READ_ONCE(page->mem_cgroup); while (memcg && !(memcg->css.flags & CSS_ONLINE)) memcg = parent_mem_cgroup(memcg); if (memcg) @@ -695,7 +699,11 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { struct mem_cgroup *mi; - atomic_long_add(x, &memcg->vmstats_local[idx]); + /* + * Batch local counters to keep them in sync with + * the hierarchical ones. + */ + __this_cpu_add(memcg->vmstats_local->stat[idx], x); for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) atomic_long_add(x, &mi->vmstats[idx]); x = 0; @@ -745,11 +753,12 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __mod_memcg_state(memcg, idx, val); /* Update lruvec */ + __this_cpu_add(pn->lruvec_stat_local->count[idx], val); + x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { struct mem_cgroup_per_node *pi; - atomic_long_add(x, &pn->lruvec_stat_local[idx]); for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) atomic_long_add(x, &pi->lruvec_stat[idx]); x = 0; @@ -757,6 +766,26 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); } +void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) +{ + struct page *page = virt_to_head_page(p); + pg_data_t *pgdat = page_pgdat(page); + struct mem_cgroup *memcg; + struct lruvec *lruvec; + + rcu_read_lock(); + memcg = memcg_from_slab_page(page); + + /* Untracked pages have no memcg, no lruvec. Update only the node */ + if (!memcg || memcg == root_mem_cgroup) { + __mod_node_page_state(pgdat, idx, val); + } else { + lruvec = mem_cgroup_lruvec(pgdat, memcg); + __mod_lruvec_state(lruvec, idx, val); + } + rcu_read_unlock(); +} + /** * __count_memcg_events - account VM events in a cgroup * @memcg: the memory cgroup @@ -775,7 +804,11 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, if (unlikely(x > MEMCG_CHARGE_BATCH)) { struct mem_cgroup *mi; - atomic_long_add(x, &memcg->vmevents_local[idx]); + /* + * Batch local counters to keep them in sync with + * the hierarchical ones. + */ + __this_cpu_add(memcg->vmstats_local->events[idx], x); for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) atomic_long_add(x, &mi->vmevents[idx]); x = 0; @@ -790,7 +823,12 @@ static unsigned long memcg_events(struct mem_cgroup *memcg, int event) static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) { - return atomic_long_read(&memcg->vmevents_local[event]); + long x = 0; + int cpu; + + for_each_possible_cpu(cpu) + x += per_cpu(memcg->vmstats_local->events[event], cpu); + return x; } static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, @@ -1110,26 +1148,45 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, css_put(&prev->css); } -static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) +static void __invalidate_reclaim_iterators(struct mem_cgroup *from, + struct mem_cgroup *dead_memcg) { - struct mem_cgroup *memcg = dead_memcg; struct mem_cgroup_reclaim_iter *iter; struct mem_cgroup_per_node *mz; int nid; int i; - for (; memcg; memcg = parent_mem_cgroup(memcg)) { - for_each_node(nid) { - mz = mem_cgroup_nodeinfo(memcg, nid); - for (i = 0; i <= DEF_PRIORITY; i++) { - iter = &mz->iter[i]; - cmpxchg(&iter->position, - dead_memcg, NULL); - } + for_each_node(nid) { + mz = mem_cgroup_nodeinfo(from, nid); + for (i = 0; i <= DEF_PRIORITY; i++) { + iter = &mz->iter[i]; + cmpxchg(&iter->position, + dead_memcg, NULL); } } } +static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) +{ + struct mem_cgroup *memcg = dead_memcg; + struct mem_cgroup *last; + + do { + __invalidate_reclaim_iterators(memcg, dead_memcg); + last = memcg; + } while ((memcg = parent_mem_cgroup(memcg))); + + /* + * When cgruop1 non-hierarchy mode is used, + * parent_mem_cgroup() does not walk all the way up to the + * cgroup root (root_mem_cgroup). So we have to handle + * dead_memcg from cgroup root separately. + */ + if (last != root_mem_cgroup) + __invalidate_reclaim_iterators(root_mem_cgroup, + dead_memcg); +} + /** * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy * @memcg: hierarchy root @@ -1155,7 +1212,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&iter->css, 0, &it); + css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); while (!ret && (task = css_task_iter_next(&it))) ret = fn(task, arg); css_task_iter_end(&it); @@ -1247,32 +1304,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, *lru_size += nr_pages; } -bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) -{ - struct mem_cgroup *task_memcg; - struct task_struct *p; - bool ret; - - p = find_lock_task_mm(task); - if (p) { - task_memcg = get_mem_cgroup_from_mm(p->mm); - task_unlock(p); - } else { - /* - * All threads may have already detached their mm's, but the oom - * killer still needs to detect if they have already been oom - * killed to prevent needlessly killing additional tasks. - */ - rcu_read_lock(); - task_memcg = mem_cgroup_from_task(task); - css_get(&task_memcg->css); - rcu_read_unlock(); - } - ret = mem_cgroup_is_descendant(task_memcg, memcg); - css_put(&task_memcg->css); - return ret; -} - /** * mem_cgroup_margin - calculate chargeable space of a memory cgroup * @memcg: the memory cgroup @@ -1348,27 +1379,114 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) return false; } -static const unsigned int memcg1_stats[] = { - MEMCG_CACHE, - MEMCG_RSS, - MEMCG_RSS_HUGE, - NR_SHMEM, - NR_FILE_MAPPED, - NR_FILE_DIRTY, - NR_WRITEBACK, - MEMCG_SWAP, -}; +static char *memory_stat_format(struct mem_cgroup *memcg) +{ + struct seq_buf s; + int i; -static const char *const memcg1_stat_names[] = { - "cache", - "rss", - "rss_huge", - "shmem", - "mapped_file", - "dirty", - "writeback", - "swap", -}; + seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE); + if (!s.buffer) + return NULL; + + /* + * Provide statistics on the state of the memory subsystem as + * well as cumulative event counters that show past behavior. + * + * This list is ordered following a combination of these gradients: + * 1) generic big picture -> specifics and details + * 2) reflecting userspace activity -> reflecting kernel heuristics + * + * Current memory state: + */ + + seq_buf_printf(&s, "anon %llu\n", + (u64)memcg_page_state(memcg, MEMCG_RSS) * + PAGE_SIZE); + seq_buf_printf(&s, "file %llu\n", + (u64)memcg_page_state(memcg, MEMCG_CACHE) * + PAGE_SIZE); + seq_buf_printf(&s, "kernel_stack %llu\n", + (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * + 1024); + seq_buf_printf(&s, "slab %llu\n", + (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) + + memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) * + PAGE_SIZE); + seq_buf_printf(&s, "sock %llu\n", + (u64)memcg_page_state(memcg, MEMCG_SOCK) * + PAGE_SIZE); + + seq_buf_printf(&s, "shmem %llu\n", + (u64)memcg_page_state(memcg, NR_SHMEM) * + PAGE_SIZE); + seq_buf_printf(&s, "file_mapped %llu\n", + (u64)memcg_page_state(memcg, NR_FILE_MAPPED) * + PAGE_SIZE); + seq_buf_printf(&s, "file_dirty %llu\n", + (u64)memcg_page_state(memcg, NR_FILE_DIRTY) * + PAGE_SIZE); + seq_buf_printf(&s, "file_writeback %llu\n", + (u64)memcg_page_state(memcg, NR_WRITEBACK) * + PAGE_SIZE); + + /* + * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter + * with the NR_ANON_THP vm counter, but right now it's a pain in the + * arse because it requires migrating the work out of rmap to a place + * where the page->mem_cgroup is set up and stable. + */ + seq_buf_printf(&s, "anon_thp %llu\n", + (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * + PAGE_SIZE); + + for (i = 0; i < NR_LRU_LISTS; i++) + seq_buf_printf(&s, "%s %llu\n", mem_cgroup_lru_names[i], + (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * + PAGE_SIZE); + + seq_buf_printf(&s, "slab_reclaimable %llu\n", + (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) * + PAGE_SIZE); + seq_buf_printf(&s, "slab_unreclaimable %llu\n", + (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) * + PAGE_SIZE); + + /* Accumulated memory events */ + + seq_buf_printf(&s, "pgfault %lu\n", memcg_events(memcg, PGFAULT)); + seq_buf_printf(&s, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT)); + + seq_buf_printf(&s, "workingset_refault %lu\n", + memcg_page_state(memcg, WORKINGSET_REFAULT)); + seq_buf_printf(&s, "workingset_activate %lu\n", + memcg_page_state(memcg, WORKINGSET_ACTIVATE)); + seq_buf_printf(&s, "workingset_nodereclaim %lu\n", + memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); + + seq_buf_printf(&s, "pgrefill %lu\n", memcg_events(memcg, PGREFILL)); + seq_buf_printf(&s, "pgscan %lu\n", + memcg_events(memcg, PGSCAN_KSWAPD) + + memcg_events(memcg, PGSCAN_DIRECT)); + seq_buf_printf(&s, "pgsteal %lu\n", + memcg_events(memcg, PGSTEAL_KSWAPD) + + memcg_events(memcg, PGSTEAL_DIRECT)); + seq_buf_printf(&s, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE)); + seq_buf_printf(&s, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE)); + seq_buf_printf(&s, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE)); + seq_buf_printf(&s, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED)); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + seq_buf_printf(&s, "thp_fault_alloc %lu\n", + memcg_events(memcg, THP_FAULT_ALLOC)); + seq_buf_printf(&s, "thp_collapse_alloc %lu\n", + memcg_events(memcg, THP_COLLAPSE_ALLOC)); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + + /* The above should easily fit into one page */ + WARN_ON_ONCE(seq_buf_has_overflowed(&s)); + + return s.buffer; +} #define K(x) ((x) << (PAGE_SHIFT-10)) /** @@ -1403,39 +1521,32 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct * */ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { - struct mem_cgroup *iter; - unsigned int i; + char *buf; pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memory)), K((u64)memcg->memory.max), memcg->memory.failcnt); - pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", - K((u64)page_counter_read(&memcg->memsw)), - K((u64)memcg->memsw.max), memcg->memsw.failcnt); - pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", - K((u64)page_counter_read(&memcg->kmem)), - K((u64)memcg->kmem.max), memcg->kmem.failcnt); - - for_each_mem_cgroup_tree(iter, memcg) { - pr_info("Memory cgroup stats for "); - pr_cont_cgroup_path(iter->css.cgroup); - pr_cont(":"); - - for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { - if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account) - continue; - pr_cont(" %s:%luKB", memcg1_stat_names[i], - K(memcg_page_state_local(iter, - memcg1_stats[i]))); - } - - for (i = 0; i < NR_LRU_LISTS; i++) - pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], - K(memcg_page_state_local(iter, - NR_LRU_BASE + i))); - - pr_cont("\n"); + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->swap)), + K((u64)memcg->swap.max), memcg->swap.failcnt); + else { + pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->memsw)), + K((u64)memcg->memsw.max), memcg->memsw.failcnt); + pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->kmem)), + K((u64)memcg->kmem.max), memcg->kmem.failcnt); } + + pr_info("Memory cgroup stats for "); + pr_cont_cgroup_path(memcg->css.cgroup); + pr_cont(":"); + buf = memory_stat_format(memcg); + if (!buf) + return; + pr_info("%s", buf); + kfree(buf); } /* @@ -2191,11 +2302,9 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) long x; x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0); - if (x) { - atomic_long_add(x, &memcg->vmstats_local[i]); + if (x) for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) atomic_long_add(x, &memcg->vmstats[i]); - } if (i >= NR_VM_NODE_STAT_ITEMS) continue; @@ -2205,12 +2314,10 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) pn = mem_cgroup_nodeinfo(memcg, nid); x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0); - if (x) { - atomic_long_add(x, &pn->lruvec_stat_local[i]); + if (x) do { atomic_long_add(x, &pn->lruvec_stat[i]); } while ((pn = parent_nodeinfo(pn, nid))); - } } } @@ -2218,11 +2325,9 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) long x; x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0); - if (x) { - atomic_long_add(x, &memcg->vmevents_local[i]); + if (x) for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) atomic_long_add(x, &memcg->vmevents[i]); - } } } @@ -2277,7 +2382,6 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned long nr_reclaimed; bool may_swap = true; bool drained = false; - bool oomed = false; enum oom_status oom_status; if (mem_cgroup_is_root(memcg)) @@ -2364,7 +2468,7 @@ retry: if (nr_retries--) goto retry; - if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed) + if (gfp_mask & __GFP_RETRY_MAYFAIL) goto nomem; if (gfp_mask & __GFP_NOFAIL) @@ -2383,7 +2487,6 @@ retry: switch (oom_status) { case OOM_SUCCESS: nr_retries = MEM_CGROUP_RECLAIM_RETRIES; - oomed = true; goto retry; case OOM_FAILED: goto force; @@ -2586,12 +2689,13 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, { struct memcg_kmem_cache_create_work *cw; + if (!css_tryget_online(&memcg->css)) + return; + cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN); if (!cw) return; - css_get(&memcg->css); - cw->memcg = memcg; cw->cachep = cachep; INIT_WORK(&cw->work, memcg_kmem_cache_create_func); @@ -2626,6 +2730,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) { struct mem_cgroup *memcg; struct kmem_cache *memcg_cachep; + struct memcg_cache_array *arr; int kmemcg_id; VM_BUG_ON(!is_root_cache(cachep)); @@ -2633,14 +2738,28 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) if (memcg_kmem_bypass()) return cachep; - memcg = get_mem_cgroup_from_current(); + rcu_read_lock(); + + if (unlikely(current->active_memcg)) + memcg = current->active_memcg; + else + memcg = mem_cgroup_from_task(current); + + if (!memcg || memcg == root_mem_cgroup) + goto out_unlock; + kmemcg_id = READ_ONCE(memcg->kmemcg_id); if (kmemcg_id < 0) - goto out; + goto out_unlock; - memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id); - if (likely(memcg_cachep)) - return memcg_cachep; + arr = rcu_dereference(cachep->memcg_params.memcg_caches); + + /* + * Make sure we will access the up-to-date value. The code updating + * memcg_caches issues a write barrier to match the data dependency + * barrier inside READ_ONCE() (see memcg_create_kmem_cache()). + */ + memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]); /* * If we are in a safe context (can wait, and not in interrupt @@ -2653,10 +2772,20 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) * memcg_create_kmem_cache, this means no further allocation * could happen with the slab_mutex held. So it's better to * defer everything. + * + * If the memcg is dying or memcg_cache is about to be released, + * don't bother creating new kmem_caches. Because memcg_cachep + * is ZEROed as the fist step of kmem offlining, we don't need + * percpu_ref_tryget_live() here. css_tryget_online() check in + * memcg_schedule_kmem_cache_create() will prevent us from + * creation of a new kmem_cache. */ - memcg_schedule_kmem_cache_create(memcg, cachep); -out: - css_put(&memcg->css); + if (unlikely(!memcg_cachep)) + memcg_schedule_kmem_cache_create(memcg, cachep); + else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt)) + cachep = memcg_cachep; +out_unlock: + rcu_read_unlock(); return cachep; } @@ -2667,7 +2796,7 @@ out: void memcg_kmem_put_cache(struct kmem_cache *cachep) { if (!is_root_cache(cachep)) - css_put(&cachep->memcg_params.memcg->css); + percpu_ref_put(&cachep->memcg_params.refcnt); } /** @@ -2695,9 +2824,6 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, cancel_charge(memcg, nr_pages); return -ENOMEM; } - - page->mem_cgroup = memcg; - return 0; } @@ -2720,12 +2846,30 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) memcg = get_mem_cgroup_from_current(); if (!mem_cgroup_is_root(memcg)) { ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); - if (!ret) + if (!ret) { + page->mem_cgroup = memcg; __SetPageKmemcg(page); + } } css_put(&memcg->css); return ret; } + +/** + * __memcg_kmem_uncharge_memcg: uncharge a kmem page + * @memcg: memcg to uncharge + * @nr_pages: number of pages to uncharge + */ +void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg, + unsigned int nr_pages) +{ + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_uncharge(&memcg->kmem, nr_pages); + + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_memsw_account()) + page_counter_uncharge(&memcg->memsw, nr_pages); +} /** * __memcg_kmem_uncharge: uncharge a kmem page * @page: page to uncharge @@ -2740,14 +2884,7 @@ void __memcg_kmem_uncharge(struct page *page, int order) return; VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); - - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->kmem, nr_pages); - - page_counter_uncharge(&memcg->memory, nr_pages); - if (do_memsw_account()) - page_counter_uncharge(&memcg->memsw, nr_pages); - + __memcg_kmem_uncharge_memcg(memcg, nr_pages); page->mem_cgroup = NULL; /* slab pages do not have PageKmemcg flag set */ @@ -3121,6 +3258,72 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } } +static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only) +{ + unsigned long stat[MEMCG_NR_STAT]; + struct mem_cgroup *mi; + int node, cpu, i; + int min_idx, max_idx; + + if (slab_only) { + min_idx = NR_SLAB_RECLAIMABLE; + max_idx = NR_SLAB_UNRECLAIMABLE; + } else { + min_idx = 0; + max_idx = MEMCG_NR_STAT; + } + + for (i = min_idx; i < max_idx; i++) + stat[i] = 0; + + for_each_online_cpu(cpu) + for (i = min_idx; i < max_idx; i++) + stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu); + + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + for (i = min_idx; i < max_idx; i++) + atomic_long_add(stat[i], &mi->vmstats[i]); + + if (!slab_only) + max_idx = NR_VM_NODE_STAT_ITEMS; + + for_each_node(node) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + struct mem_cgroup_per_node *pi; + + for (i = min_idx; i < max_idx; i++) + stat[i] = 0; + + for_each_online_cpu(cpu) + for (i = min_idx; i < max_idx; i++) + stat[i] += per_cpu( + pn->lruvec_stat_cpu->count[i], cpu); + + for (pi = pn; pi; pi = parent_nodeinfo(pi, node)) + for (i = min_idx; i < max_idx; i++) + atomic_long_add(stat[i], &pi->lruvec_stat[i]); + } +} + +static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg) +{ + unsigned long events[NR_VM_EVENT_ITEMS]; + struct mem_cgroup *mi; + int cpu, i; + + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + events[i] = 0; + + for_each_online_cpu(cpu) + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + events[i] += per_cpu(memcg->vmstats_percpu->events[i], + cpu); + + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + atomic_long_add(events[i], &mi->vmevents[i]); +} + #ifdef CONFIG_MEMCG_KMEM static int memcg_online_kmem(struct mem_cgroup *memcg) { @@ -3166,16 +3369,23 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) */ memcg->kmem_state = KMEM_ALLOCATED; - memcg_deactivate_kmem_caches(memcg); - - kmemcg_id = memcg->kmemcg_id; - BUG_ON(kmemcg_id < 0); - parent = parent_mem_cgroup(memcg); if (!parent) parent = root_mem_cgroup; /* + * Deactivate and reparent kmem_caches. Then flush percpu + * slab statistics to have precise values at the parent and + * all ancestor levels. It's required to keep slab stats + * accurate after the reparenting of kmem_caches. + */ + memcg_deactivate_kmem_caches(memcg, parent); + memcg_flush_percpu_vmstats(memcg, true); + + kmemcg_id = memcg->kmemcg_id; + BUG_ON(kmemcg_id < 0); + + /* * Change kmemcg_id of this cgroup and all its descendants to the * parent's id, and then move all entries from this cgroup's list_lrus * to ones of the parent. After we have finished, all list_lrus @@ -3205,9 +3415,8 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) memcg_offline_kmem(memcg); if (memcg->kmem_state == KMEM_ALLOCATED) { - memcg_destroy_kmem_caches(memcg); + WARN_ON(!list_empty(&memcg->kmem_caches)); static_branch_dec(&memcg_kmem_enabled_key); - WARN_ON(page_counter_read(&memcg->kmem)); } } #else @@ -3470,6 +3679,28 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) } #endif /* CONFIG_NUMA */ +static const unsigned int memcg1_stats[] = { + MEMCG_CACHE, + MEMCG_RSS, + MEMCG_RSS_HUGE, + NR_SHMEM, + NR_FILE_MAPPED, + NR_FILE_DIRTY, + NR_WRITEBACK, + MEMCG_SWAP, +}; + +static const char *const memcg1_stat_names[] = { + "cache", + "rss", + "rss_huge", + "shmem", + "mapped_file", + "dirty", + "writeback", + "swap", +}; + /* Universal VM events cgroup1 shows, original sort order */ static const unsigned int memcg1_events[] = { PGPGIN, @@ -3528,12 +3759,13 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], - (u64)memcg_page_state(memcg, i) * PAGE_SIZE); + (u64)memcg_page_state(memcg, memcg1_stats[i]) * + PAGE_SIZE); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], - (u64)memcg_events(memcg, i)); + (u64)memcg_events(memcg, memcg1_events[i])); for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], @@ -4483,8 +4715,15 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return 1; + pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat); + if (!pn->lruvec_stat_local) { + kfree(pn); + return 1; + } + pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat); if (!pn->lruvec_stat_cpu) { + free_percpu(pn->lruvec_stat_local); kfree(pn); return 1; } @@ -4506,6 +4745,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) return; free_percpu(pn->lruvec_stat_cpu); + free_percpu(pn->lruvec_stat_local); kfree(pn); } @@ -4513,9 +4753,16 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; + /* + * Flush percpu vmstats and vmevents to guarantee the value correctness + * on parent's and all ancestor levels. + */ + memcg_flush_percpu_vmstats(memcg, false); + memcg_flush_percpu_vmevents(memcg); for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu); + free_percpu(memcg->vmstats_local); kfree(memcg); } @@ -4544,6 +4791,10 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (memcg->id.id < 0) goto fail; + memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu); + if (!memcg->vmstats_local) + goto fail; + memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu); if (!memcg->vmstats_percpu) goto fail; @@ -4619,6 +4870,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) /* The following stuff does not apply to the root */ if (!parent) { +#ifdef CONFIG_MEMCG_KMEM + INIT_LIST_HEAD(&memcg->kmem_caches); +#endif root_mem_cgroup = memcg; return &memcg->css; } @@ -4778,7 +5032,7 @@ enum mc_target_type { static struct page *mc_handle_present_pte(struct vm_area_struct *vma, unsigned long addr, pte_t ptent) { - struct page *page = _vm_normal_page(vma, addr, ptent, true); + struct page *page = vm_normal_page(vma, addr, ptent); if (!page || !page_mapped(page)) return NULL; @@ -4979,8 +5233,8 @@ out: * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a * target for charge migration. if @target is not NULL, the entry is stored * in target->ent. - * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC - * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru). + * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE + * (so ZONE_DEVICE page and thus not on the lru). * For now we such page is charge like a regular page would be as for all * intent and purposes it is just special memory taking the place of a * regular page. @@ -5014,8 +5268,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, */ if (page->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; - if (is_device_private_page(page) || - is_device_public_page(page)) + if (is_device_private_page(page)) ret = MC_TARGET_DEVICE; if (target) target->page = page; @@ -5086,8 +5339,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, if (ptl) { /* * Note their can not be MC_TARGET_DEVICE for now as we do not - * support transparent huge page with MEMORY_DEVICE_PUBLIC or - * MEMORY_DEVICE_PRIVATE but this might change. + * support transparent huge page with MEMORY_DEVICE_PRIVATE but + * this might change. */ if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) mc.precharge += HPAGE_PMD_NR; @@ -5610,112 +5863,42 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, return nbytes; } +static void __memory_events_show(struct seq_file *m, atomic_long_t *events) +{ + seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); + seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH])); + seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX])); + seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); + seq_printf(m, "oom_kill %lu\n", + atomic_long_read(&events[MEMCG_OOM_KILL])); +} + static int memory_events_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - seq_printf(m, "low %lu\n", - atomic_long_read(&memcg->memory_events[MEMCG_LOW])); - seq_printf(m, "high %lu\n", - atomic_long_read(&memcg->memory_events[MEMCG_HIGH])); - seq_printf(m, "max %lu\n", - atomic_long_read(&memcg->memory_events[MEMCG_MAX])); - seq_printf(m, "oom %lu\n", - atomic_long_read(&memcg->memory_events[MEMCG_OOM])); - seq_printf(m, "oom_kill %lu\n", - atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); - + __memory_events_show(m, memcg->memory_events); return 0; } -static int memory_stat_show(struct seq_file *m, void *v) +static int memory_events_local_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - int i; - /* - * Provide statistics on the state of the memory subsystem as - * well as cumulative event counters that show past behavior. - * - * This list is ordered following a combination of these gradients: - * 1) generic big picture -> specifics and details - * 2) reflecting userspace activity -> reflecting kernel heuristics - * - * Current memory state: - */ - - seq_printf(m, "anon %llu\n", - (u64)memcg_page_state(memcg, MEMCG_RSS) * PAGE_SIZE); - seq_printf(m, "file %llu\n", - (u64)memcg_page_state(memcg, MEMCG_CACHE) * PAGE_SIZE); - seq_printf(m, "kernel_stack %llu\n", - (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * 1024); - seq_printf(m, "slab %llu\n", - (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) + - memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) * - PAGE_SIZE); - seq_printf(m, "sock %llu\n", - (u64)memcg_page_state(memcg, MEMCG_SOCK) * PAGE_SIZE); - - seq_printf(m, "shmem %llu\n", - (u64)memcg_page_state(memcg, NR_SHMEM) * PAGE_SIZE); - seq_printf(m, "file_mapped %llu\n", - (u64)memcg_page_state(memcg, NR_FILE_MAPPED) * PAGE_SIZE); - seq_printf(m, "file_dirty %llu\n", - (u64)memcg_page_state(memcg, NR_FILE_DIRTY) * PAGE_SIZE); - seq_printf(m, "file_writeback %llu\n", - (u64)memcg_page_state(memcg, NR_WRITEBACK) * PAGE_SIZE); - - /* - * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter - * with the NR_ANON_THP vm counter, but right now it's a pain in the - * arse because it requires migrating the work out of rmap to a place - * where the page->mem_cgroup is set up and stable. - */ - seq_printf(m, "anon_thp %llu\n", - (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * PAGE_SIZE); - - for (i = 0; i < NR_LRU_LISTS; i++) - seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], - (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * - PAGE_SIZE); - - seq_printf(m, "slab_reclaimable %llu\n", - (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) * - PAGE_SIZE); - seq_printf(m, "slab_unreclaimable %llu\n", - (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) * - PAGE_SIZE); - - /* Accumulated memory events */ - - seq_printf(m, "pgfault %lu\n", memcg_events(memcg, PGFAULT)); - seq_printf(m, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT)); - - seq_printf(m, "workingset_refault %lu\n", - memcg_page_state(memcg, WORKINGSET_REFAULT)); - seq_printf(m, "workingset_activate %lu\n", - memcg_page_state(memcg, WORKINGSET_ACTIVATE)); - seq_printf(m, "workingset_nodereclaim %lu\n", - memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); - - seq_printf(m, "pgrefill %lu\n", memcg_events(memcg, PGREFILL)); - seq_printf(m, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) + - memcg_events(memcg, PGSCAN_DIRECT)); - seq_printf(m, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) + - memcg_events(memcg, PGSTEAL_DIRECT)); - seq_printf(m, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE)); - seq_printf(m, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE)); - seq_printf(m, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE)); - seq_printf(m, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED)); + __memory_events_show(m, memcg->memory_events_local); + return 0; +} -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - seq_printf(m, "thp_fault_alloc %lu\n", - memcg_events(memcg, THP_FAULT_ALLOC)); - seq_printf(m, "thp_collapse_alloc %lu\n", - memcg_events(memcg, THP_COLLAPSE_ALLOC)); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +static int memory_stat_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + char *buf; + buf = memory_stat_format(memcg); + if (!buf) + return -ENOMEM; + seq_puts(m, buf); + kfree(buf); return 0; } @@ -5787,6 +5970,12 @@ static struct cftype memory_files[] = { .seq_show = memory_events_show, }, { + .name = "events.local", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct mem_cgroup, events_local_file), + .seq_show = memory_events_local_show, + }, + { .name = "stat", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = memory_stat_show, |