From 5d097056c9a017a3b720849efb5432f37acabbac Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 14 Jan 2016 15:18:21 -0800 Subject: kmemcg: account certain kmem allocations to memcg Mark those kmem allocations that are known to be easily triggered from userspace as __GFP_ACCOUNT/SLAB_ACCOUNT, which makes them accounted to memcg. For the list, see below: - threadinfo - task_struct - task_delay_info - pid - cred - mm_struct - vm_area_struct and vm_region (nommu) - anon_vma and anon_vma_chain - signal_struct - sighand_struct - fs_struct - files_struct - fdtable and fdtable->full_fds_bits - dentry and external_name - inode for all filesystems. This is the most tedious part, because most filesystems overwrite the alloc_inode method. The list is far from complete, so feel free to add more objects. Nevertheless, it should be close to "account everything" approach and keep most workloads within bounds. Malevolent users will be able to breach the limit, but this was possible even with the former "account everything" approach (simply because it did not account everything in fact). [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Greg Thelen Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 6774e6b2e96d..51915842f1c0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -300,9 +300,9 @@ void __init fork_init(void) #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES #endif /* create a slab on which task_structs can be allocated */ - task_struct_cachep = - kmem_cache_create("task_struct", arch_task_struct_size, - ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); + task_struct_cachep = kmem_cache_create("task_struct", + arch_task_struct_size, ARCH_MIN_TASKALIGN, + SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL); #endif /* do the arch specific task caches init */ @@ -1848,16 +1848,19 @@ void __init proc_caches_init(void) sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| - SLAB_NOTRACK, sighand_ctor); + SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); /* * FIXME! The "sizeof(struct mm_struct)" currently includes the * whole struct cpumask for the OFFSTACK case. We could change @@ -1867,8 +1870,9 @@ void __init proc_caches_init(void) */ mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); - vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); + vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); mmap_init(); nsproxy_cache_init(); } -- cgit From 84638335900f1995495838fe1bd4870c43ec1f67 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 14 Jan 2016 15:22:07 -0800 Subject: mm: rework virtual memory accounting When inspecting a vague code inside prctl(PR_SET_MM_MEM) call (which testing the RLIMIT_DATA value to figure out if we're allowed to assign new @start_brk, @brk, @start_data, @end_data from mm_struct) it's been commited that RLIMIT_DATA in a form it's implemented now doesn't do anything useful because most of user-space libraries use mmap() syscall for dynamic memory allocations. Linus suggested to convert RLIMIT_DATA rlimit into something suitable for anonymous memory accounting. But in this patch we go further, and the changes are bundled together as: * keep vma counting if CONFIG_PROC_FS=n, will be used for limits * replace mm->shared_vm with better defined mm->data_vm * account anonymous executable areas as executable * account file-backed growsdown/up areas as stack * drop struct file* argument from vm_stat_account * enforce RLIMIT_DATA for size of data areas This way code looks cleaner: now code/stack/data classification depends only on vm_flags state: VM_EXEC & ~VM_WRITE -> code (VmExe + VmLib in proc) VM_GROWSUP | VM_GROWSDOWN -> stack (VmStk) VM_WRITE & ~VM_SHARED & !stack -> data (VmData) The rest (VmSize - VmData - VmStk - VmExe - VmLib) could be called "shared", but that might be strange beast like readonly-private or VM_IO area. - RLIMIT_AS limits whole address space "VmSize" - RLIMIT_STACK limits stack "VmStk" (but each vma individually) - RLIMIT_DATA now limits "VmData" Signed-off-by: Konstantin Khlebnikov Signed-off-by: Cyrill Gorcunov Cc: Quentin Casasnovas Cc: Vegard Nossum Acked-by: Linus Torvalds Cc: Willy Tarreau Cc: Andy Lutomirski Cc: Kees Cook Cc: Vladimir Davydov Cc: Pavel Emelyanov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/perfmon.c | 3 +-- fs/proc/task_mmu.c | 7 +++--- include/linux/mm.h | 13 +++------- include/linux/mm_types.h | 2 +- kernel/fork.c | 5 ++-- mm/debug.c | 4 +-- mm/mmap.c | 63 +++++++++++++++++++++++----------------------- mm/mprotect.c | 8 ++++-- mm/mremap.c | 7 +++--- 9 files changed, 54 insertions(+), 58 deletions(-) (limited to 'kernel/fork.c') diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 60e02f7747ff..9cd607b06964 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2332,8 +2332,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t */ insert_vm_struct(mm, vma); - vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, - vma_pages(vma)); + vm_stat_account(vma->vm_mm, vma->vm_flags, vma_pages(vma)); up_write(&task->mm->mmap_sem); /* diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 46d9619d0aea..a353b4c6e86e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -23,7 +23,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long data, text, lib, swap, ptes, pmds, anon, file, shmem; + unsigned long text, lib, swap, ptes, pmds, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; anon = get_mm_counter(mm, MM_ANONPAGES); @@ -44,7 +44,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) if (hiwater_rss < mm->hiwater_rss) hiwater_rss = mm->hiwater_rss; - data = mm->total_vm - mm->shared_vm - mm->stack_vm; text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; swap = get_mm_counter(mm, MM_SWAPENTS); @@ -76,7 +75,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) anon << (PAGE_SHIFT-10), file << (PAGE_SHIFT-10), shmem << (PAGE_SHIFT-10), - data << (PAGE_SHIFT-10), + mm->data_vm << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, ptes >> 10, pmds >> 10, @@ -97,7 +96,7 @@ unsigned long task_statm(struct mm_struct *mm, get_mm_counter(mm, MM_SHMEMPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; - *data = mm->total_vm - mm->shared_vm; + *data = mm->data_vm + mm->stack_vm; *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); return mm->total_vm; } diff --git a/include/linux/mm.h b/include/linux/mm.h index ec9d4559514d..839d9e9a1c38 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1929,7 +1929,9 @@ extern void mm_drop_all_locks(struct mm_struct *mm); extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); -extern int may_expand_vm(struct mm_struct *mm, unsigned long npages); +extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages); +extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages); + extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, @@ -2147,15 +2149,6 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); -#ifdef CONFIG_PROC_FS -void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); -#else -static inline void vm_stat_account(struct mm_struct *mm, - unsigned long flags, struct file *file, long pages) -{ - mm->total_vm += pages; -} -#endif /* CONFIG_PROC_FS */ #ifdef CONFIG_DEBUG_PAGEALLOC extern bool _debug_pagealloc_enabled; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 207890be93c8..6bc9a0ce2253 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -427,7 +427,7 @@ struct mm_struct { unsigned long total_vm; /* Total pages mapped */ unsigned long locked_vm; /* Pages that have PG_mlocked set */ unsigned long pinned_vm; /* Refcount permanently increased */ - unsigned long shared_vm; /* Shared pages (files) */ + unsigned long data_vm; /* VM_WRITE & ~VM_SHARED/GROWSDOWN */ unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE */ unsigned long stack_vm; /* VM_GROWSUP/DOWN */ unsigned long def_flags; diff --git a/kernel/fork.c b/kernel/fork.c index 51915842f1c0..2e391c754ae7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -414,7 +414,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); mm->total_vm = oldmm->total_vm; - mm->shared_vm = oldmm->shared_vm; + mm->data_vm = oldmm->data_vm; mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; @@ -433,8 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { - vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, - -vma_pages(mpnt)); + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); continue; } charge = 0; diff --git a/mm/debug.c b/mm/debug.c index 668aa35191ca..5d2072ed8d5e 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -175,7 +175,7 @@ void dump_mm(const struct mm_struct *mm) "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" - "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" + "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" "start_brk %lx brk %lx start_stack %lx\n" "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" @@ -209,7 +209,7 @@ void dump_mm(const struct mm_struct *mm) mm_nr_pmds((struct mm_struct *)mm), mm->map_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, - mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, + mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm, mm->start_code, mm->end_code, mm->start_data, mm->end_data, mm->start_brk, mm->brk, mm->start_stack, mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, diff --git a/mm/mmap.c b/mm/mmap.c index f32b84ad621a..b3f00b616b81 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1220,24 +1220,6 @@ none: return NULL; } -#ifdef CONFIG_PROC_FS -void vm_stat_account(struct mm_struct *mm, unsigned long flags, - struct file *file, long pages) -{ - const unsigned long stack_flags - = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); - - mm->total_vm += pages; - - if (file) { - mm->shared_vm += pages; - if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) - mm->exec_vm += pages; - } else if (flags & stack_flags) - mm->stack_vm += pages; -} -#endif /* CONFIG_PROC_FS */ - /* * If a hint addr is less than mmap_min_addr change hint to be as * low as possible but still greater than mmap_min_addr @@ -1556,7 +1538,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long charged = 0; /* Check against address space limit. */ - if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { + if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { unsigned long nr_pages; /* @@ -1565,7 +1547,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, */ nr_pages = count_vma_pages_range(mm, addr, addr + len); - if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages)) + if (!may_expand_vm(mm, vm_flags, + (len >> PAGE_SHIFT) - nr_pages)) return -ENOMEM; } @@ -1664,7 +1647,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, out: perf_event_mmap(vma); - vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); + vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))) @@ -2111,7 +2094,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns unsigned long new_start, actual_size; /* address space limit tests */ - if (!may_expand_vm(mm, grow)) + if (!may_expand_vm(mm, vma->vm_flags, grow)) return -ENOMEM; /* Stack limit test */ @@ -2208,8 +2191,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; - vm_stat_account(mm, vma->vm_flags, - vma->vm_file, grow); + vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; anon_vma_interval_tree_post_update_vma(vma); @@ -2284,8 +2266,7 @@ int expand_downwards(struct vm_area_struct *vma, spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; - vm_stat_account(mm, vma->vm_flags, - vma->vm_file, grow); + vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; @@ -2399,7 +2380,7 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) if (vma->vm_flags & VM_ACCOUNT) nr_accounted += nrpages; - vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); + vm_stat_account(mm, vma->vm_flags, -nrpages); vma = remove_vma(vma); } while (vma); vm_unacct_memory(nr_accounted); @@ -2769,7 +2750,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) } /* Check against address space limits *after* clearing old maps... */ - if (!may_expand_vm(mm, len >> PAGE_SHIFT)) + if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) return -ENOMEM; if (mm->map_count > sysctl_max_map_count) @@ -2804,6 +2785,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; + mm->data_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); vma->vm_flags |= VM_SOFTDIRTY; @@ -2995,9 +2977,28 @@ out: * Return true if the calling process may expand its vm space by the passed * number of pages */ -int may_expand_vm(struct mm_struct *mm, unsigned long npages) +bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) { - return mm->total_vm + npages <= rlimit(RLIMIT_AS) >> PAGE_SHIFT; + if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT) + return false; + + if ((flags & (VM_WRITE | VM_SHARED | (VM_STACK_FLAGS & + (VM_GROWSUP | VM_GROWSDOWN)))) == VM_WRITE) + return mm->data_vm + npages <= rlimit(RLIMIT_DATA); + + return true; +} + +void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) +{ + mm->total_vm += npages; + + if ((flags & (VM_EXEC | VM_WRITE)) == VM_EXEC) + mm->exec_vm += npages; + else if (flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN))) + mm->stack_vm += npages; + else if ((flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) + mm->data_vm += npages; } static int special_mapping_fault(struct vm_area_struct *vma, @@ -3079,7 +3080,7 @@ static struct vm_area_struct *__install_special_mapping( if (ret) goto out; - mm->total_vm += len >> PAGE_SHIFT; + vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT); perf_event_mmap(vma); diff --git a/mm/mprotect.c b/mm/mprotect.c index ef5be8eaab00..c764402c464f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -278,6 +278,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, * even if read-only so there is no need to account for them here */ if (newflags & VM_WRITE) { + /* Check space limits when area turns into data. */ + if (!may_expand_vm(mm, newflags, nrpages) && + may_expand_vm(mm, oldflags, nrpages)) + return -ENOMEM; if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| VM_SHARED|VM_NORESERVE))) { charged = nrpages; @@ -334,8 +338,8 @@ success: populate_vma_page_range(vma, start, end, NULL); } - vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); - vm_stat_account(mm, newflags, vma->vm_file, nrpages); + vm_stat_account(mm, oldflags, -nrpages); + vm_stat_account(mm, newflags, nrpages); perf_event_mmap(vma); return 0; diff --git a/mm/mremap.c b/mm/mremap.c index de824e72c3e8..e55b157865d5 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -317,7 +317,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, * If this were a serious issue, we'd add a flag to do_munmap(). */ hiwater_vm = mm->hiwater_vm; - vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); + vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT); /* Tell pfnmap has moved from this vma */ if (unlikely(vma->vm_flags & VM_PFNMAP)) @@ -383,7 +383,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, return ERR_PTR(-EAGAIN); } - if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) + if (!may_expand_vm(mm, vma->vm_flags, + (new_len - old_len) >> PAGE_SHIFT)) return ERR_PTR(-ENOMEM); if (vma->vm_flags & VM_ACCOUNT) { @@ -545,7 +546,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, goto out; } - vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); + vm_stat_account(mm, vma->vm_flags, pages); if (vma->vm_flags & VM_LOCKED) { mm->locked_vm += pages; locked = true; -- cgit