From 844f35db1088dd1a9de37b53d4d823626232bd19 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 8 Sep 2015 14:58:57 -0700 Subject: dax: add huge page fault support This is the support code for DAX-enabled filesystems to allow them to provide huge pages in response to faults. Signed-off-by: Matthew Wilcox Cc: Hillf Danton Cc: "Kirill A. Shutemov" Cc: Theodore Ts'o Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/dax.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt index 7af2851d667c..7bde64014a89 100644 --- a/Documentation/filesystems/dax.txt +++ b/Documentation/filesystems/dax.txt @@ -60,9 +60,10 @@ Filesystem support consists of - implementing the direct_IO address space operation, and calling dax_do_io() instead of blockdev_direct_IO() if S_DAX is set - implementing an mmap file operation for DAX files which sets the - VM_MIXEDMAP flag on the VMA, and setting the vm_ops to include handlers - for fault and page_mkwrite (which should probably call dax_fault() and - dax_mkwrite(), passing the appropriate get_block() callback) + VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to + include handlers for fault, pmd_fault and page_mkwrite (which should + probably call dax_fault(), dax_pmd_fault() and dax_mkwrite(), passing the + appropriate get_block() callback) - calling dax_truncate_page() instead of block_truncate_page() for DAX files - calling dax_zero_page_range() instead of zero_user() for DAX files - ensuring that there is sufficient locking between reads, writes, -- cgit v1.2.3-73-gaa49b From 77bb499bb60f4b79cca7d139c8041662860fcf87 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 8 Sep 2015 15:00:10 -0700 Subject: pagemap: add mmap-exclusive bit for marking pages mapped only here This patch sets bit 56 in pagemap if this page is mapped only once. It allows to detect exclusively used pages without exposing PFN: present file exclusive state 0 0 0 non-present 1 1 0 file page mapped somewhere else 1 1 1 file page mapped only here 1 0 0 anon non-CoWed page (shared with parent/child) 1 0 1 anon CoWed page (or never forked) CoWed pages in (MAP_FILE | MAP_PRIVATE) areas are anon in this context. MMap-exclusive bit doesn't reflect potential page-sharing via swapcache: page could be mapped once but has several swap-ptes which point to it. Application could detect that by swap bit in pagemap entry and touch that pte via /proc/pid/mem to get real information. See http://lkml.kernel.org/r/CAEVpBa+_RyACkhODZrRvQLs80iy0sqpdrd0AaP_-tgnX3Y9yNQ@mail.gmail.com Requested by Mark Williamson. [akpm@linux-foundation.org: fix spello] Signed-off-by: Konstantin Khlebnikov Reviewed-by: Mark Williamson Tested-by: Mark Williamson Reviewed-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/pagemap.txt | 3 ++- fs/proc/task_mmu.c | 14 +++++++++++++- tools/vm/page-types.c | 10 ++++++++++ 3 files changed, 25 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt index 6bfbc172cdb9..56faec0f73f7 100644 --- a/Documentation/vm/pagemap.txt +++ b/Documentation/vm/pagemap.txt @@ -16,7 +16,8 @@ There are three components to pagemap: * Bits 0-4 swap type if swapped * Bits 5-54 swap offset if swapped * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt) - * Bits 56-60 zero + * Bit 56 page exclusively mapped + * Bits 57-60 zero * Bit 61 page is file-page or shared-anon * Bit 62 page swapped * Bit 63 page present diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index bc651644b1b2..67c76468a7be 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -949,6 +949,7 @@ struct pagemapread { #define PM_PFRAME_BITS 55 #define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) #define PM_SOFT_DIRTY BIT_ULL(55) +#define PM_MMAP_EXCLUSIVE BIT_ULL(56) #define PM_FILE BIT_ULL(61) #define PM_SWAP BIT_ULL(62) #define PM_PRESENT BIT_ULL(63) @@ -1036,6 +1037,8 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, if (page && !PageAnon(page)) flags |= PM_FILE; + if (page && page_mapcount(page) == 1) + flags |= PM_MMAP_EXCLUSIVE; if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; @@ -1066,6 +1069,11 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, * This if-check is just to prepare for future implementation. */ if (pmd_present(pmd)) { + struct page *page = pmd_page(pmd); + + if (page_mapcount(page) == 1) + flags |= PM_MMAP_EXCLUSIVE; + flags |= PM_PRESENT; if (pm->show_pfn) frame = pmd_pfn(pmd) + @@ -1131,6 +1139,9 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, if (!PageAnon(page)) flags |= PM_FILE; + if (page_mapcount(page) == 1) + flags |= PM_MMAP_EXCLUSIVE; + flags |= PM_PRESENT; if (pm->show_pfn) frame = pte_pfn(pte) + @@ -1163,7 +1174,8 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, * Bits 0-4 swap type if swapped * Bits 5-54 swap offset if swapped * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt) - * Bits 56-60 zero + * Bit 56 page exclusively mapped + * Bits 57-60 zero * Bit 61 page is file-page or shared-anon * Bit 62 page swapped * Bit 63 page present diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index 603ec916716b..7f73fa32a590 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -62,6 +62,7 @@ #define PM_PFRAME_MASK ((1LL << PM_PFRAME_BITS) - 1) #define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) #define PM_SOFT_DIRTY (1ULL << 55) +#define PM_MMAP_EXCLUSIVE (1ULL << 56) #define PM_FILE (1ULL << 61) #define PM_SWAP (1ULL << 62) #define PM_PRESENT (1ULL << 63) @@ -91,6 +92,8 @@ #define KPF_SLOB_FREE 49 #define KPF_SLUB_FROZEN 50 #define KPF_SLUB_DEBUG 51 +#define KPF_FILE 62 +#define KPF_MMAP_EXCLUSIVE 63 #define KPF_ALL_BITS ((uint64_t)~0ULL) #define KPF_HACKERS_BITS (0xffffULL << 32) @@ -140,6 +143,9 @@ static const char * const page_flag_names[] = { [KPF_SLOB_FREE] = "P:slob_free", [KPF_SLUB_FROZEN] = "A:slub_frozen", [KPF_SLUB_DEBUG] = "E:slub_debug", + + [KPF_FILE] = "F:file", + [KPF_MMAP_EXCLUSIVE] = "1:mmap_exclusive", }; @@ -443,6 +449,10 @@ static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme) if (pme & PM_SOFT_DIRTY) flags |= BIT(SOFTDIRTY); + if (pme & PM_FILE) + flags |= BIT(FILE); + if (pme & PM_MMAP_EXCLUSIVE) + flags |= BIT(MMAP_EXCLUSIVE); return flags; } -- cgit v1.2.3-73-gaa49b From 83b4b0bb635eee2b8e075062e4e008d1bc110ed7 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 8 Sep 2015 15:00:13 -0700 Subject: pagemap: update documentation Notes about recent changes. [akpm@linux-foundation.org: various tweaks] Signed-off-by: Konstantin Khlebnikov Cc: Mark Williamson Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/pagemap.txt | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt index 56faec0f73f7..3cd38438242a 100644 --- a/Documentation/vm/pagemap.txt +++ b/Documentation/vm/pagemap.txt @@ -16,12 +16,17 @@ There are three components to pagemap: * Bits 0-4 swap type if swapped * Bits 5-54 swap offset if swapped * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt) - * Bit 56 page exclusively mapped + * Bit 56 page exclusively mapped (since 4.2) * Bits 57-60 zero - * Bit 61 page is file-page or shared-anon + * Bit 61 page is file-page or shared-anon (since 3.5) * Bit 62 page swapped * Bit 63 page present + Since Linux 4.0 only users with the CAP_SYS_ADMIN capability can get PFNs. + In 4.0 and 4.1 opens by unprivileged fail with -EPERM. Starting from + 4.2 the PFN field is zeroed if the user does not have CAP_SYS_ADMIN. + Reason: information about PFNs helps in exploiting Rowhammer vulnerability. + If the page is not present but in swap, then the PFN contains an encoding of the swap file number and the page's offset into the swap. Unmapped pages return a null PFN. This allows determining @@ -160,3 +165,8 @@ Other notes: Reading from any of the files will return -EINVAL if you are not starting the read on an 8-byte boundary (e.g., if you sought an odd number of bytes into the file), or if the size of the read is not a multiple of 8 bytes. + +Before Linux 3.11 pagemap bits 55-60 were used for "page-shift" (which is +always 12 at most architectures). Since Linux 3.11 their meaning changes +after first clear of soft-dirty bits. Since Linux 4.2 they are used for +flags unconditionally. -- cgit v1.2.3-73-gaa49b From 8334b96221ff0dcbde4873d31eb4d84774ed8ed4 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 8 Sep 2015 15:00:24 -0700 Subject: mm: /proc/pid/smaps:: show proportional swap share of the mapping We want to know per-process workingset size for smart memory management on userland and we use swap(ex, zram) heavily to maximize memory efficiency so workingset includes swap as well as RSS. On such system, if there are lots of shared anonymous pages, it's really hard to figure out exactly how many each process consumes memory(ie, rss + wap) if the system has lots of shared anonymous memory(e.g, android). This patch introduces SwapPss field on /proc//smaps so we can get more exact workingset size per process. Bongkyu tested it. Result is below. 1. 50M used swap SwapTotal: 461976 kB SwapFree: 411192 kB $ adb shell cat /proc/*/smaps | grep "SwapPss:" | awk '{sum += $2} END {print sum}'; 48236 $ adb shell cat /proc/*/smaps | grep "Swap:" | awk '{sum += $2} END {print sum}'; 141184 2. 240M used swap SwapTotal: 461976 kB SwapFree: 216808 kB $ adb shell cat /proc/*/smaps | grep "SwapPss:" | awk '{sum += $2} END {print sum}'; 230315 $ adb shell cat /proc/*/smaps | grep "Swap:" | awk '{sum += $2} END {print sum}'; 1387744 [akpm@linux-foundation.org: simplify kunmap_atomic() call] Signed-off-by: Minchan Kim Reported-by: Bongkyu Kim Tested-by: Bongkyu Kim Cc: Hugh Dickins Cc: Sergey Senozhatsky Cc: Jonathan Corbet Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 18 +++++++++++----- fs/proc/task_mmu.c | 18 ++++++++++++++-- include/linux/swap.h | 6 ++++++ mm/swapfile.c | 42 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 77 insertions(+), 7 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 6f7fafde0884..d411ca63c8b6 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -424,6 +424,7 @@ Private_Dirty: 0 kB Referenced: 892 kB Anonymous: 0 kB Swap: 0 kB +SwapPss: 0 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Locked: 374 kB @@ -433,16 +434,23 @@ the first of these lines shows the same information as is displayed for the mapping in /proc/PID/maps. The remaining lines show the size of the mapping (size), the amount of the mapping that is currently resident in RAM (RSS), the process' proportional share of this mapping (PSS), the number of clean and -dirty private pages in the mapping. Note that even a page which is part of a -MAP_SHARED mapping, but has only a single pte mapped, i.e. is currently used -by only one process, is accounted as private and not as shared. "Referenced" -indicates the amount of memory currently marked as referenced or accessed. +dirty private pages in the mapping. + +The "proportional set size" (PSS) of a process is the count of pages it has +in memory, where each page is divided by the number of processes sharing it. +So if a process has 1000 pages all to itself, and 1000 shared with one other +process, its PSS will be 1500. +Note that even a page which is part of a MAP_SHARED mapping, but has only +a single pte mapped, i.e. is currently used by only one process, is accounted +as private and not as shared. +"Referenced" indicates the amount of memory currently marked as referenced or +accessed. "Anonymous" shows the amount of memory that does not belong to any file. Even a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE and a page is modified, the file page is replaced by a private anonymous copy. "Swap" shows how much would-be-anonymous memory is also used, but out on swap. - +"SwapPss" shows proportional swap share of this mapping. "VmFlags" field deserves a separate description. This member represents the kernel flags associated with the particular virtual memory area in two letter encoded manner. The codes are the following: diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 67c76468a7be..41f1a50c10c9 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -446,6 +446,7 @@ struct mem_size_stats { unsigned long anonymous_thp; unsigned long swap; u64 pss; + u64 swap_pss; }; static void smaps_account(struct mem_size_stats *mss, struct page *page, @@ -492,9 +493,20 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } else if (is_swap_pte(*pte)) { swp_entry_t swpent = pte_to_swp_entry(*pte); - if (!non_swap_entry(swpent)) + if (!non_swap_entry(swpent)) { + int mapcount; + mss->swap += PAGE_SIZE; - else if (is_migration_entry(swpent)) + mapcount = swp_swapcount(swpent); + if (mapcount >= 2) { + u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT; + + do_div(pss_delta, mapcount); + mss->swap_pss += pss_delta; + } else { + mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; + } + } else if (is_migration_entry(swpent)) page = migration_entry_to_page(swpent); } @@ -640,6 +652,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) "Anonymous: %8lu kB\n" "AnonHugePages: %8lu kB\n" "Swap: %8lu kB\n" + "SwapPss: %8lu kB\n" "KernelPageSize: %8lu kB\n" "MMUPageSize: %8lu kB\n" "Locked: %8lu kB\n", @@ -654,6 +667,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) mss.anonymous >> 10, mss.anonymous_thp >> 10, mss.swap >> 10, + (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)), vma_kernel_pagesize(vma) >> 10, vma_mmu_pagesize(vma) >> 10, (vma->vm_flags & VM_LOCKED) ? diff --git a/include/linux/swap.h b/include/linux/swap.h index 31496d201fdc..6282f1eb3d6a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -431,6 +431,7 @@ extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); +extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); extern int reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); @@ -522,6 +523,11 @@ static inline int page_swapcount(struct page *page) return 0; } +static inline int swp_swapcount(swp_entry_t entry) +{ + return 0; +} + #define reuse_swap_page(page) (page_mapcount(page) == 1) static inline int try_to_free_swap(struct page *page) diff --git a/mm/swapfile.c b/mm/swapfile.c index aebc2dd6e649..58877312cf6b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -874,6 +874,48 @@ int page_swapcount(struct page *page) return count; } +/* + * How many references to @entry are currently swapped out? + * This considers COUNT_CONTINUED so it returns exact answer. + */ +int swp_swapcount(swp_entry_t entry) +{ + int count, tmp_count, n; + struct swap_info_struct *p; + struct page *page; + pgoff_t offset; + unsigned char *map; + + p = swap_info_get(entry); + if (!p) + return 0; + + count = swap_count(p->swap_map[swp_offset(entry)]); + if (!(count & COUNT_CONTINUED)) + goto out; + + count &= ~COUNT_CONTINUED; + n = SWAP_MAP_MAX + 1; + + offset = swp_offset(entry); + page = vmalloc_to_page(p->swap_map + offset); + offset &= ~PAGE_MASK; + VM_BUG_ON(page_private(page) != SWP_CONTINUED); + + do { + page = list_entry(page->lru.next, struct page, lru); + map = kmap_atomic(page); + tmp_count = map[offset]; + kunmap_atomic(map); + + count += (tmp_count & ~COUNT_CONTINUED) * n; + n *= (SWAP_CONT_MAX + 1); + } while (tmp_count & COUNT_CONTINUED); +out: + spin_unlock(&p->lock); + return count; +} + /* * We can write to an anon page without COW if there are no other references * to it. And as a side-effect, free up its swap: because the old content -- cgit v1.2.3-73-gaa49b From 071a4befebb655d6b31bf5c6bacd5a6df035224d Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 8 Sep 2015 15:00:42 -0700 Subject: mm, oom: do not panic for oom kills triggered from sysrq Sysrq+f is used to kill a process either for debug or when the VM is otherwise unresponsive. It is not intended to trigger a panic when no process may be killed. Avoid panicking the system for sysrq+f when no processes are killed. Signed-off-by: David Rientjes Suggested-by: Michal Hocko Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysrq.txt | 3 ++- mm/oom_kill.c | 7 +++++-- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt index 267f39386f99..13f5619b2203 100644 --- a/Documentation/sysrq.txt +++ b/Documentation/sysrq.txt @@ -75,7 +75,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.: 'e' - Send a SIGTERM to all processes, except for init. -'f' - Will call oom_kill to kill a memory hog process. +'f' - Will call the oom killer to kill a memory hog process, but do not + panic if nothing can be killed. 'g' - Used by kgdb (kernel debugger) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 77adc8e876aa..91dd59f63910 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -607,6 +607,9 @@ void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint, if (constraint != CONSTRAINT_NONE) return; } + /* Do not panic for oom kills triggered by sysrq */ + if (oc->order == -1) + return; dump_header(oc, NULL, memcg); panic("Out of memory: %s panic_on_oom is enabled\n", sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); @@ -686,11 +689,11 @@ bool out_of_memory(struct oom_control *oc) p = select_bad_process(oc, &points, totalpages); /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!p) { + if (!p && oc->order != -1) { dump_header(oc, NULL, NULL); panic("Out of memory and no killable processes...\n"); } - if (p != (void *)-1UL) { + if (p && p != (void *)-1UL) { oom_kill_process(oc, p, points, totalpages, NULL, "Out of memory"); killed = 1; -- cgit v1.2.3-73-gaa49b From ad82362b2defd4adad87d8538617b2f51a4bf9c3 Mon Sep 17 00:00:00 2001 From: "Sean O. Stalley" Date: Tue, 8 Sep 2015 15:02:27 -0700 Subject: mm: add dma_pool_zalloc() call to DMA API Add a wrapper function for dma_pool_alloc() to get zeroed memory. Signed-off-by: Sean O. Stalley Cc: Vinod Koul Cc: Bjorn Helgaas Cc: Gilles Muller Cc: Nicolas Palix Cc: Michal Marek Cc: Sebastian Andrzej Siewior Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/DMA-API.txt | 7 +++++++ include/linux/dmapool.h | 6 ++++++ 2 files changed, 13 insertions(+) (limited to 'Documentation') diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt index 7eba542eff7c..edccacd4f048 100644 --- a/Documentation/DMA-API.txt +++ b/Documentation/DMA-API.txt @@ -104,6 +104,13 @@ crossing restrictions, pass 0 for alloc; passing 4096 says memory allocated from this pool must not cross 4KByte boundaries. + void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags, + dma_addr_t *handle) + +Wraps dma_pool_alloc() and also zeroes the returned memory if the +allocation attempt succeeded. + + void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags, dma_addr_t *dma_handle); diff --git a/include/linux/dmapool.h b/include/linux/dmapool.h index e1043f79122f..53ba737505df 100644 --- a/include/linux/dmapool.h +++ b/include/linux/dmapool.h @@ -24,6 +24,12 @@ void dma_pool_destroy(struct dma_pool *pool); void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, dma_addr_t *handle); +static inline void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags, + dma_addr_t *handle) +{ + return dma_pool_alloc(pool, mem_flags | __GFP_ZERO, handle); +} + void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t addr); /* -- cgit v1.2.3-73-gaa49b From e6590740ceb83fd014fae7d571fe5a5d5886b7c8 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 8 Sep 2015 15:02:58 -0700 Subject: Documentation: update libhugetlbfs location and use for testing The URL for libhugetlbfs has changed. Also, put a stronger emphasis on using libgugetlbfs for hugetlb regression testing. Signed-off-by: Mike Kravetz Acked-by: Naoya Horiguchi Cc: Joern Engel Cc: Davidlohr Bueso Cc: David Rientjes Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/hugetlbpage.txt | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'Documentation') diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index 030977fb8d2d..54dd9b9c6c31 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt @@ -329,7 +329,14 @@ Examples 3) hugepage-mmap: see tools/testing/selftests/vm/hugepage-mmap.c -4) The libhugetlbfs (http://libhugetlbfs.sourceforge.net) library provides a - wide range of userspace tools to help with huge page usability, environment - setup, and control. Furthermore it provides useful test cases that should be - used when modifying code to ensure no regressions are introduced. +4) The libhugetlbfs (https://github.com/libhugetlbfs/libhugetlbfs) library + provides a wide range of userspace tools to help with huge page usability, + environment setup, and control. + +Kernel development regression testing +===================================== + +The most complete set of hugetlb tests are in the libhugetlbfs repository. +If you modify any hugetlb related code, use the libhugetlbfs test suite +to check for regressions. In addition, if you add any new hugetlb +functionality, please add appropriate tests to libhugetlbfs. -- cgit v1.2.3-73-gaa49b From 013110a73dcf970cb28c5b0a79f9eee577ea6aa2 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Tue, 8 Sep 2015 15:04:10 -0700 Subject: mm/page_alloc.c: fix a misleading comment The comment says that the per-cpu batchsize and zone watermarks are determined by present_pages which is definitely wrong, they are both calculated from managed_pages. Fix it. Signed-off-by: Yaowei Bai Acked-by: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 4 ++-- mm/page_alloc.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 9c3f2f8054b5..a4482fceacec 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -349,7 +349,7 @@ zone[i]'s protection[j] is calculated by following expression. (i < j): zone[i]->protection[j] - = (total sums of present_pages from zone[i+1] to zone[j] on the node) + = (total sums of managed_pages from zone[i+1] to zone[j] on the node) / lowmem_reserve_ratio[i]; (i = j): (should not be protected. = 0; @@ -360,7 +360,7 @@ The default values of lowmem_reserve_ratio[i] are 256 (if zone[i] means DMA or DMA32 zone) 32 (others). As above expression, they are reciprocal number of ratio. -256 means 1/256. # of protection pages becomes about "0.39%" of total present +256 means 1/256. # of protection pages becomes about "0.39%" of total managed pages of higher zones on the node. If you would like to protect more pages, smaller values are effective. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bdaa0cf8fd41..59abb47b70ee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6022,7 +6022,7 @@ void __init mem_init_print_info(const char *str) * set_dma_reserve - set the specified number of pages reserved in the first zone * @new_dma_reserve: The number of pages to mark reserved * - * The per-cpu batchsize and zone watermarks are determined by present_pages. + * The per-cpu batchsize and zone watermarks are determined by managed_pages. * In the DMA zone, a significant percentage may be consumed by kernel image * and other unfreeable allocations which can skew the watermarks badly. This * function may optionally be used to account for unfreeable pages in the -- cgit v1.2.3-73-gaa49b From 860c707dca155a56dfa115ddd6c00959296144a6 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 8 Sep 2015 15:04:38 -0700 Subject: zsmalloc: account the number of compacted pages Compaction returns back to zram the number of migrated objects, which is quite uninformative -- we have objects of different sizes so user space cannot obtain any valuable data from that number. Change compaction to operate in terms of pages and return back to compaction issuer the number of pages that were freed during compaction. So from now on we will export more meaningful value in zram/mm_stat -- the number of freed (compacted) pages. This requires: (a) a rename of `num_migrated' to 'pages_compacted' (b) a internal API change -- return first_page's fullness_group from putback_zspage(), so we know when putback_zspage() did free_zspage(). It helps us to account compaction stats correctly. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/blockdev/zram.txt | 3 ++- drivers/block/zram/zram_drv.c | 2 +- include/linux/zsmalloc.h | 4 ++-- mm/zsmalloc.c | 27 +++++++++++++++++---------- 4 files changed, 22 insertions(+), 14 deletions(-) (limited to 'Documentation') diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index c4de576093af..62435bb25266 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -144,7 +144,8 @@ mem_used_max RW the maximum amount memory zram have consumed to store compressed data mem_limit RW the maximum amount of memory ZRAM can use to store the compressed data -num_migrated RO the number of objects migrated migrated by compaction +pages_compacted RO the number of pages freed during compaction + (available only via zram/mm_stat node) compact WO trigger memory compaction WARNING diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index bcde5c321090..f1c4bb34e007 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -450,7 +450,7 @@ static ssize_t mm_stat_show(struct device *dev, zram->limit_pages << PAGE_SHIFT, max_used << PAGE_SHIFT, (u64)atomic64_read(&zram->stats.zero_pages), - pool_stats.num_migrated); + pool_stats.pages_compacted); up_read(&zram->init_lock); return ret; diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index ad3d23239043..6398dfae53f1 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -35,8 +35,8 @@ enum zs_mapmode { }; struct zs_pool_stats { - /* How many objects were migrated */ - unsigned long num_migrated; + /* How many pages were migrated (freed) */ + unsigned long pages_compacted; }; struct zs_pool; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 8f76d8875aca..b7b4a5612ec7 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1579,8 +1579,6 @@ struct zs_compact_control { /* Starting object index within @s_page which used for live object * in the subpage. */ int index; - /* How many of objects were migrated */ - int nr_migrated; }; static int migrate_zspage(struct zs_pool *pool, struct size_class *class, @@ -1617,7 +1615,6 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, record_obj(handle, free_obj); unpin_tag(handle); obj_free(pool, class, used_obj); - cc->nr_migrated++; } /* Remember last position in this iteration */ @@ -1643,8 +1640,17 @@ static struct page *isolate_target_page(struct size_class *class) return page; } -static void putback_zspage(struct zs_pool *pool, struct size_class *class, - struct page *first_page) +/* + * putback_zspage - add @first_page into right class's fullness list + * @pool: target pool + * @class: destination class + * @first_page: target page + * + * Return @fist_page's fullness_group + */ +static enum fullness_group putback_zspage(struct zs_pool *pool, + struct size_class *class, + struct page *first_page) { enum fullness_group fullness; @@ -1662,6 +1668,8 @@ static void putback_zspage(struct zs_pool *pool, struct size_class *class, free_zspage(first_page); } + + return fullness; } static struct page *isolate_source_page(struct size_class *class) @@ -1704,7 +1712,6 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class) struct page *src_page; struct page *dst_page = NULL; - cc.nr_migrated = 0; spin_lock(&class->lock); while ((src_page = isolate_source_page(class))) { @@ -1733,7 +1740,9 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class) break; putback_zspage(pool, class, dst_page); - putback_zspage(pool, class, src_page); + if (putback_zspage(pool, class, src_page) == ZS_EMPTY) + pool->stats.pages_compacted += + get_pages_per_zspage(class->size); spin_unlock(&class->lock); cond_resched(); spin_lock(&class->lock); @@ -1742,8 +1751,6 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class) if (src_page) putback_zspage(pool, class, src_page); - pool->stats.num_migrated += cc.nr_migrated; - spin_unlock(&class->lock); } @@ -1761,7 +1768,7 @@ unsigned long zs_compact(struct zs_pool *pool) __zs_compact(pool, class); } - return pool->stats.num_migrated; + return pool->stats.pages_compacted; } EXPORT_SYMBOL_GPL(zs_compact); -- cgit v1.2.3-73-gaa49b