diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 14 | ||||
-rw-r--r-- | mm/Kconfig.debug | 17 | ||||
-rw-r--r-- | mm/Makefile | 8 | ||||
-rw-r--r-- | mm/backing-dev.c | 1 | ||||
-rw-r--r-- | mm/bootmem.c | 811 | ||||
-rw-r--r-- | mm/cma.c | 15 | ||||
-rw-r--r-- | mm/cma_debug.c | 11 | ||||
-rw-r--r-- | mm/compaction.c | 1046 | ||||
-rw-r--r-- | mm/debug.c | 75 | ||||
-rw-r--r-- | mm/dmapool.c | 13 | ||||
-rw-r--r-- | mm/failslab.c | 14 | ||||
-rw-r--r-- | mm/filemap.c | 1088 | ||||
-rw-r--r-- | mm/gup.c | 347 | ||||
-rw-r--r-- | mm/gup_benchmark.c | 48 | ||||
-rw-r--r-- | mm/highmem.c | 5 | ||||
-rw-r--r-- | mm/hmm.c | 445 | ||||
-rw-r--r-- | mm/huge_memory.c | 247 | ||||
-rw-r--r-- | mm/hugetlb.c | 132 | ||||
-rw-r--r-- | mm/internal.h | 50 | ||||
-rw-r--r-- | mm/kasan/Makefile | 18 | ||||
-rw-r--r-- | mm/kasan/common.c (renamed from mm/kasan/kasan.c) | 684 | ||||
-rw-r--r-- | mm/kasan/generic.c | 325 | ||||
-rw-r--r-- | mm/kasan/generic_report.c | 150 | ||||
-rw-r--r-- | mm/kasan/init.c (renamed from mm/kasan/kasan_init.c) | 86 | ||||
-rw-r--r-- | mm/kasan/kasan.h | 62 | ||||
-rw-r--r-- | mm/kasan/quarantine.c | 21 | ||||
-rw-r--r-- | mm/kasan/report.c | 272 | ||||
-rw-r--r-- | mm/kasan/tags.c | 161 | ||||
-rw-r--r-- | mm/kasan/tags_report.c | 58 | ||||
-rw-r--r-- | mm/khugepaged.c | 282 | ||||
-rw-r--r-- | mm/kmemleak.c | 73 | ||||
-rw-r--r-- | mm/ksm.c | 112 | ||||
-rw-r--r-- | mm/list_lru.c | 3 | ||||
-rw-r--r-- | mm/maccess.c | 6 | ||||
-rw-r--r-- | mm/madvise.c | 23 | ||||
-rw-r--r-- | mm/memblock.c | 233 | ||||
-rw-r--r-- | mm/memcontrol.c | 255 | ||||
-rw-r--r-- | mm/memfd.c | 108 | ||||
-rw-r--r-- | mm/memory-failure.c | 23 | ||||
-rw-r--r-- | mm/memory.c | 379 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 497 | ||||
-rw-r--r-- | mm/mempolicy.c | 48 | ||||
-rw-r--r-- | mm/mempool.c | 8 | ||||
-rw-r--r-- | mm/migrate.c | 364 | ||||
-rw-r--r-- | mm/mincore.c | 6 | ||||
-rw-r--r-- | mm/mlock.c | 14 | ||||
-rw-r--r-- | mm/mm_init.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 159 | ||||
-rw-r--r-- | mm/mmu_gather.c | 2 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 62 | ||||
-rw-r--r-- | mm/mprotect.c | 21 | ||||
-rw-r--r-- | mm/mremap.c | 113 | ||||
-rw-r--r-- | mm/nobootmem.c | 445 | ||||
-rw-r--r-- | mm/nommu.c | 6 | ||||
-rw-r--r-- | mm/oom_kill.c | 128 | ||||
-rw-r--r-- | mm/page-writeback.c | 164 | ||||
-rw-r--r-- | mm/page_alloc.c | 1007 | ||||
-rw-r--r-- | mm/page_ext.c | 13 | ||||
-rw-r--r-- | mm/page_idle.c | 10 | ||||
-rw-r--r-- | mm/page_io.c | 16 | ||||
-rw-r--r-- | mm/page_isolation.c | 10 | ||||
-rw-r--r-- | mm/page_owner.c | 11 | ||||
-rw-r--r-- | mm/page_poison.c | 12 | ||||
-rw-r--r-- | mm/page_vma_mapped.c | 24 | ||||
-rw-r--r-- | mm/percpu-km.c | 7 | ||||
-rw-r--r-- | mm/percpu.c | 52 | ||||
-rw-r--r-- | mm/readahead.c | 24 | ||||
-rw-r--r-- | mm/rmap.c | 70 | ||||
-rw-r--r-- | mm/shmem.c | 969 | ||||
-rw-r--r-- | mm/slab.c | 90 | ||||
-rw-r--r-- | mm/slab.h | 11 | ||||
-rw-r--r-- | mm/slab_common.c | 173 | ||||
-rw-r--r-- | mm/slub.c | 228 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 6 | ||||
-rw-r--r-- | mm/sparse.c | 67 | ||||
-rw-r--r-- | mm/swap.c | 47 | ||||
-rw-r--r-- | mm/swap_state.c | 143 | ||||
-rw-r--r-- | mm/swapfile.c | 572 | ||||
-rw-r--r-- | mm/truncate.c | 41 | ||||
-rw-r--r-- | mm/usercopy.c | 9 | ||||
-rw-r--r-- | mm/userfaultfd.c | 64 | ||||
-rw-r--r-- | mm/util.c | 55 | ||||
-rw-r--r-- | mm/vmalloc.c | 465 | ||||
-rw-r--r-- | mm/vmscan.c | 304 | ||||
-rw-r--r-- | mm/vmstat.c | 36 | ||||
-rw-r--r-- | mm/workingset.c | 208 | ||||
-rw-r--r-- | mm/z3fold.c | 101 | ||||
-rw-r--r-- | mm/zsmalloc.c | 2 | ||||
-rw-r--r-- | mm/zswap.c | 4 |
89 files changed, 7834 insertions, 6777 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index de64ea658716..25c71eb8a7db 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -127,9 +127,6 @@ config SPARSEMEM_VMEMMAP pfn_to_page and page_to_pfn operations. This is the most efficient option when sufficient kernel resources are available. -config HAVE_MEMBLOCK - bool - config HAVE_MEMBLOCK_NODE_MAP bool @@ -142,9 +139,6 @@ config HAVE_GENERIC_GUP config ARCH_DISCARD_MEMBLOCK bool -config NO_BOOTMEM - bool - config MEMORY_ISOLATION bool @@ -297,6 +291,7 @@ config MMU_NOTIFIER config KSM bool "Enable KSM for page merging" depends on MMU + select XXHASH help Enable Kernel Samepage Merging: KSM periodically scans those areas of an application's address space that an app has advised may be @@ -379,7 +374,7 @@ config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE select COMPACTION - select RADIX_TREE_MULTIORDER + select XARRAY_MULTI help Transparent Hugepages allows the kernel to use huge pages and huge tlb transparently to the applications whenever possible. @@ -481,7 +476,7 @@ config FRONTSWAP config CMA bool "Contiguous Memory Allocator" - depends on HAVE_MEMBLOCK && MMU + depends on MMU select MIGRATION select MEMORY_ISOLATION help @@ -634,7 +629,6 @@ config MAX_STACK_SIZE_MB config DEFERRED_STRUCT_PAGE_INIT bool "Defer initialisation of struct pages to kthreads" default n - depends on NO_BOOTMEM depends on SPARSEMEM depends on !NEED_PER_CPU_KM depends on 64BIT @@ -671,7 +665,7 @@ config ZONE_DEVICE depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP depends on ARCH_HAS_ZONE_DEVICE - select RADIX_TREE_MULTIORDER + select XARRAY_MULTI help Device memory hotplug support allows for establishing pmem, diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 9a7b8b049d04..e3df921208c0 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -39,6 +39,23 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT Enable debug page memory allocations by default? This value can be overridden by debug_pagealloc=off|on. +config PAGE_OWNER + bool "Track page owner" + depends on DEBUG_KERNEL && STACKTRACE_SUPPORT + select DEBUG_FS + select STACKTRACE + select STACKDEPOT + select PAGE_EXTENSION + help + This keeps track of what call chain is the owner of a page, may + help to find bare alloc_page(s) leaks. Even if you include this + feature on your build, it is disabled in default. You should pass + "page_owner=on" to boot parameter in order to enable it. Eats + a fair amount of memory if enabled. See tools/vm/page_owner_sort.c + for user-space helper. + + If unsure, say N. + config PAGE_POISONING bool "Poison pages after freeing" select PAGE_POISONING_NO_SANITY if HIBERNATION diff --git a/mm/Makefile b/mm/Makefile index 6485d5745dd7..d210cc9d6f80 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -42,17 +42,11 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ debug.o $(mmu-y) obj-y += init-mm.o - -ifdef CONFIG_NO_BOOTMEM - obj-y += nobootmem.o -else - obj-y += bootmem.o -endif +obj-y += memblock.o ifdef CONFIG_MMU obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o endif -obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_slots.o obj-$(CONFIG_FRONTSWAP) += frontswap.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8a8bb8796c6c..72e6d0c55cfa 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -689,6 +689,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); bdi->cgwb_congested_tree = RB_ROOT; mutex_init(&bdi->cgwb_release_mutex); + init_rwsem(&bdi->wb_switch_rwsem); ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); if (!ret) { diff --git a/mm/bootmem.c b/mm/bootmem.c deleted file mode 100644 index 97db0e8e362b..000000000000 --- a/mm/bootmem.c +++ /dev/null @@ -1,811 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * bootmem - A boot-time physical memory allocator and configurator - * - * Copyright (C) 1999 Ingo Molnar - * 1999 Kanoj Sarcar, SGI - * 2008 Johannes Weiner - * - * Access to this subsystem has to be serialized externally (which is true - * for the boot process anyway). - */ -#include <linux/init.h> -#include <linux/pfn.h> -#include <linux/slab.h> -#include <linux/export.h> -#include <linux/kmemleak.h> -#include <linux/range.h> -#include <linux/bug.h> -#include <linux/io.h> -#include <linux/bootmem.h> - -#include "internal.h" - -/** - * DOC: bootmem overview - * - * Bootmem is a boot-time physical memory allocator and configurator. - * - * It is used early in the boot process before the page allocator is - * set up. - * - * Bootmem is based on the most basic of allocators, a First Fit - * allocator which uses a bitmap to represent memory. If a bit is 1, - * the page is allocated and 0 if unallocated. To satisfy allocations - * of sizes smaller than a page, the allocator records the Page Frame - * Number (PFN) of the last allocation and the offset the allocation - * ended at. Subsequent small allocations are merged together and - * stored on the same page. - * - * The information used by the bootmem allocator is represented by - * :c:type:`struct bootmem_data`. An array to hold up to %MAX_NUMNODES - * such structures is statically allocated and then it is discarded - * when the system initialization completes. Each entry in this array - * corresponds to a node with memory. For UMA systems only entry 0 is - * used. - * - * The bootmem allocator is initialized during early architecture - * specific setup. Each architecture is required to supply a - * :c:func:`setup_arch` function which, among other tasks, is - * responsible for acquiring the necessary parameters to initialise - * the boot memory allocator. These parameters define limits of usable - * physical memory: - * - * * @min_low_pfn - the lowest PFN that is available in the system - * * @max_low_pfn - the highest PFN that may be addressed by low - * memory (%ZONE_NORMAL) - * * @max_pfn - the last PFN available to the system. - * - * After those limits are determined, the :c:func:`init_bootmem` or - * :c:func:`init_bootmem_node` function should be called to initialize - * the bootmem allocator. The UMA case should use the `init_bootmem` - * function. It will initialize ``contig_page_data`` structure that - * represents the only memory node in the system. In the NUMA case the - * `init_bootmem_node` function should be called to initialize the - * bootmem allocator for each node. - * - * Once the allocator is set up, it is possible to use either single - * node or NUMA variant of the allocation APIs. - */ - -#ifndef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data __refdata contig_page_data = { - .bdata = &bootmem_node_data[0] -}; -EXPORT_SYMBOL(contig_page_data); -#endif - -unsigned long max_low_pfn; -unsigned long min_low_pfn; -unsigned long max_pfn; -unsigned long long max_possible_pfn; - -bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; - -static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); - -static int bootmem_debug; - -static int __init bootmem_debug_setup(char *buf) -{ - bootmem_debug = 1; - return 0; -} -early_param("bootmem_debug", bootmem_debug_setup); - -#define bdebug(fmt, args...) ({ \ - if (unlikely(bootmem_debug)) \ - pr_info("bootmem::%s " fmt, \ - __func__, ## args); \ -}) - -static unsigned long __init bootmap_bytes(unsigned long pages) -{ - unsigned long bytes = DIV_ROUND_UP(pages, BITS_PER_BYTE); - - return ALIGN(bytes, sizeof(long)); -} - -/** - * bootmem_bootmap_pages - calculate bitmap size in pages - * @pages: number of pages the bitmap has to represent - * - * Return: the number of pages needed to hold the bitmap. - */ -unsigned long __init bootmem_bootmap_pages(unsigned long pages) -{ - unsigned long bytes = bootmap_bytes(pages); - - return PAGE_ALIGN(bytes) >> PAGE_SHIFT; -} - -/* - * link bdata in order - */ -static void __init link_bootmem(bootmem_data_t *bdata) -{ - bootmem_data_t *ent; - - list_for_each_entry(ent, &bdata_list, list) { - if (bdata->node_min_pfn < ent->node_min_pfn) { - list_add_tail(&bdata->list, &ent->list); - return; - } - } - - list_add_tail(&bdata->list, &bdata_list); -} - -/* - * Called once to set up the allocator itself. - */ -static unsigned long __init init_bootmem_core(bootmem_data_t *bdata, - unsigned long mapstart, unsigned long start, unsigned long end) -{ - unsigned long mapsize; - - mminit_validate_memmodel_limits(&start, &end); - bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart)); - bdata->node_min_pfn = start; - bdata->node_low_pfn = end; - link_bootmem(bdata); - - /* - * Initially all pages are reserved - setup_arch() has to - * register free RAM areas explicitly. - */ - mapsize = bootmap_bytes(end - start); - memset(bdata->node_bootmem_map, 0xff, mapsize); - - bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n", - bdata - bootmem_node_data, start, mapstart, end, mapsize); - - return mapsize; -} - -/** - * init_bootmem_node - register a node as boot memory - * @pgdat: node to register - * @freepfn: pfn where the bitmap for this node is to be placed - * @startpfn: first pfn on the node - * @endpfn: first pfn after the node - * - * Return: the number of bytes needed to hold the bitmap for this node. - */ -unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, - unsigned long startpfn, unsigned long endpfn) -{ - return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn); -} - -/** - * init_bootmem - register boot memory - * @start: pfn where the bitmap is to be placed - * @pages: number of available physical pages - * - * Return: the number of bytes needed to hold the bitmap. - */ -unsigned long __init init_bootmem(unsigned long start, unsigned long pages) -{ - max_low_pfn = pages; - min_low_pfn = start; - return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); -} - -void __init free_bootmem_late(unsigned long physaddr, unsigned long size) -{ - unsigned long cursor, end; - - kmemleak_free_part_phys(physaddr, size); - - cursor = PFN_UP(physaddr); - end = PFN_DOWN(physaddr + size); - - for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); - totalram_pages++; - } -} - -static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) -{ - struct page *page; - unsigned long *map, start, end, pages, cur, count = 0; - - if (!bdata->node_bootmem_map) - return 0; - - map = bdata->node_bootmem_map; - start = bdata->node_min_pfn; - end = bdata->node_low_pfn; - - bdebug("nid=%td start=%lx end=%lx\n", - bdata - bootmem_node_data, start, end); - - while (start < end) { - unsigned long idx, vec; - unsigned shift; - - idx = start - bdata->node_min_pfn; - shift = idx & (BITS_PER_LONG - 1); - /* - * vec holds at most BITS_PER_LONG map bits, - * bit 0 corresponds to start. - */ - vec = ~map[idx / BITS_PER_LONG]; - - if (shift) { - vec >>= shift; - if (end - start >= BITS_PER_LONG) - vec |= ~map[idx / BITS_PER_LONG + 1] << - (BITS_PER_LONG - shift); - } - /* - * If we have a properly aligned and fully unreserved - * BITS_PER_LONG block of pages in front of us, free - * it in one go. - */ - if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) { - int order = ilog2(BITS_PER_LONG); - - __free_pages_bootmem(pfn_to_page(start), start, order); - count += BITS_PER_LONG; - start += BITS_PER_LONG; - } else { - cur = start; - - start = ALIGN(start + 1, BITS_PER_LONG); - while (vec && cur != start) { - if (vec & 1) { - page = pfn_to_page(cur); - __free_pages_bootmem(page, cur, 0); - count++; - } - vec >>= 1; - ++cur; - } - } - } - - cur = bdata->node_min_pfn; - page = virt_to_page(bdata->node_bootmem_map); - pages = bdata->node_low_pfn - bdata->node_min_pfn; - pages = bootmem_bootmap_pages(pages); - count += pages; - while (pages--) - __free_pages_bootmem(page++, cur++, 0); - bdata->node_bootmem_map = NULL; - - bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); - - return count; -} - -static int reset_managed_pages_done __initdata; - -void reset_node_managed_pages(pg_data_t *pgdat) -{ - struct zone *z; - - for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) - z->managed_pages = 0; -} - -void __init reset_all_zones_managed_pages(void) -{ - struct pglist_data *pgdat; - - if (reset_managed_pages_done) - return; - - for_each_online_pgdat(pgdat) - reset_node_managed_pages(pgdat); - - reset_managed_pages_done = 1; -} - -unsigned long __init free_all_bootmem(void) -{ - unsigned long total_pages = 0; - bootmem_data_t *bdata; - - reset_all_zones_managed_pages(); - - list_for_each_entry(bdata, &bdata_list, list) - total_pages += free_all_bootmem_core(bdata); - - totalram_pages += total_pages; - - return total_pages; -} - -static void __init __free(bootmem_data_t *bdata, - unsigned long sidx, unsigned long eidx) -{ - unsigned long idx; - - bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data, - sidx + bdata->node_min_pfn, - eidx + bdata->node_min_pfn); - - if (WARN_ON(bdata->node_bootmem_map == NULL)) - return; - - if (bdata->hint_idx > sidx) - bdata->hint_idx = sidx; - - for (idx = sidx; idx < eidx; idx++) - if (!test_and_clear_bit(idx, bdata->node_bootmem_map)) - BUG(); -} - -static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx, - unsigned long eidx, int flags) -{ - unsigned long idx; - int exclusive = flags & BOOTMEM_EXCLUSIVE; - - bdebug("nid=%td start=%lx end=%lx flags=%x\n", - bdata - bootmem_node_data, - sidx + bdata->node_min_pfn, - eidx + bdata->node_min_pfn, - flags); - - if (WARN_ON(bdata->node_bootmem_map == NULL)) - return 0; - - for (idx = sidx; idx < eidx; idx++) - if (test_and_set_bit(idx, bdata->node_bootmem_map)) { - if (exclusive) { - __free(bdata, sidx, idx); - return -EBUSY; - } - bdebug("silent double reserve of PFN %lx\n", - idx + bdata->node_min_pfn); - } - return 0; -} - -static int __init mark_bootmem_node(bootmem_data_t *bdata, - unsigned long start, unsigned long end, - int reserve, int flags) -{ - unsigned long sidx, eidx; - - bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n", - bdata - bootmem_node_data, start, end, reserve, flags); - - BUG_ON(start < bdata->node_min_pfn); - BUG_ON(end > bdata->node_low_pfn); - - sidx = start - bdata->node_min_pfn; - eidx = end - bdata->node_min_pfn; - - if (reserve) - return __reserve(bdata, sidx, eidx, flags); - else - __free(bdata, sidx, eidx); - return 0; -} - -static int __init mark_bootmem(unsigned long start, unsigned long end, - int reserve, int flags) -{ - unsigned long pos; - bootmem_data_t *bdata; - - pos = start; - list_for_each_entry(bdata, &bdata_list, list) { - int err; - unsigned long max; - - if (pos < bdata->node_min_pfn || - pos >= bdata->node_low_pfn) { - BUG_ON(pos != start); - continue; - } - - max = min(bdata->node_low_pfn, end); - - err = mark_bootmem_node(bdata, pos, max, reserve, flags); - if (reserve && err) { - mark_bootmem(start, pos, 0, 0); - return err; - } - - if (max == end) - return 0; - pos = bdata->node_low_pfn; - } - BUG(); -} - -void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, - unsigned long size) -{ - unsigned long start, end; - - kmemleak_free_part_phys(physaddr, size); - - start = PFN_UP(physaddr); - end = PFN_DOWN(physaddr + size); - - mark_bootmem_node(pgdat->bdata, start, end, 0, 0); -} - -void __init free_bootmem(unsigned long physaddr, unsigned long size) -{ - unsigned long start, end; - - kmemleak_free_part_phys(physaddr, size); - - start = PFN_UP(physaddr); - end = PFN_DOWN(physaddr + size); - - mark_bootmem(start, end, 0, 0); -} - -/** - * reserve_bootmem_node - mark a page range as reserved - * @pgdat: node the range resides on - * @physaddr: starting address of the range - * @size: size of the range in bytes - * @flags: reservation flags (see linux/bootmem.h) - * - * Partial pages will be reserved. - * - * The range must reside completely on the specified node. - * - * Return: 0 on success, -errno on failure. - */ -int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, - unsigned long size, int flags) -{ - unsigned long start, end; - - start = PFN_DOWN(physaddr); - end = PFN_UP(physaddr + size); - - return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); -} - -/** - * reserve_bootmem - mark a page range as reserved - * @addr: starting address of the range - * @size: size of the range in bytes - * @flags: reservation flags (see linux/bootmem.h) - * - * Partial pages will be reserved. - * - * The range must be contiguous but may span node boundaries. - * - * Return: 0 on success, -errno on failure. - */ -int __init reserve_bootmem(unsigned long addr, unsigned long size, - int flags) -{ - unsigned long start, end; - - start = PFN_DOWN(addr); - end = PFN_UP(addr + size); - - return mark_bootmem(start, end, 1, flags); -} - -static unsigned long __init align_idx(struct bootmem_data *bdata, - unsigned long idx, unsigned long step) -{ - unsigned long base = bdata->node_min_pfn; - - /* - * Align the index with respect to the node start so that the - * combination of both satisfies the requested alignment. - */ - - return ALIGN(base + idx, step) - base; -} - -static unsigned long __init align_off(struct bootmem_data *bdata, - unsigned long off, unsigned long align) -{ - unsigned long base = PFN_PHYS(bdata->node_min_pfn); - - /* Same as align_idx for byte offsets */ - - return ALIGN(base + off, align) - base; -} - -static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata, - unsigned long size, unsigned long align, - unsigned long goal, unsigned long limit) -{ - unsigned long fallback = 0; - unsigned long min, max, start, sidx, midx, step; - - bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", - bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, - align, goal, limit); - - BUG_ON(!size); - BUG_ON(align & (align - 1)); - BUG_ON(limit && goal + size > limit); - - if (!bdata->node_bootmem_map) - return NULL; - - min = bdata->node_min_pfn; - max = bdata->node_low_pfn; - - goal >>= PAGE_SHIFT; - limit >>= PAGE_SHIFT; - - if (limit && max > limit) - max = limit; - if (max <= min) - return NULL; - - step = max(align >> PAGE_SHIFT, 1UL); - - if (goal && min < goal && goal < max) - start = ALIGN(goal, step); - else - start = ALIGN(min, step); - - sidx = start - bdata->node_min_pfn; - midx = max - bdata->node_min_pfn; - - if (bdata->hint_idx > sidx) { - /* - * Handle the valid case of sidx being zero and still - * catch the fallback below. - */ - fallback = sidx + 1; - sidx = align_idx(bdata, bdata->hint_idx, step); - } - - while (1) { - int merge; - void *region; - unsigned long eidx, i, start_off, end_off; -find_block: - sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx); - sidx = align_idx(bdata, sidx, step); - eidx = sidx + PFN_UP(size); - - if (sidx >= midx || eidx > midx) - break; - - for (i = sidx; i < eidx; i++) - if (test_bit(i, bdata->node_bootmem_map)) { - sidx = align_idx(bdata, i, step); - if (sidx == i) - sidx += step; - goto find_block; - } - - if (bdata->last_end_off & (PAGE_SIZE - 1) && - PFN_DOWN(bdata->last_end_off) + 1 == sidx) - start_off = align_off(bdata, bdata->last_end_off, align); - else - start_off = PFN_PHYS(sidx); - - merge = PFN_DOWN(start_off) < sidx; - end_off = start_off + size; - - bdata->last_end_off = end_off; - bdata->hint_idx = PFN_UP(end_off); - - /* - * Reserve the area now: - */ - if (__reserve(bdata, PFN_DOWN(start_off) + merge, - PFN_UP(end_off), BOOTMEM_EXCLUSIVE)) - BUG(); - - region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + - start_off); - memset(region, 0, size); - /* - * The min_count is set to 0 so that bootmem allocated blocks - * are never reported as leaks. - */ - kmemleak_alloc(region, size, 0, 0); - return region; - } - - if (fallback) { - sidx = align_idx(bdata, fallback - 1, step); - fallback = 0; - goto find_block; - } - - return NULL; -} - -static void * __init alloc_bootmem_core(unsigned long size, - unsigned long align, - unsigned long goal, - unsigned long limit) -{ - bootmem_data_t *bdata; - void *region; - - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc(size, GFP_NOWAIT); - - list_for_each_entry(bdata, &bdata_list, list) { - if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) - continue; - if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) - break; - - region = alloc_bootmem_bdata(bdata, size, align, goal, limit); - if (region) - return region; - } - - return NULL; -} - -static void * __init ___alloc_bootmem_nopanic(unsigned long size, - unsigned long align, - unsigned long goal, - unsigned long limit) -{ - void *ptr; - -restart: - ptr = alloc_bootmem_core(size, align, goal, limit); - if (ptr) - return ptr; - if (goal) { - goal = 0; - goto restart; - } - - return NULL; -} - -void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, - unsigned long goal) -{ - unsigned long limit = 0; - - return ___alloc_bootmem_nopanic(size, align, goal, limit); -} - -static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, - unsigned long goal, unsigned long limit) -{ - void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit); - - if (mem) - return mem; - /* - * Whoops, we cannot satisfy the allocation request. - */ - pr_alert("bootmem alloc of %lu bytes failed!\n", size); - panic("Out of memory"); - return NULL; -} - -void * __init __alloc_bootmem(unsigned long size, unsigned long align, - unsigned long goal) -{ - unsigned long limit = 0; - - return ___alloc_bootmem(size, align, goal, limit); -} - -void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, - unsigned long size, unsigned long align, - unsigned long goal, unsigned long limit) -{ - void *ptr; - - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); -again: - - /* do not panic in alloc_bootmem_bdata() */ - if (limit && goal + size > limit) - limit = 0; - - ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit); - if (ptr) - return ptr; - - ptr = alloc_bootmem_core(size, align, goal, limit); - if (ptr) - return ptr; - - if (goal) { - goal = 0; - goto again; - } - - return NULL; -} - -void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); -} - -void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal, - unsigned long limit) -{ - void *ptr; - - ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); - if (ptr) - return ptr; - - pr_alert("bootmem alloc of %lu bytes failed!\n", size); - panic("Out of memory"); - return NULL; -} - -void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - - return ___alloc_bootmem_node(pgdat, size, align, goal, 0); -} - -void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ -#ifdef MAX_DMA32_PFN - unsigned long end_pfn; - - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - - /* update goal according ...MAX_DMA32_PFN */ - end_pfn = pgdat_end_pfn(pgdat); - - if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && - (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { - void *ptr; - unsigned long new_goal; - - new_goal = MAX_DMA32_PFN << PAGE_SHIFT; - ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, - new_goal, 0); - if (ptr) - return ptr; - } -#endif - - return __alloc_bootmem_node(pgdat, size, align, goal); - -} - -void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, - unsigned long goal) -{ - return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); -} - -void * __init __alloc_bootmem_low_nopanic(unsigned long size, - unsigned long align, - unsigned long goal) -{ - return ___alloc_bootmem_nopanic(size, align, goal, - ARCH_LOW_ADDRESS_LIMIT); -} - -void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - - return ___alloc_bootmem_node(pgdat, size, align, - goal, ARCH_LOW_ADDRESS_LIMIT); -} @@ -353,12 +353,14 @@ int __init cma_declare_contiguous(phys_addr_t base, ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma); if (ret) - goto err; + goto free_mem; pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, &base); return 0; +free_mem: + memblock_free(base, size); err: pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); return ret; @@ -407,6 +409,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, unsigned long pfn = -1; unsigned long start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; + size_t i; struct page *page = NULL; int ret = -ENOMEM; @@ -466,6 +469,16 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, trace_cma_alloc(pfn, page, count, align); + /* + * CMA can allocate multiple page blocks, which results in different + * blocks being marked with different tags. Reset the tags to ignore + * those page blocks. + */ + if (page) { + for (i = 0; i < count; i++) + page_kasan_tag_reset(page + i); + } + if (ret && !no_warn) { pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n", __func__, count, ret); diff --git a/mm/cma_debug.c b/mm/cma_debug.c index ad6723e9d110..8d7b2fd52225 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -21,8 +21,6 @@ struct cma_mem { unsigned long n; }; -static struct dentry *cma_debugfs_root; - static int cma_debugfs_get(void *data, u64 *val) { unsigned long *p = data; @@ -162,7 +160,7 @@ static int cma_alloc_write(void *data, u64 val) } DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); -static void cma_debugfs_add_one(struct cma *cma, int idx) +static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry) { struct dentry *tmp; char name[16]; @@ -170,7 +168,7 @@ static void cma_debugfs_add_one(struct cma *cma, int idx) scnprintf(name, sizeof(name), "cma-%s", cma->name); - tmp = debugfs_create_dir(name, cma_debugfs_root); + tmp = debugfs_create_dir(name, root_dentry); debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops); debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops); @@ -188,14 +186,13 @@ static void cma_debugfs_add_one(struct cma *cma, int idx) static int __init cma_debugfs_init(void) { + struct dentry *cma_debugfs_root; int i; cma_debugfs_root = debugfs_create_dir("cma", NULL); - if (!cma_debugfs_root) - return -ENOMEM; for (i = 0; i < cma_area_count; i++) - cma_debugfs_add_one(&cma_areas[i], i); + cma_debugfs_add_one(&cma_areas[i], cma_debugfs_root); return 0; } diff --git a/mm/compaction.c b/mm/compaction.c index faca45ebe62d..f171a83707ce 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -22,6 +22,7 @@ #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/page_owner.h> +#include <linux/psi.h> #include "internal.h" #ifdef CONFIG_COMPACTION @@ -65,7 +66,7 @@ static unsigned long release_freepages(struct list_head *freelist) return high_pfn; } -static void map_pages(struct list_head *list) +static void split_map_pages(struct list_head *list) { unsigned int i, order, nr_pages; struct page *page, *next; @@ -236,6 +237,70 @@ static bool pageblock_skip_persistent(struct page *page) return false; } +static bool +__reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source, + bool check_target) +{ + struct page *page = pfn_to_online_page(pfn); + struct page *end_page; + unsigned long block_pfn; + + if (!page) + return false; + if (zone != page_zone(page)) + return false; + if (pageblock_skip_persistent(page)) + return false; + + /* + * If skip is already cleared do no further checking once the + * restart points have been set. + */ + if (check_source && check_target && !get_pageblock_skip(page)) + return true; + + /* + * If clearing skip for the target scanner, do not select a + * non-movable pageblock as the starting point. + */ + if (!check_source && check_target && + get_pageblock_migratetype(page) != MIGRATE_MOVABLE) + return false; + + /* + * Only clear the hint if a sample indicates there is either a + * free page or an LRU page in the block. One or other condition + * is necessary for the block to be a migration source/target. + */ + block_pfn = pageblock_start_pfn(pfn); + pfn = max(block_pfn, zone->zone_start_pfn); + page = pfn_to_page(pfn); + if (zone != page_zone(page)) + return false; + pfn = block_pfn + pageblock_nr_pages; + pfn = min(pfn, zone_end_pfn(zone)); + end_page = pfn_to_page(pfn); + + do { + if (pfn_valid_within(pfn)) { + if (check_source && PageLRU(page)) { + clear_pageblock_skip(page); + return true; + } + + if (check_target && PageBuddy(page)) { + clear_pageblock_skip(page); + return true; + } + } + + page += (1 << PAGE_ALLOC_COSTLY_ORDER); + pfn += (1 << PAGE_ALLOC_COSTLY_ORDER); + } while (page < end_page); + + return false; +} + /* * This function is called to clear all cached information on pageblocks that * should be skipped for page isolation when the migrate and free page scanner @@ -243,30 +308,54 @@ static bool pageblock_skip_persistent(struct page *page) */ static void __reset_isolation_suitable(struct zone *zone) { - unsigned long start_pfn = zone->zone_start_pfn; - unsigned long end_pfn = zone_end_pfn(zone); - unsigned long pfn; + unsigned long migrate_pfn = zone->zone_start_pfn; + unsigned long free_pfn = zone_end_pfn(zone); + unsigned long reset_migrate = free_pfn; + unsigned long reset_free = migrate_pfn; + bool source_set = false; + bool free_set = false; + + if (!zone->compact_blockskip_flush) + return; zone->compact_blockskip_flush = false; - /* Walk the zone and mark every pageblock as suitable for isolation */ - for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { - struct page *page; - + /* + * Walk the zone and update pageblock skip information. Source looks + * for PageLRU while target looks for PageBuddy. When the scanner + * is found, both PageBuddy and PageLRU are checked as the pageblock + * is suitable as both source and target. + */ + for (; migrate_pfn < free_pfn; migrate_pfn += pageblock_nr_pages, + free_pfn -= pageblock_nr_pages) { cond_resched(); - page = pfn_to_online_page(pfn); - if (!page) - continue; - if (zone != page_zone(page)) - continue; - if (pageblock_skip_persistent(page)) - continue; + /* Update the migrate PFN */ + if (__reset_isolation_pfn(zone, migrate_pfn, true, source_set) && + migrate_pfn < reset_migrate) { + source_set = true; + reset_migrate = migrate_pfn; + zone->compact_init_migrate_pfn = reset_migrate; + zone->compact_cached_migrate_pfn[0] = reset_migrate; + zone->compact_cached_migrate_pfn[1] = reset_migrate; + } - clear_pageblock_skip(page); + /* Update the free PFN */ + if (__reset_isolation_pfn(zone, free_pfn, free_set, true) && + free_pfn > reset_free) { + free_set = true; + reset_free = free_pfn; + zone->compact_init_free_pfn = reset_free; + zone->compact_cached_free_pfn = reset_free; + } } - reset_cached_positions(zone); + /* Leave no distance if no suitable block was reset */ + if (reset_migrate >= reset_free) { + zone->compact_cached_migrate_pfn[0] = migrate_pfn; + zone->compact_cached_migrate_pfn[1] = migrate_pfn; + zone->compact_cached_free_pfn = free_pfn; + } } void reset_isolation_suitable(pg_data_t *pgdat) @@ -285,15 +374,53 @@ void reset_isolation_suitable(pg_data_t *pgdat) } /* + * Sets the pageblock skip bit if it was clear. Note that this is a hint as + * locks are not required for read/writers. Returns true if it was already set. + */ +static bool test_and_set_skip(struct compact_control *cc, struct page *page, + unsigned long pfn) +{ + bool skip; + + /* Do no update if skip hint is being ignored */ + if (cc->ignore_skip_hint) + return false; + + if (!IS_ALIGNED(pfn, pageblock_nr_pages)) + return false; + + skip = get_pageblock_skip(page); + if (!skip && !cc->no_set_skip_hint) + set_pageblock_skip(page); + + return skip; +} + +static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) +{ + struct zone *zone = cc->zone; + + pfn = pageblock_end_pfn(pfn); + + /* Set for isolation rather than compaction */ + if (cc->no_set_skip_hint) + return; + + if (pfn > zone->compact_cached_migrate_pfn[0]) + zone->compact_cached_migrate_pfn[0] = pfn; + if (cc->mode != MIGRATE_ASYNC && + pfn > zone->compact_cached_migrate_pfn[1]) + zone->compact_cached_migrate_pfn[1] = pfn; +} + +/* * If no pages were isolated then mark this pageblock to be skipped in the * future. The information is later cleared by __reset_isolation_suitable(). */ static void update_pageblock_skip(struct compact_control *cc, - struct page *page, unsigned long nr_isolated, - bool migrate_scanner) + struct page *page, unsigned long pfn) { struct zone *zone = cc->zone; - unsigned long pfn; if (cc->no_set_skip_hint) return; @@ -301,24 +428,11 @@ static void update_pageblock_skip(struct compact_control *cc, if (!page) return; - if (nr_isolated) - return; - set_pageblock_skip(page); - pfn = page_to_pfn(page); - /* Update where async and sync compaction should restart */ - if (migrate_scanner) { - if (pfn > zone->compact_cached_migrate_pfn[0]) - zone->compact_cached_migrate_pfn[0] = pfn; - if (cc->mode != MIGRATE_ASYNC && - pfn > zone->compact_cached_migrate_pfn[1]) - zone->compact_cached_migrate_pfn[1] = pfn; - } else { - if (pfn < zone->compact_cached_free_pfn) - zone->compact_cached_free_pfn = pfn; - } + if (pfn < zone->compact_cached_free_pfn) + zone->compact_cached_free_pfn = pfn; } #else static inline bool isolation_suitable(struct compact_control *cc, @@ -333,32 +447,42 @@ static inline bool pageblock_skip_persistent(struct page *page) } static inline void update_pageblock_skip(struct compact_control *cc, - struct page *page, unsigned long nr_isolated, - bool migrate_scanner) + struct page *page, unsigned long pfn) +{ +} + +static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) +{ +} + +static bool test_and_set_skip(struct compact_control *cc, struct page *page, + unsigned long pfn) { + return false; } #endif /* CONFIG_COMPACTION */ /* * Compaction requires the taking of some coarse locks that are potentially - * very heavily contended. For async compaction, back out if the lock cannot - * be taken immediately. For sync compaction, spin on the lock if needed. + * very heavily contended. For async compaction, trylock and record if the + * lock is contended. The lock will still be acquired but compaction will + * abort when the current block is finished regardless of success rate. + * Sync compaction acquires the lock. * - * Returns true if the lock is held - * Returns false if the lock is not held and compaction should abort + * Always returns true which makes it easier to track lock state in callers. */ -static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags, +static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags, struct compact_control *cc) { - if (cc->mode == MIGRATE_ASYNC) { - if (!spin_trylock_irqsave(lock, *flags)) { - cc->contended = true; - return false; - } - } else { - spin_lock_irqsave(lock, *flags); + /* Track if the lock is contended in async mode */ + if (cc->mode == MIGRATE_ASYNC && !cc->contended) { + if (spin_trylock_irqsave(lock, *flags)) + return true; + + cc->contended = true; } + spin_lock_irqsave(lock, *flags); return true; } @@ -390,37 +514,7 @@ static bool compact_unlock_should_abort(spinlock_t *lock, return true; } - if (need_resched()) { - if (cc->mode == MIGRATE_ASYNC) { - cc->contended = true; - return true; - } - cond_resched(); - } - - return false; -} - -/* - * Aside from avoiding lock contention, compaction also periodically checks - * need_resched() and either schedules in sync compaction or aborts async - * compaction. This is similar to what compact_unlock_should_abort() does, but - * is used where no lock is concerned. - * - * Returns false when no scheduling was needed, or sync compaction scheduled. - * Returns true when async compaction should abort. - */ -static inline bool compact_should_abort(struct compact_control *cc) -{ - /* async compaction aborts if contended */ - if (need_resched()) { - if (cc->mode == MIGRATE_ASYNC) { - cc->contended = true; - return true; - } - - cond_resched(); - } + cond_resched(); return false; } @@ -434,19 +528,24 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, unsigned long *start_pfn, unsigned long end_pfn, struct list_head *freelist, + unsigned int stride, bool strict) { int nr_scanned = 0, total_isolated = 0; - struct page *cursor, *valid_page = NULL; + struct page *cursor; unsigned long flags = 0; bool locked = false; unsigned long blockpfn = *start_pfn; unsigned int order; + /* Strict mode is for isolation, speed is secondary */ + if (strict) + stride = 1; + cursor = pfn_to_page(blockpfn); /* Isolate free pages. */ - for (; blockpfn < end_pfn; blockpfn++, cursor++) { + for (; blockpfn < end_pfn; blockpfn += stride, cursor += stride) { int isolated; struct page *page = cursor; @@ -464,9 +563,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (!pfn_valid_within(blockpfn)) goto isolate_fail; - if (!valid_page) - valid_page = page; - /* * For compound pages such as THP and hugetlbfs, we can save * potentially a lot of iterations if we skip them at once. @@ -494,18 +590,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, * recheck as well. */ if (!locked) { - /* - * The zone lock must be held to isolate freepages. - * Unfortunately this is a very coarse lock and can be - * heavily contended if there are parallel allocations - * or parallel compactions. For async compaction do not - * spin on the lock and we acquire the lock as late as - * possible. - */ - locked = compact_trylock_irqsave(&cc->zone->lock, + locked = compact_lock_irqsave(&cc->zone->lock, &flags, cc); - if (!locked) - break; /* Recheck this is a buddy page under lock */ if (!PageBuddy(page)) @@ -564,10 +650,6 @@ isolate_fail: if (strict && blockpfn < end_pfn) total_isolated = 0; - /* Update the pageblock-skip if the whole pageblock was scanned */ - if (blockpfn == end_pfn) - update_pageblock_skip(cc, valid_page, total_isolated, false); - cc->total_free_scanned += nr_scanned; if (total_isolated) count_compact_events(COMPACTISOLATED, total_isolated); @@ -625,7 +707,7 @@ isolate_freepages_range(struct compact_control *cc, break; isolated = isolate_freepages_block(cc, &isolate_start_pfn, - block_end_pfn, &freelist, true); + block_end_pfn, &freelist, 0, true); /* * In strict mode, isolate_freepages_block() returns 0 if @@ -643,7 +725,7 @@ isolate_freepages_range(struct compact_control *cc, } /* __isolate_free_page() does not map the pages */ - map_pages(&freelist); + split_map_pages(&freelist); if (pfn < end_pfn) { /* Loop terminated early, cleanup. */ @@ -656,16 +738,16 @@ isolate_freepages_range(struct compact_control *cc, } /* Similar to reclaim, but different enough that they don't share logic */ -static bool too_many_isolated(struct zone *zone) +static bool too_many_isolated(pg_data_t *pgdat) { unsigned long active, inactive, isolated; - inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) + - node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON); - active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) + - node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON); - isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) + - node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON); + inactive = node_page_state(pgdat, NR_INACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_ANON); + active = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_ACTIVE_ANON); + isolated = node_page_state(pgdat, NR_ISOLATED_FILE) + + node_page_state(pgdat, NR_ISOLATED_ANON); return isolated > (inactive + active) / 2; } @@ -692,7 +774,7 @@ static unsigned long isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn, isolate_mode_t isolate_mode) { - struct zone *zone = cc->zone; + pg_data_t *pgdat = cc->zone->zone_pgdat; unsigned long nr_scanned = 0, nr_isolated = 0; struct lruvec *lruvec; unsigned long flags = 0; @@ -701,13 +783,14 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, unsigned long start_pfn = low_pfn; bool skip_on_failure = false; unsigned long next_skip_pfn = 0; + bool skip_updated = false; /* * Ensure that there are not too many pages isolated from the LRU * list by either parallel reclaimers or compaction. If there are, * delay for some time until fewer pages are isolated */ - while (unlikely(too_many_isolated(zone))) { + while (unlikely(too_many_isolated(pgdat))) { /* async migration should just abort */ if (cc->mode == MIGRATE_ASYNC) return 0; @@ -718,8 +801,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, return 0; } - if (compact_should_abort(cc)) - return 0; + cond_resched(); if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) { skip_on_failure = true; @@ -757,8 +839,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * if contended. */ if (!(low_pfn % SWAP_CLUSTER_MAX) - && compact_unlock_should_abort(zone_lru_lock(zone), flags, - &locked, cc)) + && compact_unlock_should_abort(&pgdat->lru_lock, + flags, &locked, cc)) break; if (!pfn_valid_within(low_pfn)) @@ -767,8 +849,19 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, page = pfn_to_page(low_pfn); - if (!valid_page) + /* + * Check if the pageblock has already been marked skipped. + * Only the aligned PFN is checked as the caller isolates + * COMPACT_CLUSTER_MAX at a time so the second call must + * not falsely conclude that the block should be skipped. + */ + if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) { + if (!cc->ignore_skip_hint && get_pageblock_skip(page)) { + low_pfn = end_pfn; + goto isolate_abort; + } valid_page = page; + } /* * Skip if free. We read page order here without zone lock @@ -817,7 +910,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (unlikely(__PageMovable(page)) && !PageIsolated(page)) { if (locked) { - spin_unlock_irqrestore(zone_lru_lock(zone), + spin_unlock_irqrestore(&pgdat->lru_lock, flags); locked = false; } @@ -847,10 +940,15 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* If we already hold the lock, we can skip some rechecking */ if (!locked) { - locked = compact_trylock_irqsave(zone_lru_lock(zone), + locked = compact_lock_irqsave(&pgdat->lru_lock, &flags, cc); - if (!locked) - break; + + /* Try get exclusive access under lock */ + if (!skip_updated) { + skip_updated = true; + if (test_and_set_skip(cc, page, low_pfn)) + goto isolate_abort; + } /* Recheck PageLRU and PageCompound under lock */ if (!PageLRU(page)) @@ -867,7 +965,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } } - lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); + lruvec = mem_cgroup_page_lruvec(page, pgdat); /* Try isolate the page */ if (__isolate_lru_page(page, isolate_mode) != 0) @@ -886,16 +984,13 @@ isolate_success: nr_isolated++; /* - * Record where we could have freed pages by migration and not - * yet flushed them to buddy allocator. - * - this is the lowest page that was isolated and likely be - * then freed by migration. + * Avoid isolating too much unless this block is being + * rescanned (e.g. dirty/writeback pages, parallel allocation) + * or a lock is contended. For contention, isolate quickly to + * potentially remove one source of contention. */ - if (!cc->last_migrated_pfn) - cc->last_migrated_pfn = low_pfn; - - /* Avoid isolating too much */ - if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { + if (cc->nr_migratepages == COMPACT_CLUSTER_MAX && + !cc->rescan && !cc->contended) { ++low_pfn; break; } @@ -912,12 +1007,11 @@ isolate_fail: */ if (nr_isolated) { if (locked) { - spin_unlock_irqrestore(zone_lru_lock(zone), flags); + spin_unlock_irqrestore(&pgdat->lru_lock, flags); locked = false; } putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; - cc->last_migrated_pfn = 0; nr_isolated = 0; } @@ -938,15 +1032,23 @@ isolate_fail: if (unlikely(low_pfn > end_pfn)) low_pfn = end_pfn; +isolate_abort: if (locked) - spin_unlock_irqrestore(zone_lru_lock(zone), flags); + spin_unlock_irqrestore(&pgdat->lru_lock, flags); /* - * Update the pageblock-skip information and cached scanner pfn, - * if the whole pageblock was scanned without isolating any page. + * Updated the cached scanner pfn once the pageblock has been scanned + * Pages will either be migrated in which case there is no point + * scanning in the near future or migration failed in which case the + * failure reason may persist. The block is marked for skipping if + * there were no pages isolated in the block or if the block is + * rescanned twice in a row. */ - if (low_pfn == end_pfn) - update_pageblock_skip(cc, valid_page, nr_isolated, true); + if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) { + if (valid_page && !skip_updated) + set_pageblock_skip(valid_page); + update_cached_migrate(cc, low_pfn); + } trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn, nr_scanned, nr_isolated); @@ -1012,6 +1114,9 @@ static bool suitable_migration_source(struct compact_control *cc, { int block_mt; + if (pageblock_skip_persistent(page)) + return false; + if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction) return true; @@ -1049,6 +1154,12 @@ static bool suitable_migration_target(struct compact_control *cc, return false; } +static inline unsigned int +freelist_scan_limit(struct compact_control *cc) +{ + return (COMPACT_CLUSTER_MAX >> cc->fast_search_fail) + 1; +} + /* * Test whether the free scanner has reached the same or lower pageblock than * the migration scanner, and compaction should thus terminate. @@ -1060,6 +1171,248 @@ static inline bool compact_scanners_met(struct compact_control *cc) } /* + * Used when scanning for a suitable migration target which scans freelists + * in reverse. Reorders the list such as the unscanned pages are scanned + * first on the next iteration of the free scanner + */ +static void +move_freelist_head(struct list_head *freelist, struct page *freepage) +{ + LIST_HEAD(sublist); + + if (!list_is_last(freelist, &freepage->lru)) { + list_cut_before(&sublist, freelist, &freepage->lru); + if (!list_empty(&sublist)) + list_splice_tail(&sublist, freelist); + } +} + +/* + * Similar to move_freelist_head except used by the migration scanner + * when scanning forward. It's possible for these list operations to + * move against each other if they search the free list exactly in + * lockstep. + */ +static void +move_freelist_tail(struct list_head *freelist, struct page *freepage) +{ + LIST_HEAD(sublist); + + if (!list_is_first(freelist, &freepage->lru)) { + list_cut_position(&sublist, freelist, &freepage->lru); + if (!list_empty(&sublist)) + list_splice_tail(&sublist, freelist); + } +} + +static void +fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated) +{ + unsigned long start_pfn, end_pfn; + struct page *page = pfn_to_page(pfn); + + /* Do not search around if there are enough pages already */ + if (cc->nr_freepages >= cc->nr_migratepages) + return; + + /* Minimise scanning during async compaction */ + if (cc->direct_compaction && cc->mode == MIGRATE_ASYNC) + return; + + /* Pageblock boundaries */ + start_pfn = pageblock_start_pfn(pfn); + end_pfn = min(start_pfn + pageblock_nr_pages, zone_end_pfn(cc->zone)); + + /* Scan before */ + if (start_pfn != pfn) { + isolate_freepages_block(cc, &start_pfn, pfn, &cc->freepages, 1, false); + if (cc->nr_freepages >= cc->nr_migratepages) + return; + } + + /* Scan after */ + start_pfn = pfn + nr_isolated; + if (start_pfn != end_pfn) + isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); + + /* Skip this pageblock in the future as it's full or nearly full */ + if (cc->nr_freepages < cc->nr_migratepages) + set_pageblock_skip(page); +} + +/* Search orders in round-robin fashion */ +static int next_search_order(struct compact_control *cc, int order) +{ + order--; + if (order < 0) + order = cc->order - 1; + + /* Search wrapped around? */ + if (order == cc->search_order) { + cc->search_order--; + if (cc->search_order < 0) + cc->search_order = cc->order - 1; + return -1; + } + + return order; +} + +static unsigned long +fast_isolate_freepages(struct compact_control *cc) +{ + unsigned int limit = min(1U, freelist_scan_limit(cc) >> 1); + unsigned int nr_scanned = 0; + unsigned long low_pfn, min_pfn, high_pfn = 0, highest = 0; + unsigned long nr_isolated = 0; + unsigned long distance; + struct page *page = NULL; + bool scan_start = false; + int order; + + /* Full compaction passes in a negative order */ + if (cc->order <= 0) + return cc->free_pfn; + + /* + * If starting the scan, use a deeper search and use the highest + * PFN found if a suitable one is not found. + */ + if (cc->free_pfn >= cc->zone->compact_init_free_pfn) { + limit = pageblock_nr_pages >> 1; + scan_start = true; + } + + /* + * Preferred point is in the top quarter of the scan space but take + * a pfn from the top half if the search is problematic. + */ + distance = (cc->free_pfn - cc->migrate_pfn); + low_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 2)); + min_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 1)); + + if (WARN_ON_ONCE(min_pfn > low_pfn)) + low_pfn = min_pfn; + + /* + * Search starts from the last successful isolation order or the next + * order to search after a previous failure + */ + cc->search_order = min_t(unsigned int, cc->order - 1, cc->search_order); + + for (order = cc->search_order; + !page && order >= 0; + order = next_search_order(cc, order)) { + struct free_area *area = &cc->zone->free_area[order]; + struct list_head *freelist; + struct page *freepage; + unsigned long flags; + unsigned int order_scanned = 0; + + if (!area->nr_free) + continue; + + spin_lock_irqsave(&cc->zone->lock, flags); + freelist = &area->free_list[MIGRATE_MOVABLE]; + list_for_each_entry_reverse(freepage, freelist, lru) { + unsigned long pfn; + + order_scanned++; + nr_scanned++; + pfn = page_to_pfn(freepage); + + if (pfn >= highest) + highest = pageblock_start_pfn(pfn); + + if (pfn >= low_pfn) { + cc->fast_search_fail = 0; + cc->search_order = order; + page = freepage; + break; + } + + if (pfn >= min_pfn && pfn > high_pfn) { + high_pfn = pfn; + + /* Shorten the scan if a candidate is found */ + limit >>= 1; + } + + if (order_scanned >= limit) + break; + } + + /* Use a minimum pfn if a preferred one was not found */ + if (!page && high_pfn) { + page = pfn_to_page(high_pfn); + + /* Update freepage for the list reorder below */ + freepage = page; + } + + /* Reorder to so a future search skips recent pages */ + move_freelist_head(freelist, freepage); + + /* Isolate the page if available */ + if (page) { + if (__isolate_free_page(page, order)) { + set_page_private(page, order); + nr_isolated = 1 << order; + cc->nr_freepages += nr_isolated; + list_add_tail(&page->lru, &cc->freepages); + count_compact_events(COMPACTISOLATED, nr_isolated); + } else { + /* If isolation fails, abort the search */ + order = -1; + page = NULL; + } + } + + spin_unlock_irqrestore(&cc->zone->lock, flags); + + /* + * Smaller scan on next order so the total scan ig related + * to freelist_scan_limit. + */ + if (order_scanned >= limit) + limit = min(1U, limit >> 1); + } + + if (!page) { + cc->fast_search_fail++; + if (scan_start) { + /* + * Use the highest PFN found above min. If one was + * not found, be pessemistic for direct compaction + * and use the min mark. + */ + if (highest) { + page = pfn_to_page(highest); + cc->free_pfn = highest; + } else { + if (cc->direct_compaction) { + page = pfn_to_page(min_pfn); + cc->free_pfn = min_pfn; + } + } + } + } + + if (highest && highest >= cc->zone->compact_cached_free_pfn) { + highest -= pageblock_nr_pages; + cc->zone->compact_cached_free_pfn = highest; + } + + cc->total_free_scanned += nr_scanned; + if (!page) + return cc->free_pfn; + + low_pfn = page_to_pfn(page); + fast_isolate_around(cc, low_pfn, nr_isolated); + return low_pfn; +} + +/* * Based on information in the current compact_control, find blocks * suitable for isolating free pages from and then isolate them. */ @@ -1072,6 +1425,12 @@ static void isolate_freepages(struct compact_control *cc) unsigned long block_end_pfn; /* end of current pageblock */ unsigned long low_pfn; /* lowest pfn scanner is able to scan */ struct list_head *freelist = &cc->freepages; + unsigned int stride; + + /* Try a small search of the free lists for a candidate */ + isolate_start_pfn = fast_isolate_freepages(cc); + if (cc->nr_freepages) + goto splitmap; /* * Initialise the free scanner. The starting point is where we last @@ -1085,10 +1444,11 @@ static void isolate_freepages(struct compact_control *cc) * is using. */ isolate_start_pfn = cc->free_pfn; - block_start_pfn = pageblock_start_pfn(cc->free_pfn); + block_start_pfn = pageblock_start_pfn(isolate_start_pfn); block_end_pfn = min(block_start_pfn + pageblock_nr_pages, zone_end_pfn(zone)); low_pfn = pageblock_end_pfn(cc->migrate_pfn); + stride = cc->mode == MIGRATE_ASYNC ? COMPACT_CLUSTER_MAX : 1; /* * Isolate free pages until enough are available to migrate the @@ -1099,14 +1459,14 @@ static void isolate_freepages(struct compact_control *cc) block_end_pfn = block_start_pfn, block_start_pfn -= pageblock_nr_pages, isolate_start_pfn = block_start_pfn) { + unsigned long nr_isolated; + /* * This can iterate a massively long zone without finding any - * suitable migration targets, so periodically check if we need - * to schedule, or even abort async compaction. + * suitable migration targets, so periodically check resched. */ - if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) - && compact_should_abort(cc)) - break; + if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))) + cond_resched(); page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, zone); @@ -1122,15 +1482,15 @@ static void isolate_freepages(struct compact_control *cc) continue; /* Found a block suitable for isolating free pages from. */ - isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn, - freelist, false); + nr_isolated = isolate_freepages_block(cc, &isolate_start_pfn, + block_end_pfn, freelist, stride, false); - /* - * If we isolated enough freepages, or aborted due to lock - * contention, terminate. - */ - if ((cc->nr_freepages >= cc->nr_migratepages) - || cc->contended) { + /* Update the skip hint if the full pageblock was scanned */ + if (isolate_start_pfn == block_end_pfn) + update_pageblock_skip(cc, page, block_start_pfn); + + /* Are enough freepages isolated? */ + if (cc->nr_freepages >= cc->nr_migratepages) { if (isolate_start_pfn >= block_end_pfn) { /* * Restart at previous pageblock if more @@ -1147,10 +1507,14 @@ static void isolate_freepages(struct compact_control *cc) */ break; } - } - /* __isolate_free_page() does not map the pages */ - map_pages(freelist); + /* Adjust stride depending on isolation */ + if (nr_isolated) { + stride = 1; + continue; + } + stride = min_t(unsigned int, COMPACT_CLUSTER_MAX, stride << 1); + } /* * Record where the free scanner will restart next time. Either we @@ -1159,6 +1523,10 @@ static void isolate_freepages(struct compact_control *cc) * and the loop terminated due to isolate_start_pfn < low_pfn */ cc->free_pfn = isolate_start_pfn; + +splitmap: + /* __isolate_free_page() does not map the pages */ + split_map_pages(freelist); } /* @@ -1171,13 +1539,8 @@ static struct page *compaction_alloc(struct page *migratepage, struct compact_control *cc = (struct compact_control *)data; struct page *freepage; - /* - * Isolate free pages if necessary, and if we are not aborting due to - * contention. - */ if (list_empty(&cc->freepages)) { - if (!cc->contended) - isolate_freepages(cc); + isolate_freepages(cc); if (list_empty(&cc->freepages)) return NULL; @@ -1216,6 +1579,147 @@ typedef enum { */ int sysctl_compact_unevictable_allowed __read_mostly = 1; +static inline void +update_fast_start_pfn(struct compact_control *cc, unsigned long pfn) +{ + if (cc->fast_start_pfn == ULONG_MAX) + return; + + if (!cc->fast_start_pfn) + cc->fast_start_pfn = pfn; + + cc->fast_start_pfn = min(cc->fast_start_pfn, pfn); +} + +static inline unsigned long +reinit_migrate_pfn(struct compact_control *cc) +{ + if (!cc->fast_start_pfn || cc->fast_start_pfn == ULONG_MAX) + return cc->migrate_pfn; + + cc->migrate_pfn = cc->fast_start_pfn; + cc->fast_start_pfn = ULONG_MAX; + + return cc->migrate_pfn; +} + +/* + * Briefly search the free lists for a migration source that already has + * some free pages to reduce the number of pages that need migration + * before a pageblock is free. + */ +static unsigned long fast_find_migrateblock(struct compact_control *cc) +{ + unsigned int limit = freelist_scan_limit(cc); + unsigned int nr_scanned = 0; + unsigned long distance; + unsigned long pfn = cc->migrate_pfn; + unsigned long high_pfn; + int order; + + /* Skip hints are relied on to avoid repeats on the fast search */ + if (cc->ignore_skip_hint) + return pfn; + + /* + * If the migrate_pfn is not at the start of a zone or the start + * of a pageblock then assume this is a continuation of a previous + * scan restarted due to COMPACT_CLUSTER_MAX. + */ + if (pfn != cc->zone->zone_start_pfn && pfn != pageblock_start_pfn(pfn)) + return pfn; + + /* + * For smaller orders, just linearly scan as the number of pages + * to migrate should be relatively small and does not necessarily + * justify freeing up a large block for a small allocation. + */ + if (cc->order <= PAGE_ALLOC_COSTLY_ORDER) + return pfn; + + /* + * Only allow kcompactd and direct requests for movable pages to + * quickly clear out a MOVABLE pageblock for allocation. This + * reduces the risk that a large movable pageblock is freed for + * an unmovable/reclaimable small allocation. + */ + if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE) + return pfn; + + /* + * When starting the migration scanner, pick any pageblock within the + * first half of the search space. Otherwise try and pick a pageblock + * within the first eighth to reduce the chances that a migration + * target later becomes a source. + */ + distance = (cc->free_pfn - cc->migrate_pfn) >> 1; + if (cc->migrate_pfn != cc->zone->zone_start_pfn) + distance >>= 2; + high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance); + + for (order = cc->order - 1; + order >= PAGE_ALLOC_COSTLY_ORDER && pfn == cc->migrate_pfn && nr_scanned < limit; + order--) { + struct free_area *area = &cc->zone->free_area[order]; + struct list_head *freelist; + unsigned long flags; + struct page *freepage; + + if (!area->nr_free) + continue; + + spin_lock_irqsave(&cc->zone->lock, flags); + freelist = &area->free_list[MIGRATE_MOVABLE]; + list_for_each_entry(freepage, freelist, lru) { + unsigned long free_pfn; + + nr_scanned++; + free_pfn = page_to_pfn(freepage); + if (free_pfn < high_pfn) { + /* + * Avoid if skipped recently. Ideally it would + * move to the tail but even safe iteration of + * the list assumes an entry is deleted, not + * reordered. + */ + if (get_pageblock_skip(freepage)) { + if (list_is_last(freelist, &freepage->lru)) + break; + + continue; + } + + /* Reorder to so a future search skips recent pages */ + move_freelist_tail(freelist, freepage); + + update_fast_start_pfn(cc, free_pfn); + pfn = pageblock_start_pfn(free_pfn); + cc->fast_search_fail = 0; + set_pageblock_skip(freepage); + break; + } + + if (nr_scanned >= limit) { + cc->fast_search_fail++; + move_freelist_tail(freelist, freepage); + break; + } + } + spin_unlock_irqrestore(&cc->zone->lock, flags); + } + + cc->total_migrate_scanned += nr_scanned; + + /* + * If fast scanning failed then use a cached entry for a page block + * that had free pages as the basis for starting a linear scan. + */ + if (pfn == cc->migrate_pfn) + pfn = reinit_migrate_pfn(cc); + + return pfn; +} + /* * Isolate all pages that can be migrated from the first suitable block, * starting at the block pointed to by the migrate scanner pfn within @@ -1231,16 +1735,25 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, const isolate_mode_t isolate_mode = (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | (cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0); + bool fast_find_block; /* * Start at where we last stopped, or beginning of the zone as - * initialized by compact_zone() + * initialized by compact_zone(). The first failure will use + * the lowest PFN as the starting point for linear scanning. */ - low_pfn = cc->migrate_pfn; + low_pfn = fast_find_migrateblock(cc); block_start_pfn = pageblock_start_pfn(low_pfn); if (block_start_pfn < zone->zone_start_pfn) block_start_pfn = zone->zone_start_pfn; + /* + * fast_find_migrateblock marks a pageblock skipped so to avoid + * the isolation_suitable check below, check whether the fast + * search was successful. + */ + fast_find_block = low_pfn != cc->migrate_pfn && !cc->fast_search_fail; + /* Only scan within a pageblock boundary */ block_end_pfn = pageblock_end_pfn(low_pfn); @@ -1249,6 +1762,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, * Do not cross the free scanner. */ for (; block_end_pfn <= cc->free_pfn; + fast_find_block = false, low_pfn = block_end_pfn, block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { @@ -1256,34 +1770,45 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, /* * This can potentially iterate a massively long zone with * many pageblocks unsuitable, so periodically check if we - * need to schedule, or even abort async compaction. + * need to schedule. */ - if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) - && compact_should_abort(cc)) - break; + if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))) + cond_resched(); page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, zone); if (!page) continue; - /* If isolation recently failed, do not retry */ - if (!isolation_suitable(cc, page)) + /* + * If isolation recently failed, do not retry. Only check the + * pageblock once. COMPACT_CLUSTER_MAX causes a pageblock + * to be visited multiple times. Assume skip was checked + * before making it "skip" so other compaction instances do + * not scan the same block. + */ + if (IS_ALIGNED(low_pfn, pageblock_nr_pages) && + !fast_find_block && !isolation_suitable(cc, page)) continue; /* - * For async compaction, also only scan in MOVABLE blocks. - * Async compaction is optimistic to see if the minimum amount - * of work satisfies the allocation. + * For async compaction, also only scan in MOVABLE blocks + * without huge pages. Async compaction is optimistic to see + * if the minimum amount of work satisfies the allocation. + * The cached PFN is updated as it's possible that all + * remaining blocks between source and target are unsuitable + * and the compaction scanners fail to meet. */ - if (!suitable_migration_source(cc, page)) + if (!suitable_migration_source(cc, page)) { + update_cached_migrate(cc, block_end_pfn); continue; + } /* Perform the isolation */ low_pfn = isolate_migratepages_block(cc, low_pfn, block_end_pfn, isolate_mode); - if (!low_pfn || cc->contended) + if (!low_pfn) return ISOLATE_ABORT; /* @@ -1309,19 +1834,16 @@ static inline bool is_via_compact_memory(int order) return order == -1; } -static enum compact_result __compact_finished(struct zone *zone, - struct compact_control *cc) +static enum compact_result __compact_finished(struct compact_control *cc) { unsigned int order; const int migratetype = cc->migratetype; - - if (cc->contended || fatal_signal_pending(current)) - return COMPACT_CONTENDED; + int ret; /* Compaction run completes if the migrate and free scanner meet */ if (compact_scanners_met(cc)) { /* Let the next compaction start anew. */ - reset_cached_positions(zone); + reset_cached_positions(cc->zone); /* * Mark that the PG_migrate_skip information should be cleared @@ -1330,7 +1852,7 @@ static enum compact_result __compact_finished(struct zone *zone, * based on an allocation request. */ if (cc->direct_compaction) - zone->compact_blockskip_flush = true; + cc->zone->compact_blockskip_flush = true; if (cc->whole_zone) return COMPACT_COMPLETE; @@ -1341,20 +1863,19 @@ static enum compact_result __compact_finished(struct zone *zone, if (is_via_compact_memory(cc->order)) return COMPACT_CONTINUE; - if (cc->finishing_block) { - /* - * We have finished the pageblock, but better check again that - * we really succeeded. - */ - if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages)) - cc->finishing_block = false; - else - return COMPACT_CONTINUE; - } + /* + * Always finish scanning a pageblock to reduce the possibility of + * fallbacks in the future. This is particularly important when + * migration source is unmovable/reclaimable but it's not worth + * special casing. + */ + if (!IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages)) + return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ + ret = COMPACT_NO_SUITABLE_PAGE; for (order = cc->order; order < MAX_ORDER; order++) { - struct free_area *area = &zone->free_area[order]; + struct free_area *area = &cc->zone->free_area[order]; bool can_steal; /* Job done if page is free of the right migratetype */ @@ -1392,21 +1913,23 @@ static enum compact_result __compact_finished(struct zone *zone, return COMPACT_SUCCESS; } - cc->finishing_block = true; - return COMPACT_CONTINUE; + ret = COMPACT_CONTINUE; + break; } } - return COMPACT_NO_SUITABLE_PAGE; + if (cc->contended || fatal_signal_pending(current)) + ret = COMPACT_CONTENDED; + + return ret; } -static enum compact_result compact_finished(struct zone *zone, - struct compact_control *cc) +static enum compact_result compact_finished(struct compact_control *cc) { int ret; - ret = __compact_finished(zone, cc); - trace_mm_compaction_finished(zone, cc->order, ret); + ret = __compact_finished(cc); + trace_mm_compaction_finished(cc->zone, cc->order, ret); if (ret == COMPACT_NO_SUITABLE_PAGE) ret = COMPACT_CONTINUE; @@ -1430,7 +1953,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, if (is_via_compact_memory(order)) return COMPACT_CONTINUE; - watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; + watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); /* * If watermarks for high-order allocation are already met, there * should be no need for compaction at all. @@ -1533,15 +2056,18 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, return false; } -static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc) +static enum compact_result +compact_zone(struct compact_control *cc, struct capture_control *capc) { enum compact_result ret; - unsigned long start_pfn = zone->zone_start_pfn; - unsigned long end_pfn = zone_end_pfn(zone); + unsigned long start_pfn = cc->zone->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(cc->zone); + unsigned long last_migrated_pfn; const bool sync = cc->mode != MIGRATE_ASYNC; + bool update_cached; cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); - ret = compaction_suitable(zone, cc->order, cc->alloc_flags, + ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, cc->classzone_idx); /* Compaction is likely to fail */ if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) @@ -1554,8 +2080,8 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro * Clear pageblock skip if there were failures recently and compaction * is about to be retried after being deferred. */ - if (compaction_restarting(zone, cc->order)) - __reset_isolation_suitable(zone); + if (compaction_restarting(cc->zone, cc->order)) + __reset_isolation_suitable(cc->zone); /* * Setup to move all movable pages to the end of the zone. Used cached @@ -1563,43 +2089,76 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro * want to compact the whole zone), but check that it is initialised * by ensuring the values are within zone boundaries. */ + cc->fast_start_pfn = 0; if (cc->whole_zone) { cc->migrate_pfn = start_pfn; cc->free_pfn = pageblock_start_pfn(end_pfn - 1); } else { - cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; - cc->free_pfn = zone->compact_cached_free_pfn; + cc->migrate_pfn = cc->zone->compact_cached_migrate_pfn[sync]; + cc->free_pfn = cc->zone->compact_cached_free_pfn; if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { cc->free_pfn = pageblock_start_pfn(end_pfn - 1); - zone->compact_cached_free_pfn = cc->free_pfn; + cc->zone->compact_cached_free_pfn = cc->free_pfn; } if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { cc->migrate_pfn = start_pfn; - zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; - zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; + cc->zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; + cc->zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; } - if (cc->migrate_pfn == start_pfn) + if (cc->migrate_pfn <= cc->zone->compact_init_migrate_pfn) cc->whole_zone = true; } - cc->last_migrated_pfn = 0; + last_migrated_pfn = 0; + + /* + * Migrate has separate cached PFNs for ASYNC and SYNC* migration on + * the basis that some migrations will fail in ASYNC mode. However, + * if the cached PFNs match and pageblocks are skipped due to having + * no isolation candidates, then the sync state does not matter. + * Until a pageblock with isolation candidates is found, keep the + * cached PFNs in sync to avoid revisiting the same blocks. + */ + update_cached = !sync && + cc->zone->compact_cached_migrate_pfn[0] == cc->zone->compact_cached_migrate_pfn[1]; trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn, sync); migrate_prep_local(); - while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { + while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) { int err; + unsigned long start_pfn = cc->migrate_pfn; + + /* + * Avoid multiple rescans which can happen if a page cannot be + * isolated (dirty/writeback in async mode) or if the migrated + * pages are being allocated before the pageblock is cleared. + * The first rescan will capture the entire pageblock for + * migration. If it fails, it'll be marked skip and scanning + * will proceed as normal. + */ + cc->rescan = false; + if (pageblock_start_pfn(last_migrated_pfn) == + pageblock_start_pfn(start_pfn)) { + cc->rescan = true; + } - switch (isolate_migratepages(zone, cc)) { + switch (isolate_migratepages(cc->zone, cc)) { case ISOLATE_ABORT: ret = COMPACT_CONTENDED; putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; + last_migrated_pfn = 0; goto out; case ISOLATE_NONE: + if (update_cached) { + cc->zone->compact_cached_migrate_pfn[1] = + cc->zone->compact_cached_migrate_pfn[0]; + } + /* * We haven't isolated and migrated anything, but * there might still be unflushed migrations from @@ -1607,6 +2166,8 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro */ goto check_drain; case ISOLATE_SUCCESS: + update_cached = false; + last_migrated_pfn = start_pfn; ; } @@ -1638,8 +2199,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro cc->migrate_pfn = block_end_pfn( cc->migrate_pfn - 1, cc->order); /* Draining pcplists is useless in this case */ - cc->last_migrated_pfn = 0; - + last_migrated_pfn = 0; } } @@ -1651,21 +2211,26 @@ check_drain: * compact_finished() can detect immediately if allocation * would succeed. */ - if (cc->order > 0 && cc->last_migrated_pfn) { + if (cc->order > 0 && last_migrated_pfn) { int cpu; unsigned long current_block_start = block_start_pfn(cc->migrate_pfn, cc->order); - if (cc->last_migrated_pfn < current_block_start) { + if (last_migrated_pfn < current_block_start) { cpu = get_cpu(); lru_add_drain_cpu(cpu); - drain_local_pages(zone); + drain_local_pages(cc->zone); put_cpu(); /* No more flushing until we migrate again */ - cc->last_migrated_pfn = 0; + last_migrated_pfn = 0; } } + /* Stop if a page has been captured */ + if (capc && capc->page) { + ret = COMPACT_SUCCESS; + break; + } } out: @@ -1684,8 +2249,8 @@ out: * Only go back, not forward. The cached pfn might have been * already reset to zone end in compact_finished() */ - if (free_pfn > zone->compact_cached_free_pfn) - zone->compact_cached_free_pfn = free_pfn; + if (free_pfn > cc->zone->compact_cached_free_pfn) + cc->zone->compact_cached_free_pfn = free_pfn; } count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned); @@ -1699,7 +2264,8 @@ out: static enum compact_result compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, enum compact_priority prio, - unsigned int alloc_flags, int classzone_idx) + unsigned int alloc_flags, int classzone_idx, + struct page **capture) { enum compact_result ret; struct compact_control cc = { @@ -1708,6 +2274,7 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .total_migrate_scanned = 0, .total_free_scanned = 0, .order = order, + .search_order = order, .gfp_mask = gfp_mask, .zone = zone, .mode = (prio == COMPACT_PRIO_ASYNC) ? @@ -1719,14 +2286,24 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY), .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY) }; + struct capture_control capc = { + .cc = &cc, + .page = NULL, + }; + + if (capture) + current->capture_control = &capc; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); - ret = compact_zone(zone, &cc); + ret = compact_zone(&cc, &capc); VM_BUG_ON(!list_empty(&cc.freepages)); VM_BUG_ON(!list_empty(&cc.migratepages)); + *capture = capc.page; + current->capture_control = NULL; + return ret; } @@ -1744,7 +2321,7 @@ int sysctl_extfrag_threshold = 500; */ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, - enum compact_priority prio) + enum compact_priority prio, struct page **capture) { int may_perform_io = gfp_mask & __GFP_IO; struct zoneref *z; @@ -1772,7 +2349,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, } status = compact_zone_order(zone, order, gfp_mask, prio, - alloc_flags, ac_classzone_idx(ac)); + alloc_flags, ac_classzone_idx(ac), capture); rc = max(status, rc); /* The allocation should succeed, stop compacting */ @@ -1840,7 +2417,7 @@ static void compact_node(int nid) INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); - compact_zone(zone, &cc); + compact_zone(&cc, NULL); VM_BUG_ON(!list_empty(&cc.freepages)); VM_BUG_ON(!list_empty(&cc.migratepages)); @@ -1875,14 +2452,6 @@ int sysctl_compaction_handler(struct ctl_table *table, int write, return 0; } -int sysctl_extfrag_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) -{ - proc_dointvec_minmax(table, write, buffer, length, ppos); - - return 0; -} - #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) static ssize_t sysfs_compact_node(struct device *dev, struct device_attribute *attr, @@ -1947,6 +2516,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) struct zone *zone; struct compact_control cc = { .order = pgdat->kcompactd_max_order, + .search_order = pgdat->kcompactd_max_order, .total_migrate_scanned = 0, .total_free_scanned = 0, .classzone_idx = pgdat->kcompactd_classzone_idx, @@ -1982,7 +2552,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) if (kthread_should_stop()) return; - status = compact_zone(zone, &cc); + status = compact_zone(&cc, NULL); if (status == COMPACT_SUCCESS) { compaction_defer_reset(zone, cc.order, false); @@ -2068,11 +2638,15 @@ static int kcompactd(void *p) pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; while (!kthread_should_stop()) { + unsigned long pflags; + trace_mm_compaction_kcompactd_sleep(pgdat->node_id); wait_event_freezable(pgdat->kcompactd_wait, kcompactd_work_requested(pgdat)); + psi_memstall_enter(&pflags); kcompactd_do_work(pgdat); + psi_memstall_leave(&pflags); } return 0; diff --git a/mm/debug.c b/mm/debug.c index bd10aad8539a..1611cf00a137 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -13,10 +13,11 @@ #include <trace/events/mmflags.h> #include <linux/migrate.h> #include <linux/page_owner.h> +#include <linux/ctype.h> #include "internal.h" -char *migrate_reason_names[MR_TYPES] = { +const char *migrate_reason_names[MR_TYPES] = { "compaction", "memory_failure", "memory_hotplug", @@ -43,6 +44,7 @@ const struct trace_print_flags vmaflag_names[] = { void __dump_page(struct page *page, const char *reason) { + struct address_space *mapping; bool page_poisoned = PagePoisoned(page); int mapcount; @@ -52,10 +54,12 @@ void __dump_page(struct page *page, const char *reason) * dump_page() when detected. */ if (page_poisoned) { - pr_emerg("page:%px is uninitialized and poisoned", page); + pr_warn("page:%px is uninitialized and poisoned", page); goto hex_only; } + mapping = page_mapping(page); + /* * Avoid VM_BUG_ON() in page_mapcount(). * page->_mapcount space in struct page is used by sl[aou]b pages to @@ -63,27 +67,39 @@ void __dump_page(struct page *page, const char *reason) */ mapcount = PageSlab(page) ? 0 : page_mapcount(page); - pr_emerg("page:%px count:%d mapcount:%d mapping:%px index:%#lx", + pr_warn("page:%px count:%d mapcount:%d mapping:%px index:%#lx", page, page_ref_count(page), mapcount, page->mapping, page_to_pgoff(page)); if (PageCompound(page)) pr_cont(" compound_mapcount: %d", compound_mapcount(page)); pr_cont("\n"); + if (PageAnon(page)) + pr_warn("anon "); + else if (PageKsm(page)) + pr_warn("ksm "); + else if (mapping) { + pr_warn("%ps ", mapping->a_ops); + if (mapping->host->i_dentry.first) { + struct dentry *dentry; + dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias); + pr_warn("name:\"%pd\" ", dentry); + } + } BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); - pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags); + pr_warn("flags: %#lx(%pGp)\n", page->flags, &page->flags); hex_only: - print_hex_dump(KERN_ALERT, "raw: ", DUMP_PREFIX_NONE, 32, + print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32, sizeof(unsigned long), page, sizeof(struct page), false); if (reason) - pr_alert("page dumped because: %s\n", reason); + pr_warn("page dumped because: %s\n", reason); #ifdef CONFIG_MEMCG if (!page_poisoned && page->mem_cgroup) - pr_alert("page->mem_cgroup:%px\n", page->mem_cgroup); + pr_warn("page->mem_cgroup:%px\n", page->mem_cgroup); #endif } @@ -175,4 +191,49 @@ void dump_mm(const struct mm_struct *mm) ); } +static bool page_init_poisoning __read_mostly = true; + +static int __init setup_vm_debug(char *str) +{ + bool __page_init_poisoning = true; + + /* + * Calling vm_debug with no arguments is equivalent to requesting + * to enable all debugging options we can control. + */ + if (*str++ != '=' || !*str) + goto out; + + __page_init_poisoning = false; + if (*str == '-') + goto out; + + while (*str) { + switch (tolower(*str)) { + case'p': + __page_init_poisoning = true; + break; + default: + pr_err("vm_debug option '%c' unknown. skipped\n", + *str); + } + + str++; + } +out: + if (page_init_poisoning && !__page_init_poisoning) + pr_warn("Page struct poisoning disabled by kernel command line option 'vm_debug'\n"); + + page_init_poisoning = __page_init_poisoning; + + return 1; +} +__setup("vm_debug", setup_vm_debug); + +void page_init_poison(struct page *page, size_t size) +{ + if (page_init_poisoning) + memset(page, PAGE_POISON_PATTERN, size); +} +EXPORT_SYMBOL_GPL(page_init_poison); #endif /* CONFIG_DEBUG_VM */ diff --git a/mm/dmapool.c b/mm/dmapool.c index 6d4b97e7e9e9..76a160083506 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -114,10 +114,9 @@ static DEVICE_ATTR(pools, 0444, show_pools, NULL); * @size: size of the blocks in this pool. * @align: alignment requirement for blocks; must be a power of two * @boundary: returned blocks won't cross this power of two boundary - * Context: !in_interrupt() + * Context: not in_interrupt() * - * Returns a dma allocation pool with the requested characteristics, or - * null if one can't be created. Given one of these pools, dma_pool_alloc() + * Given one of these pools, dma_pool_alloc() * may be used to allocate memory. Such memory will all have "consistent" * DMA mappings, accessible by the device and its driver without using * cache flushing primitives. The actual size of blocks allocated may be @@ -127,6 +126,9 @@ static DEVICE_ATTR(pools, 0444, show_pools, NULL); * cross that size boundary. This is useful for devices which have * addressing restrictions on individual DMA transfers, such as not crossing * boundaries of 4KBytes. + * + * Return: a dma allocation pool with the requested characteristics, or + * %NULL if one can't be created. */ struct dma_pool *dma_pool_create(const char *name, struct device *dev, size_t size, size_t align, size_t boundary) @@ -313,7 +315,7 @@ EXPORT_SYMBOL(dma_pool_destroy); * @mem_flags: GFP_* bitmask * @handle: pointer to dma address of block * - * This returns the kernel virtual address of a currently unused block, + * Return: the kernel virtual address of a currently unused block, * and reports its dma address through the handle. * If such a memory block can't be allocated, %NULL is returned. */ @@ -498,6 +500,9 @@ static int dmam_pool_match(struct device *dev, void *res, void *match_data) * * Managed dma_pool_create(). DMA pool created with this function is * automatically destroyed on driver detach. + * + * Return: a managed dma allocation pool with the requested + * characteristics, or %NULL if one can't be created. */ struct dma_pool *dmam_pool_create(const char *name, struct device *dev, size_t size, size_t align, size_t allocation) diff --git a/mm/failslab.c b/mm/failslab.c index b135ebb88b6f..ec5aad211c5b 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -48,18 +48,12 @@ static int __init failslab_debugfs_init(void) if (IS_ERR(dir)) return PTR_ERR(dir); - if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, - &failslab.ignore_gfp_reclaim)) - goto fail; - if (!debugfs_create_bool("cache-filter", mode, dir, - &failslab.cache_filter)) - goto fail; + debugfs_create_bool("ignore-gfp-wait", mode, dir, + &failslab.ignore_gfp_reclaim); + debugfs_create_bool("cache-filter", mode, dir, + &failslab.cache_filter); return 0; -fail: - debugfs_remove_recursive(dir); - - return -ENOMEM; } late_initcall(failslab_debugfs_init); diff --git a/mm/filemap.c b/mm/filemap.c index 52517f28e6f4..a3b4021c448f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -36,6 +36,8 @@ #include <linux/cleancache.h> #include <linux/shmem_fs.h> #include <linux/rmap.h> +#include <linux/delayacct.h> +#include <linux/psi.h> #include "internal.h" #define CREATE_TRACE_POINTS @@ -96,8 +98,8 @@ * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) * ->i_pages lock (try_to_unmap_one) - * ->zone_lru_lock(zone) (follow_page->mark_page_accessed) - * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page) + * ->pgdat->lru_lock (follow_page->mark_page_accessed) + * ->pgdat->lru_lock (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) * ->i_pages lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) @@ -111,60 +113,26 @@ * ->tasklist_lock (memory_failure, collect_procs_ao) */ -static int page_cache_tree_insert(struct address_space *mapping, - struct page *page, void **shadowp) -{ - struct radix_tree_node *node; - void **slot; - int error; - - error = __radix_tree_create(&mapping->i_pages, page->index, 0, - &node, &slot); - if (error) - return error; - if (*slot) { - void *p; - - p = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); - if (!radix_tree_exceptional_entry(p)) - return -EEXIST; - - mapping->nrexceptional--; - if (shadowp) - *shadowp = p; - } - __radix_tree_replace(&mapping->i_pages, node, slot, page, - workingset_lookup_update(mapping)); - mapping->nrpages++; - return 0; -} - -static void page_cache_tree_delete(struct address_space *mapping, +static void page_cache_delete(struct address_space *mapping, struct page *page, void *shadow) { - int i, nr; + XA_STATE(xas, &mapping->i_pages, page->index); + unsigned int nr = 1; - /* hugetlb pages are represented by one entry in the radix tree */ - nr = PageHuge(page) ? 1 : hpage_nr_pages(page); + mapping_set_update(&xas, mapping); + + /* hugetlb pages are represented by a single entry in the xarray */ + if (!PageHuge(page)) { + xas_set_order(&xas, page->index, compound_order(page)); + nr = 1U << compound_order(page); + } VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(nr != 1 && shadow, page); - for (i = 0; i < nr; i++) { - struct radix_tree_node *node; - void **slot; - - __radix_tree_lookup(&mapping->i_pages, page->index + i, - &node, &slot); - - VM_BUG_ON_PAGE(!node && nr != 1, page); - - radix_tree_clear_tags(&mapping->i_pages, node, slot); - __radix_tree_replace(&mapping->i_pages, node, slot, shadow, - workingset_lookup_update(mapping)); - } + xas_store(&xas, shadow); + xas_init_marks(&xas); page->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ @@ -263,7 +231,7 @@ void __delete_from_page_cache(struct page *page, void *shadow) trace_mm_filemap_delete_from_page_cache(page); unaccount_page_cache_page(mapping, page); - page_cache_tree_delete(mapping, page, shadow); + page_cache_delete(mapping, page, shadow); } static void page_cache_free_page(struct address_space *mapping, @@ -306,7 +274,7 @@ void delete_from_page_cache(struct page *page) EXPORT_SYMBOL(delete_from_page_cache); /* - * page_cache_tree_delete_batch - delete several pages from page cache + * page_cache_delete_batch - delete several pages from page cache * @mapping: the mapping to which pages belong * @pvec: pagevec with pages to delete * @@ -319,24 +287,19 @@ EXPORT_SYMBOL(delete_from_page_cache); * * The function expects the i_pages lock to be held. */ -static void -page_cache_tree_delete_batch(struct address_space *mapping, +static void page_cache_delete_batch(struct address_space *mapping, struct pagevec *pvec) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); int total_pages = 0; int i = 0, tail_pages = 0; struct page *page; - pgoff_t start; - start = pvec->pages[0]->index; - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { + mapping_set_update(&xas, mapping); + xas_for_each(&xas, page, ULONG_MAX) { if (i >= pagevec_count(pvec) && !tail_pages) break; - page = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) continue; if (!tail_pages) { /* @@ -344,8 +307,11 @@ page_cache_tree_delete_batch(struct address_space *mapping, * have our pages locked so they are protected from * being removed. */ - if (page != pvec->pages[i]) + if (page != pvec->pages[i]) { + VM_BUG_ON_PAGE(page->index > + pvec->pages[i]->index, page); continue; + } WARN_ON_ONCE(!PageLocked(page)); if (PageTransHuge(page) && !PageHuge(page)) tail_pages = HPAGE_PMD_NR - 1; @@ -356,11 +322,11 @@ page_cache_tree_delete_batch(struct address_space *mapping, */ i++; } else { + VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages + != pvec->pages[i]->index, page); tail_pages--; } - radix_tree_clear_tags(&mapping->i_pages, iter.node, slot); - __radix_tree_replace(&mapping->i_pages, iter.node, slot, NULL, - workingset_lookup_update(mapping)); + xas_store(&xas, NULL); total_pages++; } mapping->nrpages -= total_pages; @@ -381,7 +347,7 @@ void delete_from_page_cache_batch(struct address_space *mapping, unaccount_page_cache_page(mapping, pvec->pages[i]); } - page_cache_tree_delete_batch(mapping, pvec); + page_cache_delete_batch(mapping, pvec); xa_unlock_irqrestore(&mapping->i_pages, flags); for (i = 0; i < pagevec_count(pvec); i++) @@ -426,6 +392,8 @@ static int filemap_check_and_keep_errors(struct address_space *mapping) * opposed to a regular memory cleansing writeback. The difference between * these two operations is that if a dirty page/buffer is encountered, it must * be waited upon, and not just skipped over. + * + * Return: %0 on success, negative error code otherwise. */ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode) @@ -472,6 +440,8 @@ EXPORT_SYMBOL(filemap_fdatawrite_range); * * This is a mostly non-blocking flush. Not suitable for data-integrity * purposes - I/O may not be started against all dirty pages. + * + * Return: %0 on success, negative error code otherwise. */ int filemap_flush(struct address_space *mapping) { @@ -487,24 +457,38 @@ EXPORT_SYMBOL(filemap_flush); * * Find at least one page in the range supplied, usually used to check if * direct writing in this range will trigger a writeback. + * + * Return: %true if at least one page exists in the specified range, + * %false otherwise. */ bool filemap_range_has_page(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { - pgoff_t index = start_byte >> PAGE_SHIFT; - pgoff_t end = end_byte >> PAGE_SHIFT; struct page *page; + XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); + pgoff_t max = end_byte >> PAGE_SHIFT; if (end_byte < start_byte) return false; - if (mapping->nrpages == 0) - return false; + rcu_read_lock(); + for (;;) { + page = xas_find(&xas, max); + if (xas_retry(&xas, page)) + continue; + /* Shadow entries don't count */ + if (xa_is_value(page)) + continue; + /* + * We don't need to try to pin this page; we're about to + * release the RCU lock anyway. It is enough to know that + * there was a page here recently. + */ + break; + } + rcu_read_unlock(); - if (!find_get_pages_range(mapping, &index, end, 1, &page)) - return false; - put_page(page); - return true; + return page != NULL; } EXPORT_SYMBOL(filemap_range_has_page); @@ -552,6 +536,8 @@ static void __filemap_fdatawait_range(struct address_space *mapping, * Since the error status of the address space is cleared by this function, * callers are responsible for checking the return value and handling and/or * reporting the error. + * + * Return: error status of the address space. */ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) @@ -574,6 +560,8 @@ EXPORT_SYMBOL(filemap_fdatawait_range); * Since the error status of the file is advanced by this function, * callers are responsible for checking the return value and handling and/or * reporting the error. + * + * Return: error status of the address space vs. the file->f_wb_err cursor. */ int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte) { @@ -595,6 +583,8 @@ EXPORT_SYMBOL(file_fdatawait_range); * Use this function if callers don't handle errors themselves. Expected * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), * fsfreeze(8) + * + * Return: error status of the address space. */ int filemap_fdatawait_keep_errors(struct address_space *mapping) { @@ -646,6 +636,8 @@ EXPORT_SYMBOL(filemap_write_and_wait); * * Note that @lend is inclusive (describes the last byte to be written) so * that this function can be used to write to the very end-of-file (end = -1). + * + * Return: error status of the address space. */ int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend) @@ -701,6 +693,8 @@ EXPORT_SYMBOL(__filemap_set_wb_err); * While we handle mapping->wb_err with atomic operations, the f_wb_err * value is protected by the f_lock since we must ensure that it reflects * the latest value swapped in for this file descriptor. + * + * Return: %0 on success, negative error code otherwise. */ int file_check_and_advance_wb_err(struct file *file) { @@ -743,6 +737,8 @@ EXPORT_SYMBOL(file_check_and_advance_wb_err); * * After writing out and waiting on the data, we check and advance the * f_wb_err cursor to the latest value, and return any errors detected there. + * + * Return: %0 on success, negative error code otherwise. */ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) { @@ -775,51 +771,46 @@ EXPORT_SYMBOL(file_write_and_wait_range); * locked. This function does not add the new page to the LRU, the * caller must do that. * - * The remove + add is atomic. The only way this function can fail is - * memory allocation failure. + * The remove + add is atomic. This function cannot fail. + * + * Return: %0 */ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) { - int error; + struct address_space *mapping = old->mapping; + void (*freepage)(struct page *) = mapping->a_ops->freepage; + pgoff_t offset = old->index; + XA_STATE(xas, &mapping->i_pages, offset); + unsigned long flags; VM_BUG_ON_PAGE(!PageLocked(old), old); VM_BUG_ON_PAGE(!PageLocked(new), new); VM_BUG_ON_PAGE(new->mapping, new); - error = radix_tree_preload(gfp_mask & GFP_RECLAIM_MASK); - if (!error) { - struct address_space *mapping = old->mapping; - void (*freepage)(struct page *); - unsigned long flags; - - pgoff_t offset = old->index; - freepage = mapping->a_ops->freepage; - - get_page(new); - new->mapping = mapping; - new->index = offset; + get_page(new); + new->mapping = mapping; + new->index = offset; - xa_lock_irqsave(&mapping->i_pages, flags); - __delete_from_page_cache(old, NULL); - error = page_cache_tree_insert(mapping, new, NULL); - BUG_ON(error); + xas_lock_irqsave(&xas, flags); + xas_store(&xas, new); - /* - * hugetlb pages do not participate in page cache accounting. - */ - if (!PageHuge(new)) - __inc_node_page_state(new, NR_FILE_PAGES); - if (PageSwapBacked(new)) - __inc_node_page_state(new, NR_SHMEM); - xa_unlock_irqrestore(&mapping->i_pages, flags); - mem_cgroup_migrate(old, new); - radix_tree_preload_end(); - if (freepage) - freepage(old); - put_page(old); - } + old->mapping = NULL; + /* hugetlb pages do not participate in page cache accounting. */ + if (!PageHuge(old)) + __dec_node_page_state(new, NR_FILE_PAGES); + if (!PageHuge(new)) + __inc_node_page_state(new, NR_FILE_PAGES); + if (PageSwapBacked(old)) + __dec_node_page_state(new, NR_SHMEM); + if (PageSwapBacked(new)) + __inc_node_page_state(new, NR_SHMEM); + xas_unlock_irqrestore(&xas, flags); + mem_cgroup_migrate(old, new); + if (freepage) + freepage(old); + put_page(old); - return error; + return 0; } EXPORT_SYMBOL_GPL(replace_page_cache_page); @@ -828,12 +819,15 @@ static int __add_to_page_cache_locked(struct page *page, pgoff_t offset, gfp_t gfp_mask, void **shadowp) { + XA_STATE(xas, &mapping->i_pages, offset); int huge = PageHuge(page); struct mem_cgroup *memcg; int error; + void *old; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapBacked(page), page); + mapping_set_update(&xas, mapping); if (!huge) { error = mem_cgroup_try_charge(page, current->mm, @@ -842,39 +836,47 @@ static int __add_to_page_cache_locked(struct page *page, return error; } - error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK); - if (error) { - if (!huge) - mem_cgroup_cancel_charge(page, memcg, false); - return error; - } - get_page(page); page->mapping = mapping; page->index = offset; - xa_lock_irq(&mapping->i_pages); - error = page_cache_tree_insert(mapping, page, shadowp); - radix_tree_preload_end(); - if (unlikely(error)) - goto err_insert; + do { + xas_lock_irq(&xas); + old = xas_load(&xas); + if (old && !xa_is_value(old)) + xas_set_err(&xas, -EEXIST); + xas_store(&xas, page); + if (xas_error(&xas)) + goto unlock; + + if (xa_is_value(old)) { + mapping->nrexceptional--; + if (shadowp) + *shadowp = old; + } + mapping->nrpages++; + + /* hugetlb pages do not participate in page cache accounting */ + if (!huge) + __inc_node_page_state(page, NR_FILE_PAGES); +unlock: + xas_unlock_irq(&xas); + } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)); + + if (xas_error(&xas)) + goto error; - /* hugetlb pages do not participate in page cache accounting. */ - if (!huge) - __inc_node_page_state(page, NR_FILE_PAGES); - xa_unlock_irq(&mapping->i_pages); if (!huge) mem_cgroup_commit_charge(page, memcg, false, false); trace_mm_filemap_add_to_page_cache(page); return 0; -err_insert: +error: page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ - xa_unlock_irq(&mapping->i_pages); if (!huge) mem_cgroup_cancel_charge(page, memcg, false); put_page(page); - return error; + return xas_error(&xas); } /** @@ -886,6 +888,8 @@ err_insert: * * This function is used to add a page to the pagecache. It must be locked. * This function does not add the page to the LRU. The caller must do that. + * + * Return: %0 on success, negative error code otherwise. */ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) @@ -915,12 +919,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, * data from the working set, only to cache data that will * get overwritten with something else, is a waste of memory. */ - if (!(gfp_mask & __GFP_WRITE) && - shadow && workingset_refault(shadow)) { - SetPageActive(page); - workingset_activation(page); - } else - ClearPageActive(page); + WARN_ON_ONCE(PageActive(page)); + if (!(gfp_mask & __GFP_WRITE) && shadow) + workingset_refault(page, shadow); lru_cache_add(page); } return ret; @@ -1003,7 +1004,14 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, if (wait_page->bit_nr != key->bit_nr) return 0; - /* Stop walking if it's locked */ + /* + * Stop walking if it's locked. + * Is this safe if put_and_wait_on_page_locked() is in use? + * Yes: the waker must hold a reference to this page, and if PG_locked + * has now already been set by another task, that task must also hold + * a reference to the *same usage* of this page; so there is no need + * to walk on to wake even the put_and_wait_on_page_locked() callers. + */ if (test_bit(key->bit_nr, &key->page->flags)) return -1; @@ -1071,15 +1079,44 @@ static void wake_up_page(struct page *page, int bit) wake_up_page_bit(page, bit); } +/* + * A choice of three behaviors for wait_on_page_bit_common(): + */ +enum behavior { + EXCLUSIVE, /* Hold ref to page and take the bit when woken, like + * __lock_page() waiting on then setting PG_locked. + */ + SHARED, /* Hold ref to page and check the bit when woken, like + * wait_on_page_writeback() waiting on PG_writeback. + */ + DROP, /* Drop ref to page before wait, no check when woken, + * like put_and_wait_on_page_locked() on PG_locked. + */ +}; + static inline int wait_on_page_bit_common(wait_queue_head_t *q, - struct page *page, int bit_nr, int state, bool lock) + struct page *page, int bit_nr, int state, enum behavior behavior) { struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; + bool bit_is_set; + bool thrashing = false; + bool delayacct = false; + unsigned long pflags; int ret = 0; + if (bit_nr == PG_locked && + !PageUptodate(page) && PageWorkingset(page)) { + if (!PageSwapBacked(page)) { + delayacct_thrashing_start(); + delayacct = true; + } + psi_memstall_enter(&pflags); + thrashing = true; + } + init_wait(wait); - wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0; + wait->flags = behavior == EXCLUSIVE ? WQ_FLAG_EXCLUSIVE : 0; wait->func = wake_page_function; wait_page.page = page; wait_page.bit_nr = bit_nr; @@ -1096,26 +1133,46 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, spin_unlock_irq(&q->lock); - if (likely(test_bit(bit_nr, &page->flags))) { + bit_is_set = test_bit(bit_nr, &page->flags); + if (behavior == DROP) + put_page(page); + + if (likely(bit_is_set)) io_schedule(); - } - if (lock) { + if (behavior == EXCLUSIVE) { if (!test_and_set_bit_lock(bit_nr, &page->flags)) break; - } else { + } else if (behavior == SHARED) { if (!test_bit(bit_nr, &page->flags)) break; } - if (unlikely(signal_pending_state(state, current))) { + if (signal_pending_state(state, current)) { ret = -EINTR; break; } + + if (behavior == DROP) { + /* + * We can no longer safely access page->flags: + * even if CONFIG_MEMORY_HOTREMOVE is not enabled, + * there is a risk of waiting forever on a page reused + * for something that keeps it locked indefinitely. + * But best check for -EINTR above before breaking. + */ + break; + } } finish_wait(q, wait); + if (thrashing) { + if (delayacct) + delayacct_thrashing_end(); + psi_memstall_leave(&pflags); + } + /* * A signal could leave PageWaiters set. Clearing it here if * !waitqueue_active would be possible (by open-coding finish_wait), @@ -1130,18 +1187,37 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, void wait_on_page_bit(struct page *page, int bit_nr) { wait_queue_head_t *q = page_waitqueue(page); - wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false); + wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED); } EXPORT_SYMBOL(wait_on_page_bit); int wait_on_page_bit_killable(struct page *page, int bit_nr) { wait_queue_head_t *q = page_waitqueue(page); - return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false); + return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED); } EXPORT_SYMBOL(wait_on_page_bit_killable); /** + * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked + * @page: The page to wait for. + * + * The caller should hold a reference on @page. They expect the page to + * become unlocked relatively soon, but do not wish to hold up migration + * (for example) by holding the reference while waiting for the page to + * come unlocked. After this function returns, the caller should not + * dereference @page. + */ +void put_and_wait_on_page_locked(struct page *page) +{ + wait_queue_head_t *q; + + page = compound_head(page); + q = page_waitqueue(page); + wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP); +} + +/** * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue * @page: Page defining the wait queue of interest * @waiter: Waiter to add to the queue @@ -1270,7 +1346,8 @@ void __lock_page(struct page *__page) { struct page *page = compound_head(__page); wait_queue_head_t *q = page_waitqueue(page); - wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true); + wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, + EXCLUSIVE); } EXPORT_SYMBOL(__lock_page); @@ -1278,7 +1355,8 @@ int __lock_page_killable(struct page *__page) { struct page *page = compound_head(__page); wait_queue_head_t *q = page_waitqueue(page); - return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true); + return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, + EXCLUSIVE); } EXPORT_SYMBOL_GPL(__lock_page_killable); @@ -1326,86 +1404,76 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, } /** - * page_cache_next_hole - find the next hole (not-present entry) - * @mapping: mapping - * @index: index - * @max_scan: maximum range to search - * - * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the - * lowest indexed hole. - * - * Returns: the index of the hole if found, otherwise returns an index - * outside of the set specified (in which case 'return - index >= - * max_scan' will be true). In rare cases of index wrap-around, 0 will - * be returned. - * - * page_cache_next_hole may be called under rcu_read_lock. However, - * like radix_tree_gang_lookup, this will not atomically search a - * snapshot of the tree at a single point in time. For example, if a - * hole is created at index 5, then subsequently a hole is created at - * index 10, page_cache_next_hole covering both indexes may return 10 - * if called under rcu_read_lock. + * page_cache_next_miss() - Find the next gap in the page cache. + * @mapping: Mapping. + * @index: Index. + * @max_scan: Maximum range to search. + * + * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the + * gap with the lowest index. + * + * This function may be called under the rcu_read_lock. However, this will + * not atomically search a snapshot of the cache at a single point in time. + * For example, if a gap is created at index 5, then subsequently a gap is + * created at index 10, page_cache_next_miss covering both indices may + * return 10 if called under the rcu_read_lock. + * + * Return: The index of the gap if found, otherwise an index outside the + * range specified (in which case 'return - index >= max_scan' will be true). + * In the rare case of index wrap-around, 0 will be returned. */ -pgoff_t page_cache_next_hole(struct address_space *mapping, +pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { - unsigned long i; + XA_STATE(xas, &mapping->i_pages, index); - for (i = 0; i < max_scan; i++) { - struct page *page; - - page = radix_tree_lookup(&mapping->i_pages, index); - if (!page || radix_tree_exceptional_entry(page)) + while (max_scan--) { + void *entry = xas_next(&xas); + if (!entry || xa_is_value(entry)) break; - index++; - if (index == 0) + if (xas.xa_index == 0) break; } - return index; + return xas.xa_index; } -EXPORT_SYMBOL(page_cache_next_hole); +EXPORT_SYMBOL(page_cache_next_miss); /** - * page_cache_prev_hole - find the prev hole (not-present entry) - * @mapping: mapping - * @index: index - * @max_scan: maximum range to search - * - * Search backwards in the range [max(index-max_scan+1, 0), index] for - * the first hole. - * - * Returns: the index of the hole if found, otherwise returns an index - * outside of the set specified (in which case 'index - return >= - * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX - * will be returned. - * - * page_cache_prev_hole may be called under rcu_read_lock. However, - * like radix_tree_gang_lookup, this will not atomically search a - * snapshot of the tree at a single point in time. For example, if a - * hole is created at index 10, then subsequently a hole is created at - * index 5, page_cache_prev_hole covering both indexes may return 5 if - * called under rcu_read_lock. + * page_cache_prev_miss() - Find the next gap in the page cache. + * @mapping: Mapping. + * @index: Index. + * @max_scan: Maximum range to search. + * + * Search the range [max(index - max_scan + 1, 0), index] for the + * gap with the highest index. + * + * This function may be called under the rcu_read_lock. However, this will + * not atomically search a snapshot of the cache at a single point in time. + * For example, if a gap is created at index 10, then subsequently a gap is + * created at index 5, page_cache_prev_miss() covering both indices may + * return 5 if called under the rcu_read_lock. + * + * Return: The index of the gap if found, otherwise an index outside the + * range specified (in which case 'index - return >= max_scan' will be true). + * In the rare case of wrap-around, ULONG_MAX will be returned. */ -pgoff_t page_cache_prev_hole(struct address_space *mapping, +pgoff_t page_cache_prev_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { - unsigned long i; + XA_STATE(xas, &mapping->i_pages, index); - for (i = 0; i < max_scan; i++) { - struct page *page; - - page = radix_tree_lookup(&mapping->i_pages, index); - if (!page || radix_tree_exceptional_entry(page)) + while (max_scan--) { + void *entry = xas_prev(&xas); + if (!entry || xa_is_value(entry)) break; - index--; - if (index == ULONG_MAX) + if (xas.xa_index == ULONG_MAX) break; } - return index; + return xas.xa_index; } -EXPORT_SYMBOL(page_cache_prev_hole); +EXPORT_SYMBOL(page_cache_prev_miss); /** * find_get_entry - find and get a page cache entry @@ -1418,51 +1486,44 @@ EXPORT_SYMBOL(page_cache_prev_hole); * If the slot holds a shadow entry of a previously evicted page, or a * swap entry from shmem/tmpfs, it is returned. * - * Otherwise, %NULL is returned. + * Return: the found page or shadow entry, %NULL if nothing is found. */ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { - void **pagep; + XA_STATE(xas, &mapping->i_pages, offset); struct page *head, *page; rcu_read_lock(); repeat: - page = NULL; - pagep = radix_tree_lookup_slot(&mapping->i_pages, offset); - if (pagep) { - page = radix_tree_deref_slot(pagep); - if (unlikely(!page)) - goto out; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto repeat; - /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Return - * it without attempting to raise page count. - */ - goto out; - } + xas_reset(&xas); + page = xas_load(&xas); + if (xas_retry(&xas, page)) + goto repeat; + /* + * A shadow entry of a recently evicted page, or a swap entry from + * shmem/tmpfs. Return it without attempting to raise page count. + */ + if (!page || xa_is_value(page)) + goto out; - head = compound_head(page); - if (!page_cache_get_speculative(head)) - goto repeat; + head = compound_head(page); + if (!page_cache_get_speculative(head)) + goto repeat; - /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + /* The page was split under us? */ + if (compound_head(page) != head) { + put_page(head); + goto repeat; + } - /* - * Has the page moved? - * This is part of the lockless pagecache protocol. See - * include/linux/pagemap.h for details. - */ - if (unlikely(page != *pagep)) { - put_page(head); - goto repeat; - } + /* + * Has the page moved? + * This is part of the lockless pagecache protocol. See + * include/linux/pagemap.h for details. + */ + if (unlikely(page != xas_reload(&xas))) { + put_page(head); + goto repeat; } out: rcu_read_unlock(); @@ -1483,9 +1544,9 @@ EXPORT_SYMBOL(find_get_entry); * If the slot holds a shadow entry of a previously evicted page, or a * swap entry from shmem/tmpfs, it is returned. * - * Otherwise, %NULL is returned. - * * find_lock_entry() may sleep. + * + * Return: the found page or shadow entry, %NULL if nothing is found. */ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) { @@ -1493,7 +1554,7 @@ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) repeat: page = find_get_entry(mapping, offset); - if (page && !radix_tree_exception(page)) { + if (page && !xa_is_value(page)) { lock_page(page); /* Has the page been truncated? */ if (unlikely(page_mapping(page) != mapping)) { @@ -1525,12 +1586,14 @@ EXPORT_SYMBOL(find_lock_entry); * - FGP_CREAT: If page is not present then a new page is allocated using * @gfp_mask and added to the page cache and the VM's LRU * list. The page is returned locked and with an increased - * refcount. Otherwise, NULL is returned. + * refcount. * * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even * if the GFP flags specified for FGP_CREAT are atomic. * * If there is a page cache page, it is returned with an increased refcount. + * + * Return: the found page or %NULL otherwise. */ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, int fgp_flags, gfp_t gfp_mask) @@ -1539,7 +1602,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, repeat: page = find_get_entry(mapping, offset); - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) page = NULL; if (!page) goto no_page; @@ -1563,7 +1626,7 @@ repeat: VM_BUG_ON_PAGE(page->index != offset, page); } - if (page && (fgp_flags & FGP_ACCESSED)) + if (fgp_flags & FGP_ACCESSED) mark_page_accessed(page); no_page: @@ -1618,60 +1681,54 @@ EXPORT_SYMBOL(pagecache_get_page); * Any shadow entries of evicted pages, or swap entries from * shmem/tmpfs, are included in the returned array. * - * find_get_entries() returns the number of pages and shadow entries - * which were found. + * Return: the number of pages and shadow entries which were found. */ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, struct page **entries, pgoff_t *indices) { - void **slot; + XA_STATE(xas, &mapping->i_pages, start); + struct page *page; unsigned int ret = 0; - struct radix_tree_iter iter; if (!nr_entries) return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - struct page *head, *page; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each(&xas, page, ULONG_MAX) { + struct page *head; + if (xas_retry(&xas, page)) continue; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page, a swap - * entry from shmem/tmpfs or a DAX entry. Return it - * without attempting to raise page count. - */ + /* + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. + */ + if (xa_is_value(page)) goto export; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; + export: - indices[ret] = iter.index; + indices[ret] = xas.xa_index; entries[ret] = page; if (++ret == nr_entries) break; + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } rcu_read_unlock(); return ret; @@ -1694,72 +1751,58 @@ export: * indexes. There may be holes in the indices due to not-present pages. * We also update @start to index the next page for the traversal. * - * find_get_pages_range() returns the number of pages which were found. If this - * number is smaller than @nr_pages, the end of specified range has been + * Return: the number of pages which were found. If this number is + * smaller than @nr_pages, the end of specified range has been * reached. */ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, pgoff_t end, unsigned int nr_pages, struct page **pages) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, *start); + struct page *page; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, *start) { - struct page *head, *page; - - if (iter.index > end) - break; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each(&xas, page, end) { + struct page *head; + if (xas_retry(&xas, page)) continue; - - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Skip - * over it. - */ + /* Skip over shadow, swap and DAX entries */ + if (xa_is_value(page)) continue; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; pages[ret] = page; if (++ret == nr_pages) { - *start = pages[ret - 1]->index + 1; + *start = xas.xa_index + 1; goto out; } + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } /* * We come here when there is no page beyond @end. We take care to not * overflow the index @start as it confuses some of the callers. This - * breaks the iteration when there is page at index -1 but that is + * breaks the iteration when there is a page at index -1 but that is * already broken anyway. */ if (end == (pgoff_t)-1) @@ -1782,69 +1825,50 @@ out: * find_get_pages_contig() works exactly like find_get_pages(), except * that the returned number of pages are guaranteed to be contiguous. * - * find_get_pages_contig() returns the number of pages which were found. + * Return: the number of pages which were found. */ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, unsigned int nr_pages, struct page **pages) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, index); + struct page *page; unsigned int ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - radix_tree_for_each_contig(slot, &mapping->i_pages, &iter, index) { - struct page *head, *page; -repeat: - page = radix_tree_deref_slot(slot); - /* The hole, there no reason to continue */ - if (unlikely(!page)) - break; - - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Stop - * looking for contiguous pages. - */ + for (page = xas_load(&xas); page; page = xas_next(&xas)) { + struct page *head; + if (xas_retry(&xas, page)) + continue; + /* + * If the entry has been swapped out, we can stop looking. + * No current caller is looking for DAX entries. + */ + if (xa_is_value(page)) break; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } - - /* - * must check mapping and index after taking the ref. - * otherwise we can get both false positives and false - * negatives, which is just confusing to the caller. - */ - if (page->mapping == NULL || page_to_pgoff(page) != iter.index) { - put_page(page); - break; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; pages[ret] = page; if (++ret == nr_pages) break; + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } rcu_read_unlock(); return ret; @@ -1862,76 +1886,62 @@ EXPORT_SYMBOL(find_get_pages_contig); * * Like find_get_pages, except we only return pages which are tagged with * @tag. We update @index to index the next page for the traversal. + * + * Return: the number of pages which were found. */ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, - pgoff_t end, int tag, unsigned int nr_pages, + pgoff_t end, xa_mark_t tag, unsigned int nr_pages, struct page **pages) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, *index); + struct page *page; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, *index, tag) { - struct page *head, *page; - - if (iter.index > end) - break; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each_marked(&xas, page, end, tag) { + struct page *head; + if (xas_retry(&xas, page)) continue; - - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page. - * - * Those entries should never be tagged, but - * this tree walk is lockless and the tags are - * looked up in bulk, one radix tree node at a - * time, so there is a sizable window for page - * reclaim to evict a page we saw tagged. - * - * Skip over it. - */ + /* + * Shadow entries should never be tagged, but this iteration + * is lockless so there is a window for page reclaim to evict + * a page we saw tagged. Skip over it. + */ + if (xa_is_value(page)) continue; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; pages[ret] = page; if (++ret == nr_pages) { - *index = pages[ret - 1]->index + 1; + *index = xas.xa_index + 1; goto out; } + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } /* - * We come here when we got at @end. We take care to not overflow the + * We come here when we got to @end. We take care to not overflow the * index @index as it confuses some of the callers. This breaks the - * iteration when there is page at index -1 but that is already broken - * anyway. + * iteration when there is a page at index -1 but that is already + * broken anyway. */ if (end == (pgoff_t)-1) *index = (pgoff_t)-1; @@ -1955,59 +1965,55 @@ EXPORT_SYMBOL(find_get_pages_range_tag); * * Like find_get_entries, except we only return entries which are tagged with * @tag. + * + * Return: the number of entries which were found. */ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, - int tag, unsigned int nr_entries, + xa_mark_t tag, unsigned int nr_entries, struct page **entries, pgoff_t *indices) { - void **slot; + XA_STATE(xas, &mapping->i_pages, start); + struct page *page; unsigned int ret = 0; - struct radix_tree_iter iter; if (!nr_entries) return 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, tag) { - struct page *head, *page; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each_marked(&xas, page, ULONG_MAX, tag) { + struct page *head; + if (xas_retry(&xas, page)) continue; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - - /* - * A shadow entry of a recently evicted page, a swap - * entry from shmem/tmpfs or a DAX entry. Return it - * without attempting to raise page count. - */ + /* + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. + */ + if (xa_is_value(page)) goto export; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; + export: - indices[ret] = iter.index; + indices[ret] = xas.xa_index; entries[ret] = page; if (++ret == nr_entries) break; + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } rcu_read_unlock(); return ret; @@ -2046,6 +2052,10 @@ static void shrink_readahead_size_eio(struct file *filp, * * This is really ugly. But the goto's actually try to clarify some * of the logic when it comes to error handling etc. + * + * Return: + * * total number of bytes copied, including those the were already @written + * * negative error code if nothing was copied */ static ssize_t generic_file_buffered_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t written) @@ -2122,7 +2132,7 @@ find_page: !mapping->a_ops->is_partially_uptodate) goto page_not_up_to_date; /* pipes can't handle partially uptodate pages */ - if (unlikely(iter->type & ITER_PIPE)) + if (unlikely(iov_iter_is_pipe(iter))) goto page_not_up_to_date; if (!trylock_page(page)) goto page_not_up_to_date; @@ -2307,6 +2317,9 @@ out: * * This is the "read_iter()" routine for all filesystems * that can use the page cache directly. + * Return: + * * number of bytes copied, even for partial reads + * * negative error code if nothing was read */ ssize_t generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) @@ -2374,6 +2387,8 @@ EXPORT_SYMBOL(generic_file_read_iter); * * This adds the requested page to the page cache if it isn't already there, * and schedules an I/O to read in its contents from disk. + * + * Return: %0 on success, negative error code otherwise. */ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) { @@ -2488,6 +2503,8 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, * has not been released. * * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. + * + * Return: bitwise-OR of %VM_FAULT_ codes. */ vm_fault_t filemap_fault(struct vm_fault *vmf) { @@ -2581,9 +2598,7 @@ no_cached_page: * system is low on memory, or a problem occurs while trying * to schedule I/O. */ - if (error == -ENOMEM) - return VM_FAULT_OOM; - return VM_FAULT_SIGBUS; + return vmf_error(error); page_not_uptodate: /* @@ -2613,45 +2628,38 @@ EXPORT_SYMBOL(filemap_fault); void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { - struct radix_tree_iter iter; - void **slot; struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; unsigned long max_idx; + XA_STATE(xas, &mapping->i_pages, start_pgoff); struct page *head, *page; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start_pgoff) { - if (iter.index > end_pgoff) - break; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) - goto next; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } + xas_for_each(&xas, page, end_pgoff) { + if (xas_retry(&xas, page)) + continue; + if (xa_is_value(page)) goto next; - } head = compound_head(page); + + /* + * Check for a locked page first, as a speculative + * reference may adversely influence page migration. + */ + if (PageLocked(head)) + goto next; if (!page_cache_get_speculative(head)) - goto repeat; + goto next; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto skip; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto skip; if (!PageUptodate(page) || PageReadahead(page) || @@ -2670,10 +2678,10 @@ repeat: if (file->f_ra.mmap_miss > 0) file->f_ra.mmap_miss--; - vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT; + vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT; if (vmf->pte) - vmf->pte += iter.index - last_pgoff; - last_pgoff = iter.index; + vmf->pte += xas.xa_index - last_pgoff; + last_pgoff = xas.xa_index; if (alloc_set_pte(vmf, NULL, page)) goto unlock; unlock_page(page); @@ -2686,8 +2694,6 @@ next: /* Huge page is mapped? No need to proceed. */ if (pmd_trans_huge(*vmf->pmd)) break; - if (iter.index == end_pgoff) - break; } rcu_read_unlock(); } @@ -2748,9 +2754,9 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) return generic_file_mmap(file, vma); } #else -int filemap_page_mkwrite(struct vm_fault *vmf) +vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { - return -ENOSYS; + return VM_FAULT_SIGBUS; } int generic_file_mmap(struct file * file, struct vm_area_struct * vma) { @@ -2797,7 +2803,7 @@ repeat: put_page(page); if (err == -EEXIST) goto repeat; - /* Presumably ENOMEM for radix tree node */ + /* Presumably ENOMEM for xarray node */ return ERR_PTR(err); } @@ -2884,6 +2890,8 @@ out: * not set, try to fill the page and wait for it to become unlocked. * * If the page does not get brought uptodate, return -EIO. + * + * Return: up to date page on success, ERR_PTR() on failure. */ struct page *read_cache_page(struct address_space *mapping, pgoff_t index, @@ -2904,6 +2912,8 @@ EXPORT_SYMBOL(read_cache_page); * any new page allocations done using the specified allocation flags. * * If the page does not get brought uptodate, return -EIO. + * + * Return: up to date page on success, ERR_PTR() on failure. */ struct page *read_cache_page_gfp(struct address_space *mapping, pgoff_t index, @@ -2916,6 +2926,42 @@ struct page *read_cache_page_gfp(struct address_space *mapping, EXPORT_SYMBOL(read_cache_page_gfp); /* + * Don't operate on ranges the page cache doesn't support, and don't exceed the + * LFS limits. If pos is under the limit it becomes a short access. If it + * exceeds the limit we return -EFBIG. + */ +static int generic_access_check_limits(struct file *file, loff_t pos, + loff_t *count) +{ + struct inode *inode = file->f_mapping->host; + loff_t max_size = inode->i_sb->s_maxbytes; + + if (!(file->f_flags & O_LARGEFILE)) + max_size = MAX_NON_LFS; + + if (unlikely(pos >= max_size)) + return -EFBIG; + *count = min(*count, max_size - pos); + return 0; +} + +static int generic_write_check_limits(struct file *file, loff_t pos, + loff_t *count) +{ + loff_t limit = rlimit(RLIMIT_FSIZE); + + if (limit != RLIM_INFINITY) { + if (pos >= limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + *count = min(*count, limit - pos); + } + + return generic_access_check_limits(file, pos, count); +} + +/* * Performs necessary checks before doing a write * * Can adjust writing position or amount of bytes to write. @@ -2926,8 +2972,8 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - unsigned long limit = rlimit(RLIMIT_FSIZE); - loff_t pos; + loff_t count; + int ret; if (!iov_iter_count(from)) return 0; @@ -2936,43 +2982,99 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); - pos = iocb->ki_pos; - if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) return -EINVAL; - if (limit != RLIM_INFINITY) { - if (iocb->ki_pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - iov_iter_truncate(from, limit - (unsigned long)pos); - } + count = iov_iter_count(from); + ret = generic_write_check_limits(file, iocb->ki_pos, &count); + if (ret) + return ret; + + iov_iter_truncate(from, count); + return iov_iter_count(from); +} +EXPORT_SYMBOL(generic_write_checks); + +/* + * Performs necessary checks before doing a clone. + * + * Can adjust amount of bytes to clone. + * Returns appropriate error code that caller should return or + * zero in case the clone should be allowed. + */ +int generic_remap_checks(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *req_count, unsigned int remap_flags) +{ + struct inode *inode_in = file_in->f_mapping->host; + struct inode *inode_out = file_out->f_mapping->host; + uint64_t count = *req_count; + uint64_t bcount; + loff_t size_in, size_out; + loff_t bs = inode_out->i_sb->s_blocksize; + int ret; + + /* The start of both ranges must be aligned to an fs block. */ + if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) + return -EINVAL; + + /* Ensure offsets don't wrap. */ + if (pos_in + count < pos_in || pos_out + count < pos_out) + return -EINVAL; + + size_in = i_size_read(inode_in); + size_out = i_size_read(inode_out); + + /* Dedupe requires both ranges to be within EOF. */ + if ((remap_flags & REMAP_FILE_DEDUP) && + (pos_in >= size_in || pos_in + count > size_in || + pos_out >= size_out || pos_out + count > size_out)) + return -EINVAL; + + /* Ensure the infile range is within the infile. */ + if (pos_in >= size_in) + return -EINVAL; + count = min(count, size_in - (uint64_t)pos_in); + + ret = generic_access_check_limits(file_in, pos_in, &count); + if (ret) + return ret; + + ret = generic_write_check_limits(file_out, pos_out, &count); + if (ret) + return ret; /* - * LFS rule + * If the user wanted us to link to the infile's EOF, round up to the + * next block boundary for this check. + * + * Otherwise, make sure the count is also block-aligned, having + * already confirmed the starting offsets' block alignment. */ - if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS && - !(file->f_flags & O_LARGEFILE))) { - if (pos >= MAX_NON_LFS) - return -EFBIG; - iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos); + if (pos_in + count == size_in) { + bcount = ALIGN(size_in, bs) - pos_in; + } else { + if (!IS_ALIGNED(count, bs)) + count = ALIGN_DOWN(count, bs); + bcount = count; } + /* Don't allow overlapped cloning within the same file. */ + if (inode_in == inode_out && + pos_out + bcount > pos_in && + pos_out < pos_in + bcount) + return -EINVAL; + /* - * Are we about to exceed the fs block limit ? - * - * If we have written data it becomes a short write. If we have - * exceeded without writing data we send a signal and return EFBIG. - * Linus frestrict idea will clean these up nicely.. + * We shortened the request but the caller can't deal with that, so + * bounce the request back to userspace. */ - if (unlikely(pos >= inode->i_sb->s_maxbytes)) - return -EFBIG; + if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) + return -EINVAL; - iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos); - return iov_iter_count(from); + *req_count = count; + return 0; } -EXPORT_SYMBOL(generic_write_checks); int pagecache_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, @@ -3012,7 +3114,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_NOWAIT) { /* If there are pages to writeback, return */ if (filemap_range_has_page(inode->i_mapping, pos, - pos + iov_iter_count(from))) + pos + write_len - 1)) return -EAGAIN; } else { written = filemap_write_and_wait_range(mapping, pos, @@ -3195,6 +3297,10 @@ EXPORT_SYMBOL(generic_perform_write); * This function does *not* take care of syncing data in case of O_SYNC write. * A caller has to handle it. This is mainly due to the fact that we want to * avoid syncing under i_mutex. + * + * Return: + * * number of bytes written, even for truncated writes + * * negative error code if no data has been written at all */ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { @@ -3279,6 +3385,10 @@ EXPORT_SYMBOL(__generic_file_write_iter); * This is a wrapper around __generic_file_write_iter() to be used by most * filesystems. It takes care of syncing the file in case of O_SYNC file * and acquires i_mutex as needed. + * Return: + * * negative error code if no data has been written at all of + * vfs_fsync_range() failed for a synchronous write + * * number of bytes written, even for truncated writes */ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { @@ -3305,8 +3415,7 @@ EXPORT_SYMBOL(generic_file_write_iter); * @gfp_mask: memory allocation flags (and I/O mode) * * The address_space is to try to release any data against the page - * (presumably at page->private). If the release was successful, return '1'. - * Otherwise return zero. + * (presumably at page->private). * * This may also be called if PG_fscache is set on a page, indicating that the * page is known to the local caching routines. @@ -3314,6 +3423,7 @@ EXPORT_SYMBOL(generic_file_write_iter); * The @gfp_mask argument specifies whether I/O may be performed to release * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). * + * Return: %1 if the release was successful, otherwise return zero. */ int try_to_release_page(struct page *page, gfp_t gfp_mask) { @@ -13,6 +13,9 @@ #include <linux/sched/signal.h> #include <linux/rwsem.h> #include <linux/hugetlb.h> +#include <linux/migrate.h> +#include <linux/mm_inline.h> +#include <linux/sched/mm.h> #include <asm/mmu_context.h> #include <asm/pgtable.h> @@ -20,6 +23,11 @@ #include "internal.h" +struct follow_page_context { + struct dev_pagemap *pgmap; + unsigned int page_mask; +}; + static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags) { @@ -71,10 +79,10 @@ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) } static struct page *follow_page_pte(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, unsigned int flags) + unsigned long address, pmd_t *pmd, unsigned int flags, + struct dev_pagemap **pgmap) { struct mm_struct *mm = vma->vm_mm; - struct dev_pagemap *pgmap = NULL; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; @@ -116,8 +124,8 @@ retry: * Only return device mapping pages in the FOLL_GET case since * they are only valid while holding the pgmap reference. */ - pgmap = get_dev_pagemap(pte_pfn(pte), NULL); - if (pgmap) + *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); + if (*pgmap) page = pte_page(pte); else goto no_page; @@ -152,15 +160,8 @@ retry: goto retry; } - if (flags & FOLL_GET) { + if (flags & FOLL_GET) get_page(page); - - /* drop the pgmap reference now that we hold the page */ - if (pgmap) { - put_dev_pagemap(pgmap); - pgmap = NULL; - } - } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) @@ -210,7 +211,8 @@ no_page: static struct page *follow_pmd_mask(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, - unsigned int flags, unsigned int *page_mask) + unsigned int flags, + struct follow_page_context *ctx) { pmd_t *pmd, pmdval; spinlock_t *ptl; @@ -258,13 +260,13 @@ retry: } if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); - page = follow_devmap_pmd(vma, address, pmd, flags); + page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); spin_unlock(ptl); if (page) return page; } if (likely(!pmd_trans_huge(pmdval))) - return follow_page_pte(vma, address, pmd, flags); + return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); if ((flags & FOLL_NUMA) && pmd_protnone(pmdval)) return no_page_table(vma, flags); @@ -284,7 +286,7 @@ retry_locked: } if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); - return follow_page_pte(vma, address, pmd, flags); + return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } if (flags & FOLL_SPLIT) { int ret; @@ -307,18 +309,18 @@ retry_locked: } return ret ? ERR_PTR(ret) : - follow_page_pte(vma, address, pmd, flags); + follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } page = follow_trans_huge_pmd(vma, address, pmd, flags); spin_unlock(ptl); - *page_mask = HPAGE_PMD_NR - 1; + ctx->page_mask = HPAGE_PMD_NR - 1; return page; } - static struct page *follow_pud_mask(struct vm_area_struct *vma, unsigned long address, p4d_t *p4dp, - unsigned int flags, unsigned int *page_mask) + unsigned int flags, + struct follow_page_context *ctx) { pud_t *pud; spinlock_t *ptl; @@ -344,7 +346,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, } if (pud_devmap(*pud)) { ptl = pud_lock(mm, pud); - page = follow_devmap_pud(vma, address, pud, flags); + page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap); spin_unlock(ptl); if (page) return page; @@ -352,13 +354,13 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, if (unlikely(pud_bad(*pud))) return no_page_table(vma, flags); - return follow_pmd_mask(vma, address, pud, flags, page_mask); + return follow_pmd_mask(vma, address, pud, flags, ctx); } - static struct page *follow_p4d_mask(struct vm_area_struct *vma, unsigned long address, pgd_t *pgdp, - unsigned int flags, unsigned int *page_mask) + unsigned int flags, + struct follow_page_context *ctx) { p4d_t *p4d; struct page *page; @@ -378,7 +380,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, return page; return no_page_table(vma, flags); } - return follow_pud_mask(vma, address, p4d, flags, page_mask); + return follow_pud_mask(vma, address, p4d, flags, ctx); } /** @@ -386,23 +388,29 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, * @vma: vm_area_struct mapping @address * @address: virtual address to look up * @flags: flags modifying lookup behaviour - * @page_mask: on output, *page_mask is set according to the size of the page + * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a + * pointer to output page_mask * * @flags can have FOLL_ flags set, defined in <linux/mm.h> * - * Returns the mapped (struct page *), %NULL if no mapping exists, or + * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches + * the device's dev_pagemap metadata to avoid repeating expensive lookups. + * + * On output, the @ctx->page_mask is set according to the size of the page. + * + * Return: the mapped (struct page *), %NULL if no mapping exists, or * an error pointer if there is a mapping to something not represented * by a page descriptor (see also vm_normal_page()). */ struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, - unsigned int *page_mask) + struct follow_page_context *ctx) { pgd_t *pgd; struct page *page; struct mm_struct *mm = vma->vm_mm; - *page_mask = 0; + ctx->page_mask = 0; /* make this handle hugepd */ page = follow_huge_addr(mm, address, flags & FOLL_WRITE); @@ -431,7 +439,19 @@ struct page *follow_page_mask(struct vm_area_struct *vma, return no_page_table(vma, flags); } - return follow_p4d_mask(vma, address, pgd, flags, page_mask); + return follow_p4d_mask(vma, address, pgd, flags, ctx); +} + +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, + unsigned int foll_flags) +{ + struct follow_page_context ctx = { NULL }; + struct page *page; + + page = follow_page_mask(vma, address, foll_flags, &ctx); + if (ctx.pgmap) + put_dev_pagemap(ctx.pgmap); + return page; } static int get_gate_page(struct mm_struct *mm, unsigned long address, @@ -659,9 +679,9 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *nonblocking) { - long i = 0; - unsigned int page_mask; + long ret = 0, i = 0; struct vm_area_struct *vma = NULL; + struct follow_page_context ctx = { NULL }; if (!nr_pages) return 0; @@ -685,18 +705,19 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (!vma || start >= vma->vm_end) { vma = find_extend_vma(mm, start); if (!vma && in_gate_area(mm, start)) { - int ret; ret = get_gate_page(mm, start & PAGE_MASK, gup_flags, &vma, pages ? &pages[i] : NULL); if (ret) - return i ? : ret; - page_mask = 0; + goto out; + ctx.page_mask = 0; goto next_page; } - if (!vma || check_vma_flags(vma, gup_flags)) - return i ? : -EFAULT; + if (!vma || check_vma_flags(vma, gup_flags)) { + ret = -EFAULT; + goto out; + } if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, @@ -709,23 +730,26 @@ retry: * If we have a pending SIGKILL, don't keep faulting pages and * potentially allocating memory. */ - if (unlikely(fatal_signal_pending(current))) - return i ? i : -ERESTARTSYS; + if (fatal_signal_pending(current)) { + ret = -ERESTARTSYS; + goto out; + } cond_resched(); - page = follow_page_mask(vma, start, foll_flags, &page_mask); + + page = follow_page_mask(vma, start, foll_flags, &ctx); if (!page) { - int ret; ret = faultin_page(tsk, vma, start, &foll_flags, nonblocking); switch (ret) { case 0: goto retry; + case -EBUSY: + ret = 0; + /* FALLTHRU */ case -EFAULT: case -ENOMEM: case -EHWPOISON: - return i ? i : ret; - case -EBUSY: - return i; + goto out; case -ENOENT: goto next_page; } @@ -737,27 +761,31 @@ retry: */ goto next_page; } else if (IS_ERR(page)) { - return i ? i : PTR_ERR(page); + ret = PTR_ERR(page); + goto out; } if (pages) { pages[i] = page; flush_anon_page(vma, page, start); flush_dcache_page(page); - page_mask = 0; + ctx.page_mask = 0; } next_page: if (vmas) { vmas[i] = vma; - page_mask = 0; + ctx.page_mask = 0; } - page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); + page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); if (page_increm > nr_pages) page_increm = nr_pages; i += page_increm; start += page_increm * PAGE_SIZE; nr_pages -= page_increm; } while (nr_pages); - return i; +out: + if (ctx.pgmap) + put_dev_pagemap(ctx.pgmap); + return i ? i : ret; } static bool vma_permits_fault(struct vm_area_struct *vma, @@ -1101,7 +1129,167 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages); +#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) + #ifdef CONFIG_FS_DAX +static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) +{ + long i; + struct vm_area_struct *vma_prev = NULL; + + for (i = 0; i < nr_pages; i++) { + struct vm_area_struct *vma = vmas[i]; + + if (vma == vma_prev) + continue; + + vma_prev = vma; + + if (vma_is_fsdax(vma)) + return true; + } + return false; +} +#else +static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) +{ + return false; +} +#endif + +#ifdef CONFIG_CMA +static struct page *new_non_cma_page(struct page *page, unsigned long private) +{ + /* + * We want to make sure we allocate the new page from the same node + * as the source page. + */ + int nid = page_to_nid(page); + /* + * Trying to allocate a page for migration. Ignore allocation + * failure warnings. We don't force __GFP_THISNODE here because + * this node here is the node where we have CMA reservation and + * in some case these nodes will have really less non movable + * allocation memory. + */ + gfp_t gfp_mask = GFP_USER | __GFP_NOWARN; + + if (PageHighMem(page)) + gfp_mask |= __GFP_HIGHMEM; + +#ifdef CONFIG_HUGETLB_PAGE + if (PageHuge(page)) { + struct hstate *h = page_hstate(page); + /* + * We don't want to dequeue from the pool because pool pages will + * mostly be from the CMA region. + */ + return alloc_migrate_huge_page(h, gfp_mask, nid, NULL); + } +#endif + if (PageTransHuge(page)) { + struct page *thp; + /* + * ignore allocation failure warnings + */ + gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_NOWARN; + + /* + * Remove the movable mask so that we don't allocate from + * CMA area again. + */ + thp_gfpmask &= ~__GFP_MOVABLE; + thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER); + if (!thp) + return NULL; + prep_transhuge_page(thp); + return thp; + } + + return __alloc_pages_node(nid, gfp_mask, 0); +} + +static long check_and_migrate_cma_pages(unsigned long start, long nr_pages, + unsigned int gup_flags, + struct page **pages, + struct vm_area_struct **vmas) +{ + long i; + bool drain_allow = true; + bool migrate_allow = true; + LIST_HEAD(cma_page_list); + +check_again: + for (i = 0; i < nr_pages; i++) { + /* + * If we get a page from the CMA zone, since we are going to + * be pinning these entries, we might as well move them out + * of the CMA zone if possible. + */ + if (is_migrate_cma_page(pages[i])) { + + struct page *head = compound_head(pages[i]); + + if (PageHuge(head)) { + isolate_huge_page(head, &cma_page_list); + } else { + if (!PageLRU(head) && drain_allow) { + lru_add_drain_all(); + drain_allow = false; + } + + if (!isolate_lru_page(head)) { + list_add_tail(&head->lru, &cma_page_list); + mod_node_page_state(page_pgdat(head), + NR_ISOLATED_ANON + + page_is_file_cache(head), + hpage_nr_pages(head)); + } + } + } + } + + if (!list_empty(&cma_page_list)) { + /* + * drop the above get_user_pages reference. + */ + for (i = 0; i < nr_pages; i++) + put_page(pages[i]); + + if (migrate_pages(&cma_page_list, new_non_cma_page, + NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) { + /* + * some of the pages failed migration. Do get_user_pages + * without migration. + */ + migrate_allow = false; + + if (!list_empty(&cma_page_list)) + putback_movable_pages(&cma_page_list); + } + /* + * We did migrate all the pages, Try to get the page references again + * migrating any new CMA pages which we failed to isolate earlier. + */ + nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas); + if ((nr_pages > 0) && migrate_allow) { + drain_allow = true; + goto check_again; + } + } + + return nr_pages; +} +#else +static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages, + unsigned int gup_flags, + struct page **pages, + struct vm_area_struct **vmas) +{ + return nr_pages; +} +#endif + /* * This is the same as get_user_pages() in that it assumes we are * operating on the current task's mm, but it goes further to validate @@ -1115,11 +1303,11 @@ EXPORT_SYMBOL(get_user_pages); * Contrast this to iov_iter_get_pages() usages which are transient. */ long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas_arg) + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas_arg) { struct vm_area_struct **vmas = vmas_arg; - struct vm_area_struct *vma_prev = NULL; + unsigned long flags; long rc, i; if (!pages) @@ -1132,31 +1320,20 @@ long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, return -ENOMEM; } + flags = memalloc_nocma_save(); rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas); + memalloc_nocma_restore(flags); + if (rc < 0) + goto out; - for (i = 0; i < rc; i++) { - struct vm_area_struct *vma = vmas[i]; - - if (vma == vma_prev) - continue; - - vma_prev = vma; - - if (vma_is_fsdax(vma)) - break; - } - - /* - * Either get_user_pages() failed, or the vma validation - * succeeded, in either case we don't need to put_page() before - * returning. - */ - if (i >= rc) + if (check_dax_vmas(vmas, rc)) { + for (i = 0; i < rc; i++) + put_page(pages[i]); + rc = -EOPNOTSUPP; goto out; + } - for (i = 0; i < rc; i++) - put_page(pages[i]); - rc = -EOPNOTSUPP; + rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas); out: if (vmas != vmas_arg) kfree(vmas); @@ -1649,7 +1826,8 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, if (!pmd_present(pmd)) return 0; - if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { + if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) || + pmd_devmap(pmd))) { /* * NUMA hinting faults need to be handled in the GUP * slowpath for accounting purposes and so that they @@ -1761,7 +1939,7 @@ static void gup_pgd_range(unsigned long addr, unsigned long end, * Check if it's allowed to use __get_user_pages_fast() for the range, or * we need to fall back to the slow version: */ -bool gup_fast_permitted(unsigned long start, int nr_pages, int write) +bool gup_fast_permitted(unsigned long start, int nr_pages) { unsigned long len, end; @@ -1780,17 +1958,15 @@ bool gup_fast_permitted(unsigned long start, int nr_pages, int write) int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { - unsigned long addr, len, end; + unsigned long len, end; unsigned long flags; int nr = 0; start &= PAGE_MASK; - addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; - if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, - (void __user *)start, len))) + if (unlikely(!access_ok((void __user *)start, len))) return 0; /* @@ -1798,16 +1974,16 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * interrupts disabled by get_futex_key. * * With interrupts disabled, we block page table pages from being - * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h - * for more details. + * freed from under us. See struct mmu_table_batch comments in + * include/asm-generic/tlb.h for more details. * * We do not adopt an rcu_read_lock(.) here as we also want to * block IPIs that come from THPs splitting. */ - if (gup_fast_permitted(start, nr_pages, write)) { + if (gup_fast_permitted(start, nr_pages)) { local_irq_save(flags); - gup_pgd_range(addr, end, write, pages, &nr); + gup_pgd_range(start, end, write, pages, &nr); local_irq_restore(flags); } @@ -1844,11 +2020,10 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, if (nr_pages <= 0) return 0; - if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, - (void __user *)start, len))) + if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; - if (gup_fast_permitted(start, nr_pages, write)) { + if (gup_fast_permitted(start, nr_pages)) { local_irq_disable(); gup_pgd_range(addr, end, write, pages, &nr); local_irq_enable(); diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index 7405c9d89d65..6c0279e70cc4 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -6,13 +6,17 @@ #include <linux/debugfs.h> #define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) +#define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark) +#define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark) struct gup_benchmark { - __u64 delta_usec; + __u64 get_delta_usec; + __u64 put_delta_usec; __u64 addr; __u64 size; __u32 nr_pages_per_call; __u32 flags; + __u64 expansion[10]; /* For future use */ }; static int __gup_benchmark_ioctl(unsigned int cmd, @@ -23,6 +27,9 @@ static int __gup_benchmark_ioctl(unsigned int cmd, int nr; struct page **pages; + if (gup->size > ULONG_MAX) + return -EINVAL; + nr_pages = gup->size / PAGE_SIZE; pages = kvcalloc(nr_pages, sizeof(void *), GFP_KERNEL); if (!pages) @@ -41,21 +48,40 @@ static int __gup_benchmark_ioctl(unsigned int cmd, nr = (next - addr) / PAGE_SIZE; } - nr = get_user_pages_fast(addr, nr, gup->flags & 1, pages + i); + switch (cmd) { + case GUP_FAST_BENCHMARK: + nr = get_user_pages_fast(addr, nr, gup->flags & 1, + pages + i); + break; + case GUP_LONGTERM_BENCHMARK: + nr = get_user_pages_longterm(addr, nr, gup->flags & 1, + pages + i, NULL); + break; + case GUP_BENCHMARK: + nr = get_user_pages(addr, nr, gup->flags & 1, pages + i, + NULL); + break; + default: + return -1; + } + if (nr <= 0) break; i += nr; } end_time = ktime_get(); - gup->delta_usec = ktime_us_delta(end_time, start_time); + gup->get_delta_usec = ktime_us_delta(end_time, start_time); gup->size = addr - gup->addr; + start_time = ktime_get(); for (i = 0; i < nr_pages; i++) { if (!pages[i]) break; put_page(pages[i]); } + end_time = ktime_get(); + gup->put_delta_usec = ktime_us_delta(end_time, start_time); kvfree(pages); return 0; @@ -67,8 +93,14 @@ static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd, struct gup_benchmark gup; int ret; - if (cmd != GUP_FAST_BENCHMARK) + switch (cmd) { + case GUP_FAST_BENCHMARK: + case GUP_LONGTERM_BENCHMARK: + case GUP_BENCHMARK: + break; + default: return -EINVAL; + } if (copy_from_user(&gup, (void __user *)arg, sizeof(gup))) return -EFAULT; @@ -90,12 +122,8 @@ static const struct file_operations gup_benchmark_fops = { static int gup_benchmark_init(void) { - void *ret; - - ret = debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL, - &gup_benchmark_fops); - if (!ret) - pr_warn("Failed to create gup_benchmark in debugfs"); + debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL, + &gup_benchmark_fops); return 0; } diff --git a/mm/highmem.c b/mm/highmem.c index 59db3223a5d6..107b10f9878e 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -105,9 +105,8 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) } #endif -unsigned long totalhigh_pages __read_mostly; -EXPORT_SYMBOL(totalhigh_pages); - +atomic_long_t _totalhigh_pages __read_mostly; +EXPORT_SYMBOL(_totalhigh_pages); EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); @@ -11,7 +11,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * Authors: Jérôme Glisse <[email protected]> + * Authors: Jérôme Glisse <[email protected]> */ /* * Refer to include/linux/hmm.h for information about heterogeneous memory @@ -43,7 +43,6 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops; * * @mm: mm struct this HMM struct is bound to * @lock: lock protecting ranges list - * @sequence: we track updates to the CPU page table with a sequence number * @ranges: list of range being snapshotted * @mirrors: list of mirrors for this mm * @mmu_notifier: mmu notifier to track updates to CPU page table @@ -52,7 +51,6 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops; struct hmm { struct mm_struct *mm; spinlock_t lock; - atomic_t sequence; struct list_head ranges; struct list_head mirrors; struct mmu_notifier mmu_notifier; @@ -85,22 +83,11 @@ static struct hmm *hmm_register(struct mm_struct *mm) return NULL; INIT_LIST_HEAD(&hmm->mirrors); init_rwsem(&hmm->mirrors_sem); - atomic_set(&hmm->sequence, 0); hmm->mmu_notifier.ops = NULL; INIT_LIST_HEAD(&hmm->ranges); spin_lock_init(&hmm->lock); hmm->mm = mm; - /* - * We should only get here if hold the mmap_sem in write mode ie on - * registration of first mirror through hmm_mirror_register() - */ - hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; - if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) { - kfree(hmm); - return NULL; - } - spin_lock(&mm->page_table_lock); if (!mm->hmm) mm->hmm = hmm; @@ -108,12 +95,27 @@ static struct hmm *hmm_register(struct mm_struct *mm) cleanup = true; spin_unlock(&mm->page_table_lock); - if (cleanup) { - mmu_notifier_unregister(&hmm->mmu_notifier, mm); - kfree(hmm); - } + if (cleanup) + goto error; + + /* + * We should only get here if hold the mmap_sem in write mode ie on + * registration of first mirror through hmm_mirror_register() + */ + hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; + if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) + goto error_mm; return mm->hmm; + +error_mm: + spin_lock(&mm->page_table_lock); + if (mm->hmm == hmm) + mm->hmm = NULL; + spin_unlock(&mm->page_table_lock); +error: + kfree(hmm); + return NULL; } void hmm_mm_destroy(struct mm_struct *mm) @@ -121,10 +123,8 @@ void hmm_mm_destroy(struct mm_struct *mm) kfree(mm->hmm); } -static void hmm_invalidate_range(struct hmm *hmm, - enum hmm_update_type action, - unsigned long start, - unsigned long end) +static int hmm_invalidate_range(struct hmm *hmm, bool device, + const struct hmm_update *update) { struct hmm_mirror *mirror; struct hmm_range *range; @@ -133,22 +133,33 @@ static void hmm_invalidate_range(struct hmm *hmm, list_for_each_entry(range, &hmm->ranges, list) { unsigned long addr, idx, npages; - if (end < range->start || start >= range->end) + if (update->end < range->start || update->start >= range->end) continue; range->valid = false; - addr = max(start, range->start); + addr = max(update->start, range->start); idx = (addr - range->start) >> PAGE_SHIFT; - npages = (min(range->end, end) - addr) >> PAGE_SHIFT; + npages = (min(range->end, update->end) - addr) >> PAGE_SHIFT; memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages); } spin_unlock(&hmm->lock); + if (!device) + return 0; + down_read(&hmm->mirrors_sem); - list_for_each_entry(mirror, &hmm->mirrors, list) - mirror->ops->sync_cpu_device_pagetables(mirror, action, - start, end); + list_for_each_entry(mirror, &hmm->mirrors, list) { + int ret; + + ret = mirror->ops->sync_cpu_device_pagetables(mirror, update); + if (!update->blockable && ret == -EAGAIN) { + up_read(&hmm->mirrors_sem); + return -EAGAIN; + } + } up_read(&hmm->mirrors_sem); + + return 0; } static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) @@ -178,30 +189,33 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) } static int hmm_invalidate_range_start(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, - unsigned long end, - bool blockable) + const struct mmu_notifier_range *range) { - struct hmm *hmm = mm->hmm; + struct hmm_update update; + struct hmm *hmm = range->mm->hmm; VM_BUG_ON(!hmm); - atomic_inc(&hmm->sequence); - - return 0; + update.start = range->start; + update.end = range->end; + update.event = HMM_UPDATE_INVALIDATE; + update.blockable = range->blockable; + return hmm_invalidate_range(hmm, true, &update); } static void hmm_invalidate_range_end(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, - unsigned long end) + const struct mmu_notifier_range *range) { - struct hmm *hmm = mm->hmm; + struct hmm_update update; + struct hmm *hmm = range->mm->hmm; VM_BUG_ON(!hmm); - hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end); + update.start = range->start; + update.end = range->end; + update.event = HMM_UPDATE_INVALIDATE; + update.blockable = true; + hmm_invalidate_range(hmm, false, &update); } static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { @@ -278,12 +292,13 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror) if (!should_unregister || mm == NULL) return; + mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); + spin_lock(&mm->page_table_lock); if (mm->hmm == hmm) mm->hmm = NULL; spin_unlock(&mm->page_table_lock); - mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); kfree(hmm); } EXPORT_SYMBOL(hmm_mirror_unregister); @@ -571,22 +586,42 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; uint64_t *pfns = range->pfns; unsigned long addr = start, i; pte_t *ptep; + pmd_t pmd; - i = (addr - range->start) >> PAGE_SHIFT; again: - if (pmd_none(*pmdp)) + pmd = READ_ONCE(*pmdp); + if (pmd_none(pmd)) return hmm_vma_walk_hole(start, end, walk); - if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB)) + if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB)) return hmm_pfns_bad(start, end, walk); - if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) { - pmd_t pmd; + if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { + bool fault, write_fault; + unsigned long npages; + uint64_t *pfns; + i = (addr - range->start) >> PAGE_SHIFT; + npages = (end - addr) >> PAGE_SHIFT; + pfns = &range->pfns[i]; + + hmm_range_need_fault(hmm_vma_walk, pfns, npages, + 0, &fault, &write_fault); + if (fault || write_fault) { + hmm_vma_walk->last = addr; + pmd_migration_entry_wait(vma->vm_mm, pmdp); + return -EAGAIN; + } + return 0; + } else if (!pmd_present(pmd)) + return hmm_pfns_bad(start, end, walk); + + if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { /* * No need to take pmd_lock here, even if some other threads * is splitting the huge pmd we will get that event through @@ -601,13 +636,21 @@ again: if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) goto again; + i = (addr - range->start) >> PAGE_SHIFT; return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); } - if (pmd_bad(*pmdp)) + /* + * We have handled all the valid case above ie either none, migration, + * huge or transparent huge. At this point either it is a valid pmd + * entry pointing to pte directory or it is a bad pmd that will not + * recover. + */ + if (pmd_bad(pmd)) return hmm_pfns_bad(start, end, walk); ptep = pte_offset_map(pmdp, addr); + i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { int r; @@ -938,19 +981,13 @@ static void hmm_devmem_ref_exit(void *data) struct hmm_devmem *devmem; devmem = container_of(ref, struct hmm_devmem, ref); + wait_for_completion(&devmem->completion); percpu_ref_exit(ref); - devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data); } -static void hmm_devmem_ref_kill(void *data) +static void hmm_devmem_ref_kill(struct percpu_ref *ref) { - struct percpu_ref *ref = data; - struct hmm_devmem *devmem; - - devmem = container_of(ref, struct hmm_devmem, ref); percpu_ref_kill(ref); - wait_for_completion(&devmem->completion); - devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data); } static int hmm_devmem_fault(struct vm_area_struct *vma, @@ -973,170 +1010,6 @@ static void hmm_devmem_free(struct page *page, void *data) devmem->ops->free(devmem, page); } -static DEFINE_MUTEX(hmm_devmem_lock); -static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL); - -static void hmm_devmem_radix_release(struct resource *resource) -{ - resource_size_t key; - - mutex_lock(&hmm_devmem_lock); - for (key = resource->start; - key <= resource->end; - key += PA_SECTION_SIZE) - radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT); - mutex_unlock(&hmm_devmem_lock); -} - -static void hmm_devmem_release(struct device *dev, void *data) -{ - struct hmm_devmem *devmem = data; - struct resource *resource = devmem->resource; - unsigned long start_pfn, npages; - struct zone *zone; - struct page *page; - - if (percpu_ref_tryget_live(&devmem->ref)) { - dev_WARN(dev, "%s: page mapping is still live!\n", __func__); - percpu_ref_put(&devmem->ref); - } - - /* pages are dead and unused, undo the arch mapping */ - start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT; - npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT; - - page = pfn_to_page(start_pfn); - zone = page_zone(page); - - mem_hotplug_begin(); - if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) - __remove_pages(zone, start_pfn, npages, NULL); - else - arch_remove_memory(start_pfn << PAGE_SHIFT, - npages << PAGE_SHIFT, NULL); - mem_hotplug_done(); - - hmm_devmem_radix_release(resource); -} - -static int hmm_devmem_pages_create(struct hmm_devmem *devmem) -{ - resource_size_t key, align_start, align_size, align_end; - struct device *device = devmem->device; - int ret, nid, is_ram; - unsigned long pfn; - - align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1); - align_size = ALIGN(devmem->resource->start + - resource_size(devmem->resource), - PA_SECTION_SIZE) - align_start; - - is_ram = region_intersects(align_start, align_size, - IORESOURCE_SYSTEM_RAM, - IORES_DESC_NONE); - if (is_ram == REGION_MIXED) { - WARN_ONCE(1, "%s attempted on mixed region %pr\n", - __func__, devmem->resource); - return -ENXIO; - } - if (is_ram == REGION_INTERSECTS) - return -ENXIO; - - if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY) - devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; - else - devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; - - devmem->pagemap.res = *devmem->resource; - devmem->pagemap.page_fault = hmm_devmem_fault; - devmem->pagemap.page_free = hmm_devmem_free; - devmem->pagemap.dev = devmem->device; - devmem->pagemap.ref = &devmem->ref; - devmem->pagemap.data = devmem; - - mutex_lock(&hmm_devmem_lock); - align_end = align_start + align_size - 1; - for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) { - struct hmm_devmem *dup; - - dup = radix_tree_lookup(&hmm_devmem_radix, - key >> PA_SECTION_SHIFT); - if (dup) { - dev_err(device, "%s: collides with mapping for %s\n", - __func__, dev_name(dup->device)); - mutex_unlock(&hmm_devmem_lock); - ret = -EBUSY; - goto error; - } - ret = radix_tree_insert(&hmm_devmem_radix, - key >> PA_SECTION_SHIFT, - devmem); - if (ret) { - dev_err(device, "%s: failed: %d\n", __func__, ret); - mutex_unlock(&hmm_devmem_lock); - goto error_radix; - } - } - mutex_unlock(&hmm_devmem_lock); - - nid = dev_to_node(device); - if (nid < 0) - nid = numa_mem_id(); - - mem_hotplug_begin(); - /* - * For device private memory we call add_pages() as we only need to - * allocate and initialize struct page for the device memory. More- - * over the device memory is un-accessible thus we do not want to - * create a linear mapping for the memory like arch_add_memory() - * would do. - * - * For device public memory, which is accesible by the CPU, we do - * want the linear mapping and thus use arch_add_memory(). - */ - if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC) - ret = arch_add_memory(nid, align_start, align_size, NULL, - false); - else - ret = add_pages(nid, align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, NULL, false); - if (ret) { - mem_hotplug_done(); - goto error_add_memory; - } - move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], - align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, NULL); - mem_hotplug_done(); - - for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) { - struct page *page = pfn_to_page(pfn); - - page->pgmap = &devmem->pagemap; - } - return 0; - -error_add_memory: - untrack_pfn(NULL, PHYS_PFN(align_start), align_size); -error_radix: - hmm_devmem_radix_release(devmem->resource); -error: - return ret; -} - -static int hmm_devmem_match(struct device *dev, void *data, void *match_data) -{ - struct hmm_devmem *devmem = data; - - return devmem->resource == match_data; -} - -static void hmm_devmem_pages_remove(struct hmm_devmem *devmem) -{ - devres_release(devmem->device, &hmm_devmem_release, - &hmm_devmem_match, devmem->resource); -} - /* * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory * @@ -1160,12 +1033,12 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, { struct hmm_devmem *devmem; resource_size_t addr; + void *result; int ret; dev_pagemap_get_ops(); - devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), - GFP_KERNEL, dev_to_node(device)); + devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); if (!devmem) return ERR_PTR(-ENOMEM); @@ -1179,11 +1052,11 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, 0, GFP_KERNEL); if (ret) - goto error_percpu_ref; + return ERR_PTR(ret); - ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); + ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, &devmem->ref); if (ret) - goto error_devm_add_action; + return ERR_PTR(ret); size = ALIGN(size, PA_SECTION_SIZE); addr = min((unsigned long)iomem_resource.end, @@ -1203,54 +1076,40 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, devmem->resource = devm_request_mem_region(device, addr, size, dev_name(device)); - if (!devmem->resource) { - ret = -ENOMEM; - goto error_no_resource; - } + if (!devmem->resource) + return ERR_PTR(-ENOMEM); break; } - if (!devmem->resource) { - ret = -ERANGE; - goto error_no_resource; - } + if (!devmem->resource) + return ERR_PTR(-ERANGE); devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; devmem->pfn_last = devmem->pfn_first + (resource_size(devmem->resource) >> PAGE_SHIFT); + devmem->page_fault = hmm_devmem_fault; - ret = hmm_devmem_pages_create(devmem); - if (ret) - goto error_pages; - - devres_add(device, devmem); - - ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); - if (ret) { - hmm_devmem_remove(devmem); - return ERR_PTR(ret); - } + devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; + devmem->pagemap.res = *devmem->resource; + devmem->pagemap.page_free = hmm_devmem_free; + devmem->pagemap.altmap_valid = false; + devmem->pagemap.ref = &devmem->ref; + devmem->pagemap.data = devmem; + devmem->pagemap.kill = hmm_devmem_ref_kill; + result = devm_memremap_pages(devmem->device, &devmem->pagemap); + if (IS_ERR(result)) + return result; return devmem; - -error_pages: - devm_release_mem_region(device, devmem->resource->start, - resource_size(devmem->resource)); -error_no_resource: -error_devm_add_action: - hmm_devmem_ref_kill(&devmem->ref); - hmm_devmem_ref_exit(&devmem->ref); -error_percpu_ref: - devres_free(devmem); - return ERR_PTR(ret); } -EXPORT_SYMBOL(hmm_devmem_add); +EXPORT_SYMBOL_GPL(hmm_devmem_add); struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, struct device *device, struct resource *res) { struct hmm_devmem *devmem; + void *result; int ret; if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) @@ -1258,8 +1117,7 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, dev_pagemap_get_ops(); - devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), - GFP_KERNEL, dev_to_node(device)); + devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); if (!devmem) return ERR_PTR(-ENOMEM); @@ -1273,71 +1131,32 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, 0, GFP_KERNEL); if (ret) - goto error_percpu_ref; + return ERR_PTR(ret); - ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); + ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, + &devmem->ref); if (ret) - goto error_devm_add_action; - + return ERR_PTR(ret); devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; devmem->pfn_last = devmem->pfn_first + (resource_size(devmem->resource) >> PAGE_SHIFT); + devmem->page_fault = hmm_devmem_fault; - ret = hmm_devmem_pages_create(devmem); - if (ret) - goto error_devm_add_action; - - devres_add(device, devmem); - - ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); - if (ret) { - hmm_devmem_remove(devmem); - return ERR_PTR(ret); - } + devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; + devmem->pagemap.res = *devmem->resource; + devmem->pagemap.page_free = hmm_devmem_free; + devmem->pagemap.altmap_valid = false; + devmem->pagemap.ref = &devmem->ref; + devmem->pagemap.data = devmem; + devmem->pagemap.kill = hmm_devmem_ref_kill; + result = devm_memremap_pages(devmem->device, &devmem->pagemap); + if (IS_ERR(result)) + return result; return devmem; - -error_devm_add_action: - hmm_devmem_ref_kill(&devmem->ref); - hmm_devmem_ref_exit(&devmem->ref); -error_percpu_ref: - devres_free(devmem); - return ERR_PTR(ret); -} -EXPORT_SYMBOL(hmm_devmem_add_resource); - -/* - * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE) - * - * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory - * - * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf - * of the device driver. It will free struct page and remove the resource that - * reserved the physical address range for this device memory. - */ -void hmm_devmem_remove(struct hmm_devmem *devmem) -{ - resource_size_t start, size; - struct device *device; - bool cdm = false; - - if (!devmem) - return; - - device = devmem->device; - start = devmem->resource->start; - size = resource_size(devmem->resource); - - cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY; - hmm_devmem_ref_kill(&devmem->ref); - hmm_devmem_ref_exit(&devmem->ref); - hmm_devmem_pages_remove(devmem); - - if (!cdm) - devm_release_mem_region(device, start, size); } -EXPORT_SYMBOL(hmm_devmem_remove); +EXPORT_SYMBOL_GPL(hmm_devmem_add_resource); /* * A device driver that wants to handle multiple devices memory through a diff --git a/mm/huge_memory.c b/mm/huge_memory.c index deed97fba979..404acdcd0455 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -33,6 +33,7 @@ #include <linux/page_idle.h> #include <linux/shmem_fs.h> #include <linux/oom.h> +#include <linux/numa.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -62,6 +63,16 @@ static struct shrinker deferred_split_shrinker; static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; +bool transparent_hugepage_enabled(struct vm_area_struct *vma) +{ + if (vma_is_anonymous(vma)) + return __transparent_hugepage_enabled(vma); + if (vma_is_shmem(vma) && shmem_huge_enabled(vma)) + return __transparent_hugepage_enabled(vma); + + return false; +} + static struct page *get_huge_zero_page(void) { struct page *zero_page; @@ -420,7 +431,7 @@ static int __init hugepage_init(void) * where the extra memory used could hurt more than TLB overhead * is likely to save. The admin can still enable it through /sys. */ - if (totalram_pages < (512 << (20 - PAGE_SHIFT))) { + if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { transparent_hugepage_flags = 0; return 0; } @@ -558,7 +569,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, return VM_FAULT_FALLBACK; } - pgtable = pte_alloc_one(vma->vm_mm, haddr); + pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) { ret = VM_FAULT_OOM; goto release; @@ -606,6 +617,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); + count_memcg_events(memcg, THP_FAULT_ALLOC, 1); } return 0; @@ -633,16 +645,25 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) { const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + /* Always do synchronous compaction */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); + + /* Kick kcompactd and fail quickly */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + + /* Synchronous compaction if madvised, otherwise kick kcompactd */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : - __GFP_KSWAPD_RECLAIM); + return GFP_TRANSHUGE_LIGHT | + (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); + + /* Only do synchronous compaction if madvised */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : - 0); + return GFP_TRANSHUGE_LIGHT | + (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); + return GFP_TRANSHUGE_LIGHT; } @@ -683,7 +704,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) struct page *zero_page; bool set; vm_fault_t ret; - pgtable = pte_alloc_one(vma->vm_mm, haddr); + pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) return VM_FAULT_OOM; zero_page = mm_get_huge_zero_page(vma->vm_mm); @@ -772,7 +793,7 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, return VM_FAULT_SIGBUS; if (arch_needs_pgtable_deposit()) { - pgtable = pte_alloc_one(vma->vm_mm, addr); + pgtable = pte_alloc_one(vma->vm_mm); if (!pgtable) return VM_FAULT_OOM; } @@ -852,11 +873,10 @@ static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, } struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, int flags) + pmd_t *pmd, int flags, struct dev_pagemap **pgmap) { unsigned long pfn = pmd_pfn(*pmd); struct mm_struct *mm = vma->vm_mm; - struct dev_pagemap *pgmap; struct page *page; assert_spin_locked(pmd_lockptr(mm, pmd)); @@ -886,12 +906,11 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, return ERR_PTR(-EEXIST); pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; - pgmap = get_dev_pagemap(pfn, NULL); - if (!pgmap) + *pgmap = get_dev_pagemap(pfn, *pgmap); + if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); get_page(page); - put_dev_pagemap(pgmap); return page; } @@ -910,7 +929,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (!vma_is_anonymous(vma)) return 0; - pgtable = pte_alloc_one(dst_mm, addr); + pgtable = pte_alloc_one(dst_mm); if (unlikely(!pgtable)) goto out; @@ -1000,11 +1019,10 @@ static void touch_pud(struct vm_area_struct *vma, unsigned long addr, } struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, int flags) + pud_t *pud, int flags, struct dev_pagemap **pgmap) { unsigned long pfn = pud_pfn(*pud); struct mm_struct *mm = vma->vm_mm; - struct dev_pagemap *pgmap; struct page *page; assert_spin_locked(pud_lockptr(mm, pud)); @@ -1028,12 +1046,11 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, return ERR_PTR(-EEXIST); pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; - pgmap = get_dev_pagemap(pfn, NULL); - if (!pgmap) + *pgmap = get_dev_pagemap(pfn, *pgmap); + if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); get_page(page); - put_dev_pagemap(pgmap); return page; } @@ -1129,8 +1146,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, int i; vm_fault_t ret = 0; struct page **pages; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + struct mmu_notifier_range range; pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *), GFP_KERNEL); @@ -1168,9 +1184,9 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, cond_resched(); } - mmun_start = haddr; - mmun_end = haddr + HPAGE_PMD_SIZE; - mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, vma->vm_mm, haddr, + haddr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) @@ -1215,8 +1231,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, * No need to double call mmu_notifier->invalidate_range() callback as * the above pmdp_huge_clear_flush_notify() did already call it. */ - mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, - mmun_end); + mmu_notifier_invalidate_range_only_end(&range); ret |= VM_FAULT_WRITE; put_page(page); @@ -1226,7 +1241,7 @@ out: out_free_pages: spin_unlock(vmf->ptl); - mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); for (i = 0; i < HPAGE_PMD_NR; i++) { memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); @@ -1243,8 +1258,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) struct page *page = NULL, *new_page; struct mem_cgroup *memcg; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + struct mmu_notifier_range range; gfp_t huge_gfp; /* for allocation and charge */ vm_fault_t ret = 0; @@ -1288,7 +1302,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) get_page(page); spin_unlock(vmf->ptl); alloc: - if (transparent_hugepage_enabled(vma) && + if (__transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { huge_gfp = alloc_hugepage_direct_gfpmask(vma); new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); @@ -1325,6 +1339,7 @@ alloc: } count_vm_event(THP_FAULT_ALLOC); + count_memcg_events(memcg, THP_FAULT_ALLOC, 1); if (!page) clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); @@ -1333,9 +1348,9 @@ alloc: vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); - mmun_start = haddr; - mmun_end = haddr + HPAGE_PMD_SIZE; - mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, vma->vm_mm, haddr, + haddr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); spin_lock(vmf->ptl); if (page) @@ -1370,8 +1385,7 @@ out_mn: * No need to double call mmu_notifier->invalidate_range() callback as * the above pmdp_huge_clear_flush_notify() did already call it. */ - mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, - mmun_end); + mmu_notifier_invalidate_range_only_end(&range); out: return ret; out_unlock: @@ -1464,7 +1478,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) struct anon_vma *anon_vma = NULL; struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - int page_nid = -1, this_nid = numa_node_id(); + int page_nid = NUMA_NO_NODE, this_nid = numa_node_id(); int target_nid, last_cpupid = -1; bool page_locked; bool migrated = false; @@ -1485,8 +1499,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) if (!get_page_unless_zero(page)) goto out_unlock; spin_unlock(vmf->ptl); - wait_on_page_locked(page); - put_page(page); + put_and_wait_on_page_locked(page); goto out; } @@ -1510,7 +1523,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) */ page_locked = trylock_page(page); target_nid = mpol_misplaced(page, vma, haddr); - if (target_nid == -1) { + if (target_nid == NUMA_NO_NODE) { /* If the page was locked, there are no parallel migrations */ if (page_locked) goto clear_pmdnuma; @@ -1518,12 +1531,11 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { - page_nid = -1; + page_nid = NUMA_NO_NODE; if (!get_page_unless_zero(page)) goto out_unlock; spin_unlock(vmf->ptl); - wait_on_page_locked(page); - put_page(page); + put_and_wait_on_page_locked(page); goto out; } @@ -1540,14 +1552,14 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) if (unlikely(!pmd_same(pmd, *vmf->pmd))) { unlock_page(page); put_page(page); - page_nid = -1; + page_nid = NUMA_NO_NODE; goto out_unlock; } /* Bail if we fail to protect against THP splits for any reason */ if (unlikely(!anon_vma)) { put_page(page); - page_nid = -1; + page_nid = NUMA_NO_NODE; goto clear_pmdnuma; } @@ -1562,8 +1574,20 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) * We are not sure a pending tlb flush here is for a huge page * mapping or not. Hence use the tlb range variant */ - if (mm_tlb_flush_pending(vma->vm_mm)) + if (mm_tlb_flush_pending(vma->vm_mm)) { flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); + /* + * change_huge_pmd() released the pmd lock before + * invalidating the secondary MMUs sharing the primary + * MMU pagetables (with ->invalidate_range()). The + * mmu_notifier_invalidate_range_end() (which + * internally calls ->invalidate_range()) in + * change_pmd_range() will run after us, so we can't + * rely on it here and we need an explicit invalidate. + */ + mmu_notifier_invalidate_range(vma->vm_mm, haddr, + haddr + HPAGE_PMD_SIZE); + } /* * Migrate the THP to the requested node, returns with page unlocked @@ -1597,7 +1621,7 @@ out: if (anon_vma) page_unlock_anon_vma_read(anon_vma); - if (page_nid != -1) + if (page_nid != NUMA_NO_NODE) task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); @@ -1958,7 +1982,6 @@ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr) { - pud_t orig_pud; spinlock_t *ptl; ptl = __pud_trans_huge_lock(pud, vma); @@ -1970,8 +1993,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, * pgtable_trans_huge_withdraw after finishing pudp related * operations. */ - orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud, - tlb->fullmm); + pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm); tlb_remove_pud_tlb_entry(tlb, pud, addr); if (vma_is_dax(vma)) { spin_unlock(ptl); @@ -2000,14 +2022,15 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, unsigned long address) { spinlock_t *ptl; - struct mm_struct *mm = vma->vm_mm; - unsigned long haddr = address & HPAGE_PUD_MASK; + struct mmu_notifier_range range; - mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE); - ptl = pud_lock(mm, pud); + mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PUD_MASK, + (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); + mmu_notifier_invalidate_range_start(&range); + ptl = pud_lock(vma->vm_mm, pud); if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) goto out; - __split_huge_pud_locked(vma, pud, haddr); + __split_huge_pud_locked(vma, pud, range.start); out: spin_unlock(ptl); @@ -2015,8 +2038,7 @@ out: * No need to double call mmu_notifier->invalidate_range() callback as * the above pudp_huge_clear_flush_notify() did already call it. */ - mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + - HPAGE_PUD_SIZE); + mmu_notifier_invalidate_range_only_end(&range); } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ @@ -2127,23 +2149,25 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, */ old_pmd = pmdp_invalidate(vma, haddr, pmd); -#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION pmd_migration = is_pmd_migration_entry(old_pmd); - if (pmd_migration) { + if (unlikely(pmd_migration)) { swp_entry_t entry; entry = pmd_to_swp_entry(old_pmd); page = pfn_to_page(swp_offset(entry)); - } else -#endif + write = is_write_migration_entry(entry); + young = false; + soft_dirty = pmd_swp_soft_dirty(old_pmd); + } else { page = pmd_page(old_pmd); + if (pmd_dirty(old_pmd)) + SetPageDirty(page); + write = pmd_write(old_pmd); + young = pmd_young(old_pmd); + soft_dirty = pmd_soft_dirty(old_pmd); + } VM_BUG_ON_PAGE(!page_count(page), page); page_ref_add(page, HPAGE_PMD_NR - 1); - if (pmd_dirty(old_pmd)) - SetPageDirty(page); - write = pmd_write(old_pmd); - young = pmd_young(old_pmd); - soft_dirty = pmd_soft_dirty(old_pmd); /* * Withdraw the table only after we mark the pmd entry invalid. @@ -2216,11 +2240,12 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct page *page) { spinlock_t *ptl; - struct mm_struct *mm = vma->vm_mm; - unsigned long haddr = address & HPAGE_PMD_MASK; + struct mmu_notifier_range range; - mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE); - ptl = pmd_lock(mm, pmd); + mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PMD_MASK, + (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); + ptl = pmd_lock(vma->vm_mm, pmd); /* * If caller asks to setup a migration entries, we need a page to check @@ -2236,7 +2261,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, clear_page_mlock(page); } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) goto out; - __split_huge_pmd_locked(vma, pmd, haddr, freeze); + __split_huge_pmd_locked(vma, pmd, range.start, freeze); out: spin_unlock(ptl); /* @@ -2252,8 +2277,7 @@ out: * any further changes to individual pte will notify. So no need * to call mmu_notifier->invalidate_range() */ - mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + - HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_only_end(&range); } void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, @@ -2322,7 +2346,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, } } -static void freeze_page(struct page *page) +static void unmap_page(struct page *page) { enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; @@ -2337,7 +2361,7 @@ static void freeze_page(struct page *page) VM_BUG_ON_PAGE(!unmap_success, page); } -static void unfreeze_page(struct page *page) +static void remap_page(struct page *page) { int i; if (PageTransHuge(page)) { @@ -2369,10 +2393,17 @@ static void __split_huge_page_tail(struct page *head, int tail, (1L << PG_mlocked) | (1L << PG_uptodate) | (1L << PG_active) | + (1L << PG_workingset) | (1L << PG_locked) | (1L << PG_unevictable) | (1L << PG_dirty))); + /* ->mapping in first tail page is compound_mapcount */ + VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, + page_tail); + page_tail->mapping = head->mapping; + page_tail->index = head->index + tail; + /* Page flags must be visible before we make the page non-compound. */ smp_wmb(); @@ -2393,12 +2424,6 @@ static void __split_huge_page_tail(struct page *head, int tail, if (page_is_idle(head)) set_page_idle(page_tail); - /* ->mapping in first tail page is compound_mapcount */ - VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, - page_tail); - page_tail->mapping = head->mapping; - - page_tail->index = head->index + tail; page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); /* @@ -2410,22 +2435,18 @@ static void __split_huge_page_tail(struct page *head, int tail, } static void __split_huge_page(struct page *page, struct list_head *list, - unsigned long flags) + pgoff_t end, unsigned long flags) { struct page *head = compound_head(page); - struct zone *zone = page_zone(head); + pg_data_t *pgdat = page_pgdat(head); struct lruvec *lruvec; - pgoff_t end = -1; int i; - lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat); + lruvec = mem_cgroup_page_lruvec(head, pgdat); /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(head); - if (!PageAnon(page)) - end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE); - for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { __split_huge_page_tail(head, i, lruvec, list); /* Some pages can be beyond i_size: drop them from page cache */ @@ -2441,20 +2462,20 @@ static void __split_huge_page(struct page *page, struct list_head *list, ClearPageCompound(head); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { - /* Additional pin to radix tree of swap cache */ + /* Additional pin to swap cache */ if (PageSwapCache(head)) page_ref_add(head, 2); else page_ref_inc(head); } else { - /* Additional pin to radix tree */ + /* Additional pin to page cache */ page_ref_add(head, 2); xa_unlock(&head->mapping->i_pages); } - spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); + spin_unlock_irqrestore(&pgdat->lru_lock, flags); - unfreeze_page(head); + remap_page(head); for (i = 0; i < HPAGE_PMD_NR; i++) { struct page *subpage = head + i; @@ -2559,7 +2580,7 @@ bool can_split_huge_page(struct page *page, int *pextra_pins) { int extra_pins; - /* Additional pins from radix tree */ + /* Additional pins from page cache */ if (PageAnon(page)) extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0; else @@ -2597,6 +2618,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) int count, mapcount, extra_pins, ret; bool mlocked; unsigned long flags; + pgoff_t end; VM_BUG_ON_PAGE(is_huge_zero_page(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -2619,6 +2641,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) ret = -EBUSY; goto out; } + end = -1; mapping = NULL; anon_vma_lock_write(anon_vma); } else { @@ -2632,10 +2655,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) anon_vma = NULL; i_mmap_lock_read(mapping); + + /* + *__split_huge_page() may need to trim off pages beyond EOF: + * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, + * which cannot be nested inside the page tree lock. So note + * end now: i_size itself may be changed at any moment, but + * head page lock is good enough to serialize the trimming. + */ + end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); } /* - * Racy check if we can split the page, before freeze_page() will + * Racy check if we can split the page, before unmap_page() will * split PMDs */ if (!can_split_huge_page(head, &extra_pins)) { @@ -2644,7 +2676,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } mlocked = PageMlocked(page); - freeze_page(head); + unmap_page(head); VM_BUG_ON_PAGE(compound_mapcount(head), head); /* Make sure the page is not on per-CPU pagevec as it takes pin */ @@ -2652,20 +2684,17 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) lru_add_drain(); /* prevent PageLRU to go away from under us, and freeze lru stats */ - spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags); + spin_lock_irqsave(&pgdata->lru_lock, flags); if (mapping) { - void **pslot; + XA_STATE(xas, &mapping->i_pages, page_index(head)); - xa_lock(&mapping->i_pages); - pslot = radix_tree_lookup_slot(&mapping->i_pages, - page_index(head)); /* - * Check if the head page is present in radix tree. + * Check if the head page is present in page cache. * We assume all tail are present too, if head is there. */ - if (radix_tree_deref_slot_protected(pslot, - &mapping->i_pages.xa_lock) != head) + xa_lock(&mapping->i_pages); + if (xas_load(&xas) != head) goto fail; } @@ -2681,7 +2710,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mapping) __dec_node_page_state(page, NR_SHMEM_THPS); spin_unlock(&pgdata->split_queue_lock); - __split_huge_page(page, list, flags); + __split_huge_page(page, list, end, flags); if (PageSwapCache(head)) { swp_entry_t entry = { .val = page_private(head) }; @@ -2700,8 +2729,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) spin_unlock(&pgdata->split_queue_lock); fail: if (mapping) xa_unlock(&mapping->i_pages); - spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); - unfreeze_page(head); + spin_unlock_irqrestore(&pgdata->lru_lock, flags); + remap_page(head); ret = -EBUSY; } @@ -2858,12 +2887,8 @@ DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, static int __init split_huge_pages_debugfs(void) { - void *ret; - - ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL, - &split_huge_pages_fops); - if (!ret) - pr_warn("Failed to create split_huge_pages in debugfs"); + debugfs_create_file("split_huge_pages", 0200, NULL, NULL, + &split_huge_pages_fops); return 0; } late_initcall(split_huge_pages_debugfs); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5c390f5a5207..97b1e0290c66 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -15,7 +15,7 @@ #include <linux/compiler.h> #include <linux/cpuset.h> #include <linux/mutex.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/sysfs.h> #include <linux/slab.h> #include <linux/mmdebug.h> @@ -25,6 +25,7 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/jhash.h> +#include <linux/numa.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -887,7 +888,7 @@ static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, struct zonelist *zonelist; struct zone *zone; struct zoneref *z; - int node = -1; + int node = NUMA_NO_NODE; zonelist = node_zonelist(nid, gfp_mask); @@ -919,7 +920,7 @@ retry_cpuset: /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { - if (hugepage_migration_supported(h)) + if (hugepage_movable_supported(h)) return GFP_HIGHUSER_MOVABLE; else return GFP_HIGHUSER; @@ -1248,10 +1249,11 @@ void free_huge_page(struct page *page) (struct hugepage_subpool *)page_private(page); bool restore_reserve; - set_page_private(page, 0); - page->mapping = NULL; VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(page_mapcount(page), page); + + set_page_private(page, 0); + page->mapping = NULL; restore_reserve = PagePrivate(page); ClearPagePrivate(page); @@ -1585,8 +1587,8 @@ out_unlock: return page; } -static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nmask) +struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nmask) { struct page *page; @@ -2100,9 +2102,9 @@ int __alloc_bootmem_huge_page(struct hstate *h) for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { void *addr; - addr = memblock_virt_alloc_try_nid_raw( + addr = memblock_alloc_try_nid_raw( huge_page_size(h), huge_page_size(h), - 0, BOOTMEM_ALLOC_ACCESSIBLE, node); + 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); if (addr) { /* * Use the beginning of the huge page to store the @@ -3233,22 +3235,22 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte) int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) { - pte_t *src_pte, *dst_pte, entry; + pte_t *src_pte, *dst_pte, entry, dst_entry; struct page *ptepage; unsigned long addr; int cow; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + struct mmu_notifier_range range; int ret = 0; cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; - mmun_start = vma->vm_start; - mmun_end = vma->vm_end; - if (cow) - mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end); + if (cow) { + mmu_notifier_range_init(&range, src, vma->vm_start, + vma->vm_end); + mmu_notifier_invalidate_range_start(&range); + } for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; @@ -3261,15 +3263,30 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, break; } - /* If the pagetables are shared don't copy or take references */ - if (dst_pte == src_pte) + /* + * If the pagetables are shared don't copy or take references. + * dst_pte == src_pte is the common case of src/dest sharing. + * + * However, src could have 'unshared' and dst shares with + * another vma. If dst_pte !none, this implies sharing. + * Check here before taking page table lock, and once again + * after taking the lock below. + */ + dst_entry = huge_ptep_get(dst_pte); + if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) continue; dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); - if (huge_pte_none(entry)) { /* skip none entry */ + dst_entry = huge_ptep_get(dst_pte); + if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) { + /* + * Skip if src entry none. Also, skip in the + * unlikely case dst entry !none as this implies + * sharing with another vma. + */ ; } else if (unlikely(is_hugetlb_entry_migration(entry) || is_hugetlb_entry_hwpoisoned(entry))) { @@ -3309,7 +3326,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, } if (cow) - mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); return ret; } @@ -3326,8 +3343,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - unsigned long mmun_start = start; /* For mmu_notifiers */ - unsigned long mmun_end = end; /* For mmu_notifiers */ + struct mmu_notifier_range range; WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); @@ -3343,8 +3359,9 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, /* * If sharing possible, alert mmu notifiers of worst case. */ - adjust_range_if_pmd_sharing_possible(vma, &mmun_start, &mmun_end); - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, mm, start, end); + adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); + mmu_notifier_invalidate_range_start(&range); address = start; for (; address < end; address += sz) { ptep = huge_pte_offset(mm, address, sz); @@ -3412,7 +3429,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, if (ref_page) break; } - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); tlb_end_vma(tlb, vma); } @@ -3530,9 +3547,8 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, struct page *old_page, *new_page; int outside_reserve = 0; vm_fault_t ret = 0; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ unsigned long haddr = address & huge_page_mask(h); + struct mmu_notifier_range range; pte = huge_ptep_get(ptep); old_page = pte_page(pte); @@ -3609,11 +3625,9 @@ retry_avoidcopy: copy_user_huge_page(new_page, old_page, address, vma, pages_per_huge_page(h)); __SetPageUptodate(new_page); - set_page_huge_active(new_page); - mmun_start = haddr; - mmun_end = mmun_start + huge_page_size(h); - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h)); + mmu_notifier_invalidate_range_start(&range); /* * Retake the page table lock to check for racing updates @@ -3626,16 +3640,17 @@ retry_avoidcopy: /* Break COW */ huge_ptep_clear_flush(vma, haddr, ptep); - mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range(mm, range.start, range.end); set_huge_pte_at(mm, haddr, ptep, make_huge_pte(vma, new_page, 1)); page_remove_rmap(old_page, true); hugepage_add_new_anon_rmap(new_page, vma, haddr); + set_page_huge_active(new_page); /* Make the old page be freed below */ new_page = old_page; } spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); out_release_all: restore_reserve_on_error(h, vma, haddr, new_page); put_page(new_page); @@ -3690,6 +3705,12 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, return err; ClearPagePrivate(page); + /* + * set page dirty so that it will not be removed from cache/file + * by non-hugetlbfs specific code paths. + */ + set_page_dirty(page); + spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); @@ -3709,6 +3730,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, pte_t new_pte; spinlock_t *ptl; unsigned long haddr = address & huge_page_mask(h); + bool new_page = false; /* * Currently, we are forced to kill the process in the event the @@ -3770,7 +3792,7 @@ retry: } clear_huge_page(page, address, pages_per_huge_page(h)); __SetPageUptodate(page); - set_page_huge_active(page); + new_page = true; if (vma->vm_flags & VM_MAYSHARE) { int err = huge_add_to_page_cache(page, mapping, idx); @@ -3841,6 +3863,15 @@ retry: } spin_unlock(ptl); + + /* + * Only make newly allocated pages active. Existing pages found + * in the pagecache could be !page_huge_active() if they have been + * isolated for migration. + */ + if (new_page) + set_page_huge_active(page); + unlock_page(page); out: return ret; @@ -4059,7 +4090,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, /* fallback to copy_from_user outside mmap_sem */ if (unlikely(ret)) { - ret = -EFAULT; + ret = -ENOENT; *pagep = page; /* don't free the page */ goto out; @@ -4075,7 +4106,6 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, * the set_pte_at() write. */ __SetPageUptodate(page); - set_page_huge_active(page); mapping = dst_vma->vm_file->f_mapping; idx = vma_hugecache_offset(h, dst_vma, dst_addr); @@ -4143,6 +4173,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); + set_page_huge_active(page); if (vm_shared) unlock_page(page); ret = 0; @@ -4178,7 +4209,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * If we have a pending SIGKILL, don't keep faulting pages and * potentially allocating memory. */ - if (unlikely(fatal_signal_pending(current))) { + if (fatal_signal_pending(current)) { remainder = 0; break; } @@ -4248,7 +4279,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } if (ret & VM_FAULT_RETRY) { - if (nonblocking) + if (nonblocking && + !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) *nonblocking = 0; *nr_pages = 0; /* @@ -4318,21 +4350,21 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, pte_t pte; struct hstate *h = hstate_vma(vma); unsigned long pages = 0; - unsigned long f_start = start; - unsigned long f_end = end; bool shared_pmd = false; + struct mmu_notifier_range range; /* * In the case of shared PMDs, the area to flush could be beyond - * start/end. Set f_start/f_end to cover the maximum possible + * start/end. Set range.start/range.end to cover the maximum possible * range if PMD sharing is possible. */ - adjust_range_if_pmd_sharing_possible(vma, &f_start, &f_end); + mmu_notifier_range_init(&range, mm, start, end); + adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); BUG_ON(address >= end); - flush_cache_range(vma, f_start, f_end); + flush_cache_range(vma, range.start, range.end); - mmu_notifier_invalidate_range_start(mm, f_start, f_end); + mmu_notifier_invalidate_range_start(&range); i_mmap_lock_write(vma->vm_file->f_mapping); for (; address < end; address += huge_page_size(h)) { spinlock_t *ptl; @@ -4367,10 +4399,12 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, continue; } if (!huge_pte_none(pte)) { - pte = huge_ptep_get_and_clear(mm, address, ptep); - pte = pte_mkhuge(huge_pte_modify(pte, newprot)); + pte_t old_pte; + + old_pte = huge_ptep_modify_prot_start(vma, address, ptep); + pte = pte_mkhuge(huge_pte_modify(old_pte, newprot)); pte = arch_make_huge_pte(pte, vma, NULL, 0); - set_huge_pte_at(mm, address, ptep, pte); + huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; } spin_unlock(ptl); @@ -4383,7 +4417,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * did unshare a page of pmds, flush the range corresponding to the pud. */ if (shared_pmd) - flush_hugetlb_tlb_range(vma, f_start, f_end); + flush_hugetlb_tlb_range(vma, range.start, range.end); else flush_hugetlb_tlb_range(vma, start, end); /* @@ -4393,7 +4427,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * See Documentation/vm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); - mmu_notifier_invalidate_range_end(mm, f_start, f_end); + mmu_notifier_invalidate_range_end(&range); return pages << h->order; } diff --git a/mm/internal.h b/mm/internal.h index 87256ae1bef8..9eeaf2b95166 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -161,8 +161,9 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, } extern int __isolate_free_page(struct page *page, unsigned int order); -extern void __free_pages_bootmem(struct page *page, unsigned long pfn, +extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); +extern void __free_pages_core(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned int order); extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); @@ -183,14 +184,16 @@ extern int user_min_free_kbytes; struct compact_control { struct list_head freepages; /* List of free pages to migrate to */ struct list_head migratepages; /* List of pages being migrated */ + unsigned int nr_freepages; /* Number of isolated free pages */ + unsigned int nr_migratepages; /* Number of pages to migrate */ + unsigned long free_pfn; /* isolate_freepages search base */ + unsigned long migrate_pfn; /* isolate_migratepages search base */ + unsigned long fast_start_pfn; /* a pfn to start linear scan from */ struct zone *zone; - unsigned long nr_freepages; /* Number of isolated free pages */ - unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long total_migrate_scanned; unsigned long total_free_scanned; - unsigned long free_pfn; /* isolate_freepages search base */ - unsigned long migrate_pfn; /* isolate_migratepages search base */ - unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ + unsigned short fast_search_fail;/* failures to use free list searches */ + short search_order; /* order to start a fast search at */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ int order; /* order a direct compactor needs */ int migratetype; /* migratetype of direct compactor */ @@ -203,7 +206,16 @@ struct compact_control { bool direct_compaction; /* False from kcompactd or /proc/... */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock or sched contention */ - bool finishing_block; /* Finishing current pageblock */ + bool rescan; /* Rescanning the same pageblock */ +}; + +/* + * Used in direct compaction when a page should be taken from the freelists + * immediately when one is created during the free path. + */ +struct capture_control { + struct compact_control *cc; + struct page *page; }; unsigned long @@ -444,6 +456,16 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, #define NODE_RECLAIM_SOME 0 #define NODE_RECLAIM_SUCCESS 1 +#ifdef CONFIG_NUMA +extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); +#else +static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, + unsigned int order) +{ + return NODE_RECLAIM_NOSCAN; +} +#endif + extern int hwpoison_filter(struct page *p); extern u32 hwpoison_filter_dev_major; @@ -480,10 +502,16 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_OOM ALLOC_NO_WATERMARKS #endif -#define ALLOC_HARDER 0x10 /* try to alloc harder */ -#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ -#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ -#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ +#define ALLOC_HARDER 0x10 /* try to alloc harder */ +#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ +#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ +#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ +#ifdef CONFIG_ZONE_DMA32 +#define ALLOC_NOFRAGMENT 0x100 /* avoid mixing pageblock types */ +#else +#define ALLOC_NOFRAGMENT 0x0 +#endif +#define ALLOC_KSWAPD 0x200 /* allow waking of kswapd */ enum ttu_flags; struct tlbflush_unmap_batch; diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 3289db38bc87..5d1065efbd47 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -1,11 +1,21 @@ # SPDX-License-Identifier: GPL-2.0 KASAN_SANITIZE := n -UBSAN_SANITIZE_kasan.o := n +UBSAN_SANITIZE_common.o := n +UBSAN_SANITIZE_generic.o := n +UBSAN_SANITIZE_tags.o := n KCOV_INSTRUMENT := n -CFLAGS_REMOVE_kasan.o = -pg +CFLAGS_REMOVE_common.o = -pg +CFLAGS_REMOVE_generic.o = -pg +CFLAGS_REMOVE_tags.o = -pg + # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 -CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -obj-y := kasan.o report.o kasan_init.o quarantine.o +CFLAGS_common.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) +CFLAGS_generic.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) +CFLAGS_tags.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) + +obj-$(CONFIG_KASAN) := common.o init.o report.o +obj-$(CONFIG_KASAN_GENERIC) += generic.o generic_report.o quarantine.o +obj-$(CONFIG_KASAN_SW_TAGS) += tags.o tags_report.o diff --git a/mm/kasan/kasan.c b/mm/kasan/common.c index c3bd5209da38..80bbe62b16cd 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/common.c @@ -1,5 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * This file contains shadow memory manipulation code. + * This file contains common generic and tag-based KASAN code. * * Copyright (c) 2014 Samsung Electronics Co., Ltd. * Author: Andrey Ryabinin <[email protected]> @@ -13,8 +14,7 @@ * */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#define DISABLE_BRANCH_PROFILING +#define __KASAN_INTERNAL #include <linux/export.h> #include <linux/interrupt.h> @@ -40,6 +40,53 @@ #include "kasan.h" #include "../slab.h" +static inline int in_irqentry_text(unsigned long ptr) +{ + return (ptr >= (unsigned long)&__irqentry_text_start && + ptr < (unsigned long)&__irqentry_text_end) || + (ptr >= (unsigned long)&__softirqentry_text_start && + ptr < (unsigned long)&__softirqentry_text_end); +} + +static inline void filter_irq_stacks(struct stack_trace *trace) +{ + int i; + + if (!trace->nr_entries) + return; + for (i = 0; i < trace->nr_entries; i++) + if (in_irqentry_text(trace->entries[i])) { + /* Include the irqentry function into the stack. */ + trace->nr_entries = i + 1; + break; + } +} + +static inline depot_stack_handle_t save_stack(gfp_t flags) +{ + unsigned long entries[KASAN_STACK_DEPTH]; + struct stack_trace trace = { + .nr_entries = 0, + .entries = entries, + .max_entries = KASAN_STACK_DEPTH, + .skip = 0 + }; + + save_stack_trace(&trace); + filter_irq_stacks(&trace); + if (trace.nr_entries != 0 && + trace.entries[trace.nr_entries-1] == ULONG_MAX) + trace.nr_entries--; + + return depot_save_stack(&trace, flags); +} + +static inline void set_track(struct kasan_track *track, gfp_t flags) +{ + track->pid = current->pid; + track->stack = save_stack(flags); +} + void kasan_enable_current(void) { current->kasan_depth++; @@ -50,27 +97,85 @@ void kasan_disable_current(void) current->kasan_depth--; } +void kasan_check_read(const volatile void *p, unsigned int size) +{ + check_memory_region((unsigned long)p, size, false, _RET_IP_); +} +EXPORT_SYMBOL(kasan_check_read); + +void kasan_check_write(const volatile void *p, unsigned int size) +{ + check_memory_region((unsigned long)p, size, true, _RET_IP_); +} +EXPORT_SYMBOL(kasan_check_write); + +#undef memset +void *memset(void *addr, int c, size_t len) +{ + check_memory_region((unsigned long)addr, len, true, _RET_IP_); + + return __memset(addr, c, len); +} + +#undef memmove +void *memmove(void *dest, const void *src, size_t len) +{ + check_memory_region((unsigned long)src, len, false, _RET_IP_); + check_memory_region((unsigned long)dest, len, true, _RET_IP_); + + return __memmove(dest, src, len); +} + +#undef memcpy +void *memcpy(void *dest, const void *src, size_t len) +{ + check_memory_region((unsigned long)src, len, false, _RET_IP_); + check_memory_region((unsigned long)dest, len, true, _RET_IP_); + + return __memcpy(dest, src, len); +} + /* * Poisons the shadow memory for 'size' bytes starting from 'addr'. * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE. */ -static void kasan_poison_shadow(const void *address, size_t size, u8 value) +void kasan_poison_shadow(const void *address, size_t size, u8 value) { void *shadow_start, *shadow_end; + /* + * Perform shadow offset calculation based on untagged address, as + * some of the callers (e.g. kasan_poison_object_data) pass tagged + * addresses to this function. + */ + address = reset_tag(address); + shadow_start = kasan_mem_to_shadow(address); shadow_end = kasan_mem_to_shadow(address + size); - memset(shadow_start, value, shadow_end - shadow_start); + __memset(shadow_start, value, shadow_end - shadow_start); } void kasan_unpoison_shadow(const void *address, size_t size) { - kasan_poison_shadow(address, size, 0); + u8 tag = get_tag(address); + + /* + * Perform shadow offset calculation based on untagged address, as + * some of the callers (e.g. kasan_unpoison_object_data) pass tagged + * addresses to this function. + */ + address = reset_tag(address); + + kasan_poison_shadow(address, size, tag); if (size & KASAN_SHADOW_MASK) { u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size); - *shadow = size & KASAN_SHADOW_MASK; + + if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) + *shadow = tag; + else + *shadow = size & KASAN_SHADOW_MASK; } } @@ -116,199 +221,18 @@ void kasan_unpoison_stack_above_sp_to(const void *watermark) kasan_unpoison_shadow(sp, size); } -/* - * All functions below always inlined so compiler could - * perform better optimizations in each of __asan_loadX/__assn_storeX - * depending on memory access size X. - */ - -static __always_inline bool memory_is_poisoned_1(unsigned long addr) -{ - s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr); - - if (unlikely(shadow_value)) { - s8 last_accessible_byte = addr & KASAN_SHADOW_MASK; - return unlikely(last_accessible_byte >= shadow_value); - } - - return false; -} - -static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr, - unsigned long size) -{ - u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr); - - /* - * Access crosses 8(shadow size)-byte boundary. Such access maps - * into 2 shadow bytes, so we need to check them both. - */ - if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1)) - return *shadow_addr || memory_is_poisoned_1(addr + size - 1); - - return memory_is_poisoned_1(addr + size - 1); -} - -static __always_inline bool memory_is_poisoned_16(unsigned long addr) -{ - u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); - - /* Unaligned 16-bytes access maps into 3 shadow bytes. */ - if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) - return *shadow_addr || memory_is_poisoned_1(addr + 15); - - return *shadow_addr; -} - -static __always_inline unsigned long bytes_is_nonzero(const u8 *start, - size_t size) -{ - while (size) { - if (unlikely(*start)) - return (unsigned long)start; - start++; - size--; - } - - return 0; -} - -static __always_inline unsigned long memory_is_nonzero(const void *start, - const void *end) -{ - unsigned int words; - unsigned long ret; - unsigned int prefix = (unsigned long)start % 8; - - if (end - start <= 16) - return bytes_is_nonzero(start, end - start); - - if (prefix) { - prefix = 8 - prefix; - ret = bytes_is_nonzero(start, prefix); - if (unlikely(ret)) - return ret; - start += prefix; - } - - words = (end - start) / 8; - while (words) { - if (unlikely(*(u64 *)start)) - return bytes_is_nonzero(start, 8); - start += 8; - words--; - } - - return bytes_is_nonzero(start, (end - start) % 8); -} - -static __always_inline bool memory_is_poisoned_n(unsigned long addr, - size_t size) -{ - unsigned long ret; - - ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr), - kasan_mem_to_shadow((void *)addr + size - 1) + 1); - - if (unlikely(ret)) { - unsigned long last_byte = addr + size - 1; - s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte); - - if (unlikely(ret != (unsigned long)last_shadow || - ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow))) - return true; - } - return false; -} - -static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) -{ - if (__builtin_constant_p(size)) { - switch (size) { - case 1: - return memory_is_poisoned_1(addr); - case 2: - case 4: - case 8: - return memory_is_poisoned_2_4_8(addr, size); - case 16: - return memory_is_poisoned_16(addr); - default: - BUILD_BUG(); - } - } - - return memory_is_poisoned_n(addr, size); -} - -static __always_inline void check_memory_region_inline(unsigned long addr, - size_t size, bool write, - unsigned long ret_ip) +void kasan_alloc_pages(struct page *page, unsigned int order) { - if (unlikely(size == 0)) - return; - - if (unlikely((void *)addr < - kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { - kasan_report(addr, size, write, ret_ip); - return; - } + u8 tag; + unsigned long i; - if (likely(!memory_is_poisoned(addr, size))) + if (unlikely(PageHighMem(page))) return; - kasan_report(addr, size, write, ret_ip); -} - -static void check_memory_region(unsigned long addr, - size_t size, bool write, - unsigned long ret_ip) -{ - check_memory_region_inline(addr, size, write, ret_ip); -} - -void kasan_check_read(const volatile void *p, unsigned int size) -{ - check_memory_region((unsigned long)p, size, false, _RET_IP_); -} -EXPORT_SYMBOL(kasan_check_read); - -void kasan_check_write(const volatile void *p, unsigned int size) -{ - check_memory_region((unsigned long)p, size, true, _RET_IP_); -} -EXPORT_SYMBOL(kasan_check_write); - -#undef memset -void *memset(void *addr, int c, size_t len) -{ - check_memory_region((unsigned long)addr, len, true, _RET_IP_); - - return __memset(addr, c, len); -} - -#undef memmove -void *memmove(void *dest, const void *src, size_t len) -{ - check_memory_region((unsigned long)src, len, false, _RET_IP_); - check_memory_region((unsigned long)dest, len, true, _RET_IP_); - - return __memmove(dest, src, len); -} - -#undef memcpy -void *memcpy(void *dest, const void *src, size_t len) -{ - check_memory_region((unsigned long)src, len, false, _RET_IP_); - check_memory_region((unsigned long)dest, len, true, _RET_IP_); - - return __memcpy(dest, src, len); -} - -void kasan_alloc_pages(struct page *page, unsigned int order) -{ - if (likely(!PageHighMem(page))) - kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order); + tag = random_tag(); + for (i = 0; i < (1 << order); i++) + page_kasan_tag_set(page + i, tag); + kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order); } void kasan_free_pages(struct page *page, unsigned int order) @@ -323,8 +247,11 @@ void kasan_free_pages(struct page *page, unsigned int order) * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. * For larger allocations larger redzones are used. */ -static unsigned int optimal_redzone(unsigned int object_size) +static inline unsigned int optimal_redzone(unsigned int object_size) { + if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) + return 0; + return object_size <= 64 - 16 ? 16 : object_size <= 128 - 32 ? 32 : @@ -339,6 +266,7 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags) { unsigned int orig_size = *size; + unsigned int redzone_size; int redzone_adjust; /* Add alloc meta. */ @@ -346,20 +274,20 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, *size += sizeof(struct kasan_alloc_meta); /* Add free meta. */ - if (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor || - cache->object_size < sizeof(struct kasan_free_meta)) { + if (IS_ENABLED(CONFIG_KASAN_GENERIC) && + (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor || + cache->object_size < sizeof(struct kasan_free_meta))) { cache->kasan_info.free_meta_offset = *size; *size += sizeof(struct kasan_free_meta); } - redzone_adjust = optimal_redzone(cache->object_size) - - (*size - cache->object_size); + redzone_size = optimal_redzone(cache->object_size); + redzone_adjust = redzone_size - (*size - cache->object_size); if (redzone_adjust > 0) *size += redzone_adjust; *size = min_t(unsigned int, KMALLOC_MAX_SIZE, - max(*size, cache->object_size + - optimal_redzone(cache->object_size))); + max(*size, cache->object_size + redzone_size)); /* * If the metadata doesn't fit, don't enable KASAN at all. @@ -375,27 +303,34 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, *flags |= SLAB_KASAN; } -void kasan_cache_shrink(struct kmem_cache *cache) +size_t kasan_metadata_size(struct kmem_cache *cache) { - quarantine_remove_cache(cache); + return (cache->kasan_info.alloc_meta_offset ? + sizeof(struct kasan_alloc_meta) : 0) + + (cache->kasan_info.free_meta_offset ? + sizeof(struct kasan_free_meta) : 0); } -void kasan_cache_shutdown(struct kmem_cache *cache) +struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, + const void *object) { - if (!__kmem_cache_empty(cache)) - quarantine_remove_cache(cache); + BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32); + return (void *)object + cache->kasan_info.alloc_meta_offset; } -size_t kasan_metadata_size(struct kmem_cache *cache) +struct kasan_free_meta *get_free_info(struct kmem_cache *cache, + const void *object) { - return (cache->kasan_info.alloc_meta_offset ? - sizeof(struct kasan_alloc_meta) : 0) + - (cache->kasan_info.free_meta_offset ? - sizeof(struct kasan_free_meta) : 0); + BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); + return (void *)object + cache->kasan_info.free_meta_offset; } void kasan_poison_slab(struct page *page) { + unsigned long i; + + for (i = 0; i < (1 << compound_order(page)); i++) + page_kasan_tag_reset(page + i); kasan_poison_shadow(page_address(page), PAGE_SIZE << compound_order(page), KASAN_KMALLOC_REDZONE); @@ -413,92 +348,94 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object) KASAN_KMALLOC_REDZONE); } -static inline int in_irqentry_text(unsigned long ptr) -{ - return (ptr >= (unsigned long)&__irqentry_text_start && - ptr < (unsigned long)&__irqentry_text_end) || - (ptr >= (unsigned long)&__softirqentry_text_start && - ptr < (unsigned long)&__softirqentry_text_end); -} - -static inline void filter_irq_stacks(struct stack_trace *trace) -{ - int i; - - if (!trace->nr_entries) - return; - for (i = 0; i < trace->nr_entries; i++) - if (in_irqentry_text(trace->entries[i])) { - /* Include the irqentry function into the stack. */ - trace->nr_entries = i + 1; - break; - } -} - -static inline depot_stack_handle_t save_stack(gfp_t flags) -{ - unsigned long entries[KASAN_STACK_DEPTH]; - struct stack_trace trace = { - .nr_entries = 0, - .entries = entries, - .max_entries = KASAN_STACK_DEPTH, - .skip = 0 - }; - - save_stack_trace(&trace); - filter_irq_stacks(&trace); - if (trace.nr_entries != 0 && - trace.entries[trace.nr_entries-1] == ULONG_MAX) - trace.nr_entries--; - - return depot_save_stack(&trace, flags); -} - -static inline void set_track(struct kasan_track *track, gfp_t flags) -{ - track->pid = current->pid; - track->stack = save_stack(flags); -} - -struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, - const void *object) +/* + * This function assigns a tag to an object considering the following: + * 1. A cache might have a constructor, which might save a pointer to a slab + * object somewhere (e.g. in the object itself). We preassign a tag for + * each object in caches with constructors during slab creation and reuse + * the same tag each time a particular object is allocated. + * 2. A cache might be SLAB_TYPESAFE_BY_RCU, which means objects can be + * accessed after being freed. We preassign tags for objects in these + * caches as well. + * 3. For SLAB allocator we can't preassign tags randomly since the freelist + * is stored as an array of indexes instead of a linked list. Assign tags + * based on objects indexes, so that objects that are next to each other + * get different tags. + */ +static u8 assign_tag(struct kmem_cache *cache, const void *object, + bool init, bool keep_tag) { - BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32); - return (void *)object + cache->kasan_info.alloc_meta_offset; -} + /* + * 1. When an object is kmalloc()'ed, two hooks are called: + * kasan_slab_alloc() and kasan_kmalloc(). We assign the + * tag only in the first one. + * 2. We reuse the same tag for krealloc'ed objects. + */ + if (keep_tag) + return get_tag(object); -struct kasan_free_meta *get_free_info(struct kmem_cache *cache, - const void *object) -{ - BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); - return (void *)object + cache->kasan_info.free_meta_offset; + /* + * If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU + * set, assign a tag when the object is being allocated (init == false). + */ + if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU)) + return init ? KASAN_TAG_KERNEL : random_tag(); + + /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */ +#ifdef CONFIG_SLAB + /* For SLAB assign tags based on the object index in the freelist. */ + return (u8)obj_to_index(cache, virt_to_page(object), (void *)object); +#else + /* + * For SLUB assign a random tag during slab creation, otherwise reuse + * the already assigned tag. + */ + return init ? random_tag() : get_tag(object); +#endif } -void kasan_init_slab_obj(struct kmem_cache *cache, const void *object) +void * __must_check kasan_init_slab_obj(struct kmem_cache *cache, + const void *object) { struct kasan_alloc_meta *alloc_info; if (!(cache->flags & SLAB_KASAN)) - return; + return (void *)object; alloc_info = get_alloc_info(cache, object); __memset(alloc_info, 0, sizeof(*alloc_info)); + + if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) + object = set_tag(object, + assign_tag(cache, object, true, false)); + + return (void *)object; } -void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) +static inline bool shadow_invalid(u8 tag, s8 shadow_byte) { - kasan_kmalloc(cache, object, cache->object_size, flags); + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + return shadow_byte < 0 || + shadow_byte >= KASAN_SHADOW_SCALE_SIZE; + else + return tag != (u8)shadow_byte; } static bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip, bool quarantine) { s8 shadow_byte; + u8 tag; + void *tagged_object; unsigned long rounded_up_size; + tag = get_tag(object); + tagged_object = object; + object = reset_tag(object); + if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) != object)) { - kasan_report_invalid_free(object, ip); + kasan_report_invalid_free(tagged_object, ip); return true; } @@ -507,20 +444,22 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object, return false; shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object)); - if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) { - kasan_report_invalid_free(object, ip); + if (shadow_invalid(tag, shadow_byte)) { + kasan_report_invalid_free(tagged_object, ip); return true; } rounded_up_size = round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE); kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); - if (!quarantine || unlikely(!(cache->flags & SLAB_KASAN))) + if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine) || + unlikely(!(cache->flags & SLAB_KASAN))) return false; set_track(&get_alloc_info(cache, object)->free_track, GFP_NOWAIT); quarantine_put(get_free_info(cache, object), cache); - return true; + + return IS_ENABLED(CONFIG_KASAN_GENERIC); } bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) @@ -528,33 +467,53 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) return __kasan_slab_free(cache, object, ip, true); } -void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, - gfp_t flags) +static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object, + size_t size, gfp_t flags, bool keep_tag) { unsigned long redzone_start; unsigned long redzone_end; + u8 tag; if (gfpflags_allow_blocking(flags)) quarantine_reduce(); if (unlikely(object == NULL)) - return; + return NULL; redzone_start = round_up((unsigned long)(object + size), KASAN_SHADOW_SCALE_SIZE); redzone_end = round_up((unsigned long)object + cache->object_size, KASAN_SHADOW_SCALE_SIZE); - kasan_unpoison_shadow(object, size); + if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) + tag = assign_tag(cache, object, false, keep_tag); + + /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */ + kasan_unpoison_shadow(set_tag(object, tag), size); kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, KASAN_KMALLOC_REDZONE); if (cache->flags & SLAB_KASAN) set_track(&get_alloc_info(cache, object)->alloc_track, flags); + + return set_tag(object, tag); +} + +void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object, + gfp_t flags) +{ + return __kasan_kmalloc(cache, object, cache->object_size, flags, false); +} + +void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object, + size_t size, gfp_t flags) +{ + return __kasan_kmalloc(cache, object, size, flags, true); } EXPORT_SYMBOL(kasan_kmalloc); -void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) +void * __must_check kasan_kmalloc_large(const void *ptr, size_t size, + gfp_t flags) { struct page *page; unsigned long redzone_start; @@ -564,7 +523,7 @@ void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) quarantine_reduce(); if (unlikely(ptr == NULL)) - return; + return NULL; page = virt_to_page(ptr); redzone_start = round_up((unsigned long)(ptr + size), @@ -574,21 +533,24 @@ void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) kasan_unpoison_shadow(ptr, size); kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, KASAN_PAGE_REDZONE); + + return (void *)ptr; } -void kasan_krealloc(const void *object, size_t size, gfp_t flags) +void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags) { struct page *page; if (unlikely(object == ZERO_SIZE_PTR)) - return; + return (void *)object; page = virt_to_head_page(object); if (unlikely(!PageSlab(page))) - kasan_kmalloc_large(object, size, flags); + return kasan_kmalloc_large(object, size, flags); else - kasan_kmalloc(page->slab_cache, object, size, flags); + return __kasan_kmalloc(page->slab_cache, object, size, + flags, true); } void kasan_poison_kfree(void *ptr, unsigned long ip) @@ -632,11 +594,12 @@ int kasan_module_alloc(void *addr, size_t size) ret = __vmalloc_node_range(shadow_size, 1, shadow_start, shadow_start + shadow_size, - GFP_KERNEL | __GFP_ZERO, + GFP_KERNEL, PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, __builtin_return_address(0)); if (ret) { + __memset(ret, KASAN_SHADOW_INIT, shadow_size); find_vm_area(addr)->flags |= VM_KASAN; kmemleak_ignore(ret); return 0; @@ -651,147 +614,6 @@ void kasan_free_shadow(const struct vm_struct *vm) vfree(kasan_mem_to_shadow(vm->addr)); } -static void register_global(struct kasan_global *global) -{ - size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE); - - kasan_unpoison_shadow(global->beg, global->size); - - kasan_poison_shadow(global->beg + aligned_size, - global->size_with_redzone - aligned_size, - KASAN_GLOBAL_REDZONE); -} - -void __asan_register_globals(struct kasan_global *globals, size_t size) -{ - int i; - - for (i = 0; i < size; i++) - register_global(&globals[i]); -} -EXPORT_SYMBOL(__asan_register_globals); - -void __asan_unregister_globals(struct kasan_global *globals, size_t size) -{ -} -EXPORT_SYMBOL(__asan_unregister_globals); - -#define DEFINE_ASAN_LOAD_STORE(size) \ - void __asan_load##size(unsigned long addr) \ - { \ - check_memory_region_inline(addr, size, false, _RET_IP_);\ - } \ - EXPORT_SYMBOL(__asan_load##size); \ - __alias(__asan_load##size) \ - void __asan_load##size##_noabort(unsigned long); \ - EXPORT_SYMBOL(__asan_load##size##_noabort); \ - void __asan_store##size(unsigned long addr) \ - { \ - check_memory_region_inline(addr, size, true, _RET_IP_); \ - } \ - EXPORT_SYMBOL(__asan_store##size); \ - __alias(__asan_store##size) \ - void __asan_store##size##_noabort(unsigned long); \ - EXPORT_SYMBOL(__asan_store##size##_noabort) - -DEFINE_ASAN_LOAD_STORE(1); -DEFINE_ASAN_LOAD_STORE(2); -DEFINE_ASAN_LOAD_STORE(4); -DEFINE_ASAN_LOAD_STORE(8); -DEFINE_ASAN_LOAD_STORE(16); - -void __asan_loadN(unsigned long addr, size_t size) -{ - check_memory_region(addr, size, false, _RET_IP_); -} -EXPORT_SYMBOL(__asan_loadN); - -__alias(__asan_loadN) -void __asan_loadN_noabort(unsigned long, size_t); -EXPORT_SYMBOL(__asan_loadN_noabort); - -void __asan_storeN(unsigned long addr, size_t size) -{ - check_memory_region(addr, size, true, _RET_IP_); -} -EXPORT_SYMBOL(__asan_storeN); - -__alias(__asan_storeN) -void __asan_storeN_noabort(unsigned long, size_t); -EXPORT_SYMBOL(__asan_storeN_noabort); - -/* to shut up compiler complaints */ -void __asan_handle_no_return(void) {} -EXPORT_SYMBOL(__asan_handle_no_return); - -/* Emitted by compiler to poison large objects when they go out of scope. */ -void __asan_poison_stack_memory(const void *addr, size_t size) -{ - /* - * Addr is KASAN_SHADOW_SCALE_SIZE-aligned and the object is surrounded - * by redzones, so we simply round up size to simplify logic. - */ - kasan_poison_shadow(addr, round_up(size, KASAN_SHADOW_SCALE_SIZE), - KASAN_USE_AFTER_SCOPE); -} -EXPORT_SYMBOL(__asan_poison_stack_memory); - -/* Emitted by compiler to unpoison large objects when they go into scope. */ -void __asan_unpoison_stack_memory(const void *addr, size_t size) -{ - kasan_unpoison_shadow(addr, size); -} -EXPORT_SYMBOL(__asan_unpoison_stack_memory); - -/* Emitted by compiler to poison alloca()ed objects. */ -void __asan_alloca_poison(unsigned long addr, size_t size) -{ - size_t rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); - size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) - - rounded_up_size; - size_t rounded_down_size = round_down(size, KASAN_SHADOW_SCALE_SIZE); - - const void *left_redzone = (const void *)(addr - - KASAN_ALLOCA_REDZONE_SIZE); - const void *right_redzone = (const void *)(addr + rounded_up_size); - - WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE)); - - kasan_unpoison_shadow((const void *)(addr + rounded_down_size), - size - rounded_down_size); - kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE, - KASAN_ALLOCA_LEFT); - kasan_poison_shadow(right_redzone, - padding_size + KASAN_ALLOCA_REDZONE_SIZE, - KASAN_ALLOCA_RIGHT); -} -EXPORT_SYMBOL(__asan_alloca_poison); - -/* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */ -void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom) -{ - if (unlikely(!stack_top || stack_top > stack_bottom)) - return; - - kasan_unpoison_shadow(stack_top, stack_bottom - stack_top); -} -EXPORT_SYMBOL(__asan_allocas_unpoison); - -/* Emitted by the compiler to [un]poison local variables. */ -#define DEFINE_ASAN_SET_SHADOW(byte) \ - void __asan_set_shadow_##byte(const void *addr, size_t size) \ - { \ - __memset((void *)addr, 0x##byte, size); \ - } \ - EXPORT_SYMBOL(__asan_set_shadow_##byte) - -DEFINE_ASAN_SET_SHADOW(00); -DEFINE_ASAN_SET_SHADOW(f1); -DEFINE_ASAN_SET_SHADOW(f2); -DEFINE_ASAN_SET_SHADOW(f3); -DEFINE_ASAN_SET_SHADOW(f5); -DEFINE_ASAN_SET_SHADOW(f8); - #ifdef CONFIG_MEMORY_HOTPLUG static bool shadow_mapped(unsigned long addr) { diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c new file mode 100644 index 000000000000..504c79363a34 --- /dev/null +++ b/mm/kasan/generic.c @@ -0,0 +1,325 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains core generic KASAN code. + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin <[email protected]> + * + * Some code borrowed from https://github.com/xairy/kasan-prototype by + * Andrey Konovalov <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define DISABLE_BRANCH_PROFILING + +#include <linux/export.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/kasan.h> +#include <linux/kernel.h> +#include <linux/kmemleak.h> +#include <linux/linkage.h> +#include <linux/memblock.h> +#include <linux/memory.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/printk.h> +#include <linux/sched.h> +#include <linux/sched/task_stack.h> +#include <linux/slab.h> +#include <linux/stacktrace.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/vmalloc.h> +#include <linux/bug.h> + +#include "kasan.h" +#include "../slab.h" + +/* + * All functions below always inlined so compiler could + * perform better optimizations in each of __asan_loadX/__assn_storeX + * depending on memory access size X. + */ + +static __always_inline bool memory_is_poisoned_1(unsigned long addr) +{ + s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr); + + if (unlikely(shadow_value)) { + s8 last_accessible_byte = addr & KASAN_SHADOW_MASK; + return unlikely(last_accessible_byte >= shadow_value); + } + + return false; +} + +static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr, + unsigned long size) +{ + u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr); + + /* + * Access crosses 8(shadow size)-byte boundary. Such access maps + * into 2 shadow bytes, so we need to check them both. + */ + if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1)) + return *shadow_addr || memory_is_poisoned_1(addr + size - 1); + + return memory_is_poisoned_1(addr + size - 1); +} + +static __always_inline bool memory_is_poisoned_16(unsigned long addr) +{ + u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); + + /* Unaligned 16-bytes access maps into 3 shadow bytes. */ + if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) + return *shadow_addr || memory_is_poisoned_1(addr + 15); + + return *shadow_addr; +} + +static __always_inline unsigned long bytes_is_nonzero(const u8 *start, + size_t size) +{ + while (size) { + if (unlikely(*start)) + return (unsigned long)start; + start++; + size--; + } + + return 0; +} + +static __always_inline unsigned long memory_is_nonzero(const void *start, + const void *end) +{ + unsigned int words; + unsigned long ret; + unsigned int prefix = (unsigned long)start % 8; + + if (end - start <= 16) + return bytes_is_nonzero(start, end - start); + + if (prefix) { + prefix = 8 - prefix; + ret = bytes_is_nonzero(start, prefix); + if (unlikely(ret)) + return ret; + start += prefix; + } + + words = (end - start) / 8; + while (words) { + if (unlikely(*(u64 *)start)) + return bytes_is_nonzero(start, 8); + start += 8; + words--; + } + + return bytes_is_nonzero(start, (end - start) % 8); +} + +static __always_inline bool memory_is_poisoned_n(unsigned long addr, + size_t size) +{ + unsigned long ret; + + ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr), + kasan_mem_to_shadow((void *)addr + size - 1) + 1); + + if (unlikely(ret)) { + unsigned long last_byte = addr + size - 1; + s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte); + + if (unlikely(ret != (unsigned long)last_shadow || + ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow))) + return true; + } + return false; +} + +static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) +{ + if (__builtin_constant_p(size)) { + switch (size) { + case 1: + return memory_is_poisoned_1(addr); + case 2: + case 4: + case 8: + return memory_is_poisoned_2_4_8(addr, size); + case 16: + return memory_is_poisoned_16(addr); + default: + BUILD_BUG(); + } + } + + return memory_is_poisoned_n(addr, size); +} + +static __always_inline void check_memory_region_inline(unsigned long addr, + size_t size, bool write, + unsigned long ret_ip) +{ + if (unlikely(size == 0)) + return; + + if (unlikely((void *)addr < + kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { + kasan_report(addr, size, write, ret_ip); + return; + } + + if (likely(!memory_is_poisoned(addr, size))) + return; + + kasan_report(addr, size, write, ret_ip); +} + +void check_memory_region(unsigned long addr, size_t size, bool write, + unsigned long ret_ip) +{ + check_memory_region_inline(addr, size, write, ret_ip); +} + +void kasan_cache_shrink(struct kmem_cache *cache) +{ + quarantine_remove_cache(cache); +} + +void kasan_cache_shutdown(struct kmem_cache *cache) +{ + if (!__kmem_cache_empty(cache)) + quarantine_remove_cache(cache); +} + +static void register_global(struct kasan_global *global) +{ + size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE); + + kasan_unpoison_shadow(global->beg, global->size); + + kasan_poison_shadow(global->beg + aligned_size, + global->size_with_redzone - aligned_size, + KASAN_GLOBAL_REDZONE); +} + +void __asan_register_globals(struct kasan_global *globals, size_t size) +{ + int i; + + for (i = 0; i < size; i++) + register_global(&globals[i]); +} +EXPORT_SYMBOL(__asan_register_globals); + +void __asan_unregister_globals(struct kasan_global *globals, size_t size) +{ +} +EXPORT_SYMBOL(__asan_unregister_globals); + +#define DEFINE_ASAN_LOAD_STORE(size) \ + void __asan_load##size(unsigned long addr) \ + { \ + check_memory_region_inline(addr, size, false, _RET_IP_);\ + } \ + EXPORT_SYMBOL(__asan_load##size); \ + __alias(__asan_load##size) \ + void __asan_load##size##_noabort(unsigned long); \ + EXPORT_SYMBOL(__asan_load##size##_noabort); \ + void __asan_store##size(unsigned long addr) \ + { \ + check_memory_region_inline(addr, size, true, _RET_IP_); \ + } \ + EXPORT_SYMBOL(__asan_store##size); \ + __alias(__asan_store##size) \ + void __asan_store##size##_noabort(unsigned long); \ + EXPORT_SYMBOL(__asan_store##size##_noabort) + +DEFINE_ASAN_LOAD_STORE(1); +DEFINE_ASAN_LOAD_STORE(2); +DEFINE_ASAN_LOAD_STORE(4); +DEFINE_ASAN_LOAD_STORE(8); +DEFINE_ASAN_LOAD_STORE(16); + +void __asan_loadN(unsigned long addr, size_t size) +{ + check_memory_region(addr, size, false, _RET_IP_); +} +EXPORT_SYMBOL(__asan_loadN); + +__alias(__asan_loadN) +void __asan_loadN_noabort(unsigned long, size_t); +EXPORT_SYMBOL(__asan_loadN_noabort); + +void __asan_storeN(unsigned long addr, size_t size) +{ + check_memory_region(addr, size, true, _RET_IP_); +} +EXPORT_SYMBOL(__asan_storeN); + +__alias(__asan_storeN) +void __asan_storeN_noabort(unsigned long, size_t); +EXPORT_SYMBOL(__asan_storeN_noabort); + +/* to shut up compiler complaints */ +void __asan_handle_no_return(void) {} +EXPORT_SYMBOL(__asan_handle_no_return); + +/* Emitted by compiler to poison alloca()ed objects. */ +void __asan_alloca_poison(unsigned long addr, size_t size) +{ + size_t rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); + size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) - + rounded_up_size; + size_t rounded_down_size = round_down(size, KASAN_SHADOW_SCALE_SIZE); + + const void *left_redzone = (const void *)(addr - + KASAN_ALLOCA_REDZONE_SIZE); + const void *right_redzone = (const void *)(addr + rounded_up_size); + + WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE)); + + kasan_unpoison_shadow((const void *)(addr + rounded_down_size), + size - rounded_down_size); + kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE, + KASAN_ALLOCA_LEFT); + kasan_poison_shadow(right_redzone, + padding_size + KASAN_ALLOCA_REDZONE_SIZE, + KASAN_ALLOCA_RIGHT); +} +EXPORT_SYMBOL(__asan_alloca_poison); + +/* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */ +void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom) +{ + if (unlikely(!stack_top || stack_top > stack_bottom)) + return; + + kasan_unpoison_shadow(stack_top, stack_bottom - stack_top); +} +EXPORT_SYMBOL(__asan_allocas_unpoison); + +/* Emitted by the compiler to [un]poison local variables. */ +#define DEFINE_ASAN_SET_SHADOW(byte) \ + void __asan_set_shadow_##byte(const void *addr, size_t size) \ + { \ + __memset((void *)addr, 0x##byte, size); \ + } \ + EXPORT_SYMBOL(__asan_set_shadow_##byte) + +DEFINE_ASAN_SET_SHADOW(00); +DEFINE_ASAN_SET_SHADOW(f1); +DEFINE_ASAN_SET_SHADOW(f2); +DEFINE_ASAN_SET_SHADOW(f3); +DEFINE_ASAN_SET_SHADOW(f5); +DEFINE_ASAN_SET_SHADOW(f8); diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c new file mode 100644 index 000000000000..36c645939bc9 --- /dev/null +++ b/mm/kasan/generic_report.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains generic KASAN specific error reporting code. + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin <[email protected]> + * + * Some code borrowed from https://github.com/xairy/kasan-prototype by + * Andrey Konovalov <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/bitops.h> +#include <linux/ftrace.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/printk.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/stackdepot.h> +#include <linux/stacktrace.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/kasan.h> +#include <linux/module.h> + +#include <asm/sections.h> + +#include "kasan.h" +#include "../slab.h" + +void *find_first_bad_addr(void *addr, size_t size) +{ + void *p = addr; + + while (p < addr + size && !(*(u8 *)kasan_mem_to_shadow(p))) + p += KASAN_SHADOW_SCALE_SIZE; + return p; +} + +static const char *get_shadow_bug_type(struct kasan_access_info *info) +{ + const char *bug_type = "unknown-crash"; + u8 *shadow_addr; + + shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr); + + /* + * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look + * at the next shadow byte to determine the type of the bad access. + */ + if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1) + shadow_addr++; + + switch (*shadow_addr) { + case 0 ... KASAN_SHADOW_SCALE_SIZE - 1: + /* + * In theory it's still possible to see these shadow values + * due to a data race in the kernel code. + */ + bug_type = "out-of-bounds"; + break; + case KASAN_PAGE_REDZONE: + case KASAN_KMALLOC_REDZONE: + bug_type = "slab-out-of-bounds"; + break; + case KASAN_GLOBAL_REDZONE: + bug_type = "global-out-of-bounds"; + break; + case KASAN_STACK_LEFT: + case KASAN_STACK_MID: + case KASAN_STACK_RIGHT: + case KASAN_STACK_PARTIAL: + bug_type = "stack-out-of-bounds"; + break; + case KASAN_FREE_PAGE: + case KASAN_KMALLOC_FREE: + bug_type = "use-after-free"; + break; + case KASAN_ALLOCA_LEFT: + case KASAN_ALLOCA_RIGHT: + bug_type = "alloca-out-of-bounds"; + break; + } + + return bug_type; +} + +static const char *get_wild_bug_type(struct kasan_access_info *info) +{ + const char *bug_type = "unknown-crash"; + + if ((unsigned long)info->access_addr < PAGE_SIZE) + bug_type = "null-ptr-deref"; + else if ((unsigned long)info->access_addr < TASK_SIZE) + bug_type = "user-memory-access"; + else + bug_type = "wild-memory-access"; + + return bug_type; +} + +const char *get_bug_type(struct kasan_access_info *info) +{ + if (addr_has_shadow(info->access_addr)) + return get_shadow_bug_type(info); + return get_wild_bug_type(info); +} + +#define DEFINE_ASAN_REPORT_LOAD(size) \ +void __asan_report_load##size##_noabort(unsigned long addr) \ +{ \ + kasan_report(addr, size, false, _RET_IP_); \ +} \ +EXPORT_SYMBOL(__asan_report_load##size##_noabort) + +#define DEFINE_ASAN_REPORT_STORE(size) \ +void __asan_report_store##size##_noabort(unsigned long addr) \ +{ \ + kasan_report(addr, size, true, _RET_IP_); \ +} \ +EXPORT_SYMBOL(__asan_report_store##size##_noabort) + +DEFINE_ASAN_REPORT_LOAD(1); +DEFINE_ASAN_REPORT_LOAD(2); +DEFINE_ASAN_REPORT_LOAD(4); +DEFINE_ASAN_REPORT_LOAD(8); +DEFINE_ASAN_REPORT_LOAD(16); +DEFINE_ASAN_REPORT_STORE(1); +DEFINE_ASAN_REPORT_STORE(2); +DEFINE_ASAN_REPORT_STORE(4); +DEFINE_ASAN_REPORT_STORE(8); +DEFINE_ASAN_REPORT_STORE(16); + +void __asan_report_load_n_noabort(unsigned long addr, size_t size) +{ + kasan_report(addr, size, false, _RET_IP_); +} +EXPORT_SYMBOL(__asan_report_load_n_noabort); + +void __asan_report_store_n_noabort(unsigned long addr, size_t size) +{ + kasan_report(addr, size, true, _RET_IP_); +} +EXPORT_SYMBOL(__asan_report_store_n_noabort); diff --git a/mm/kasan/kasan_init.c b/mm/kasan/init.c index 7a2a2f13f86f..fcaa1ca03175 100644 --- a/mm/kasan/kasan_init.c +++ b/mm/kasan/init.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * This file contains some kasan initialization code. * @@ -10,11 +11,10 @@ * */ -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/init.h> #include <linux/kasan.h> #include <linux/kernel.h> -#include <linux/memblock.h> #include <linux/mm.h> #include <linux/pfn.h> #include <linux/slab.h> @@ -31,60 +31,60 @@ * - Latter it reused it as zero shadow to cover large ranges of memory * that allowed to access, but not handled by kasan (vmalloc/vmemmap ...). */ -unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss; +unsigned char kasan_early_shadow_page[PAGE_SIZE] __page_aligned_bss; #if CONFIG_PGTABLE_LEVELS > 4 -p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss; +p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss; static inline bool kasan_p4d_table(pgd_t pgd) { - return pgd_page(pgd) == virt_to_page(lm_alias(kasan_zero_p4d)); + return pgd_page(pgd) == virt_to_page(lm_alias(kasan_early_shadow_p4d)); } #else static inline bool kasan_p4d_table(pgd_t pgd) { - return 0; + return false; } #endif #if CONFIG_PGTABLE_LEVELS > 3 -pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss; +pud_t kasan_early_shadow_pud[PTRS_PER_PUD] __page_aligned_bss; static inline bool kasan_pud_table(p4d_t p4d) { - return p4d_page(p4d) == virt_to_page(lm_alias(kasan_zero_pud)); + return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud)); } #else static inline bool kasan_pud_table(p4d_t p4d) { - return 0; + return false; } #endif #if CONFIG_PGTABLE_LEVELS > 2 -pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss; +pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD] __page_aligned_bss; static inline bool kasan_pmd_table(pud_t pud) { - return pud_page(pud) == virt_to_page(lm_alias(kasan_zero_pmd)); + return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd)); } #else static inline bool kasan_pmd_table(pud_t pud) { - return 0; + return false; } #endif -pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss; +pte_t kasan_early_shadow_pte[PTRS_PER_PTE] __page_aligned_bss; static inline bool kasan_pte_table(pmd_t pmd) { - return pmd_page(pmd) == virt_to_page(lm_alias(kasan_zero_pte)); + return pmd_page(pmd) == virt_to_page(lm_alias(kasan_early_shadow_pte)); } -static inline bool kasan_zero_page_entry(pte_t pte) +static inline bool kasan_early_shadow_page_entry(pte_t pte) { - return pte_page(pte) == virt_to_page(lm_alias(kasan_zero_page)); + return pte_page(pte) == virt_to_page(lm_alias(kasan_early_shadow_page)); } static __init void *early_alloc(size_t size, int node) { - return memblock_virt_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, node); + return memblock_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), + MEMBLOCK_ALLOC_ACCESSIBLE, node); } static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr, @@ -93,7 +93,8 @@ static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr, pte_t *pte = pte_offset_kernel(pmd, addr); pte_t zero_pte; - zero_pte = pfn_pte(PFN_DOWN(__pa_symbol(kasan_zero_page)), PAGE_KERNEL); + zero_pte = pfn_pte(PFN_DOWN(__pa_symbol(kasan_early_shadow_page)), + PAGE_KERNEL); zero_pte = pte_wrprotect(zero_pte); while (addr + PAGE_SIZE <= end) { @@ -113,7 +114,8 @@ static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr, next = pmd_addr_end(addr, end); if (IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) { - pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte)); + pmd_populate_kernel(&init_mm, pmd, + lm_alias(kasan_early_shadow_pte)); continue; } @@ -121,7 +123,7 @@ static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr, pte_t *p; if (slab_is_available()) - p = pte_alloc_one_kernel(&init_mm, addr); + p = pte_alloc_one_kernel(&init_mm); else p = early_alloc(PAGE_SIZE, NUMA_NO_NODE); if (!p) @@ -146,9 +148,11 @@ static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr, if (IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) { pmd_t *pmd; - pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd)); + pud_populate(&init_mm, pud, + lm_alias(kasan_early_shadow_pmd)); pmd = pmd_offset(pud, addr); - pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte)); + pmd_populate_kernel(&init_mm, pmd, + lm_alias(kasan_early_shadow_pte)); continue; } @@ -182,12 +186,14 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr, pud_t *pud; pmd_t *pmd; - p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud)); + p4d_populate(&init_mm, p4d, + lm_alias(kasan_early_shadow_pud)); pud = pud_offset(p4d, addr); - pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd)); + pud_populate(&init_mm, pud, + lm_alias(kasan_early_shadow_pmd)); pmd = pmd_offset(pud, addr); pmd_populate_kernel(&init_mm, pmd, - lm_alias(kasan_zero_pte)); + lm_alias(kasan_early_shadow_pte)); continue; } @@ -210,13 +216,13 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr, } /** - * kasan_populate_zero_shadow - populate shadow memory region with - * kasan_zero_page + * kasan_populate_early_shadow - populate shadow memory region with + * kasan_early_shadow_page * @shadow_start - start of the memory range to populate * @shadow_end - end of the memory range to populate */ -int __ref kasan_populate_zero_shadow(const void *shadow_start, - const void *shadow_end) +int __ref kasan_populate_early_shadow(const void *shadow_start, + const void *shadow_end) { unsigned long addr = (unsigned long)shadow_start; unsigned long end = (unsigned long)shadow_end; @@ -232,7 +238,7 @@ int __ref kasan_populate_zero_shadow(const void *shadow_start, pmd_t *pmd; /* - * kasan_zero_pud should be populated with pmds + * kasan_early_shadow_pud should be populated with pmds * at this moment. * [pud,pmd]_populate*() below needed only for * 3,2 - level page tables where we don't have @@ -242,21 +248,25 @@ int __ref kasan_populate_zero_shadow(const void *shadow_start, * The ifndef is required to avoid build breakage. * * With 5level-fixup.h, pgd_populate() is not nop and - * we reference kasan_zero_p4d. It's not defined + * we reference kasan_early_shadow_p4d. It's not defined * unless 5-level paging enabled. * * The ifndef can be dropped once all KASAN-enabled * architectures will switch to pgtable-nop4d.h. */ #ifndef __ARCH_HAS_5LEVEL_HACK - pgd_populate(&init_mm, pgd, lm_alias(kasan_zero_p4d)); + pgd_populate(&init_mm, pgd, + lm_alias(kasan_early_shadow_p4d)); #endif p4d = p4d_offset(pgd, addr); - p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud)); + p4d_populate(&init_mm, p4d, + lm_alias(kasan_early_shadow_pud)); pud = pud_offset(p4d, addr); - pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd)); + pud_populate(&init_mm, pud, + lm_alias(kasan_early_shadow_pmd)); pmd = pmd_offset(pud, addr); - pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte)); + pmd_populate_kernel(&init_mm, pmd, + lm_alias(kasan_early_shadow_pte)); continue; } @@ -351,7 +361,7 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr, if (!pte_present(*pte)) continue; - if (WARN_ON(!kasan_zero_page_entry(*pte))) + if (WARN_ON(!kasan_early_shadow_page_entry(*pte))) continue; pte_clear(&init_mm, addr, pte); } @@ -481,7 +491,7 @@ int kasan_add_zero_shadow(void *start, unsigned long size) WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE))) return -EINVAL; - ret = kasan_populate_zero_shadow(shadow_start, shadow_end); + ret = kasan_populate_early_shadow(shadow_start, shadow_end); if (ret) kasan_remove_zero_shadow(shadow_start, size >> KASAN_SHADOW_SCALE_SHIFT); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index c12dcfde2ebd..3e0c11f7d7a1 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -8,10 +8,22 @@ #define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) #define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) +#define KASAN_TAG_KERNEL 0xFF /* native kernel pointers tag */ +#define KASAN_TAG_INVALID 0xFE /* inaccessible memory tag */ +#define KASAN_TAG_MAX 0xFD /* maximum value for random tags */ + +#ifdef CONFIG_KASAN_GENERIC #define KASAN_FREE_PAGE 0xFF /* page was freed */ #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ #define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ #define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ +#else +#define KASAN_FREE_PAGE KASAN_TAG_INVALID +#define KASAN_PAGE_REDZONE KASAN_TAG_INVALID +#define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID +#define KASAN_KMALLOC_FREE KASAN_TAG_INVALID +#endif + #define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */ /* @@ -22,7 +34,6 @@ #define KASAN_STACK_MID 0xF2 #define KASAN_STACK_RIGHT 0xF3 #define KASAN_STACK_PARTIAL 0xF4 -#define KASAN_USE_AFTER_SCOPE 0xF8 /* * alloca redzone shadow values @@ -105,11 +116,25 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr) << KASAN_SHADOW_SCALE_SHIFT); } +static inline bool addr_has_shadow(const void *addr) +{ + return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); +} + +void kasan_poison_shadow(const void *address, size_t size, u8 value); + +void check_memory_region(unsigned long addr, size_t size, bool write, + unsigned long ret_ip); + +void *find_first_bad_addr(void *addr, size_t size); +const char *get_bug_type(struct kasan_access_info *info); + void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); void kasan_report_invalid_free(void *object, unsigned long ip); -#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB) +#if defined(CONFIG_KASAN_GENERIC) && \ + (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); void quarantine_reduce(void); void quarantine_remove_cache(struct kmem_cache *cache); @@ -120,6 +145,37 @@ static inline void quarantine_reduce(void) { } static inline void quarantine_remove_cache(struct kmem_cache *cache) { } #endif +#ifdef CONFIG_KASAN_SW_TAGS + +void print_tags(u8 addr_tag, const void *addr); + +u8 random_tag(void); + +#else + +static inline void print_tags(u8 addr_tag, const void *addr) { } + +static inline u8 random_tag(void) +{ + return 0; +} + +#endif + +#ifndef arch_kasan_set_tag +#define arch_kasan_set_tag(addr, tag) ((void *)(addr)) +#endif +#ifndef arch_kasan_reset_tag +#define arch_kasan_reset_tag(addr) ((void *)(addr)) +#endif +#ifndef arch_kasan_get_tag +#define arch_kasan_get_tag(addr) 0 +#endif + +#define set_tag(addr, tag) ((void *)arch_kasan_set_tag((addr), (tag))) +#define reset_tag(addr) ((void *)arch_kasan_reset_tag(addr)) +#define get_tag(addr) arch_kasan_get_tag(addr) + /* * Exported functions for interfaces called from assembly or from generated * code. Declarations here to avoid warning about missing declarations. @@ -130,8 +186,6 @@ void __asan_unregister_globals(struct kasan_global *globals, size_t size); void __asan_loadN(unsigned long addr, size_t size); void __asan_storeN(unsigned long addr, size_t size); void __asan_handle_no_return(void); -void __asan_poison_stack_memory(const void *addr, size_t size); -void __asan_unpoison_stack_memory(const void *addr, size_t size); void __asan_alloca_poison(unsigned long addr, size_t size); void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom); diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 3a8ddf8baf7d..978bc4a3eb51 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * KASAN quarantine. * @@ -103,7 +104,7 @@ static int quarantine_head; static int quarantine_tail; /* Total size of all objects in global_quarantine across all batches. */ static unsigned long quarantine_size; -static DEFINE_SPINLOCK(quarantine_lock); +static DEFINE_RAW_SPINLOCK(quarantine_lock); DEFINE_STATIC_SRCU(remove_cache_srcu); /* Maximum size of the global queue. */ @@ -190,7 +191,7 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) { qlist_move_all(q, &temp); - spin_lock(&quarantine_lock); + raw_spin_lock(&quarantine_lock); WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes); qlist_move_all(&temp, &global_quarantine[quarantine_tail]); if (global_quarantine[quarantine_tail].bytes >= @@ -203,7 +204,7 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) if (new_tail != quarantine_head) quarantine_tail = new_tail; } - spin_unlock(&quarantine_lock); + raw_spin_unlock(&quarantine_lock); } local_irq_restore(flags); @@ -230,13 +231,13 @@ void quarantine_reduce(void) * expected case). */ srcu_idx = srcu_read_lock(&remove_cache_srcu); - spin_lock_irqsave(&quarantine_lock, flags); + raw_spin_lock_irqsave(&quarantine_lock, flags); /* * Update quarantine size in case of hotplug. Allocate a fraction of * the installed memory to quarantine minus per-cpu queue limits. */ - total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / + total_size = (totalram_pages() << PAGE_SHIFT) / QUARANTINE_FRACTION; percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus(); new_quarantine_size = (total_size < percpu_quarantines) ? @@ -254,7 +255,7 @@ void quarantine_reduce(void) quarantine_head = 0; } - spin_unlock_irqrestore(&quarantine_lock, flags); + raw_spin_unlock_irqrestore(&quarantine_lock, flags); qlist_free_all(&to_free, NULL); srcu_read_unlock(&remove_cache_srcu, srcu_idx); @@ -310,17 +311,17 @@ void quarantine_remove_cache(struct kmem_cache *cache) */ on_each_cpu(per_cpu_remove_cache, cache, 1); - spin_lock_irqsave(&quarantine_lock, flags); + raw_spin_lock_irqsave(&quarantine_lock, flags); for (i = 0; i < QUARANTINE_BATCHES; i++) { if (qlist_empty(&global_quarantine[i])) continue; qlist_move_cache(&global_quarantine[i], &to_free, cache); /* Scanning whole quarantine can take a while. */ - spin_unlock_irqrestore(&quarantine_lock, flags); + raw_spin_unlock_irqrestore(&quarantine_lock, flags); cond_resched(); - spin_lock_irqsave(&quarantine_lock, flags); + raw_spin_lock_irqsave(&quarantine_lock, flags); } - spin_unlock_irqrestore(&quarantine_lock, flags); + raw_spin_unlock_irqrestore(&quarantine_lock, flags); qlist_free_all(&to_free, cache); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 5c169aa688fd..ca9418fe9232 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -1,5 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * This file contains error reporting code. + * This file contains common generic and tag-based KASAN error reporting code. * * Copyright (c) 2014 Samsung Electronics Co., Ltd. * Author: Andrey Ryabinin <[email protected]> @@ -39,129 +40,43 @@ #define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK) #define SHADOW_ROWS_AROUND_ADDR 2 -static const void *find_first_bad_addr(const void *addr, size_t size) -{ - u8 shadow_val = *(u8 *)kasan_mem_to_shadow(addr); - const void *first_bad_addr = addr; - - while (!shadow_val && first_bad_addr < addr + size) { - first_bad_addr += KASAN_SHADOW_SCALE_SIZE; - shadow_val = *(u8 *)kasan_mem_to_shadow(first_bad_addr); - } - return first_bad_addr; -} +static unsigned long kasan_flags; -static bool addr_has_shadow(struct kasan_access_info *info) -{ - return (info->access_addr >= - kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); -} +#define KASAN_BIT_REPORTED 0 +#define KASAN_BIT_MULTI_SHOT 1 -static const char *get_shadow_bug_type(struct kasan_access_info *info) +bool kasan_save_enable_multi_shot(void) { - const char *bug_type = "unknown-crash"; - u8 *shadow_addr; - - info->first_bad_addr = find_first_bad_addr(info->access_addr, - info->access_size); - - shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr); - - /* - * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look - * at the next shadow byte to determine the type of the bad access. - */ - if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1) - shadow_addr++; - - switch (*shadow_addr) { - case 0 ... KASAN_SHADOW_SCALE_SIZE - 1: - /* - * In theory it's still possible to see these shadow values - * due to a data race in the kernel code. - */ - bug_type = "out-of-bounds"; - break; - case KASAN_PAGE_REDZONE: - case KASAN_KMALLOC_REDZONE: - bug_type = "slab-out-of-bounds"; - break; - case KASAN_GLOBAL_REDZONE: - bug_type = "global-out-of-bounds"; - break; - case KASAN_STACK_LEFT: - case KASAN_STACK_MID: - case KASAN_STACK_RIGHT: - case KASAN_STACK_PARTIAL: - bug_type = "stack-out-of-bounds"; - break; - case KASAN_FREE_PAGE: - case KASAN_KMALLOC_FREE: - bug_type = "use-after-free"; - break; - case KASAN_USE_AFTER_SCOPE: - bug_type = "use-after-scope"; - break; - case KASAN_ALLOCA_LEFT: - case KASAN_ALLOCA_RIGHT: - bug_type = "alloca-out-of-bounds"; - break; - } - - return bug_type; + return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); } +EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot); -static const char *get_wild_bug_type(struct kasan_access_info *info) +void kasan_restore_multi_shot(bool enabled) { - const char *bug_type = "unknown-crash"; - - if ((unsigned long)info->access_addr < PAGE_SIZE) - bug_type = "null-ptr-deref"; - else if ((unsigned long)info->access_addr < TASK_SIZE) - bug_type = "user-memory-access"; - else - bug_type = "wild-memory-access"; - - return bug_type; + if (!enabled) + clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); } +EXPORT_SYMBOL_GPL(kasan_restore_multi_shot); -static const char *get_bug_type(struct kasan_access_info *info) +static int __init kasan_set_multi_shot(char *str) { - if (addr_has_shadow(info)) - return get_shadow_bug_type(info); - return get_wild_bug_type(info); + set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); + return 1; } +__setup("kasan_multi_shot", kasan_set_multi_shot); static void print_error_description(struct kasan_access_info *info) { - const char *bug_type = get_bug_type(info); - pr_err("BUG: KASAN: %s in %pS\n", - bug_type, (void *)info->ip); + get_bug_type(info), (void *)info->ip); pr_err("%s of size %zu at addr %px by task %s/%d\n", info->is_write ? "Write" : "Read", info->access_size, info->access_addr, current->comm, task_pid_nr(current)); } -static inline bool kernel_or_module_addr(const void *addr) -{ - if (addr >= (void *)_stext && addr < (void *)_end) - return true; - if (is_module_address((unsigned long)addr)) - return true; - return false; -} - -static inline bool init_task_stack_addr(const void *addr) -{ - return addr >= (void *)&init_thread_union.stack && - (addr <= (void *)&init_thread_union.stack + - sizeof(init_thread_union.stack)); -} - static DEFINE_SPINLOCK(report_lock); -static void kasan_start_report(unsigned long *flags) +static void start_report(unsigned long *flags) { /* * Make sure we don't end up in loop. @@ -171,7 +86,7 @@ static void kasan_start_report(unsigned long *flags) pr_err("==================================================================\n"); } -static void kasan_end_report(unsigned long *flags) +static void end_report(unsigned long *flags) { pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); @@ -249,6 +164,22 @@ static void describe_object(struct kmem_cache *cache, void *object, describe_object_addr(cache, object, addr); } +static inline bool kernel_or_module_addr(const void *addr) +{ + if (addr >= (void *)_stext && addr < (void *)_end) + return true; + if (is_module_address((unsigned long)addr)) + return true; + return false; +} + +static inline bool init_task_stack_addr(const void *addr) +{ + return addr >= (void *)&init_thread_union.stack && + (addr <= (void *)&init_thread_union.stack + + sizeof(init_thread_union.stack)); +} + static void print_address_description(void *addr) { struct page *page = addr_to_page(addr); @@ -326,126 +257,69 @@ static void print_shadow_for_address(const void *addr) } } +static bool report_enabled(void) +{ + if (current->kasan_depth) + return false; + if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) + return true; + return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); +} + void kasan_report_invalid_free(void *object, unsigned long ip) { unsigned long flags; - kasan_start_report(&flags); + start_report(&flags); pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip); + print_tags(get_tag(object), reset_tag(object)); + object = reset_tag(object); pr_err("\n"); print_address_description(object); pr_err("\n"); print_shadow_for_address(object); - kasan_end_report(&flags); -} - -static void kasan_report_error(struct kasan_access_info *info) -{ - unsigned long flags; - - kasan_start_report(&flags); - - print_error_description(info); - pr_err("\n"); - - if (!addr_has_shadow(info)) { - dump_stack(); - } else { - print_address_description((void *)info->access_addr); - pr_err("\n"); - print_shadow_for_address(info->first_bad_addr); - } - - kasan_end_report(&flags); -} - -static unsigned long kasan_flags; - -#define KASAN_BIT_REPORTED 0 -#define KASAN_BIT_MULTI_SHOT 1 - -bool kasan_save_enable_multi_shot(void) -{ - return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); -} -EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot); - -void kasan_restore_multi_shot(bool enabled) -{ - if (!enabled) - clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); -} -EXPORT_SYMBOL_GPL(kasan_restore_multi_shot); - -static int __init kasan_set_multi_shot(char *str) -{ - set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); - return 1; -} -__setup("kasan_multi_shot", kasan_set_multi_shot); - -static inline bool kasan_report_enabled(void) -{ - if (current->kasan_depth) - return false; - if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) - return true; - return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); + end_report(&flags); } void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) { struct kasan_access_info info; + void *tagged_addr; + void *untagged_addr; + unsigned long flags; - if (likely(!kasan_report_enabled())) + if (likely(!report_enabled())) return; disable_trace_on_warning(); - info.access_addr = (void *)addr; - info.first_bad_addr = (void *)addr; + tagged_addr = (void *)addr; + untagged_addr = reset_tag(tagged_addr); + + info.access_addr = tagged_addr; + if (addr_has_shadow(untagged_addr)) + info.first_bad_addr = find_first_bad_addr(tagged_addr, size); + else + info.first_bad_addr = untagged_addr; info.access_size = size; info.is_write = is_write; info.ip = ip; - kasan_report_error(&info); -} + start_report(&flags); + print_error_description(&info); + if (addr_has_shadow(untagged_addr)) + print_tags(get_tag(tagged_addr), info.first_bad_addr); + pr_err("\n"); -#define DEFINE_ASAN_REPORT_LOAD(size) \ -void __asan_report_load##size##_noabort(unsigned long addr) \ -{ \ - kasan_report(addr, size, false, _RET_IP_); \ -} \ -EXPORT_SYMBOL(__asan_report_load##size##_noabort) - -#define DEFINE_ASAN_REPORT_STORE(size) \ -void __asan_report_store##size##_noabort(unsigned long addr) \ -{ \ - kasan_report(addr, size, true, _RET_IP_); \ -} \ -EXPORT_SYMBOL(__asan_report_store##size##_noabort) - -DEFINE_ASAN_REPORT_LOAD(1); -DEFINE_ASAN_REPORT_LOAD(2); -DEFINE_ASAN_REPORT_LOAD(4); -DEFINE_ASAN_REPORT_LOAD(8); -DEFINE_ASAN_REPORT_LOAD(16); -DEFINE_ASAN_REPORT_STORE(1); -DEFINE_ASAN_REPORT_STORE(2); -DEFINE_ASAN_REPORT_STORE(4); -DEFINE_ASAN_REPORT_STORE(8); -DEFINE_ASAN_REPORT_STORE(16); - -void __asan_report_load_n_noabort(unsigned long addr, size_t size) -{ - kasan_report(addr, size, false, _RET_IP_); -} -EXPORT_SYMBOL(__asan_report_load_n_noabort); + if (addr_has_shadow(untagged_addr)) { + print_address_description(untagged_addr); + pr_err("\n"); + print_shadow_for_address(info.first_bad_addr); + } else { + dump_stack(); + } -void __asan_report_store_n_noabort(unsigned long addr, size_t size) -{ - kasan_report(addr, size, true, _RET_IP_); + end_report(&flags); } -EXPORT_SYMBOL(__asan_report_store_n_noabort); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c new file mode 100644 index 000000000000..63fca3172659 --- /dev/null +++ b/mm/kasan/tags.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains core tag-based KASAN code. + * + * Copyright (c) 2018 Google, Inc. + * Author: Andrey Konovalov <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define DISABLE_BRANCH_PROFILING + +#include <linux/export.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/kasan.h> +#include <linux/kernel.h> +#include <linux/kmemleak.h> +#include <linux/linkage.h> +#include <linux/memblock.h> +#include <linux/memory.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/printk.h> +#include <linux/random.h> +#include <linux/sched.h> +#include <linux/sched/task_stack.h> +#include <linux/slab.h> +#include <linux/stacktrace.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/vmalloc.h> +#include <linux/bug.h> + +#include "kasan.h" +#include "../slab.h" + +static DEFINE_PER_CPU(u32, prng_state); + +void kasan_init_tags(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(prng_state, cpu) = (u32)get_cycles(); +} + +/* + * If a preemption happens between this_cpu_read and this_cpu_write, the only + * side effect is that we'll give a few allocated in different contexts objects + * the same tag. Since tag-based KASAN is meant to be used a probabilistic + * bug-detection debug feature, this doesn't have significant negative impact. + * + * Ideally the tags use strong randomness to prevent any attempts to predict + * them during explicit exploit attempts. But strong randomness is expensive, + * and we did an intentional trade-off to use a PRNG. This non-atomic RMW + * sequence has in fact positive effect, since interrupts that randomly skew + * PRNG at unpredictable points do only good. + */ +u8 random_tag(void) +{ + u32 state = this_cpu_read(prng_state); + + state = 1664525 * state + 1013904223; + this_cpu_write(prng_state, state); + + return (u8)(state % (KASAN_TAG_MAX + 1)); +} + +void *kasan_reset_tag(const void *addr) +{ + return reset_tag(addr); +} + +void check_memory_region(unsigned long addr, size_t size, bool write, + unsigned long ret_ip) +{ + u8 tag; + u8 *shadow_first, *shadow_last, *shadow; + void *untagged_addr; + + if (unlikely(size == 0)) + return; + + tag = get_tag((const void *)addr); + + /* + * Ignore accesses for pointers tagged with 0xff (native kernel + * pointer tag) to suppress false positives caused by kmap. + * + * Some kernel code was written to account for archs that don't keep + * high memory mapped all the time, but rather map and unmap particular + * pages when needed. Instead of storing a pointer to the kernel memory, + * this code saves the address of the page structure and offset within + * that page for later use. Those pages are then mapped and unmapped + * with kmap/kunmap when necessary and virt_to_page is used to get the + * virtual address of the page. For arm64 (that keeps the high memory + * mapped all the time), kmap is turned into a page_address call. + + * The issue is that with use of the page_address + virt_to_page + * sequence the top byte value of the original pointer gets lost (gets + * set to KASAN_TAG_KERNEL (0xFF)). + */ + if (tag == KASAN_TAG_KERNEL) + return; + + untagged_addr = reset_tag((const void *)addr); + if (unlikely(untagged_addr < + kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { + kasan_report(addr, size, write, ret_ip); + return; + } + shadow_first = kasan_mem_to_shadow(untagged_addr); + shadow_last = kasan_mem_to_shadow(untagged_addr + size - 1); + for (shadow = shadow_first; shadow <= shadow_last; shadow++) { + if (*shadow != tag) { + kasan_report(addr, size, write, ret_ip); + return; + } + } +} + +#define DEFINE_HWASAN_LOAD_STORE(size) \ + void __hwasan_load##size##_noabort(unsigned long addr) \ + { \ + check_memory_region(addr, size, false, _RET_IP_); \ + } \ + EXPORT_SYMBOL(__hwasan_load##size##_noabort); \ + void __hwasan_store##size##_noabort(unsigned long addr) \ + { \ + check_memory_region(addr, size, true, _RET_IP_); \ + } \ + EXPORT_SYMBOL(__hwasan_store##size##_noabort) + +DEFINE_HWASAN_LOAD_STORE(1); +DEFINE_HWASAN_LOAD_STORE(2); +DEFINE_HWASAN_LOAD_STORE(4); +DEFINE_HWASAN_LOAD_STORE(8); +DEFINE_HWASAN_LOAD_STORE(16); + +void __hwasan_loadN_noabort(unsigned long addr, unsigned long size) +{ + check_memory_region(addr, size, false, _RET_IP_); +} +EXPORT_SYMBOL(__hwasan_loadN_noabort); + +void __hwasan_storeN_noabort(unsigned long addr, unsigned long size) +{ + check_memory_region(addr, size, true, _RET_IP_); +} +EXPORT_SYMBOL(__hwasan_storeN_noabort); + +void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size) +{ + kasan_poison_shadow((void *)addr, size, tag); +} +EXPORT_SYMBOL(__hwasan_tag_memory); diff --git a/mm/kasan/tags_report.c b/mm/kasan/tags_report.c new file mode 100644 index 000000000000..8eaf5f722271 --- /dev/null +++ b/mm/kasan/tags_report.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains tag-based KASAN specific error reporting code. + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin <[email protected]> + * + * Some code borrowed from https://github.com/xairy/kasan-prototype by + * Andrey Konovalov <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/bitops.h> +#include <linux/ftrace.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/printk.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/stackdepot.h> +#include <linux/stacktrace.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/kasan.h> +#include <linux/module.h> + +#include <asm/sections.h> + +#include "kasan.h" +#include "../slab.h" + +const char *get_bug_type(struct kasan_access_info *info) +{ + return "invalid-access"; +} + +void *find_first_bad_addr(void *addr, size_t size) +{ + u8 tag = get_tag(addr); + void *p = reset_tag(addr); + void *end = p + size; + + while (p < end && tag == *(u8 *)kasan_mem_to_shadow(p)) + p += KASAN_SHADOW_SCALE_SIZE; + return p; +} + +void print_tags(u8 addr_tag, const void *addr) +{ + u8 *shadow = (u8 *)kasan_mem_to_shadow(addr); + + pr_err("Pointer tag: [%02x], memory tag: [%02x]\n", addr_tag, *shadow); +} diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a31d740e6cd1..449044378782 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -944,8 +944,7 @@ static void collapse_huge_page(struct mm_struct *mm, int isolated = 0, result = 0; struct mem_cgroup *memcg; struct vm_area_struct *vma; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + struct mmu_notifier_range range; gfp_t gfp; VM_BUG_ON(address & ~HPAGE_PMD_MASK); @@ -1017,9 +1016,8 @@ static void collapse_huge_page(struct mm_struct *mm, pte = pte_offset_map(pmd, address); pte_ptl = pte_lockptr(mm, pmd); - mmun_start = address; - mmun_end = address + HPAGE_PMD_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, mm, address, address + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* * After this gup_fast can't run anymore. This also removes @@ -1029,7 +1027,7 @@ static void collapse_huge_page(struct mm_struct *mm, */ _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); spin_lock(pte_ptl); isolated = __collapse_huge_page_isolate(vma, address, pte); @@ -1076,6 +1074,7 @@ static void collapse_huge_page(struct mm_struct *mm, BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address, true); mem_cgroup_commit_charge(new_page, memcg, false, true); + count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); lru_cache_add_active_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); @@ -1225,7 +1224,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) { struct mm_struct *mm = mm_slot->mm; - VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); + lockdep_assert_held(&khugepaged_mm_lock); if (khugepaged_test_exit(mm)) { /* free mm_slot */ @@ -1287,31 +1286,30 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * collapse_shmem - collapse small tmpfs/shmem pages into huge one. * * Basic scheme is simple, details are more complex: - * - allocate and freeze a new huge page; - * - scan over radix tree replacing old pages the new one + * - allocate and lock a new huge page; + * - scan page cache replacing old pages with the new one * + swap in pages if necessary; * + fill in gaps; - * + keep old pages around in case if rollback is required; - * - if replacing succeed: + * + keep old pages around in case rollback is required; + * - if replacing succeeds: * + copy data over; * + free old pages; - * + unfreeze huge page; + * + unlock huge page; * - if replacing failed; * + put all pages back and unfreeze them; - * + restore gaps in the radix-tree; - * + free huge page; + * + restore gaps in the page cache; + * + unlock and free huge page; */ static void collapse_shmem(struct mm_struct *mm, struct address_space *mapping, pgoff_t start, struct page **hpage, int node) { gfp_t gfp; - struct page *page, *new_page, *tmp; + struct page *new_page; struct mem_cgroup *memcg; pgoff_t index, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); - struct radix_tree_iter iter; - void **slot; + XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); int nr_none = 0, result = SCAN_SUCCEED; VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); @@ -1330,59 +1328,71 @@ static void collapse_shmem(struct mm_struct *mm, goto out; } + /* This will be less messy when we use multi-index entries */ + do { + xas_lock_irq(&xas); + xas_create_range(&xas); + if (!xas_error(&xas)) + break; + xas_unlock_irq(&xas); + if (!xas_nomem(&xas, GFP_KERNEL)) { + mem_cgroup_cancel_charge(new_page, memcg, true); + result = SCAN_FAIL; + goto out; + } + } while (1); + + __SetPageLocked(new_page); + __SetPageSwapBacked(new_page); new_page->index = start; new_page->mapping = mapping; - __SetPageSwapBacked(new_page); - __SetPageLocked(new_page); - BUG_ON(!page_ref_freeze(new_page, 1)); - /* - * At this point the new_page is 'frozen' (page_count() is zero), locked - * and not up-to-date. It's safe to insert it into radix tree, because - * nobody would be able to map it or use it in other way until we - * unfreeze it. + * At this point the new_page is locked and not up-to-date. + * It's safe to insert it into the page cache, because nobody would + * be able to map it or use it in another way until we unlock it. */ - index = start; - xa_lock_irq(&mapping->i_pages); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - int n = min(iter.index, end) - index; + xas_set(&xas, start); + for (index = start; index < end; index++) { + struct page *page = xas_next(&xas); - /* - * Handle holes in the radix tree: charge it from shmem and - * insert relevant subpage of new_page into the radix-tree. - */ - if (n && !shmem_charge(mapping->host, n)) { - result = SCAN_FAIL; - break; - } - nr_none += n; - for (; index < min(iter.index, end); index++) { - radix_tree_insert(&mapping->i_pages, index, - new_page + (index % HPAGE_PMD_NR)); + VM_BUG_ON(index != xas.xa_index); + if (!page) { + /* + * Stop if extent has been truncated or hole-punched, + * and is now completely empty. + */ + if (index == start) { + if (!xas_next_entry(&xas, end - 1)) { + result = SCAN_TRUNCATED; + goto xa_locked; + } + xas_set(&xas, index); + } + if (!shmem_charge(mapping->host, 1)) { + result = SCAN_FAIL; + goto xa_locked; + } + xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); + nr_none++; + continue; } - /* We are done. */ - if (index >= end) - break; - - page = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); - if (radix_tree_exceptional_entry(page) || !PageUptodate(page)) { - xa_unlock_irq(&mapping->i_pages); + if (xa_is_value(page) || !PageUptodate(page)) { + xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ if (shmem_getpage(mapping->host, index, &page, SGP_NOHUGE)) { result = SCAN_FAIL; - goto tree_unlocked; + goto xa_unlocked; } - xa_lock_irq(&mapping->i_pages); } else if (trylock_page(page)) { get_page(page); + xas_unlock_irq(&xas); } else { result = SCAN_PAGE_LOCK; - break; + goto xa_locked; } /* @@ -1391,38 +1401,46 @@ static void collapse_shmem(struct mm_struct *mm, */ VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageUptodate(page), page); - VM_BUG_ON_PAGE(PageTransCompound(page), page); + + /* + * If file was truncated then extended, or hole-punched, before + * we locked the first page, then a THP might be there already. + */ + if (PageTransCompound(page)) { + result = SCAN_PAGE_COMPOUND; + goto out_unlock; + } if (page_mapping(page) != mapping) { result = SCAN_TRUNCATED; goto out_unlock; } - xa_unlock_irq(&mapping->i_pages); if (isolate_lru_page(page)) { result = SCAN_DEL_PAGE_LRU; - goto out_isolate_failed; + goto out_unlock; } if (page_mapped(page)) unmap_mapping_pages(mapping, index, 1, false); - xa_lock_irq(&mapping->i_pages); + xas_lock_irq(&xas); + xas_set(&xas, index); - slot = radix_tree_lookup_slot(&mapping->i_pages, index); - VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock), page); + VM_BUG_ON_PAGE(page != xas_load(&xas), page); VM_BUG_ON_PAGE(page_mapped(page), page); /* * The page is expected to have page_count() == 3: * - we hold a pin on it; - * - one reference from radix tree; + * - one reference from page cache; * - one from isolate_lru_page; */ if (!page_ref_freeze(page, 3)) { result = SCAN_PAGE_COUNT; - goto out_lru; + xas_unlock_irq(&xas); + putback_lru_page(page); + goto out_unlock; } /* @@ -1432,133 +1450,110 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - radix_tree_replace_slot(&mapping->i_pages, slot, - new_page + (index % HPAGE_PMD_NR)); - - slot = radix_tree_iter_resume(slot, &iter); - index++; + xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); continue; -out_lru: - xa_unlock_irq(&mapping->i_pages); - putback_lru_page(page); -out_isolate_failed: - unlock_page(page); - put_page(page); - goto tree_unlocked; out_unlock: unlock_page(page); put_page(page); - break; + goto xa_unlocked; } - /* - * Handle hole in radix tree at the end of the range. - * This code only triggers if there's nothing in radix tree - * beyond 'end'. - */ - if (result == SCAN_SUCCEED && index < end) { - int n = end - index; - - if (!shmem_charge(mapping->host, n)) { - result = SCAN_FAIL; - goto tree_locked; - } + __inc_node_page_state(new_page, NR_SHMEM_THPS); + if (nr_none) { + struct zone *zone = page_zone(new_page); - for (; index < end; index++) { - radix_tree_insert(&mapping->i_pages, index, - new_page + (index % HPAGE_PMD_NR)); - } - nr_none += n; + __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); + __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); } -tree_locked: - xa_unlock_irq(&mapping->i_pages); -tree_unlocked: +xa_locked: + xas_unlock_irq(&xas); +xa_unlocked: if (result == SCAN_SUCCEED) { - unsigned long flags; - struct zone *zone = page_zone(new_page); + struct page *page, *tmp; /* - * Replacing old pages with new one has succeed, now we need to - * copy the content and free old pages. + * Replacing old pages with new one has succeeded, now we + * need to copy the content and free the old pages. */ + index = start; list_for_each_entry_safe(page, tmp, &pagelist, lru) { + while (index < page->index) { + clear_highpage(new_page + (index % HPAGE_PMD_NR)); + index++; + } copy_highpage(new_page + (page->index % HPAGE_PMD_NR), page); list_del(&page->lru); - unlock_page(page); - page_ref_unfreeze(page, 1); page->mapping = NULL; + page_ref_unfreeze(page, 1); ClearPageActive(page); ClearPageUnevictable(page); + unlock_page(page); put_page(page); + index++; } - - local_irq_save(flags); - __inc_node_page_state(new_page, NR_SHMEM_THPS); - if (nr_none) { - __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); - __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); + while (index < end) { + clear_highpage(new_page + (index % HPAGE_PMD_NR)); + index++; } - local_irq_restore(flags); - /* - * Remove pte page tables, so we can re-faulti - * the page as huge. - */ - retract_page_tables(mapping, start); - - /* Everything is ready, let's unfreeze the new_page */ - set_page_dirty(new_page); SetPageUptodate(new_page); - page_ref_unfreeze(new_page, HPAGE_PMD_NR); + page_ref_add(new_page, HPAGE_PMD_NR - 1); + set_page_dirty(new_page); mem_cgroup_commit_charge(new_page, memcg, false, true); + count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); lru_cache_add_anon(new_page); - unlock_page(new_page); + /* + * Remove pte page tables, so we can re-fault the page as huge. + */ + retract_page_tables(mapping, start); *hpage = NULL; khugepaged_pages_collapsed++; } else { - /* Something went wrong: rollback changes to the radix-tree */ + struct page *page; + + /* Something went wrong: roll back page cache changes */ + xas_lock_irq(&xas); + mapping->nrpages -= nr_none; shmem_uncharge(mapping->host, nr_none); - xa_lock_irq(&mapping->i_pages); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - if (iter.index >= end) - break; + + xas_set(&xas, start); + xas_for_each(&xas, page, end - 1) { page = list_first_entry_or_null(&pagelist, struct page, lru); - if (!page || iter.index < page->index) { + if (!page || xas.xa_index < page->index) { if (!nr_none) break; nr_none--; /* Put holes back where they were */ - radix_tree_delete(&mapping->i_pages, iter.index); + xas_store(&xas, NULL); continue; } - VM_BUG_ON_PAGE(page->index != iter.index, page); + VM_BUG_ON_PAGE(page->index != xas.xa_index, page); /* Unfreeze the page. */ list_del(&page->lru); page_ref_unfreeze(page, 2); - radix_tree_replace_slot(&mapping->i_pages, slot, page); - slot = radix_tree_iter_resume(slot, &iter); - xa_unlock_irq(&mapping->i_pages); - putback_lru_page(page); + xas_store(&xas, page); + xas_pause(&xas); + xas_unlock_irq(&xas); unlock_page(page); - xa_lock_irq(&mapping->i_pages); + putback_lru_page(page); + xas_lock_irq(&xas); } VM_BUG_ON(nr_none); - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); - /* Unfreeze new_page, caller would take care about freeing it */ - page_ref_unfreeze(new_page, 1); mem_cgroup_cancel_charge(new_page, memcg, true); - unlock_page(new_page); new_page->mapping = NULL; } + + unlock_page(new_page); out: VM_BUG_ON(!list_empty(&pagelist)); /* TODO: tracepoints */ @@ -1569,8 +1564,7 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, pgoff_t start, struct page **hpage) { struct page *page = NULL; - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, start); int present, swap; int node = NUMA_NO_NODE; int result = SCAN_SUCCEED; @@ -1579,17 +1573,11 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, swap = 0; memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - if (iter.index >= start + HPAGE_PMD_NR) - break; - - page = radix_tree_deref_slot(slot); - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); + xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) { + if (xas_retry(&xas, page)) continue; - } - if (radix_tree_exception(page)) { + if (xa_is_value(page)) { if (++swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; break; @@ -1628,7 +1616,7 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, present++; if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); + xas_pause(&xas); cond_resched_rcu(); } } @@ -1665,7 +1653,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int progress = 0; VM_BUG_ON(!pages); - VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); + lockdep_assert_held(&khugepaged_mm_lock); if (khugepaged_scan.mm_slot) mm_slot = khugepaged_scan.mm_slot; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 17dd883198ae..707fa5579f66 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -86,12 +86,13 @@ #include <linux/seq_file.h> #include <linux/cpumask.h> #include <linux/spinlock.h> +#include <linux/module.h> #include <linux/mutex.h> #include <linux/rcupdate.h> #include <linux/stacktrace.h> #include <linux/cache.h> #include <linux/percpu.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/pfn.h> #include <linux/mmzone.h> #include <linux/slab.h> @@ -181,6 +182,7 @@ struct kmemleak_object { /* flag set to not scan the object */ #define OBJECT_NO_SCAN (1 << 2) +#define HEX_PREFIX " " /* number of bytes to print per line; must be 16 or 32 */ #define HEX_ROW_SIZE 16 /* number of bytes to print at a time (1, 2, 4, 8) */ @@ -235,6 +237,9 @@ static int kmemleak_skip_disable; /* If there are leaks that can be reported */ static bool kmemleak_found_leaks; +static bool kmemleak_verbose; +module_param_named(verbose, kmemleak_verbose, bool, 0600); + /* * Early object allocation/freeing logging. Kmemleak is initialized after the * kernel allocator. However, both the kernel allocator and kmemleak may @@ -299,6 +304,25 @@ static void kmemleak_disable(void); kmemleak_disable(); \ } while (0) +#define warn_or_seq_printf(seq, fmt, ...) do { \ + if (seq) \ + seq_printf(seq, fmt, ##__VA_ARGS__); \ + else \ + pr_warn(fmt, ##__VA_ARGS__); \ +} while (0) + +static void warn_or_seq_hex_dump(struct seq_file *seq, int prefix_type, + int rowsize, int groupsize, const void *buf, + size_t len, bool ascii) +{ + if (seq) + seq_hex_dump(seq, HEX_PREFIX, prefix_type, rowsize, groupsize, + buf, len, ascii); + else + print_hex_dump(KERN_WARNING, pr_fmt(HEX_PREFIX), prefix_type, + rowsize, groupsize, buf, len, ascii); +} + /* * Printing of the objects hex dump to the seq file. The number of lines to be * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The @@ -314,10 +338,10 @@ static void hex_dump_object(struct seq_file *seq, /* limit the number of lines to HEX_MAX_LINES */ len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE); - seq_printf(seq, " hex dump (first %zu bytes):\n", len); + warn_or_seq_printf(seq, " hex dump (first %zu bytes):\n", len); kasan_disable_current(); - seq_hex_dump(seq, " ", DUMP_PREFIX_NONE, HEX_ROW_SIZE, - HEX_GROUP_SIZE, ptr, len, HEX_ASCII); + warn_or_seq_hex_dump(seq, DUMP_PREFIX_NONE, HEX_ROW_SIZE, + HEX_GROUP_SIZE, ptr, len, HEX_ASCII); kasan_enable_current(); } @@ -365,17 +389,17 @@ static void print_unreferenced(struct seq_file *seq, int i; unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies); - seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", + warn_or_seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", object->pointer, object->size); - seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", + warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", object->comm, object->pid, object->jiffies, msecs_age / 1000, msecs_age % 1000); hex_dump_object(seq, object); - seq_printf(seq, " backtrace:\n"); + warn_or_seq_printf(seq, " backtrace:\n"); for (i = 0; i < object->trace_len; i++) { void *ptr = (void *)object->trace[i]; - seq_printf(seq, " [<%p>] %pS\n", ptr, ptr); + warn_or_seq_printf(seq, " [<%p>] %pS\n", ptr, ptr); } } @@ -550,6 +574,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, unsigned long flags; struct kmemleak_object *object, *parent; struct rb_node **link, *rb_parent; + unsigned long untagged_ptr; object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); if (!object) { @@ -595,8 +620,9 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, write_lock_irqsave(&kmemleak_lock, flags); - min_addr = min(min_addr, ptr); - max_addr = max(max_addr, ptr + size); + untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); + min_addr = min(min_addr, untagged_ptr); + max_addr = max(max_addr, untagged_ptr + size); link = &object_tree_root.rb_node; rb_parent = NULL; while (*link) { @@ -1309,6 +1335,7 @@ static void scan_block(void *_start, void *_end, unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); unsigned long *end = _end - (BYTES_PER_POINTER - 1); unsigned long flags; + unsigned long untagged_ptr; read_lock_irqsave(&kmemleak_lock, flags); for (ptr = start; ptr < end; ptr++) { @@ -1323,7 +1350,8 @@ static void scan_block(void *_start, void *_end, pointer = *ptr; kasan_enable_current(); - if (pointer < min_addr || pointer >= max_addr) + untagged_ptr = (unsigned long)kasan_reset_tag((void *)pointer); + if (untagged_ptr < min_addr || untagged_ptr >= max_addr) continue; /* @@ -1523,11 +1551,14 @@ static void kmemleak_scan(void) unsigned long pfn; for (pfn = start_pfn; pfn < end_pfn; pfn++) { - struct page *page; + struct page *page = pfn_to_online_page(pfn); + + if (!page) + continue; - if (!pfn_valid(pfn)) + /* only scan pages belonging to this node */ + if (page_to_nid(page) != i) continue; - page = pfn_to_page(pfn); /* only scan if page is in use */ if (page_count(page) == 0) continue; @@ -1598,6 +1629,10 @@ static void kmemleak_scan(void) if (unreferenced_object(object) && !(object->flags & OBJECT_REPORTED)) { object->flags |= OBJECT_REPORTED; + + if (kmemleak_verbose) + print_unreferenced(NULL, object); + new_leaks++; } spin_unlock_irqrestore(&object->lock, flags); @@ -1619,7 +1654,7 @@ static void kmemleak_scan(void) */ static int kmemleak_scan_thread(void *arg) { - static int first_run = 1; + static int first_run = IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN); pr_info("Automatic memory scanning thread started\n"); set_user_nice(current, 10); @@ -2113,9 +2148,11 @@ static int __init kmemleak_late_init(void) return -ENOMEM; } - mutex_lock(&scan_mutex); - start_scan_thread(); - mutex_unlock(&scan_mutex); + if (IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN)) { + mutex_lock(&scan_mutex); + start_scan_thread(); + mutex_unlock(&scan_mutex); + } pr_info("Kernel memory leak detector initialized\n"); @@ -25,7 +25,7 @@ #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/spinlock.h> -#include <linux/jhash.h> +#include <linux/xxhash.h> #include <linux/delay.h> #include <linux/kthread.h> #include <linux/wait.h> @@ -296,6 +296,7 @@ static unsigned long ksm_run = KSM_RUN_STOP; static void wait_while_offlining(void); static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); +static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait); static DEFINE_MUTEX(ksm_thread_mutex); static DEFINE_SPINLOCK(ksm_mmlist_lock); @@ -597,7 +598,7 @@ static struct stable_node *alloc_stable_node_chain(struct stable_node *dup, chain->chain_prune_time = jiffies; chain->rmap_hlist_len = STABLE_NODE_CHAIN; #if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA) - chain->nid = -1; /* debug */ + chain->nid = NUMA_NO_NODE; /* debug */ #endif ksm_stable_node_chains++; @@ -666,6 +667,12 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) free_stable_node(stable_node); } +enum get_ksm_page_flags { + GET_KSM_PAGE_NOLOCK, + GET_KSM_PAGE_LOCK, + GET_KSM_PAGE_TRYLOCK +}; + /* * get_ksm_page: checks if the page indicated by the stable node * is still its ksm page, despite having held no reference to it. @@ -685,7 +692,8 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) * a page to put something that might look like our key in page->mapping. * is on its way to being freed; but it is an anomaly to bear in mind. */ -static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) +static struct page *get_ksm_page(struct stable_node *stable_node, + enum get_ksm_page_flags flags) { struct page *page; void *expected_mapping; @@ -705,8 +713,9 @@ again: * case this node is no longer referenced, and should be freed; * however, it might mean that the page is under page_ref_freeze(). * The __remove_mapping() case is easy, again the node is now stale; - * but if page is swapcache in migrate_page_move_mapping(), it might - * still be our page, in which case it's essential to keep the node. + * the same is in reuse_ksm_page() case; but if page is swapcache + * in migrate_page_move_mapping(), it might still be our page, + * in which case it's essential to keep the node. */ while (!get_page_unless_zero(page)) { /* @@ -727,8 +736,15 @@ again: goto stale; } - if (lock_it) { + if (flags == GET_KSM_PAGE_TRYLOCK) { + if (!trylock_page(page)) { + put_page(page); + return ERR_PTR(-EBUSY); + } + } else if (flags == GET_KSM_PAGE_LOCK) lock_page(page); + + if (flags != GET_KSM_PAGE_NOLOCK) { if (READ_ONCE(page->mapping) != expected_mapping) { unlock_page(page); put_page(page); @@ -762,7 +778,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) struct page *page; stable_node = rmap_item->head; - page = get_ksm_page(stable_node, true); + page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); if (!page) goto out; @@ -862,7 +878,7 @@ static int remove_stable_node(struct stable_node *stable_node) struct page *page; int err; - page = get_ksm_page(stable_node, true); + page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); if (!page) { /* * get_ksm_page did remove_node_from_stable_tree itself. @@ -1009,7 +1025,7 @@ static u32 calc_checksum(struct page *page) { u32 checksum; void *addr = kmap_atomic(page); - checksum = jhash2(addr, PAGE_SIZE / 4, 17); + checksum = xxhash(addr, PAGE_SIZE, 0); kunmap_atomic(addr); return checksum; } @@ -1042,8 +1058,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, }; int swapped; int err = -EFAULT; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + struct mmu_notifier_range range; pvmw.address = page_address_in_vma(page, vma); if (pvmw.address == -EFAULT) @@ -1051,9 +1066,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, BUG_ON(PageTransCompound(page)); - mmun_start = pvmw.address; - mmun_end = pvmw.address + PAGE_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, mm, pvmw.address, + pvmw.address + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); if (!page_vma_mapped_walk(&pvmw)) goto out_mn; @@ -1105,7 +1120,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, out_unlock: page_vma_mapped_walk_done(&pvmw); out_mn: - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); out: return err; } @@ -1129,8 +1144,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, spinlock_t *ptl; unsigned long addr; int err = -EFAULT; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + struct mmu_notifier_range range; addr = page_address_in_vma(page, vma); if (addr == -EFAULT) @@ -1140,9 +1154,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd) goto out; - mmun_start = addr; - mmun_end = addr + PAGE_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!pte_same(*ptep, orig_pte)) { @@ -1188,7 +1201,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, pte_unmap_unlock(ptep, ptl); err = 0; out_mn: - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); out: return err; } @@ -1387,7 +1400,7 @@ static struct page *stable_node_dup(struct stable_node **_stable_node_dup, * stable_node parameter itself will be freed from * under us if it returns NULL. */ - _tree_page = get_ksm_page(dup, false); + _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK); if (!_tree_page) continue; nr += 1; @@ -1510,7 +1523,7 @@ static struct page *__stable_node_chain(struct stable_node **_stable_node_dup, if (!is_stable_node_chain(stable_node)) { if (is_page_sharing_candidate(stable_node)) { *_stable_node_dup = stable_node; - return get_ksm_page(stable_node, false); + return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK); } /* * _stable_node_dup set to NULL means the stable_node @@ -1615,7 +1628,8 @@ again: * wrprotected at all times. Any will work * fine to continue the walk. */ - tree_page = get_ksm_page(stable_node_any, false); + tree_page = get_ksm_page(stable_node_any, + GET_KSM_PAGE_NOLOCK); } VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); if (!tree_page) { @@ -1675,7 +1689,12 @@ again: * It would be more elegant to return stable_node * than kpage, but that involves more changes. */ - tree_page = get_ksm_page(stable_node_dup, true); + tree_page = get_ksm_page(stable_node_dup, + GET_KSM_PAGE_TRYLOCK); + + if (PTR_ERR(tree_page) == -EBUSY) + return ERR_PTR(-EBUSY); + if (unlikely(!tree_page)) /* * The tree may have been rebalanced, @@ -1844,7 +1863,8 @@ again: * wrprotected at all times. Any will work * fine to continue the walk. */ - tree_page = get_ksm_page(stable_node_any, false); + tree_page = get_ksm_page(stable_node_any, + GET_KSM_PAGE_NOLOCK); } VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); if (!tree_page) { @@ -2070,6 +2090,9 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) remove_rmap_item_from_tree(rmap_item); if (kpage) { + if (PTR_ERR(kpage) == -EBUSY) + return; + err = try_to_merge_with_ksm_page(rmap_item, page, kpage); if (!err) { /* @@ -2244,7 +2267,8 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { - page = get_ksm_page(stable_node, false); + page = get_ksm_page(stable_node, + GET_KSM_PAGE_NOLOCK); if (page) put_page(page); cond_resched(); @@ -2391,6 +2415,8 @@ static int ksmd_should_run(void) static int ksm_scan_thread(void *nothing) { + unsigned int sleep_ms; + set_freezable(); set_user_nice(current, 5); @@ -2404,8 +2430,10 @@ static int ksm_scan_thread(void *nothing) try_to_freeze(); if (ksmd_should_run()) { - schedule_timeout_interruptible( - msecs_to_jiffies(ksm_thread_sleep_millisecs)); + sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs); + wait_event_interruptible_timeout(ksm_iter_wait, + sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), + msecs_to_jiffies(sleep_ms)); } else { wait_event_freezable(ksm_thread_wait, ksmd_should_run() || kthread_should_stop()); @@ -2640,6 +2668,31 @@ again: goto again; } +bool reuse_ksm_page(struct page *page, + struct vm_area_struct *vma, + unsigned long address) +{ +#ifdef CONFIG_DEBUG_VM + if (WARN_ON(is_zero_pfn(page_to_pfn(page))) || + WARN_ON(!page_mapped(page)) || + WARN_ON(!PageLocked(page))) { + dump_page(page, "reuse_ksm_page"); + return false; + } +#endif + + if (PageSwapCache(page) || !page_stable_node(page)) + return false; + /* Prohibit parallel get_ksm_page() */ + if (!page_ref_freeze(page, 1)) + return false; + + page_move_anon_rmap(page, vma); + page->index = linear_page_index(vma, address); + page_ref_unfreeze(page, 1); + + return true; +} #ifdef CONFIG_MIGRATION void ksm_migrate_page(struct page *newpage, struct page *oldpage) { @@ -2824,6 +2877,7 @@ static ssize_t sleep_millisecs_store(struct kobject *kobj, return -EINVAL; ksm_thread_sleep_millisecs = msecs; + wake_up_interruptible(&ksm_iter_wait); return count; } diff --git a/mm/list_lru.c b/mm/list_lru.c index 5b30625fd365..0730bf8ff39f 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -601,7 +601,6 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct lock_class_key *key, struct shrinker *shrinker) { int i; - size_t size = sizeof(*lru->node) * nr_node_ids; int err = -ENOMEM; #ifdef CONFIG_MEMCG_KMEM @@ -612,7 +611,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, #endif memcg_get_cache_ids(); - lru->node = kzalloc(size, GFP_KERNEL); + lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL); if (!lru->node) goto out; diff --git a/mm/maccess.c b/mm/maccess.c index f3416632e5a4..ec00be51a24f 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -30,10 +30,8 @@ long __probe_kernel_read(void *dst, const void *src, size_t size) set_fs(KERNEL_DS); pagefault_disable(); - current->kernel_uaccess_faults_ok++; ret = __copy_from_user_inatomic(dst, (__force const void __user *)src, size); - current->kernel_uaccess_faults_ok--; pagefault_enable(); set_fs(old_fs); @@ -60,9 +58,7 @@ long __probe_kernel_write(void *dst, const void *src, size_t size) set_fs(KERNEL_DS); pagefault_disable(); - current->kernel_uaccess_faults_ok++; ret = __copy_to_user_inatomic((__force void __user *)dst, src, size); - current->kernel_uaccess_faults_ok--; pagefault_enable(); set_fs(old_fs); @@ -98,13 +94,11 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) set_fs(KERNEL_DS); pagefault_disable(); - current->kernel_uaccess_faults_ok++; do { ret = __get_user(*dst++, (const char __user __force *)src++); } while (dst[-1] && ret == 0 && src - unsafe_addr < count); - current->kernel_uaccess_faults_ok--; dst[-1] = '\0'; pagefault_enable(); set_fs(old_fs); diff --git a/mm/madvise.c b/mm/madvise.c index 71d21df2a3f3..21a7881a2db4 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -251,7 +251,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; page = find_get_entry(mapping, index); - if (!radix_tree_exceptional_entry(page)) { + if (!xa_is_value(page)) { if (page) put_page(page); continue; @@ -458,29 +458,30 @@ static void madvise_free_page_range(struct mmu_gather *tlb, static int madvise_free_single_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { - unsigned long start, end; struct mm_struct *mm = vma->vm_mm; + struct mmu_notifier_range range; struct mmu_gather tlb; /* MADV_FREE works for only anon vma at the moment */ if (!vma_is_anonymous(vma)) return -EINVAL; - start = max(vma->vm_start, start_addr); - if (start >= vma->vm_end) + range.start = max(vma->vm_start, start_addr); + if (range.start >= vma->vm_end) return -EINVAL; - end = min(vma->vm_end, end_addr); - if (end <= vma->vm_start) + range.end = min(vma->vm_end, end_addr); + if (range.end <= vma->vm_start) return -EINVAL; + mmu_notifier_range_init(&range, mm, range.start, range.end); lru_add_drain(); - tlb_gather_mmu(&tlb, mm, start, end); + tlb_gather_mmu(&tlb, mm, range.start, range.end); update_hiwater_rss(mm); - mmu_notifier_invalidate_range_start(mm, start, end); - madvise_free_page_range(&tlb, vma, start, end); - mmu_notifier_invalidate_range_end(mm, start, end); - tlb_finish_mmu(&tlb, start, end); + mmu_notifier_invalidate_range_start(&range); + madvise_free_page_range(&tlb, vma, range.start, range.end); + mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb, range.start, range.end); return 0; } diff --git a/mm/memblock.c b/mm/memblock.c index 237944479d25..470601115892 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -20,13 +20,19 @@ #include <linux/kmemleak.h> #include <linux/seq_file.h> #include <linux/memblock.h> -#include <linux/bootmem.h> #include <asm/sections.h> #include <linux/io.h> #include "internal.h" +#define INIT_MEMBLOCK_REGIONS 128 +#define INIT_PHYSMEM_REGIONS 4 + +#ifndef INIT_MEMBLOCK_RESERVED_REGIONS +# define INIT_MEMBLOCK_RESERVED_REGIONS INIT_MEMBLOCK_REGIONS +#endif + /** * DOC: memblock overview * @@ -82,8 +88,18 @@ * initialization compltes. */ +#ifndef CONFIG_NEED_MULTIPLE_NODES +struct pglist_data __refdata contig_page_data; +EXPORT_SYMBOL(contig_page_data); +#endif + +unsigned long max_low_pfn; +unsigned long min_low_pfn; +unsigned long max_pfn; +unsigned long long max_possible_pfn; + static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; -static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; +static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock; #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock; #endif @@ -96,7 +112,7 @@ struct memblock memblock __initdata_memblock = { .reserved.regions = memblock_reserved_init_regions, .reserved.cnt = 1, /* empty dummy entry */ - .reserved.max = INIT_MEMBLOCK_REGIONS, + .reserved.max = INIT_MEMBLOCK_RESERVED_REGIONS, .reserved.name = "reserved", #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP @@ -253,7 +269,8 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, phys_addr_t kernel_end, ret; /* pump up @end */ - if (end == MEMBLOCK_ALLOC_ACCESSIBLE) + if (end == MEMBLOCK_ALLOC_ACCESSIBLE || + end == MEMBLOCK_ALLOC_KASAN) end = memblock.current_limit; /* avoid allocating the first page */ @@ -791,7 +808,14 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) return memblock_remove_range(&memblock.memory, base, size); } - +/** + * memblock_free - free boot memory block + * @base: phys starting address of the boot memory block + * @size: size of the boot memory block in bytes + * + * Free boot memory block previously allocated by memblock_alloc_xx() API. + * The freeing memory will not be released to the buddy allocator. + */ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; @@ -1170,7 +1194,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* - * Common iterator interface used to define for_each_mem_range(). + * Common iterator interface used to define for_each_mem_pfn_range(). */ void __init_memblock __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, @@ -1238,8 +1262,11 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, { phys_addr_t found; - if (!align) + if (!align) { + /* Can't use WARNs this early in boot on powerpc */ + dump_stack(); align = SMP_CACHE_BYTES; + } found = memblock_find_in_range_node(size, align, start, end, nid, flags); @@ -1269,7 +1296,7 @@ phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags); } -phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) +phys_addr_t __init memblock_phys_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { enum memblock_flags flags = choose_memblock_flags(); phys_addr_t ret; @@ -1304,23 +1331,22 @@ phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys return alloc; } -phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align) +phys_addr_t __init memblock_phys_alloc(phys_addr_t size, phys_addr_t align) { return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); } -phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) +phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) { - phys_addr_t res = memblock_alloc_nid(size, align, nid); + phys_addr_t res = memblock_phys_alloc_nid(size, align, nid); if (res) return res; return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); } -#if defined(CONFIG_NO_BOOTMEM) /** - * memblock_virt_alloc_internal - allocate boot memory block + * memblock_alloc_internal - allocate boot memory block * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @min_addr: the lower bound of the memory region to allocate (phys address) @@ -1333,9 +1359,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i * hold the requested memory. * * The allocation is performed from memory region limited by - * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE. - * - * The memory block is aligned on %SMP_CACHE_BYTES if @align == 0. + * memblock.current_limit if @max_addr == %MEMBLOCK_ALLOC_ACCESSIBLE. * * The phys address of allocated boot memory block is converted to virtual and * allocated memory is reset to 0. @@ -1346,7 +1370,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i * Return: * Virtual address of allocated memory block on success, NULL on failure. */ -static void * __init memblock_virt_alloc_internal( +static void * __init memblock_alloc_internal( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) @@ -1361,13 +1385,15 @@ static void * __init memblock_virt_alloc_internal( /* * Detect any accidental use of these APIs after slab is ready, as at * this moment memblock may be deinitialized already and its - * internal data may be destroyed (after execution of free_all_bootmem) + * internal data may be destroyed (after execution of memblock_free_all) */ if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, nid); - if (!align) + if (!align) { + dump_stack(); align = SMP_CACHE_BYTES; + } if (max_addr > memblock.current_limit) max_addr = memblock.current_limit; @@ -1401,26 +1427,28 @@ again: done: ptr = phys_to_virt(alloc); - /* - * The min_count is set to 0 so that bootmem allocated blocks - * are never reported as leaks. This is because many of these blocks - * are only referred via the physical address which is not - * looked up by kmemleak. - */ - kmemleak_alloc(ptr, size, 0, 0); + /* Skip kmemleak for kasan_init() due to high volume. */ + if (max_addr != MEMBLOCK_ALLOC_KASAN) + /* + * The min_count is set to 0 so that bootmem allocated + * blocks are never reported as leaks. This is because many + * of these blocks are only referred via the physical + * address which is not looked up by kmemleak. + */ + kmemleak_alloc(ptr, size, 0, 0); return ptr; } /** - * memblock_virt_alloc_try_nid_raw - allocate boot memory block without zeroing + * memblock_alloc_try_nid_raw - allocate boot memory block without zeroing * memory and without panicking * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @min_addr: the lower bound of the memory region from where the allocation * is preferred (phys address) * @max_addr: the upper bound of the memory region from where the allocation - * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to * allocate only from memory limited by memblock.current_limit value * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * @@ -1431,7 +1459,7 @@ done: * Return: * Virtual address of allocated memory block on success, NULL on failure. */ -void * __init memblock_virt_alloc_try_nid_raw( +void * __init memblock_alloc_try_nid_raw( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) @@ -1442,23 +1470,22 @@ void * __init memblock_virt_alloc_try_nid_raw( __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); - ptr = memblock_virt_alloc_internal(size, align, + ptr = memblock_alloc_internal(size, align, min_addr, max_addr, nid); -#ifdef CONFIG_DEBUG_VM if (ptr && size > 0) - memset(ptr, PAGE_POISON_PATTERN, size); -#endif + page_init_poison(ptr, size); + return ptr; } /** - * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block + * memblock_alloc_try_nid_nopanic - allocate boot memory block * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @min_addr: the lower bound of the memory region from where the allocation * is preferred (phys address) * @max_addr: the upper bound of the memory region from where the allocation - * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to * allocate only from memory limited by memblock.current_limit value * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * @@ -1468,7 +1495,7 @@ void * __init memblock_virt_alloc_try_nid_raw( * Return: * Virtual address of allocated memory block on success, NULL on failure. */ -void * __init memblock_virt_alloc_try_nid_nopanic( +void * __init memblock_alloc_try_nid_nopanic( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) @@ -1479,7 +1506,7 @@ void * __init memblock_virt_alloc_try_nid_nopanic( __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); - ptr = memblock_virt_alloc_internal(size, align, + ptr = memblock_alloc_internal(size, align, min_addr, max_addr, nid); if (ptr) memset(ptr, 0, size); @@ -1487,24 +1514,24 @@ void * __init memblock_virt_alloc_try_nid_nopanic( } /** - * memblock_virt_alloc_try_nid - allocate boot memory block with panicking + * memblock_alloc_try_nid - allocate boot memory block with panicking * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @min_addr: the lower bound of the memory region from where the allocation * is preferred (phys address) * @max_addr: the upper bound of the memory region from where the allocation - * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to * allocate only from memory limited by memblock.current_limit value * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * - * Public panicking version of memblock_virt_alloc_try_nid_nopanic() + * Public panicking version of memblock_alloc_try_nid_nopanic() * which provides debug information (including caller info), if enabled, * and panics if the request can not be satisfied. * * Return: * Virtual address of allocated memory block on success, NULL on failure. */ -void * __init memblock_virt_alloc_try_nid( +void * __init memblock_alloc_try_nid( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) @@ -1514,7 +1541,7 @@ void * __init memblock_virt_alloc_try_nid( memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n", __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); - ptr = memblock_virt_alloc_internal(size, align, + ptr = memblock_alloc_internal(size, align, min_addr, max_addr, nid); if (ptr) { memset(ptr, 0, size); @@ -1525,25 +1552,6 @@ void * __init memblock_virt_alloc_try_nid( __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr); return NULL; } -#endif - -/** - * __memblock_free_early - free boot memory block - * @base: phys starting address of the boot memory block - * @size: size of the boot memory block in bytes - * - * Free boot memory block previously allocated by memblock_virt_alloc_xx() API. - * The freeing memory will not be released to the buddy allocator. - */ -void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) -{ - phys_addr_t end = base + size - 1; - - memblock_dbg("%s: [%pa-%pa] %pF\n", - __func__, &base, &end, (void *)_RET_IP_); - kmemleak_free_part_phys(base, size); - memblock_remove_range(&memblock.reserved, base, size); -} /** * __memblock_free_late - free bootmem block pages directly to buddy allocator @@ -1566,8 +1574,8 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) end = PFN_DOWN(base + size); for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); - totalram_pages++; + memblock_free_pages(pfn_to_page(cursor), cursor, 0); + totalram_pages_inc(); } } @@ -1718,7 +1726,7 @@ static int __init_memblock memblock_search(struct memblock_type *type, phys_addr return -1; } -bool __init memblock_is_reserved(phys_addr_t addr) +bool __init_memblock memblock_is_reserved(phys_addr_t addr) { return memblock_search(&memblock.reserved, addr) != -1; } @@ -1880,6 +1888,100 @@ static int __init early_memblock(char *p) } early_param("memblock", early_memblock); +static void __init __free_pages_memory(unsigned long start, unsigned long end) +{ + int order; + + while (start < end) { + order = min(MAX_ORDER - 1UL, __ffs(start)); + + while (start + (1UL << order) > end) + order--; + + memblock_free_pages(pfn_to_page(start), start, order); + + start += (1UL << order); + } +} + +static unsigned long __init __free_memory_core(phys_addr_t start, + phys_addr_t end) +{ + unsigned long start_pfn = PFN_UP(start); + unsigned long end_pfn = min_t(unsigned long, + PFN_DOWN(end), max_low_pfn); + + if (start_pfn >= end_pfn) + return 0; + + __free_pages_memory(start_pfn, end_pfn); + + return end_pfn - start_pfn; +} + +static unsigned long __init free_low_memory_core_early(void) +{ + unsigned long count = 0; + phys_addr_t start, end; + u64 i; + + memblock_clear_hotplug(0, -1); + + for_each_reserved_mem_region(i, &start, &end) + reserve_bootmem_region(start, end); + + /* + * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id + * because in some case like Node0 doesn't have RAM installed + * low ram will be on Node1 + */ + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, + NULL) + count += __free_memory_core(start, end); + + return count; +} + +static int reset_managed_pages_done __initdata; + +void reset_node_managed_pages(pg_data_t *pgdat) +{ + struct zone *z; + + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + atomic_long_set(&z->managed_pages, 0); +} + +void __init reset_all_zones_managed_pages(void) +{ + struct pglist_data *pgdat; + + if (reset_managed_pages_done) + return; + + for_each_online_pgdat(pgdat) + reset_node_managed_pages(pgdat); + + reset_managed_pages_done = 1; +} + +/** + * memblock_free_all - release free pages to the buddy allocator + * + * Return: the number of pages actually released. + */ +unsigned long __init memblock_free_all(void) +{ + unsigned long pages; + + reset_all_zones_managed_pages(); + + pages = free_low_memory_core_early(); + totalram_pages_add(pages); + + return pages; +} + #if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK) static int memblock_debug_show(struct seq_file *m, void *private) @@ -1903,8 +2005,7 @@ DEFINE_SHOW_ATTRIBUTE(memblock_debug); static int __init memblock_init_debugfs(void) { struct dentry *root = debugfs_create_dir("memblock", NULL); - if (!root) - return -ENXIO; + debugfs_create_file("memory", 0444, root, &memblock.memory, &memblock_debug_fops); debugfs_create_file("reserved", 0444, root, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e79cb59552d9..532e0e2a4817 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -39,6 +39,7 @@ #include <linux/shmem_fs.h> #include <linux/hugetlb.h> #include <linux/pagemap.h> +#include <linux/vm_event_item.h> #include <linux/smp.h> #include <linux/page-flags.h> #include <linux/backing-dev.h> @@ -248,6 +249,12 @@ enum res_type { iter != NULL; \ iter = mem_cgroup_iter(NULL, iter, NULL)) +static inline bool should_force_charge(void) +{ + return tsk_is_oom_victim(current) || fatal_signal_pending(current) || + (current->flags & PF_EXITING); +} + /* Some nice accessors for the vmpressure. */ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) { @@ -1293,32 +1300,39 @@ static const char *const memcg1_stat_names[] = { #define K(x) ((x) << (PAGE_SHIFT-10)) /** - * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. + * mem_cgroup_print_oom_context: Print OOM information relevant to + * memory controller. * @memcg: The memory cgroup that went over limit * @p: Task that is going to be killed * * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is * enabled */ -void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) +void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) { - struct mem_cgroup *iter; - unsigned int i; - rcu_read_lock(); + if (memcg) { + pr_cont(",oom_memcg="); + pr_cont_cgroup_path(memcg->css.cgroup); + } else + pr_cont(",global_oom"); if (p) { - pr_info("Task in "); + pr_cont(",task_memcg="); pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); - pr_cont(" killed as a result of limit of "); - } else { - pr_info("Memory limit reached of cgroup "); } - - pr_cont_cgroup_path(memcg->css.cgroup); - pr_cont("\n"); - rcu_read_unlock(); +} + +/** + * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to + * memory controller. + * @memcg: The memory cgroup that went over limit + */ +void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) +{ + struct mem_cgroup *iter; + unsigned int i; pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memory)), @@ -1382,8 +1396,13 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, }; bool ret; - mutex_lock(&oom_lock); - ret = out_of_memory(&oc); + if (mutex_lock_killable(&oom_lock)) + return true; + /* + * A few threads which were not waiting at mutex_lock_killable() can + * fail to bail out. Therefore, check again after holding oom_lock. + */ + ret = should_force_charge() || out_of_memory(&oc); mutex_unlock(&oom_lock); return ret; } @@ -1666,9 +1685,14 @@ enum oom_status { static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { + enum oom_status ret; + bool locked; + if (order > PAGE_ALLOC_COSTLY_ORDER) return OOM_SKIPPED; + memcg_memory_event(memcg, MEMCG_OOM); + /* * We are in the middle of the charge context here, so we * don't want to block when potentially sitting on a callstack @@ -1698,10 +1722,23 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int return OOM_ASYNC; } + mem_cgroup_mark_under_oom(memcg); + + locked = mem_cgroup_oom_trylock(memcg); + + if (locked) + mem_cgroup_oom_notify(memcg); + + mem_cgroup_unmark_under_oom(memcg); if (mem_cgroup_out_of_memory(memcg, mask, order)) - return OOM_SUCCESS; + ret = OOM_SUCCESS; + else + ret = OOM_FAILED; + + if (locked) + mem_cgroup_oom_unlock(memcg); - return OOM_FAILED; + return ret; } /** @@ -2184,9 +2221,7 @@ retry: * bypass the last charges so that they can exit quickly and * free their memory. */ - if (unlikely(tsk_is_oom_victim(current) || - fatal_signal_pending(current) || - current->flags & PF_EXITING)) + if (unlikely(should_force_charge())) goto force; /* @@ -2250,8 +2285,6 @@ retry: if (fatal_signal_pending(current)) goto force; - memcg_memory_event(mem_over_limit, MEMCG_OOM); - /* * keep retrying as long as the memcg oom killer is able to make * a forward progress or bypass the charge if the oom killer @@ -2329,13 +2362,13 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) static void lock_page_lru(struct page *page, int *isolated) { - struct zone *zone = page_zone(page); + pg_data_t *pgdat = page_pgdat(page); - spin_lock_irq(zone_lru_lock(zone)); + spin_lock_irq(&pgdat->lru_lock); if (PageLRU(page)) { struct lruvec *lruvec; - lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); + lruvec = mem_cgroup_page_lruvec(page, pgdat); ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_lru(page)); *isolated = 1; @@ -2345,17 +2378,17 @@ static void lock_page_lru(struct page *page, int *isolated) static void unlock_page_lru(struct page *page, int isolated) { - struct zone *zone = page_zone(page); + pg_data_t *pgdat = page_pgdat(page); if (isolated) { struct lruvec *lruvec; - lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); + lruvec = mem_cgroup_page_lruvec(page, pgdat); VM_BUG_ON_PAGE(PageLRU(page), page); SetPageLRU(page); add_page_to_lru_list(page, lruvec, page_lru(page)); } - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); } static void commit_charge(struct page *page, struct mem_cgroup *memcg, @@ -2460,7 +2493,7 @@ static void memcg_kmem_cache_create_func(struct work_struct *w) /* * Enqueue the creation of a per-memcg kmem_cache. */ -static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, +static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, struct kmem_cache *cachep) { struct memcg_kmem_cache_create_work *cw; @@ -2478,25 +2511,6 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, queue_work(memcg_kmem_cache_wq, &cw->work); } -static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, - struct kmem_cache *cachep) -{ - /* - * We need to stop accounting when we kmalloc, because if the - * corresponding kmalloc cache is not yet created, the first allocation - * in __memcg_schedule_kmem_cache_create will recurse. - * - * However, it is better to enclose the whole function. Depending on - * the debugging options enabled, INIT_WORK(), for instance, can - * trigger an allocation. This too, will make us recurse. Because at - * this point we can't allow ourselves back into memcg_kmem_get_cache, - * the safest choice is to do it like this, wrapping the whole function. - */ - current->memcg_kmem_skip_account = 1; - __memcg_schedule_kmem_cache_create(memcg, cachep); - current->memcg_kmem_skip_account = 0; -} - static inline bool memcg_kmem_bypass(void) { if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) @@ -2531,9 +2545,6 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) if (memcg_kmem_bypass()) return cachep; - if (current->memcg_kmem_skip_account) - return cachep; - memcg = get_mem_cgroup_from_current(); kmemcg_id = READ_ONCE(memcg->kmemcg_id); if (kmemcg_id < 0) @@ -2572,7 +2583,7 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep) } /** - * memcg_kmem_charge_memcg: charge a kmem page + * __memcg_kmem_charge_memcg: charge a kmem page * @page: page to charge * @gfp: reclaim mode * @order: allocation order @@ -2580,7 +2591,7 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep) * * Returns 0 on success, an error code on failure. */ -int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, +int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, struct mem_cgroup *memcg) { unsigned int nr_pages = 1 << order; @@ -2603,14 +2614,14 @@ int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, } /** - * memcg_kmem_charge: charge a kmem page to the current memory cgroup + * __memcg_kmem_charge: charge a kmem page to the current memory cgroup * @page: page to charge * @gfp: reclaim mode * @order: allocation order * * Returns 0 on success, an error code on failure. */ -int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { struct mem_cgroup *memcg; int ret = 0; @@ -2620,7 +2631,7 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) memcg = get_mem_cgroup_from_current(); if (!mem_cgroup_is_root(memcg)) { - ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); + ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); if (!ret) __SetPageKmemcg(page); } @@ -2628,11 +2639,11 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) return ret; } /** - * memcg_kmem_uncharge: uncharge a kmem page + * __memcg_kmem_uncharge: uncharge a kmem page * @page: page to uncharge * @order: allocation order */ -void memcg_kmem_uncharge(struct page *page, int order) +void __memcg_kmem_uncharge(struct page *page, int order) { struct mem_cgroup *memcg = page->mem_cgroup; unsigned int nr_pages = 1 << order; @@ -2663,7 +2674,7 @@ void memcg_kmem_uncharge(struct page *page, int order) /* * Because tail pages are not marked as "used", set it. We're under - * zone_lru_lock and migration entries setup in all page mappings. + * pgdat->lru_lock and migration entries setup in all page mappings. */ void mem_cgroup_split_huge_fixup(struct page *head) { @@ -3336,7 +3347,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) const struct numa_stat *stat; int nid; unsigned long nr; - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); @@ -3387,7 +3398,7 @@ static const char *const memcg1_event_names[] = { static int memcg_stat_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); unsigned long memory, memsw; struct mem_cgroup *mi; unsigned int i; @@ -3625,8 +3636,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, size = thresholds->primary ? thresholds->primary->size + 1 : 1; /* Allocate memory for new array of thresholds */ - new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), - GFP_KERNEL); + new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); if (!new) { ret = -ENOMEM; goto unlock; @@ -3820,7 +3830,7 @@ static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); @@ -4321,14 +4331,12 @@ static void mem_cgroup_id_remove(struct mem_cgroup *memcg) static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) { - VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0); - atomic_add(n, &memcg->id.ref); + refcount_add(n, &memcg->id.ref); } static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { - VM_BUG_ON(atomic_read(&memcg->id.ref) < n); - if (atomic_sub_and_test(n, &memcg->id.ref)) { + if (refcount_sub_and_test(n, &memcg->id.ref)) { mem_cgroup_id_remove(memcg); /* Memcg ID pins CSS */ @@ -4421,7 +4429,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) static struct mem_cgroup *mem_cgroup_alloc(void) { struct mem_cgroup *memcg; - size_t size; + unsigned int size; int node; size = sizeof(struct mem_cgroup); @@ -4545,7 +4553,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) } /* Online state pins memcg ID, memcg ID pins CSS */ - atomic_set(&memcg->id.ref, 1); + refcount_set(&memcg->id.ref, 1); css_get(css); return 0; } @@ -4573,6 +4581,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) memcg_offline_kmem(memcg); wb_memcg_offline(memcg); + drain_all_stock(memcg); + mem_cgroup_id_put(memcg); } @@ -4750,7 +4760,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, /* shmem/tmpfs may report page out on swap: account for that too. */ if (shmem_mapping(mapping)) { page = find_get_entry(mapping, pgoff); - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { swp_entry_t swp = radix_to_swp_entry(page); if (do_memsw_account()) *entry = swp; @@ -5353,6 +5363,16 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) root_mem_cgroup->use_hierarchy = false; } +static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) +{ + if (value == PAGE_COUNTER_MAX) + seq_puts(m, "max\n"); + else + seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); + + return 0; +} + static u64 memory_current_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -5363,15 +5383,8 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, static int memory_min_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long min = READ_ONCE(memcg->memory.min); - - if (min == PAGE_COUNTER_MAX) - seq_puts(m, "max\n"); - else - seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE); - - return 0; + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); } static ssize_t memory_min_write(struct kernfs_open_file *of, @@ -5393,15 +5406,8 @@ static ssize_t memory_min_write(struct kernfs_open_file *of, static int memory_low_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long low = READ_ONCE(memcg->memory.low); - - if (low == PAGE_COUNTER_MAX) - seq_puts(m, "max\n"); - else - seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE); - - return 0; + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); } static ssize_t memory_low_write(struct kernfs_open_file *of, @@ -5423,15 +5429,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, static int memory_high_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long high = READ_ONCE(memcg->high); - - if (high == PAGE_COUNTER_MAX) - seq_puts(m, "max\n"); - else - seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE); - - return 0; + return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high)); } static ssize_t memory_high_write(struct kernfs_open_file *of, @@ -5460,15 +5458,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, static int memory_max_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long max = READ_ONCE(memcg->memory.max); - - if (max == PAGE_COUNTER_MAX) - seq_puts(m, "max\n"); - else - seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); - - return 0; + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.max)); } static ssize_t memory_max_write(struct kernfs_open_file *of, @@ -5522,7 +5513,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, static int memory_events_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); seq_printf(m, "low %lu\n", atomic_long_read(&memcg->memory_events[MEMCG_LOW])); @@ -5540,7 +5531,7 @@ static int memory_events_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); struct accumulated_stats acc; int i; @@ -5581,6 +5572,15 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "file_writeback %llu\n", (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE); + /* + * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter + * with the NR_ANON_THP vm counter, but right now it's a pain in the + * arse because it requires migrating the work out of rmap to a place + * where the page->mem_cgroup is set up and stable. + */ + seq_printf(m, "anon_thp %llu\n", + (u64)acc.stat[MEMCG_RSS_HUGE] * PAGE_SIZE); + for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], (u64)acc.lru_pages[i] * PAGE_SIZE); @@ -5595,6 +5595,13 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]); seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]); + seq_printf(m, "workingset_refault %lu\n", + acc.stat[WORKINGSET_REFAULT]); + seq_printf(m, "workingset_activate %lu\n", + acc.stat[WORKINGSET_ACTIVATE]); + seq_printf(m, "workingset_nodereclaim %lu\n", + acc.stat[WORKINGSET_NODERECLAIM]); + seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]); seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] + acc.events[PGSCAN_DIRECT]); @@ -5605,19 +5612,18 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); - seq_printf(m, "workingset_refault %lu\n", - acc.stat[WORKINGSET_REFAULT]); - seq_printf(m, "workingset_activate %lu\n", - acc.stat[WORKINGSET_ACTIVATE]); - seq_printf(m, "workingset_nodereclaim %lu\n", - acc.stat[WORKINGSET_NODERECLAIM]); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]); + seq_printf(m, "thp_collapse_alloc %lu\n", + acc.events[THP_COLLAPSE_ALLOC]); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ return 0; } static int memory_oom_group_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); seq_printf(m, "%d\n", memcg->oom_group); @@ -5746,7 +5752,7 @@ struct cgroup_subsys memory_cgrp_subsys = { * * | memory.current, if memory.current < memory.low * low_usage = | - | 0, otherwise. + * | 0, otherwise. * * * Such definition of the effective memory.low provides the expected @@ -6377,7 +6383,7 @@ subsys_initcall(mem_cgroup_init); #ifdef CONFIG_MEMCG_SWAP static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) { - while (!atomic_inc_not_zero(&memcg->id.ref)) { + while (!refcount_inc_not_zero(&memcg->id.ref)) { /* * The root cgroup cannot be destroyed, so it's refcount must * always be >= 1. @@ -6600,15 +6606,8 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, static int swap_max_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long max = READ_ONCE(memcg->swap.max); - - if (max == PAGE_COUNTER_MAX) - seq_puts(m, "max\n"); - else - seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); - - return 0; + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->swap.max)); } static ssize_t swap_max_write(struct kernfs_open_file *of, @@ -6630,7 +6629,7 @@ static ssize_t swap_max_write(struct kernfs_open_file *of, static int swap_events_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); seq_printf(m, "max %lu\n", atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); diff --git a/mm/memfd.c b/mm/memfd.c index 2bb5e257080e..650e65a46b9c 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -21,44 +21,36 @@ #include <uapi/linux/memfd.h> /* - * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, + * We need a tag: a new tag would expand every xa_node by 8 bytes, * so reuse a tag which we firmly believe is never set or cleared on tmpfs * or hugetlbfs because they are memory only filesystems. */ #define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE #define LAST_SCAN 4 /* about 150ms max */ -static void memfd_tag_pins(struct address_space *mapping) +static void memfd_tag_pins(struct xa_state *xas) { - struct radix_tree_iter iter; - void __rcu **slot; - pgoff_t start; struct page *page; + unsigned int tagged = 0; lru_add_drain(); - start = 0; - rcu_read_lock(); - - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - page = radix_tree_deref_slot(slot); - if (!page || radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - } else if (page_count(page) - page_mapcount(page) > 1) { - xa_lock_irq(&mapping->i_pages); - radix_tree_tag_set(&mapping->i_pages, iter.index, - MEMFD_TAG_PINNED); - xa_unlock_irq(&mapping->i_pages); - } - if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); - } + xas_lock_irq(xas); + xas_for_each(xas, page, ULONG_MAX) { + if (xa_is_value(page)) + continue; + if (page_count(page) - page_mapcount(page) > 1) + xas_set_mark(xas, MEMFD_TAG_PINNED); + + if (++tagged % XA_CHECK_SCHED) + continue; + + xas_pause(xas); + xas_unlock_irq(xas); + cond_resched(); + xas_lock_irq(xas); } - rcu_read_unlock(); + xas_unlock_irq(xas); } /* @@ -72,17 +64,17 @@ static void memfd_tag_pins(struct address_space *mapping) */ static int memfd_wait_for_pins(struct address_space *mapping) { - struct radix_tree_iter iter; - void __rcu **slot; - pgoff_t start; + XA_STATE(xas, &mapping->i_pages, 0); struct page *page; int error, scan; - memfd_tag_pins(mapping); + memfd_tag_pins(&xas); error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { - if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED)) + unsigned int tagged = 0; + + if (!xas_marked(&xas, MEMFD_TAG_PINNED)) break; if (!scan) @@ -90,45 +82,34 @@ static int memfd_wait_for_pins(struct address_space *mapping) else if (schedule_timeout_killable((HZ << scan) / 200)) scan = LAST_SCAN; - start = 0; - rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, - start, MEMFD_TAG_PINNED) { - - page = radix_tree_deref_slot(slot); - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - - page = NULL; - } - - if (page && - page_count(page) - page_mapcount(page) != 1) { - if (scan < LAST_SCAN) - goto continue_resched; - + xas_set(&xas, 0); + xas_lock_irq(&xas); + xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) { + bool clear = true; + if (xa_is_value(page)) + continue; + if (page_count(page) - page_mapcount(page) != 1) { /* * On the last scan, we clean up all those tags * we inserted; but make a note that we still * found pages pinned. */ - error = -EBUSY; + if (scan == LAST_SCAN) + error = -EBUSY; + else + clear = false; } + if (clear) + xas_clear_mark(&xas, MEMFD_TAG_PINNED); + if (++tagged % XA_CHECK_SCHED) + continue; - xa_lock_irq(&mapping->i_pages); - radix_tree_tag_clear(&mapping->i_pages, - iter.index, MEMFD_TAG_PINNED); - xa_unlock_irq(&mapping->i_pages); -continue_resched: - if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); - } + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); } - rcu_read_unlock(); + xas_unlock_irq(&xas); } return error; @@ -150,7 +131,8 @@ static unsigned int *memfd_file_seals_ptr(struct file *file) #define F_ALL_SEALS (F_SEAL_SEAL | \ F_SEAL_SHRINK | \ F_SEAL_GROW | \ - F_SEAL_WRITE) + F_SEAL_WRITE | \ + F_SEAL_FUTURE_WRITE) static int memfd_add_seals(struct file *file, unsigned int seals) { diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0cd3de3550f0..fc8b51744579 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -372,7 +372,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail, if (fail || tk->addr_valid == 0) { pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", pfn, tk->tsk->comm, tk->tsk->pid); - force_sig(SIGKILL, tk->tsk); + do_send_sig_info(SIGKILL, SEND_SIG_PRIV, + tk->tsk, PIDTYPE_PID); } /* @@ -1161,6 +1162,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, LIST_HEAD(tokill); int rc = -EBUSY; loff_t start; + dax_entry_t cookie; /* * Prevent the inode from being freed while we are interrogating @@ -1169,7 +1171,8 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, * also prevents changes to the mapping of this pfn until * poison signaling is complete. */ - if (!dax_lock_mapping_entry(page)) + cookie = dax_lock_page(page); + if (!cookie) goto out; if (hwpoison_filter(page)) { @@ -1220,7 +1223,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags); rc = 0; unlock: - dax_unlock_mapping_entry(page); + dax_unlock_page(page, cookie); out: /* drop pgmap ref acquired in caller */ put_dev_pagemap(pgmap); @@ -1822,19 +1825,17 @@ static int soft_offline_in_use_page(struct page *page, int flags) struct page *hpage = compound_head(page); if (!PageHuge(page) && PageTransHuge(hpage)) { - lock_page(hpage); - if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { - unlock_page(hpage); - if (!PageAnon(hpage)) + lock_page(page); + if (!PageAnon(page) || unlikely(split_huge_page(page))) { + unlock_page(page); + if (!PageAnon(page)) pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page)); else pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page)); - put_hwpoison_page(hpage); + put_hwpoison_page(page); return -EBUSY; } - unlock_page(hpage); - get_hwpoison_page(page); - put_hwpoison_page(hpage); + unlock_page(page); } /* diff --git a/mm/memory.c b/mm/memory.c index 21a5e6e4758b..47fe250307c7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -69,6 +69,7 @@ #include <linux/userfaultfd_k.h> #include <linux/dax.h> #include <linux/oom.h> +#include <linux/numa.h> #include <asm/io.h> #include <asm/mmu_context.h> @@ -400,10 +401,10 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, } } -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd) { spinlock_t *ptl; - pgtable_t new = pte_alloc_one(mm, address); + pgtable_t new = pte_alloc_one(mm); if (!new) return -ENOMEM; @@ -434,9 +435,9 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) return 0; } -int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) +int __pte_alloc_kernel(pmd_t *pmd) { - pte_t *new = pte_alloc_one_kernel(&init_mm, address); + pte_t *new = pte_alloc_one_kernel(&init_mm); if (!new) return -ENOMEM; @@ -973,8 +974,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long next; unsigned long addr = vma->vm_start; unsigned long end = vma->vm_end; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + struct mmu_notifier_range range; bool is_cow; int ret; @@ -1008,11 +1008,11 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * is_cow_mapping() returns true. */ is_cow = is_cow_mapping(vma->vm_flags); - mmun_start = addr; - mmun_end = end; - if (is_cow) - mmu_notifier_invalidate_range_start(src_mm, mmun_start, - mmun_end); + + if (is_cow) { + mmu_notifier_range_init(&range, src_mm, addr, end); + mmu_notifier_invalidate_range_start(&range); + } ret = 0; dst_pgd = pgd_offset(dst_mm, addr); @@ -1029,7 +1029,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, } while (dst_pgd++, src_pgd++, addr = next, addr != end); if (is_cow) - mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); return ret; } @@ -1332,12 +1332,13 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { - struct mm_struct *mm = vma->vm_mm; + struct mmu_notifier_range range; - mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); + mmu_notifier_range_init(&range, vma->vm_mm, start_addr, end_addr); + mmu_notifier_invalidate_range_start(&range); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); - mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); + mmu_notifier_invalidate_range_end(&range); } /** @@ -1351,18 +1352,18 @@ void unmap_vmas(struct mmu_gather *tlb, void zap_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long size) { - struct mm_struct *mm = vma->vm_mm; + struct mmu_notifier_range range; struct mmu_gather tlb; - unsigned long end = start + size; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, start, end); - update_hiwater_rss(mm); - mmu_notifier_invalidate_range_start(mm, start, end); - for ( ; vma && vma->vm_start < end; vma = vma->vm_next) - unmap_single_vma(&tlb, vma, start, end, NULL); - mmu_notifier_invalidate_range_end(mm, start, end); - tlb_finish_mmu(&tlb, start, end); + mmu_notifier_range_init(&range, vma->vm_mm, start, start + size); + tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end); + update_hiwater_rss(vma->vm_mm); + mmu_notifier_invalidate_range_start(&range); + for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next) + unmap_single_vma(&tlb, vma, start, range.end, NULL); + mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb, start, range.end); } /** @@ -1377,17 +1378,17 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { - struct mm_struct *mm = vma->vm_mm; + struct mmu_notifier_range range; struct mmu_gather tlb; - unsigned long end = address + size; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, address, end); - update_hiwater_rss(mm); - mmu_notifier_invalidate_range_start(mm, address, end); - unmap_single_vma(&tlb, vma, address, end, details); - mmu_notifier_invalidate_range_end(mm, address, end); - tlb_finish_mmu(&tlb, address, end); + mmu_notifier_range_init(&range, vma->vm_mm, address, address + size); + tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end); + update_hiwater_rss(vma->vm_mm); + mmu_notifier_invalidate_range_start(&range); + unmap_single_vma(&tlb, vma, address, range.end, details); + mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb, address, range.end); } /** @@ -1451,7 +1452,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, spinlock_t *ptl; retval = -EINVAL; - if (PageAnon(page)) + if (PageAnon(page) || PageSlab(page) || page_has_type(page)) goto out; retval = -ENOMEM; flush_dcache_page(page); @@ -1503,6 +1504,8 @@ out: * under mm->mmap_sem write-lock, so it can change vma->vm_flags. * Caller must set VM_MIXEDMAP on vma if it wants to call this * function from other places, for example from page-fault handler. + * + * Return: %0 on success, negative error code otherwise. */ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) @@ -1520,19 +1523,16 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_page); -static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, +static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn, pgprot_t prot, bool mkwrite) { struct mm_struct *mm = vma->vm_mm; - int retval; pte_t *pte, entry; spinlock_t *ptl; - retval = -ENOMEM; pte = get_locked_pte(mm, addr, &ptl); if (!pte) - goto out; - retval = -EBUSY; + return VM_FAULT_OOM; if (!pte_none(*pte)) { if (mkwrite) { /* @@ -1540,10 +1540,15 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, * in may not match the PFN we have mapped if the * mapped PFN is a writeable COW page. In the mkwrite * case we are creating a writable PTE for a shared - * mapping and we expect the PFNs to match. + * mapping and we expect the PFNs to match. If they + * don't match, we are likely racing with block + * allocation and mapping invalidation so just skip the + * update. */ - if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn))) + if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) { + WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte))); goto out_unlock; + } entry = *pte; goto out_mkwrite; } else @@ -1565,56 +1570,32 @@ out_mkwrite: set_pte_at(mm, addr, pte, entry); update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ - retval = 0; out_unlock: pte_unmap_unlock(pte, ptl); -out: - return retval; -} - -/** - * vm_insert_pfn - insert single pfn into user vma - * @vma: user vma to map to - * @addr: target user address of this page - * @pfn: source kernel pfn - * - * Similar to vm_insert_page, this allows drivers to insert individual pages - * they've allocated into a user vma. Same comments apply. - * - * This function should only be called from a vm_ops->fault handler, and - * in that case the handler should return NULL. - * - * vma cannot be a COW mapping. - * - * As this is called only for pages that do not currently exist, we - * do not need to flush old virtual caches or the TLB. - */ -int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn) -{ - return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); + return VM_FAULT_NOPAGE; } -EXPORT_SYMBOL(vm_insert_pfn); /** - * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot + * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot * @vma: user vma to map to * @addr: target user address of this page * @pfn: source kernel pfn * @pgprot: pgprot flags for the inserted page * - * This is exactly like vm_insert_pfn, except that it allows drivers to + * This is exactly like vmf_insert_pfn(), except that it allows drivers to * to override pgprot on a per-page basis. * * This only makes sense for IO mappings, and it makes no sense for - * cow mappings. In general, using multiple vmas is preferable; - * vm_insert_pfn_prot should only be used if using multiple VMAs is + * COW mappings. In general, using multiple vmas is preferable; + * vmf_insert_pfn_prot should only be used if using multiple VMAs is * impractical. + * + * Context: Process context. May allocate using %GFP_KERNEL. + * Return: vm_fault_t value. */ -int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, +vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot) { - int ret; /* * Technically, architectures with pte_special can avoid all these * restrictions (same for remap_pfn_range). However we would like @@ -1628,19 +1609,44 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) - return -EFAULT; + return VM_FAULT_SIGBUS; if (!pfn_modify_allowed(pfn, pgprot)) - return -EACCES; + return VM_FAULT_SIGBUS; track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); - ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, + return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, false); +} +EXPORT_SYMBOL(vmf_insert_pfn_prot); - return ret; +/** + * vmf_insert_pfn - insert single pfn into user vma + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * + * Similar to vm_insert_page, this allows drivers to insert individual pages + * they've allocated into a user vma. Same comments apply. + * + * This function should only be called from a vm_ops->fault handler, and + * in that case the handler should return the result of this function. + * + * vma cannot be a COW mapping. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + * + * Context: Process context. May allocate using %GFP_KERNEL. + * Return: vm_fault_t value. + */ +vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn) +{ + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); } -EXPORT_SYMBOL(vm_insert_pfn_prot); +EXPORT_SYMBOL(vmf_insert_pfn); static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) { @@ -1656,20 +1662,21 @@ static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) return false; } -static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn, bool mkwrite) +static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, + unsigned long addr, pfn_t pfn, bool mkwrite) { pgprot_t pgprot = vma->vm_page_prot; + int err; BUG_ON(!vm_mixed_ok(vma, pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) - return -EFAULT; + return VM_FAULT_SIGBUS; track_pfn_insert(vma, &pgprot, pfn); if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) - return -EACCES; + return VM_FAULT_SIGBUS; /* * If we don't have pte special, then we have to use the pfn_valid() @@ -1688,36 +1695,35 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, * result in pfn_t_has_page() == false. */ page = pfn_to_page(pfn_t_to_pfn(pfn)); - return insert_page(vma, addr, page, pgprot); + err = insert_page(vma, addr, page, pgprot); + } else { + return insert_pfn(vma, addr, pfn, pgprot, mkwrite); } - return insert_pfn(vma, addr, pfn, pgprot, mkwrite); + + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; + + return VM_FAULT_NOPAGE; } -int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn) +vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + pfn_t pfn) { return __vm_insert_mixed(vma, addr, pfn, false); - } -EXPORT_SYMBOL(vm_insert_mixed); +EXPORT_SYMBOL(vmf_insert_mixed); /* * If the insertion of PTE failed because someone else already added a * different entry in the mean time, we treat that as success as we assume * the same entry was actually inserted. */ - vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) { - int err; - - err = __vm_insert_mixed(vma, addr, pfn, true); - if (err == -ENOMEM) - return VM_FAULT_OOM; - if (err < 0 && err != -EBUSY) - return VM_FAULT_SIGBUS; - return VM_FAULT_NOPAGE; + return __vm_insert_mixed(vma, addr, pfn, true); } EXPORT_SYMBOL(vmf_insert_mixed_mkwrite); @@ -1827,7 +1833,9 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, * @size: size of map area * @prot: page protection flags for this mapping * - * Note: this is only safe if the mm semaphore is held when called. + * Note: this is only safe if the mm semaphore is held when called. + * + * Return: %0 on success, negative error code otherwise. */ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) @@ -1900,6 +1908,8 @@ EXPORT_SYMBOL(remap_pfn_range); * * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get * whatever write-combining details or similar. + * + * Return: %0 on success, negative error code otherwise. */ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) { @@ -2244,9 +2254,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) struct page *new_page = NULL; pte_t entry; int page_copied = 0; - const unsigned long mmun_start = vmf->address & PAGE_MASK; - const unsigned long mmun_end = mmun_start + PAGE_SIZE; struct mem_cgroup *memcg; + struct mmu_notifier_range range; if (unlikely(anon_vma_prepare(vma))) goto oom; @@ -2269,7 +2278,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) __SetPageUptodate(new_page); - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, mm, vmf->address & PAGE_MASK, + (vmf->address & PAGE_MASK) + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); /* * Re-check the pte - we dropped the lock @@ -2346,7 +2357,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * No need to double call mmu_notifier->invalidate_range() callback as * the above ptep_clear_flush_notify() did already call it. */ - mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_only_end(&range); if (old_page) { /* * Don't let another task, with possibly unlocked vma, @@ -2377,12 +2388,13 @@ oom: * * This function handles all that is needed to finish a write page fault in a * shared mapping due to PTE being read-only once the mapped page is prepared. - * It handles locking of PTE and modifying it. The function returns - * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE - * lock. + * It handles locking of PTE and modifying it. * * The function expects the page to be locked or other protection against * concurrent faults / writeback (such as DAX radix tree locks). + * + * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before + * we acquired PTE lock. */ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) { @@ -2500,8 +2512,11 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */ - if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { + if (PageAnon(vmf->page)) { int total_map_swapcount; + if (PageKsm(vmf->page) && (PageSwapCache(vmf->page) || + page_count(vmf->page) != 1)) + goto copy; if (!trylock_page(vmf->page)) { get_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -2516,6 +2531,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) } put_page(vmf->page); } + if (PageKsm(vmf->page)) { + bool reused = reuse_ksm_page(vmf->page, vmf->vma, + vmf->address); + unlock_page(vmf->page); + if (!reused) + goto copy; + wp_page_reuse(vmf); + return VM_FAULT_WRITE; + } if (reuse_swap_page(vmf->page, &total_map_swapcount)) { if (total_map_swapcount == 1) { /* @@ -2536,7 +2560,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) (VM_WRITE|VM_SHARED))) { return wp_page_shared(vmf); } - +copy: /* * Ok, we need to copy. Oh, well.. */ @@ -2892,7 +2916,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) * * Here we only have down_read(mmap_sem). */ - if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address)) + if (pte_alloc(vma->vm_mm, vmf->pmd)) return VM_FAULT_OOM; /* See the comment in pte_alloc_one_map() */ @@ -2990,6 +3014,28 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; + /* + * Preallocate pte before we take page_lock because this might lead to + * deadlocks for memcg reclaim which waits for pages under writeback: + * lock_page(A) + * SetPageWriteback(A) + * unlock_page(A) + * lock_page(B) + * lock_page(B) + * pte_alloc_pne + * shrink_page_list + * wait_on_page_writeback(A) + * SetPageWriteback(B) + * unlock_page(B) + * # flush A, B to clear the writeback + */ + if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) { + vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); + if (!vmf->prealloc_pte) + return VM_FAULT_OOM; + smp_wmb(); /* See comment in __pte_alloc() */ + } + ret = vma->vm_ops->fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | VM_FAULT_DONE_COW))) @@ -3039,7 +3085,7 @@ static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf) pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); spin_unlock(vmf->ptl); vmf->prealloc_pte = NULL; - } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) { + } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) { return VM_FAULT_OOM; } map_pte: @@ -3118,7 +3164,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * related to pte entry. Use the preallocated table for that. */ if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { - vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address); + vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; smp_wmb(); /* See comment in __pte_alloc() */ @@ -3175,6 +3221,8 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * * Target users are page handler itself and implementations of * vm_ops->map_pages. + * + * Return: %0 on success, %VM_FAULT_ code in case of error. */ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, struct page *page) @@ -3235,11 +3283,12 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, * This function handles all that is needed to finish a page fault once the * page to fault in is prepared. It handles locking of PTEs, inserts PTE for * given page, adds reverse page mapping, handles memcg charges and LRU - * addition. The function returns 0 on success, VM_FAULT_ code in case of - * error. + * addition. * * The function expects the page to be locked and on success it consumes a * reference of a page being mapped (for the PTE which maps it). + * + * Return: %0 on success, %VM_FAULT_ code in case of error. */ vm_fault_t finish_fault(struct vm_fault *vmf) { @@ -3295,12 +3344,8 @@ DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops, static int __init fault_around_debugfs(void) { - void *ret; - - ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, - &fault_around_bytes_fops); - if (!ret) - pr_warn("Failed to create fault_around_bytes in debugfs"); + debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, + &fault_around_bytes_fops); return 0; } late_initcall(fault_around_debugfs); @@ -3356,8 +3401,7 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) start_pgoff + nr_pages - 1); if (pmd_none(*vmf->pmd)) { - vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm, - vmf->address); + vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); if (!vmf->prealloc_pte) goto out; smp_wmb(); /* See comment in __pte_alloc() */ @@ -3492,16 +3536,45 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) * but allow concurrent faults). * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). + * If mmap_sem is released, vma may become invalid (for example + * by other thread calling munmap()). */ static vm_fault_t do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + struct mm_struct *vm_mm = vma->vm_mm; vm_fault_t ret; - /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ - if (!vma->vm_ops->fault) - ret = VM_FAULT_SIGBUS; - else if (!(vmf->flags & FAULT_FLAG_WRITE)) + /* + * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND + */ + if (!vma->vm_ops->fault) { + /* + * If we find a migration pmd entry or a none pmd entry, which + * should never happen, return SIGBUS + */ + if (unlikely(!pmd_present(*vmf->pmd))) + ret = VM_FAULT_SIGBUS; + else { + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, + vmf->pmd, + vmf->address, + &vmf->ptl); + /* + * Make sure this is not a temporary clearing of pte + * by holding ptl and checking again. A R/M/W update + * of pte involves: take ptl, clearing the pte so that + * we don't have concurrent modification by hardware + * followed by an update. + */ + if (unlikely(pte_none(*vmf->pte))) + ret = VM_FAULT_SIGBUS; + else + ret = VM_FAULT_NOPAGE; + + pte_unmap_unlock(vmf->pte, vmf->ptl); + } + } else if (!(vmf->flags & FAULT_FLAG_WRITE)) ret = do_read_fault(vmf); else if (!(vma->vm_flags & VM_SHARED)) ret = do_cow_fault(vmf); @@ -3510,7 +3583,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf) /* preallocated pagetable is unused: free it */ if (vmf->prealloc_pte) { - pte_free(vma->vm_mm, vmf->prealloc_pte); + pte_free(vm_mm, vmf->prealloc_pte); vmf->prealloc_pte = NULL; } return ret; @@ -3535,11 +3608,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; - int page_nid = -1; + int page_nid = NUMA_NO_NODE; int last_cpupid; int target_nid; bool migrated = false; - pte_t pte; + pte_t pte, old_pte; bool was_writable = pte_savedwrite(vmf->orig_pte); int flags = 0; @@ -3559,12 +3632,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * Make it present again, Depending on how arch implementes non * accessible ptes, some can allow access by kernel mode. */ - pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte); - pte = pte_modify(pte, vma->vm_page_prot); + old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); + pte = pte_modify(old_pte, vma->vm_page_prot); pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte); - ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte); + ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); update_mmu_cache(vma, vmf->address, vmf->pte); page = vm_normal_page(vma, vmf->address, pte); @@ -3602,7 +3675,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, &flags); pte_unmap_unlock(vmf->pte, vmf->ptl); - if (target_nid == -1) { + if (target_nid == NUMA_NO_NODE) { put_page(page); goto out; } @@ -3616,7 +3689,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) flags |= TNF_MIGRATE_FAIL; out: - if (page_nid != -1) + if (page_nid != NUMA_NO_NODE) task_numa_fault(last_cpupid, page_nid, 1, flags); return 0; } @@ -3801,7 +3874,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, vmf.pud = pud_alloc(mm, p4d, address); if (!vmf.pud) return VM_FAULT_OOM; - if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) { + if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -3827,7 +3900,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, vmf.pmd = pmd_alloc(mm, vmf.pud, address); if (!vmf.pmd) return VM_FAULT_OOM; - if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { + if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -4001,7 +4074,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) #endif /* __PAGETABLE_PMD_FOLDED */ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, - unsigned long *start, unsigned long *end, + struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) { pgd_t *pgd; @@ -4029,10 +4102,10 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, if (!pmdpp) goto out; - if (start && end) { - *start = address & PMD_MASK; - *end = *start + PMD_SIZE; - mmu_notifier_invalidate_range_start(mm, *start, *end); + if (range) { + mmu_notifier_range_init(range, mm, address & PMD_MASK, + (address & PMD_MASK) + PMD_SIZE); + mmu_notifier_invalidate_range_start(range); } *ptlp = pmd_lock(mm, pmd); if (pmd_huge(*pmd)) { @@ -4040,17 +4113,17 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, return 0; } spin_unlock(*ptlp); - if (start && end) - mmu_notifier_invalidate_range_end(mm, *start, *end); + if (range) + mmu_notifier_invalidate_range_end(range); } if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) goto out; - if (start && end) { - *start = address & PAGE_MASK; - *end = *start + PAGE_SIZE; - mmu_notifier_invalidate_range_start(mm, *start, *end); + if (range) { + mmu_notifier_range_init(range, mm, address & PAGE_MASK, + (address & PAGE_MASK) + PAGE_SIZE); + mmu_notifier_invalidate_range_start(range); } ptep = pte_offset_map_lock(mm, pmd, address, ptlp); if (!pte_present(*ptep)) @@ -4059,8 +4132,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, return 0; unlock: pte_unmap_unlock(ptep, *ptlp); - if (start && end) - mmu_notifier_invalidate_range_end(mm, *start, *end); + if (range) + mmu_notifier_invalidate_range_end(range); out: return -EINVAL; } @@ -4072,20 +4145,20 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address, /* (void) is needed to make gcc happy */ (void) __cond_lock(*ptlp, - !(res = __follow_pte_pmd(mm, address, NULL, NULL, + !(res = __follow_pte_pmd(mm, address, NULL, ptepp, NULL, ptlp))); return res; } int follow_pte_pmd(struct mm_struct *mm, unsigned long address, - unsigned long *start, unsigned long *end, - pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) + struct mmu_notifier_range *range, + pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) { int res; /* (void) is needed to make gcc happy */ (void) __cond_lock(*ptlp, - !(res = __follow_pte_pmd(mm, address, start, end, + !(res = __follow_pte_pmd(mm, address, range, ptepp, pmdpp, ptlp))); return res; } @@ -4099,7 +4172,7 @@ EXPORT_SYMBOL(follow_pte_pmd); * * Only IO mappings and raw PFN mappings are allowed. * - * Returns zero and the pfn at @pfn on success, -ve otherwise. + * Return: zero and the pfn at @pfn on success, -ve otherwise. */ int follow_pfn(struct vm_area_struct *vma, unsigned long address, unsigned long *pfn) @@ -4249,6 +4322,8 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, * @gup_flags: flags modifying lookup behaviour * * The caller must hold a reference on @mm. + * + * Return: number of bytes copied from source to destination. */ int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 38d94b703e9d..6b05576fb4ec 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -33,8 +33,8 @@ #include <linux/stop_machine.h> #include <linux/hugetlb.h> #include <linux/memblock.h> -#include <linux/bootmem.h> #include <linux/compaction.h> +#include <linux/rmap.h> #include <asm/tlbflush.h> @@ -47,7 +47,7 @@ * and restore_online_page_callback() for generic callback restore. */ -static void generic_online_page(struct page *page); +static void generic_online_page(struct page *page, unsigned int order); static online_page_callback_t online_page_callback = generic_online_page; static DEFINE_MUTEX(online_page_callback_lock); @@ -254,7 +254,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, if (pfn_valid(phys_start_pfn)) return -EEXIST; - ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn, altmap); + ret = sparse_add_one_section(nid, phys_start_pfn, altmap); if (ret < 0) return ret; @@ -587,6 +587,7 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, for (i = 0; i < sections_to_remove; i++) { unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; + cond_resched(); ret = __remove_section(zone, __pfn_to_section(pfn), map_offset, altmap); map_offset = 0; @@ -655,26 +656,40 @@ void __online_page_free(struct page *page) } EXPORT_SYMBOL_GPL(__online_page_free); -static void generic_online_page(struct page *page) +static void generic_online_page(struct page *page, unsigned int order) +{ + kernel_map_pages(page, 1 << order, 1); + __free_pages_core(page, order); + totalram_pages_add(1UL << order); +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages_add(1UL << order); +#endif +} + +static int online_pages_blocks(unsigned long start, unsigned long nr_pages) { - __online_page_set_limits(page); - __online_page_increment_counters(page); - __online_page_free(page); + unsigned long end = start + nr_pages; + int order, onlined_pages = 0; + + while (start < end) { + order = min(MAX_ORDER - 1, + get_order(PFN_PHYS(end) - PFN_PHYS(start))); + (*online_page_callback)(pfn_to_page(start), order); + + onlined_pages += (1UL << order); + start += (1UL << order); + } + return onlined_pages; } static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, void *arg) { - unsigned long i; unsigned long onlined_pages = *(unsigned long *)arg; - struct page *page; if (PageReserved(pfn_to_page(start_pfn))) - for (i = 0; i < nr_pages; i++) { - page = pfn_to_page(start_pfn + i); - (*online_page_callback)(page); - onlined_pages++; - } + onlined_pages += online_pages_blocks(start_pfn, nr_pages); online_mem_sections(start_pfn, start_pfn + nr_pages); @@ -687,62 +702,19 @@ static void node_states_check_changes_online(unsigned long nr_pages, struct zone *zone, struct memory_notify *arg) { int nid = zone_to_nid(zone); - enum zone_type zone_last = ZONE_NORMAL; - /* - * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] - * contains nodes which have zones of 0...ZONE_NORMAL, - * set zone_last to ZONE_NORMAL. - * - * If we don't have HIGHMEM nor movable node, - * node_states[N_NORMAL_MEMORY] contains nodes which have zones of - * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. - */ - if (N_MEMORY == N_NORMAL_MEMORY) - zone_last = ZONE_MOVABLE; + arg->status_change_nid = NUMA_NO_NODE; + arg->status_change_nid_normal = NUMA_NO_NODE; + arg->status_change_nid_high = NUMA_NO_NODE; - /* - * if the memory to be online is in a zone of 0...zone_last, and - * the zones of 0...zone_last don't have memory before online, we will - * need to set the node to node_states[N_NORMAL_MEMORY] after - * the memory is online. - */ - if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) + if (!node_state(nid, N_MEMORY)) + arg->status_change_nid = nid; + if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY)) arg->status_change_nid_normal = nid; - else - arg->status_change_nid_normal = -1; - #ifdef CONFIG_HIGHMEM - /* - * If we have movable node, node_states[N_HIGH_MEMORY] - * contains nodes which have zones of 0...ZONE_HIGHMEM, - * set zone_last to ZONE_HIGHMEM. - * - * If we don't have movable node, node_states[N_NORMAL_MEMORY] - * contains nodes which have zones of 0...ZONE_MOVABLE, - * set zone_last to ZONE_MOVABLE. - */ - zone_last = ZONE_HIGHMEM; - if (N_MEMORY == N_HIGH_MEMORY) - zone_last = ZONE_MOVABLE; - - if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) + if (zone_idx(zone) <= N_HIGH_MEMORY && !node_state(nid, N_HIGH_MEMORY)) arg->status_change_nid_high = nid; - else - arg->status_change_nid_high = -1; -#else - arg->status_change_nid_high = arg->status_change_nid_normal; #endif - - /* - * if the node don't have memory befor online, we will need to - * set the node to node_states[N_MEMORY] after the memory - * is online. - */ - if (!node_state(nid, N_MEMORY)) - arg->status_change_nid = nid; - else - arg->status_change_nid = -1; } static void node_states_set_node(int node, struct memory_notify *arg) @@ -753,7 +725,8 @@ static void node_states_set_node(int node, struct memory_notify *arg) if (arg->status_change_nid_high >= 0) node_set_state(node, N_HIGH_MEMORY); - node_set_state(node, N_MEMORY); + if (arg->status_change_nid >= 0) + node_set_state(node, N_MEMORY); } static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, @@ -785,14 +758,13 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, int nid = pgdat->node_id; unsigned long flags; - if (zone_is_empty(zone)) - init_currently_empty_zone(zone, start_pfn, nr_pages); - clear_zone_contiguous(zone); /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */ pgdat_resize_lock(pgdat, &flags); zone_span_writelock(zone); + if (zone_is_empty(zone)) + init_currently_empty_zone(zone, start_pfn, nr_pages); resize_zone_range(zone, start_pfn, nr_pages); zone_span_writeunlock(zone); resize_pgdat_range(pgdat, start_pfn, nr_pages); @@ -881,7 +853,6 @@ static struct zone * __meminit move_pfn_range(int online_type, int nid, return zone; } -/* Must be protected by mem_hotplug_begin() or a device_lock */ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) { unsigned long flags; @@ -893,6 +864,8 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ struct memory_notify arg; struct memory_block *mem; + mem_hotplug_begin(); + /* * We can't use pfn_to_nid() because nid might be stored in struct page * which is not yet initialized. Instead, we find nid from memory block. @@ -957,6 +930,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (onlined_pages) memory_notify(MEM_ONLINE, &arg); + mem_hotplug_done(); return 0; failed_addition: @@ -964,6 +938,7 @@ failed_addition: (unsigned long long) pfn << PAGE_SHIFT, (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_ONLINE, &arg); + mem_hotplug_done(); return ret; } #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ @@ -1111,8 +1086,13 @@ static int online_memory_block(struct memory_block *mem, void *arg) return device_online(&mem->dev); } -/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -int __ref add_memory_resource(int nid, struct resource *res, bool online) +/* + * NOTE: The caller must call lock_device_hotplug() to serialize hotplug + * and online/offline operations (triggered e.g. by sysfs). + * + * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG + */ +int __ref add_memory_resource(int nid, struct resource *res) { u64 start, size; bool new_node = false; @@ -1163,26 +1143,26 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) /* create new memmap entry */ firmware_map_add_hotplug(start, start + size, "System RAM"); + /* device_online() will take the lock when calling online_pages() */ + mem_hotplug_done(); + /* online pages if requested */ - if (online) + if (memhp_auto_online) walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, online_memory_block); - goto out; - + return ret; error: /* rollback pgdat allocation and others */ if (new_node) rollback_node_hotadd(nid); memblock_remove(start, size); - -out: mem_hotplug_done(); return ret; } -EXPORT_SYMBOL_GPL(add_memory_resource); -int __ref add_memory(int nid, u64 start, u64 size) +/* requires device_hotplug_lock, see add_memory_resource() */ +int __ref __add_memory(int nid, u64 start, u64 size) { struct resource *res; int ret; @@ -1191,11 +1171,22 @@ int __ref add_memory(int nid, u64 start, u64 size) if (IS_ERR(res)) return PTR_ERR(res); - ret = add_memory_resource(nid, res, memhp_auto_online); + ret = add_memory_resource(nid, res); if (ret < 0) release_memory_resource(res); return ret; } + +int add_memory(int nid, u64 start, u64 size) +{ + int rc; + + lock_device_hotplug(); + rc = __add_memory(nid, start, size); + unlock_device_hotplug(); + + return rc; +} EXPORT_SYMBOL_GPL(add_memory); #ifdef CONFIG_MEMORY_HOTREMOVE @@ -1211,11 +1202,13 @@ static inline int pageblock_free(struct page *page) return PageBuddy(page) && page_order(page) >= pageblock_order; } -/* Return the start of the next active pageblock after a given page */ -static struct page *next_active_pageblock(struct page *page) +/* Return the pfn of the start of the next active pageblock after a given pfn */ +static unsigned long next_active_pageblock(unsigned long pfn) { + struct page *page = pfn_to_page(pfn); + /* Ensure the starting page is pageblock-aligned */ - BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); + BUG_ON(pfn & (pageblock_nr_pages - 1)); /* If the entire pageblock is free, move to the end of free page */ if (pageblock_free(page)) { @@ -1223,16 +1216,16 @@ static struct page *next_active_pageblock(struct page *page) /* be careful. we don't have locks, page_order can be changed.*/ order = page_order(page); if ((order < MAX_ORDER) && (order >= pageblock_order)) - return page + (1 << order); + return pfn + (1 << order); } - return page + pageblock_nr_pages; + return pfn + pageblock_nr_pages; } -static bool is_pageblock_removable_nolock(struct page *page) +static bool is_pageblock_removable_nolock(unsigned long pfn) { + struct page *page = pfn_to_page(pfn); struct zone *zone; - unsigned long pfn; /* * We have to be careful here because we are iterating over memory @@ -1249,18 +1242,20 @@ static bool is_pageblock_removable_nolock(struct page *page) if (!zone_spans_pfn(zone, pfn)) return false; - return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true); + return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, SKIP_HWPOISON); } /* Checks if this range of memory is likely to be hot-removable. */ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) { - struct page *page = pfn_to_page(start_pfn); - struct page *end_page = page + nr_pages; + unsigned long end_pfn, pfn; + + end_pfn = min(start_pfn + nr_pages, + zone_end_pfn(page_zone(pfn_to_page(start_pfn)))); /* Check the starting page of each pageblock within the range */ - for (; page < end_page; page = next_active_pageblock(page)) { - if (!is_pageblock_removable_nolock(page)) + for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) { + if (!is_pageblock_removable_nolock(pfn)) return false; cond_resched(); } @@ -1296,6 +1291,9 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, i++; if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn) continue; + /* Check if we got outside of the zone */ + if (zone && !zone_spans_pfn(zone, pfn + i)) + return 0; page = pfn_to_page(pfn + i); if (zone && page_zone(page) != zone) return 0; @@ -1324,23 +1322,27 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, static unsigned long scan_movable_pages(unsigned long start, unsigned long end) { unsigned long pfn; - struct page *page; + for (pfn = start; pfn < end; pfn++) { - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - if (PageLRU(page)) - return pfn; - if (__PageMovable(page)) - return pfn; - if (PageHuge(page)) { - if (hugepage_migration_supported(page_hstate(page)) && - page_huge_active(page)) - return pfn; - else - pfn = round_up(pfn + 1, - 1 << compound_order(page)) - 1; - } - } + struct page *page, *head; + unsigned long skip; + + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + if (PageLRU(page)) + return pfn; + if (__PageMovable(page)) + return pfn; + + if (!PageHuge(page)) + continue; + head = compound_head(page); + if (hugepage_migration_supported(page_hstate(head)) && + page_huge_active(head)) + return pfn; + skip = (1 << compound_order(head)) - (page - head); + pfn += skip - 1; } return 0; } @@ -1362,36 +1364,47 @@ static struct page *new_node_page(struct page *page, unsigned long private) return new_page_nodemask(page, nid, &nmask); } -#define NR_OFFLINE_AT_ONCE_PAGES (256) static int do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; struct page *page; - int move_pages = NR_OFFLINE_AT_ONCE_PAGES; - int not_managed = 0; int ret = 0; LIST_HEAD(source); - for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { + for (pfn = start_pfn; pfn < end_pfn; pfn++) { if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); if (PageHuge(page)) { struct page *head = compound_head(page); - pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; if (compound_order(head) > PFN_SECTION_SHIFT) { ret = -EBUSY; break; } - if (isolate_huge_page(page, &source)) - move_pages -= 1 << compound_order(head); + pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; + isolate_huge_page(head, &source); continue; } else if (PageTransHuge(page)) pfn = page_to_pfn(compound_head(page)) + hpage_nr_pages(page) - 1; + /* + * HWPoison pages have elevated reference counts so the migration would + * fail on them. It also doesn't make any sense to migrate them in the + * first place. Still try to unmap such a page in case it is still mapped + * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep + * the unmap as the catch all safety net). + */ + if (PageHWPoison(page)) { + if (WARN_ON(PageLRU(page))) + isolate_lru_page(page); + if (page_mapped(page)) + try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS); + continue; + } + if (!get_page_unless_zero(page)) continue; /* @@ -1403,41 +1416,31 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) else ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); if (!ret) { /* Success */ - put_page(page); list_add_tail(&page->lru, &source); - move_pages--; if (!__PageMovable(page)) inc_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); } else { -#ifdef CONFIG_DEBUG_VM - pr_alert("failed to isolate pfn %lx\n", pfn); + pr_warn("failed to isolate pfn %lx\n", pfn); dump_page(page, "isolation failed"); -#endif - put_page(page); - /* Because we don't have big zone->lock. we should - check this again here. */ - if (page_count(page)) { - not_managed++; - ret = -EBUSY; - break; - } } + put_page(page); } if (!list_empty(&source)) { - if (not_managed) { - putback_movable_pages(&source); - goto out; - } - /* Allocate a new page from the nearest neighbor node */ ret = migrate_pages(&source, new_node_page, NULL, 0, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); - if (ret) + if (ret) { + list_for_each_entry(page, &source, lru) { + pr_warn("migrating pfn %lx failed ret:%d ", + page_to_pfn(page), ret); + dump_page(page, "migration failure"); + } putback_movable_pages(&source); + } } -out: + return ret; } @@ -1505,75 +1508,53 @@ static void node_states_check_changes_offline(unsigned long nr_pages, { struct pglist_data *pgdat = zone->zone_pgdat; unsigned long present_pages = 0; - enum zone_type zt, zone_last = ZONE_NORMAL; + enum zone_type zt; - /* - * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] - * contains nodes which have zones of 0...ZONE_NORMAL, - * set zone_last to ZONE_NORMAL. - * - * If we don't have HIGHMEM nor movable node, - * node_states[N_NORMAL_MEMORY] contains nodes which have zones of - * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. - */ - if (N_MEMORY == N_NORMAL_MEMORY) - zone_last = ZONE_MOVABLE; + arg->status_change_nid = NUMA_NO_NODE; + arg->status_change_nid_normal = NUMA_NO_NODE; + arg->status_change_nid_high = NUMA_NO_NODE; /* - * check whether node_states[N_NORMAL_MEMORY] will be changed. - * If the memory to be offline is in a zone of 0...zone_last, - * and it is the last present memory, 0...zone_last will - * become empty after offline , thus we can determind we will - * need to clear the node from node_states[N_NORMAL_MEMORY]. + * Check whether node_states[N_NORMAL_MEMORY] will be changed. + * If the memory to be offline is within the range + * [0..ZONE_NORMAL], and it is the last present memory there, + * the zones in that range will become empty after the offlining, + * thus we can determine that we need to clear the node from + * node_states[N_NORMAL_MEMORY]. */ - for (zt = 0; zt <= zone_last; zt++) + for (zt = 0; zt <= ZONE_NORMAL; zt++) present_pages += pgdat->node_zones[zt].present_pages; - if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) + if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages) arg->status_change_nid_normal = zone_to_nid(zone); - else - arg->status_change_nid_normal = -1; #ifdef CONFIG_HIGHMEM /* - * If we have movable node, node_states[N_HIGH_MEMORY] - * contains nodes which have zones of 0...ZONE_HIGHMEM, - * set zone_last to ZONE_HIGHMEM. - * - * If we don't have movable node, node_states[N_NORMAL_MEMORY] - * contains nodes which have zones of 0...ZONE_MOVABLE, - * set zone_last to ZONE_MOVABLE. + * node_states[N_HIGH_MEMORY] contains nodes which + * have normal memory or high memory. + * Here we add the present_pages belonging to ZONE_HIGHMEM. + * If the zone is within the range of [0..ZONE_HIGHMEM), and + * we determine that the zones in that range become empty, + * we need to clear the node for N_HIGH_MEMORY. */ - zone_last = ZONE_HIGHMEM; - if (N_MEMORY == N_HIGH_MEMORY) - zone_last = ZONE_MOVABLE; - - for (; zt <= zone_last; zt++) - present_pages += pgdat->node_zones[zt].present_pages; - if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) + present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages; + if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages) arg->status_change_nid_high = zone_to_nid(zone); - else - arg->status_change_nid_high = -1; -#else - arg->status_change_nid_high = arg->status_change_nid_normal; #endif /* - * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE + * We have accounted the pages from [0..ZONE_NORMAL), and + * in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM + * as well. + * Here we count the possible pages from ZONE_MOVABLE. + * If after having accounted all the pages, we see that the nr_pages + * to be offlined is over or equal to the accounted pages, + * we know that the node will become empty, and so, we can clear + * it for N_MEMORY as well. */ - zone_last = ZONE_MOVABLE; + present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages; - /* - * check whether node_states[N_HIGH_MEMORY] will be changed - * If we try to offline the last present @nr_pages from the node, - * we can determind we will need to clear the node from - * node_states[N_HIGH_MEMORY]. - */ - for (; zt <= zone_last; zt++) - present_pages += pgdat->node_zones[zt].present_pages; if (nr_pages >= present_pages) arg->status_change_nid = zone_to_nid(zone); - else - arg->status_change_nid = -1; } static void node_states_clear_node(int node, struct memory_notify *arg) @@ -1581,12 +1562,10 @@ static void node_states_clear_node(int node, struct memory_notify *arg) if (arg->status_change_nid_normal >= 0) node_clear_state(node, N_NORMAL_MEMORY); - if ((N_MEMORY != N_NORMAL_MEMORY) && - (arg->status_change_nid_high >= 0)) + if (arg->status_change_nid_high >= 0) node_clear_state(node, N_HIGH_MEMORY); - if ((N_MEMORY != N_HIGH_MEMORY) && - (arg->status_change_nid >= 0)) + if (arg->status_change_nid >= 0) node_clear_state(node, N_MEMORY); } @@ -1600,16 +1579,18 @@ static int __ref __offline_pages(unsigned long start_pfn, unsigned long valid_start, valid_end; struct zone *zone; struct memory_notify arg; + char *reason; + + mem_hotplug_begin(); - /* at least, alignment against pageblock is necessary */ - if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) - return -EINVAL; - if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) - return -EINVAL; /* This makes hotplug much easier...and readable. we assume this for now. .*/ - if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) - return -EINVAL; + if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, + &valid_end)) { + ret = -EINVAL; + reason = "multizone range"; + goto failed_removal; + } zone = page_zone(pfn_to_page(valid_start)); node = zone_to_nid(zone); @@ -1617,9 +1598,12 @@ static int __ref __offline_pages(unsigned long start_pfn, /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, - MIGRATE_MOVABLE, true); - if (ret) - return ret; + MIGRATE_MOVABLE, + SKIP_HWPOISON | REPORT_FAILURE); + if (ret) { + reason = "failure to isolate range"; + goto failed_removal; + } arg.start_pfn = start_pfn; arg.nr_pages = nr_pages; @@ -1627,37 +1611,46 @@ static int __ref __offline_pages(unsigned long start_pfn, ret = memory_notify(MEM_GOING_OFFLINE, &arg); ret = notifier_to_errno(ret); - if (ret) - goto failed_removal; + if (ret) { + reason = "notifier failure"; + goto failed_removal_isolated; + } - pfn = start_pfn; -repeat: - /* start memory hot removal */ - ret = -EINTR; - if (signal_pending(current)) - goto failed_removal; + do { + for (pfn = start_pfn; pfn;) { + if (signal_pending(current)) { + ret = -EINTR; + reason = "signal backoff"; + goto failed_removal_isolated; + } - cond_resched(); - lru_add_drain_all(); - drain_all_pages(zone); + cond_resched(); + lru_add_drain_all(); - pfn = scan_movable_pages(start_pfn, end_pfn); - if (pfn) { /* We have movable pages */ - ret = do_migrate_range(pfn, end_pfn); - goto repeat; - } + pfn = scan_movable_pages(pfn, end_pfn); + if (pfn) { + /* + * TODO: fatal migration failures should bail + * out + */ + do_migrate_range(pfn, end_pfn); + } + } + + /* + * Dissolve free hugepages in the memory block before doing + * offlining actually in order to make hugetlbfs's object + * counting consistent. + */ + ret = dissolve_free_huge_pages(start_pfn, end_pfn); + if (ret) { + reason = "failure to dissolve huge pages"; + goto failed_removal_isolated; + } + /* check again */ + offlined_pages = check_pages_isolated(start_pfn, end_pfn); + } while (offlined_pages < 0); - /* - * dissolve free hugepages in the memory block before doing offlining - * actually in order to make hugetlbfs's object counting consistent. - */ - ret = dissolve_free_huge_pages(start_pfn, end_pfn); - if (ret) - goto failed_removal; - /* check again */ - offlined_pages = check_pages_isolated(start_pfn, end_pfn); - if (offlined_pages < 0) - goto repeat; pr_info("Offlined Pages %ld\n", offlined_pages); /* Ok, all of our target is isolated. We cannot do rollback at this point. */ @@ -1690,19 +1683,22 @@ repeat: writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); + mem_hotplug_done(); return 0; +failed_removal_isolated: + undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); failed_removal: - pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n", + pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n", (unsigned long long) start_pfn << PAGE_SHIFT, - ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); + ((unsigned long long) end_pfn << PAGE_SHIFT) - 1, + reason); memory_notify(MEM_CANCEL_OFFLINE, &arg); /* pushback to free area */ - undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); + mem_hotplug_done(); return ret; } -/* Must be protected by mem_hotplug_begin() or a device_lock */ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) { return __offline_pages(start_pfn, start_pfn + nr_pages); @@ -1791,34 +1787,6 @@ static int check_cpu_on_node(pg_data_t *pgdat) return 0; } -static void unmap_cpu_on_node(pg_data_t *pgdat) -{ -#ifdef CONFIG_ACPI_NUMA - int cpu; - - for_each_possible_cpu(cpu) - if (cpu_to_node(cpu) == pgdat->node_id) - numa_clear_node(cpu); -#endif -} - -static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) -{ - int ret; - - ret = check_cpu_on_node(pgdat); - if (ret) - return ret; - - /* - * the node will be offlined when we come here, so we can clear - * the cpu_to_node() now. - */ - - unmap_cpu_on_node(pgdat); - return 0; -} - /** * try_offline_node * @nid: the node ID @@ -1851,7 +1819,7 @@ void try_offline_node(int nid) return; } - if (check_and_unmap_cpu_on_node(pgdat)) + if (check_cpu_on_node(pgdat)) return; /* @@ -1873,7 +1841,7 @@ EXPORT_SYMBOL(try_offline_node); * and online/offline operations before this call, as required by * try_offline_node(). */ -void __ref remove_memory(int nid, u64 start, u64 size) +void __ref __remove_memory(int nid, u64 start, u64 size) { int ret; @@ -1896,11 +1864,18 @@ void __ref remove_memory(int nid, u64 start, u64 size) memblock_free(start, size); memblock_remove(start, size); - arch_remove_memory(start, size, NULL); + arch_remove_memory(nid, start, size, NULL); try_offline_node(nid); mem_hotplug_done(); } + +void remove_memory(int nid, u64 start, u64 size) +{ + lock_device_hotplug(); + __remove_memory(nid, start, size); + unlock_device_hotplug(); +} EXPORT_SYMBOL_GPL(remove_memory); #endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index da858f794eb6..af171ccb56a2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -350,7 +350,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol) return; - if (!mpol_store_user_nodemask(pol) && + if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; @@ -797,16 +797,19 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) } } -static int lookup_node(unsigned long addr) +static int lookup_node(struct mm_struct *mm, unsigned long addr) { struct page *p; int err; - err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL); + int locked = 1; + err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked); if (err >= 0) { err = page_to_nid(p); put_page(p); } + if (locked) + up_read(&mm->mmap_sem); return err; } @@ -817,7 +820,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, int err; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; - struct mempolicy *pol = current->mempolicy; + struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) @@ -857,7 +860,16 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { - err = lookup_node(addr); + /* + * Take a refcount on the mpol, lookup_node() + * wil drop the mmap_sem, so after calling + * lookup_node() only "pol" remains valid, "vma" + * is stale. + */ + pol_refcount = pol; + vma = NULL; + mpol_get(pol); + err = lookup_node(mm, addr); if (err < 0) goto out; *policy = err; @@ -892,7 +904,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, out: mpol_cond_put(pol); if (vma) - up_read(¤t->mm->mmap_sem); + up_read(&mm->mmap_sem); + if (pol_refcount) + mpol_put(pol_refcount); return err; } @@ -1300,7 +1314,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, nodemask_t *nodes) { unsigned long copy = ALIGN(maxnode-1, 64) / 8; - const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); + unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); if (copy > nbytes) { if (copy > PAGE_SIZE) @@ -1477,7 +1491,7 @@ static int kernel_get_mempolicy(int __user *policy, int uninitialized_var(pval); nodemask_t nodes; - if (nmask != NULL && maxnode < MAX_NUMNODES) + if (nmask != NULL && maxnode < nr_node_ids) return -EINVAL; err = do_get_mempolicy(&pval, &nodes, addr, flags); @@ -1513,7 +1527,7 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, unsigned long nr_bits, alloc_size; DECLARE_BITMAP(bm, MAX_NUMNODES); - nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); + nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids); alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; if (nmask) @@ -2039,8 +2053,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, * If the policy is interleave, or does not allow the current * node in its nodemask, we allocate the standard way. */ - if (pol->mode == MPOL_PREFERRED && - !(pol->flags & MPOL_F_LOCAL)) + if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL)) hpage_node = pol->v.preferred_node; nmask = policy_nodemask(gfp, pol); @@ -2291,7 +2304,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long unsigned long pgoff; int thiscpu = raw_smp_processor_id(); int thisnid = cpu_to_node(thiscpu); - int polnid = -1; + int polnid = NUMA_NO_NODE; int ret = -1; pol = get_vma_policy(vma, addr); @@ -2697,12 +2710,11 @@ static const char * const policy_modes[] = int mpol_parse_str(char *str, struct mempolicy **mpol) { struct mempolicy *new = NULL; - unsigned short mode; unsigned short mode_flags; nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); - int err = 1; + int err = 1, mode; if (nodelist) { /* NUL-terminate mode or flags string */ @@ -2717,12 +2729,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) if (flags) *flags++ = '\0'; /* terminate mode string */ - for (mode = 0; mode < MPOL_MAX; mode++) { - if (!strcmp(str, policy_modes[mode])) { - break; - } - } - if (mode >= MPOL_MAX) + mode = match_string(policy_modes, MPOL_MAX, str); + if (mode < 0) goto out; switch (mode) { diff --git a/mm/mempool.c b/mm/mempool.c index 0ef8cc8d1602..85efab3da720 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -222,6 +222,8 @@ EXPORT_SYMBOL(mempool_init_node); * * Like mempool_create(), but initializes the pool in (i.e. embedded in another * structure). + * + * Return: %0 on success, negative error code otherwise. */ int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data) @@ -245,6 +247,8 @@ EXPORT_SYMBOL(mempool_init); * functions. This function might sleep. Both the alloc_fn() and the free_fn() * functions might sleep - as long as the mempool_alloc() function is not called * from IRQ contexts. + * + * Return: pointer to the created memory pool object or %NULL on error. */ mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data) @@ -289,6 +293,8 @@ EXPORT_SYMBOL(mempool_create_node); * Note, the caller must guarantee that no mempool_destroy is called * while this function is running. mempool_alloc() & mempool_free() * might be called (eg. from IRQ contexts) while this function executes. + * + * Return: %0 on success, negative error code otherwise. */ int mempool_resize(mempool_t *pool, int new_min_nr) { @@ -363,6 +369,8 @@ EXPORT_SYMBOL(mempool_resize); * *never* fails when called from process contexts. (it might * fail if called from an IRQ context.) * Note: using __GFP_ZERO is not supported. + * + * Return: pointer to the allocated element or %NULL on error. */ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) { diff --git a/mm/migrate.c b/mm/migrate.c index 84381b55b2bd..ac6f4939bb59 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -100,7 +100,7 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) /* * Check PageMovable before holding a PG_lock because page's owner * assumes anybody doesn't touch PG_lock of newly allocated page - * so unconditionally grapping the lock ruins page's owner side. + * so unconditionally grabbing the lock ruins page's owner side. */ if (unlikely(!__PageMovable(page))) goto out_putpage; @@ -326,17 +326,14 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, page = migration_entry_to_page(entry); /* - * Once radix-tree replacement of page migration started, page_count - * *must* be zero. And, we don't want to call wait_on_page_locked() - * against a page without get_page(). - * So, we use get_page_unless_zero(), here. Even failed, page fault - * will occur again. + * Once page cache replacement of page migration started, page_count + * is zero; but we must not call put_and_wait_on_page_locked() without + * a ref. Use get_page_unless_zero(), and just fault again if it fails. */ if (!get_page_unless_zero(page)) goto out; pte_unmap_unlock(ptep, ptl); - wait_on_page_locked(page); - put_page(page); + put_and_wait_on_page_locked(page); return; out: pte_unmap_unlock(ptep, ptl); @@ -370,63 +367,28 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) if (!get_page_unless_zero(page)) goto unlock; spin_unlock(ptl); - wait_on_page_locked(page); - put_page(page); + put_and_wait_on_page_locked(page); return; unlock: spin_unlock(ptl); } #endif -#ifdef CONFIG_BLOCK -/* Returns true if all buffers are successfully locked */ -static bool buffer_migrate_lock_buffers(struct buffer_head *head, - enum migrate_mode mode) +static int expected_page_refs(struct address_space *mapping, struct page *page) { - struct buffer_head *bh = head; - - /* Simple case, sync compaction */ - if (mode != MIGRATE_ASYNC) { - do { - get_bh(bh); - lock_buffer(bh); - bh = bh->b_this_page; - - } while (bh != head); + int expected_count = 1; - return true; - } - - /* async case, we cannot block on lock_buffer so use trylock_buffer */ - do { - get_bh(bh); - if (!trylock_buffer(bh)) { - /* - * We failed to lock the buffer and cannot stall in - * async migration. Release the taken locks - */ - struct buffer_head *failed_bh = bh; - put_bh(failed_bh); - bh = head; - while (bh != failed_bh) { - unlock_buffer(bh); - put_bh(bh); - bh = bh->b_this_page; - } - return false; - } + /* + * Device public or private pages have an extra refcount as they are + * ZONE_DEVICE pages. + */ + expected_count += is_device_private_page(page); + expected_count += is_device_public_page(page); + if (mapping) + expected_count += hpage_nr_pages(page) + page_has_private(page); - bh = bh->b_this_page; - } while (bh != head); - return true; -} -#else -static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, - enum migrate_mode mode) -{ - return true; + return expected_count; } -#endif /* CONFIG_BLOCK */ /* * Replace the page in the mapping. @@ -437,21 +399,13 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. */ int migrate_page_move_mapping(struct address_space *mapping, - struct page *newpage, struct page *page, - struct buffer_head *head, enum migrate_mode mode, + struct page *newpage, struct page *page, enum migrate_mode mode, int extra_count) { + XA_STATE(xas, &mapping->i_pages, page_index(page)); struct zone *oldzone, *newzone; int dirty; - int expected_count = 1 + extra_count; - void **pslot; - - /* - * Device public or private pages have an extra refcount as they are - * ZONE_DEVICE pages. - */ - expected_count += is_device_private_page(page); - expected_count += is_device_public_page(page); + int expected_count = expected_page_refs(mapping, page) + extra_count; if (!mapping) { /* Anonymous page without mapping */ @@ -470,35 +424,14 @@ int migrate_page_move_mapping(struct address_space *mapping, oldzone = page_zone(page); newzone = page_zone(newpage); - xa_lock_irq(&mapping->i_pages); - - pslot = radix_tree_lookup_slot(&mapping->i_pages, - page_index(page)); - - expected_count += hpage_nr_pages(page) + page_has_private(page); - if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, - &mapping->i_pages.xa_lock) != page) { - xa_unlock_irq(&mapping->i_pages); + xas_lock_irq(&xas); + if (page_count(page) != expected_count || xas_load(&xas) != page) { + xas_unlock_irq(&xas); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - xa_unlock_irq(&mapping->i_pages); - return -EAGAIN; - } - - /* - * In the async migration case of moving a page with buffers, lock the - * buffers using trylock before the mapping is moved. If the mapping - * was moved, we later failed to lock the buffers and could not move - * the mapping back due to an elevated page count, we would have to - * block waiting on other references to be dropped. - */ - if (mode == MIGRATE_ASYNC && head && - !buffer_migrate_lock_buffers(head, mode)) { - page_ref_unfreeze(page, expected_count); - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return -EAGAIN; } @@ -526,16 +459,13 @@ int migrate_page_move_mapping(struct address_space *mapping, SetPageDirty(newpage); } - radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); + xas_store(&xas, newpage); if (PageTransHuge(page)) { int i; - int index = page_index(page); for (i = 1; i < HPAGE_PMD_NR; i++) { - pslot = radix_tree_lookup_slot(&mapping->i_pages, - index + i); - radix_tree_replace_slot(&mapping->i_pages, pslot, - newpage + i); + xas_next(&xas); + xas_store(&xas, newpage + i); } } @@ -546,7 +476,7 @@ int migrate_page_move_mapping(struct address_space *mapping, */ page_ref_unfreeze(page, expected_count - hpage_nr_pages(page)); - xa_unlock(&mapping->i_pages); + xas_unlock(&xas); /* Leave irq disabled to prevent preemption while updating stats */ /* @@ -586,22 +516,18 @@ EXPORT_SYMBOL(migrate_page_move_mapping); int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page) { + XA_STATE(xas, &mapping->i_pages, page_index(page)); int expected_count; - void **pslot; - - xa_lock_irq(&mapping->i_pages); - - pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); + xas_lock_irq(&xas); expected_count = 2 + page_has_private(page); - if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) { - xa_unlock_irq(&mapping->i_pages); + if (page_count(page) != expected_count || xas_load(&xas) != page) { + xas_unlock_irq(&xas); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return -EAGAIN; } @@ -610,11 +536,11 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, get_page(newpage); - radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); + xas_store(&xas, newpage); page_ref_unfreeze(page, expected_count - 1); - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return MIGRATEPAGE_SUCCESS; } @@ -685,6 +611,8 @@ void migrate_page_states(struct page *newpage, struct page *page) SetPageActive(newpage); } else if (TestClearPageUnevictable(page)) SetPageUnevictable(newpage); + if (PageWorkingset(page)) + SetPageWorkingset(newpage); if (PageChecked(page)) SetPageChecked(newpage); if (PageMappedToDisk(page)) @@ -758,7 +686,7 @@ int migrate_page(struct address_space *mapping, BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0); if (rc != MIGRATEPAGE_SUCCESS) return rc; @@ -772,34 +700,94 @@ int migrate_page(struct address_space *mapping, EXPORT_SYMBOL(migrate_page); #ifdef CONFIG_BLOCK -/* - * Migration function for pages with buffers. This function can only be used - * if the underlying filesystem guarantees that no other references to "page" - * exist. - */ -int buffer_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode) +/* Returns true if all buffers are successfully locked */ +static bool buffer_migrate_lock_buffers(struct buffer_head *head, + enum migrate_mode mode) +{ + struct buffer_head *bh = head; + + /* Simple case, sync compaction */ + if (mode != MIGRATE_ASYNC) { + do { + lock_buffer(bh); + bh = bh->b_this_page; + + } while (bh != head); + + return true; + } + + /* async case, we cannot block on lock_buffer so use trylock_buffer */ + do { + if (!trylock_buffer(bh)) { + /* + * We failed to lock the buffer and cannot stall in + * async migration. Release the taken locks + */ + struct buffer_head *failed_bh = bh; + bh = head; + while (bh != failed_bh) { + unlock_buffer(bh); + bh = bh->b_this_page; + } + return false; + } + + bh = bh->b_this_page; + } while (bh != head); + return true; +} + +static int __buffer_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode, + bool check_refs) { struct buffer_head *bh, *head; int rc; + int expected_count; if (!page_has_buffers(page)) return migrate_page(mapping, newpage, page, mode); + /* Check whether page does not have extra refs before we do more work */ + expected_count = expected_page_refs(mapping, page); + if (page_count(page) != expected_count) + return -EAGAIN; + head = page_buffers(page); + if (!buffer_migrate_lock_buffers(head, mode)) + return -EAGAIN; - rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0); + if (check_refs) { + bool busy; + bool invalidated = false; - if (rc != MIGRATEPAGE_SUCCESS) - return rc; +recheck_buffers: + busy = false; + spin_lock(&mapping->private_lock); + bh = head; + do { + if (atomic_read(&bh->b_count)) { + busy = true; + break; + } + bh = bh->b_this_page; + } while (bh != head); + spin_unlock(&mapping->private_lock); + if (busy) { + if (invalidated) { + rc = -EAGAIN; + goto unlock_buffers; + } + invalidate_bh_lrus(); + invalidated = true; + goto recheck_buffers; + } + } - /* - * In the async case, migrate_page_move_mapping locked the buffers - * with an IRQ-safe spinlock held. In the sync case, the buffers - * need to be locked now - */ - if (mode != MIGRATE_ASYNC) - BUG_ON(!buffer_migrate_lock_buffers(head, mode)); + rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0); + if (rc != MIGRATEPAGE_SUCCESS) + goto unlock_buffers; ClearPagePrivate(page); set_page_private(newpage, page_private(page)); @@ -821,17 +809,41 @@ int buffer_migrate_page(struct address_space *mapping, else migrate_page_states(newpage, page); + rc = MIGRATEPAGE_SUCCESS; +unlock_buffers: bh = head; do { unlock_buffer(bh); - put_bh(bh); bh = bh->b_this_page; } while (bh != head); - return MIGRATEPAGE_SUCCESS; + return rc; +} + +/* + * Migration function for pages with buffers. This function can only be used + * if the underlying filesystem guarantees that no other references to "page" + * exist. For example attached buffer heads are accessed only under page lock. + */ +int buffer_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + return __buffer_migrate_page(mapping, newpage, page, mode, false); } EXPORT_SYMBOL(buffer_migrate_page); + +/* + * Same as above except that this variant is more careful and checks that there + * are also no buffer head references. This function is the right one for + * mappings where buffer heads are directly looked up and referenced (such as + * block device mappings). + */ +int buffer_migrate_page_norefs(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + return __buffer_migrate_page(mapping, newpage, page, mode, true); +} #endif /* @@ -899,7 +911,7 @@ static int fallback_migrate_page(struct address_space *mapping, */ if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) - return -EAGAIN; + return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; return migrate_page(mapping, newpage, page, mode); } @@ -1118,10 +1130,13 @@ out: * If migration is successful, decrease refcount of the newpage * which will not free the page because new page owner increased * refcounter. As well, if it is LRU page, add the page to LRU - * list in here. + * list in here. Use the old state of the isolated source page to + * determine if we migrated a LRU page. newpage was already unlocked + * and possibly modified by its owner - don't rely on the page + * state. */ if (rc == MIGRATEPAGE_SUCCESS) { - if (unlikely(__PageMovable(newpage))) + if (unlikely(!is_lru)) put_page(newpage); else putback_lru_page(newpage); @@ -1272,7 +1287,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, struct anon_vma *anon_vma = NULL; /* - * Movability of hugepages depends on architectures and hugepage size. + * Migratability of hugepages depends on architectures and their size. * This check is necessary because some callers of hugepage migration * like soft offline and memory hotremove don't walk through page * tables or check whether the hugepage is pmd-based or not before @@ -1300,6 +1315,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, lock_page(hpage); } + /* + * Check for pages which are in the process of being freed. Without + * page_mapping() set, hugetlbfs specific move page routine will not + * be called and we could leak usage counts for subpools. + */ + if (page_private(hpage) && !page_mapping(hpage)) { + rc = -EBUSY; + goto out_unlock; + } + if (PageAnon(hpage)) anon_vma = page_get_anon_vma(hpage); @@ -1330,6 +1355,7 @@ put_anon: put_new_page = NULL; } +out_unlock: unlock_page(hpage); out: if (rc != -EAGAIN) @@ -1973,8 +1999,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, int isolated = 0; struct page *new_page = NULL; int page_lru = page_is_file_cache(page); - unsigned long mmun_start = address & HPAGE_PMD_MASK; - unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; + unsigned long start = address & HPAGE_PMD_MASK; new_page = alloc_pages_node(node, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), @@ -1997,15 +2022,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, /* anon mapping, we can simply copy page->mapping to the new page: */ new_page->mapping = page->mapping; new_page->index = page->index; + /* flush the cache before copying using the kernel virtual address */ + flush_cache_range(vma, start, start + HPAGE_PMD_SIZE); migrate_page_copy(new_page, page); WARN_ON(PageLRU(new_page)); /* Recheck the target PMD */ - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) { spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); /* Reverse changes made by migrate_page_copy() */ if (TestClearPageActive(new_page)) @@ -2029,16 +2054,26 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); /* - * Clear the old entry under pagetable lock and establish the new PTE. - * Any parallel GUP will either observe the old page blocking on the - * page lock, block on the page table lock or observe the new page. - * The SetPageUptodate on the new page and page_add_new_anon_rmap - * guarantee the copy is visible before the pagetable update. + * Overwrite the old entry under pagetable lock and establish + * the new PTE. Any parallel GUP will either observe the old + * page blocking on the page lock, block on the page table + * lock or observe the new page. The SetPageUptodate on the + * new page and page_add_new_anon_rmap guarantee the copy is + * visible before the pagetable update. + */ + page_add_anon_rmap(new_page, vma, start, true); + /* + * At this point the pmd is numa/protnone (i.e. non present) and the TLB + * has already been flushed globally. So no TLB can be currently + * caching this non present pmd mapping. There's no need to clear the + * pmd before doing set_pmd_at(), nor to flush the TLB after + * set_pmd_at(). Clearing the pmd here would introduce a race + * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the + * mmap_sem for reading. If the pmd is set to NULL at any given time, + * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this + * pmd. */ - flush_cache_range(vma, mmun_start, mmun_end); - page_add_anon_rmap(new_page, vma, mmun_start, true); - pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); - set_pmd_at(mm, mmun_start, pmd, entry); + set_pmd_at(mm, start, pmd, entry); update_mmu_cache_pmd(vma, address, &entry); page_ref_unfreeze(page, 2); @@ -2047,11 +2082,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); spin_unlock(ptl); - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above pmdp_huge_clear_flush_notify() did already call it. - */ - mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); /* Take an "isolate" reference and put new page on the LRU. */ get_page(new_page); @@ -2075,7 +2105,7 @@ out_fail: ptl = pmd_lock(mm, pmd); if (pmd_same(*pmd, entry)) { entry = pmd_modify(entry, vma->vm_page_prot); - set_pmd_at(mm, mmun_start, pmd, entry); + set_pmd_at(mm, start, pmd, entry); update_mmu_cache_pmd(vma, address, &entry); } spin_unlock(ptl); @@ -2309,6 +2339,7 @@ next: */ static void migrate_vma_collect(struct migrate_vma *migrate) { + struct mmu_notifier_range range; struct mm_walk mm_walk; mm_walk.pmd_entry = migrate_vma_collect_pmd; @@ -2320,13 +2351,11 @@ static void migrate_vma_collect(struct migrate_vma *migrate) mm_walk.mm = migrate->vma->vm_mm; mm_walk.private = migrate; - mmu_notifier_invalidate_range_start(mm_walk.mm, - migrate->start, - migrate->end); + mmu_notifier_range_init(&range, mm_walk.mm, migrate->start, + migrate->end); + mmu_notifier_invalidate_range_start(&range); walk_page_range(migrate->start, migrate->end, &mm_walk); - mmu_notifier_invalidate_range_end(mm_walk.mm, - migrate->start, - migrate->end); + mmu_notifier_invalidate_range_end(&range); migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); } @@ -2605,7 +2634,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, * * Here we only have down_read(mmap_sem). */ - if (pte_alloc(mm, pmdp, addr)) + if (pte_alloc(mm, pmdp)) goto abort; /* See the comment in pte_alloc_one_map() */ @@ -2707,9 +2736,8 @@ static void migrate_vma_pages(struct migrate_vma *migrate) { const unsigned long npages = migrate->npages; const unsigned long start = migrate->start; - struct vm_area_struct *vma = migrate->vma; - struct mm_struct *mm = vma->vm_mm; - unsigned long addr, i, mmu_start; + struct mmu_notifier_range range; + unsigned long addr, i; bool notified = false; for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { @@ -2728,11 +2756,12 @@ static void migrate_vma_pages(struct migrate_vma *migrate) continue; } if (!notified) { - mmu_start = addr; notified = true; - mmu_notifier_invalidate_range_start(mm, - mmu_start, - migrate->end); + + mmu_notifier_range_init(&range, + migrate->vma->vm_mm, + addr, migrate->end); + mmu_notifier_invalidate_range_start(&range); } migrate_vma_insert_page(migrate, addr, newpage, &migrate->src[i], @@ -2773,8 +2802,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate) * did already call it. */ if (notified) - mmu_notifier_invalidate_range_only_end(mm, mmu_start, - migrate->end); + mmu_notifier_invalidate_range_only_end(&range); } /* diff --git a/mm/mincore.c b/mm/mincore.c index fc37afe226e6..218099b5ed31 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -66,7 +66,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) * shmem/tmpfs may return swap: account for swapcache * page too. */ - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { swp_entry_t swp = radix_to_swp_entry(page); page = find_get_page(swap_address_space(swp), swp_offset(swp)); @@ -233,14 +233,14 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, return -EINVAL; /* ..and we need to be passed a valid user-space range */ - if (!access_ok(VERIFY_READ, (void __user *) start, len)) + if (!access_ok((void __user *) start, len)) return -ENOMEM; /* This also avoids any overflows on PAGE_ALIGN */ pages = len >> PAGE_SHIFT; pages += (offset_in_page(len)) != 0; - if (!access_ok(VERIFY_WRITE, vec, pages)) + if (!access_ok(vec, pages)) return -EFAULT; tmp = (void *) __get_free_page(GFP_USER); diff --git a/mm/mlock.c b/mm/mlock.c index 41cc47e28ad6..080f3b36415b 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -182,7 +182,7 @@ static void __munlock_isolation_failed(struct page *page) unsigned int munlock_vma_page(struct page *page) { int nr_pages; - struct zone *zone = page_zone(page); + pg_data_t *pgdat = page_pgdat(page); /* For try_to_munlock() and to serialize with page migration */ BUG_ON(!PageLocked(page)); @@ -194,7 +194,7 @@ unsigned int munlock_vma_page(struct page *page) * might otherwise copy PageMlocked to part of the tail pages before * we clear it in the head page. It also stabilizes hpage_nr_pages(). */ - spin_lock_irq(zone_lru_lock(zone)); + spin_lock_irq(&pgdat->lru_lock); if (!TestClearPageMlocked(page)) { /* Potentially, PTE-mapped THP: do not skip the rest PTEs */ @@ -203,17 +203,17 @@ unsigned int munlock_vma_page(struct page *page) } nr_pages = hpage_nr_pages(page); - __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); + __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); if (__munlock_isolate_lru_page(page, true)) { - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); __munlock_isolated_page(page); goto out; } __munlock_isolation_failed(page); unlock_out: - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); out: return nr_pages - 1; @@ -298,7 +298,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) pagevec_init(&pvec_putback); /* Phase 1: page isolation */ - spin_lock_irq(zone_lru_lock(zone)); + spin_lock_irq(&zone->zone_pgdat->lru_lock); for (i = 0; i < nr; i++) { struct page *page = pvec->pages[i]; @@ -325,7 +325,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) pvec->pages[i] = NULL; } __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&zone->zone_pgdat->lru_lock); /* Now we can release pins of pages that we are not munlocking */ pagevec_release(&pvec_putback); diff --git a/mm/mm_init.c b/mm/mm_init.c index 6838a530789b..33917105a3a2 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -146,7 +146,7 @@ static void __meminit mm_compute_batch(void) s32 batch = max_t(s32, nr*2, 32); /* batch size set to 0.4% of (total memory/#cpus), or max int32 */ - memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff); + memsized_batch = min_t(u64, (totalram_pages()/nr)/256, 0x7fffffff); vm_committed_as_batch = max_t(s32, memsized_batch, batch); } diff --git a/mm/mmap.c b/mm/mmap.c index f7cd9cb966c0..41eb48d9b527 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -191,16 +191,19 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long retval; - unsigned long newbrk, oldbrk; + unsigned long newbrk, oldbrk, origbrk; struct mm_struct *mm = current->mm; struct vm_area_struct *next; unsigned long min_brk; bool populate; + bool downgraded = false; LIST_HEAD(uf); if (down_write_killable(&mm->mmap_sem)) return -EINTR; + origbrk = mm->brk; + #ifdef CONFIG_COMPAT_BRK /* * CONFIG_COMPAT_BRK can still be overridden by setting @@ -229,14 +232,32 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) newbrk = PAGE_ALIGN(brk); oldbrk = PAGE_ALIGN(mm->brk); - if (oldbrk == newbrk) - goto set_brk; + if (oldbrk == newbrk) { + mm->brk = brk; + goto success; + } - /* Always allow shrinking brk. */ + /* + * Always allow shrinking brk. + * __do_munmap() may downgrade mmap_sem to read. + */ if (brk <= mm->brk) { - if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf)) - goto set_brk; - goto out; + int ret; + + /* + * mm->brk must to be protected by write mmap_sem so update it + * before downgrading mmap_sem. When __do_munmap() fails, + * mm->brk will be restored from origbrk. + */ + mm->brk = brk; + ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true); + if (ret < 0) { + mm->brk = origbrk; + goto out; + } else if (ret == 1) { + downgraded = true; + } + goto success; } /* Check against existing mmap mappings. */ @@ -247,18 +268,21 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* Ok, looks good - let it rip. */ if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0) goto out; - -set_brk: mm->brk = brk; + +success: populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; - up_write(&mm->mmap_sem); + if (downgraded) + up_read(&mm->mmap_sem); + else + up_write(&mm->mmap_sem); userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(oldbrk, newbrk - oldbrk); return brk; out: - retval = mm->brk; + retval = origbrk; up_write(&mm->mmap_sem); return retval; } @@ -414,7 +438,7 @@ static void vma_gap_update(struct vm_area_struct *vma) { /* * As it turns out, RB_DECLARE_CALLBACKS() already created a callback - * function that does exacltly what we want. + * function that does exactly what we want. */ vma_gap_callbacks_propagate(&vma->vm_rb, NULL); } @@ -988,7 +1012,7 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, * VM_SOFTDIRTY should not prevent from VMA merging, if we * match the flags but dirty bit -- the caller should mark * merged VMA as dirty. If dirty bit won't be excluded from - * comparison, we increase pressue on the memory system forcing + * comparison, we increase pressure on the memory system forcing * the kernel to generate new VMAs when old one could be * extended instead. */ @@ -1091,7 +1115,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN * might become case 1 below case 2 below case 3 below * - * It is important for case 8 that the the vma NNNN overlapping the + * It is important for case 8 that the vma NNNN overlapping the * region AAAA is never going to extended over XXXX. Instead XXXX must * be extended in region AAAA and NNNN must be removed. This way in * all cases where vma_merge succeeds, the moment vma_adjust drops the @@ -1621,7 +1645,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) #endif /* __ARCH_WANT_SYS_OLD_MMAP */ /* - * Some shared mappigns will want the pages marked read-only + * Some shared mappings will want the pages marked read-only * to track write events. If so, we'll downgrade vm_page_prot * to the private version (using protection_map[] without the * VM_SHARED bit). @@ -2042,6 +2066,15 @@ found_highest: return gap_end; } + +#ifndef arch_get_mmap_end +#define arch_get_mmap_end(addr) (TASK_SIZE) +#endif + +#ifndef arch_get_mmap_base +#define arch_get_mmap_base(addr, base) (base) +#endif + /* Get an address range which is currently unmapped. * For shmat() with addr=0. * @@ -2061,8 +2094,9 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct vm_unmapped_area_info info; + const unsigned long mmap_end = arch_get_mmap_end(addr); - if (len > TASK_SIZE - mmap_min_addr) + if (len > mmap_end - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) @@ -2071,7 +2105,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma_prev(mm, addr, &prev); - if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + if (mmap_end - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vm_start_gap(vma)) && (!prev || addr >= vm_end_gap(prev))) return addr; @@ -2080,7 +2114,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, info.flags = 0; info.length = len; info.low_limit = mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = mmap_end; info.align_mask = 0; return vm_unmapped_area(&info); } @@ -2092,17 +2126,17 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, */ #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN unsigned long -arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, - const unsigned long len, const unsigned long pgoff, - const unsigned long flags) +arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) { struct vm_area_struct *vma, *prev; struct mm_struct *mm = current->mm; - unsigned long addr = addr0; struct vm_unmapped_area_info info; + const unsigned long mmap_end = arch_get_mmap_end(addr); /* requested length too big for entire address space */ - if (len > TASK_SIZE - mmap_min_addr) + if (len > mmap_end - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) @@ -2112,7 +2146,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma_prev(mm, addr, &prev); - if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + if (mmap_end - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vm_start_gap(vma)) && (!prev || addr >= vm_end_gap(prev))) return addr; @@ -2121,7 +2155,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = mm->mmap_base; + info.high_limit = arch_get_mmap_base(addr, mm->mmap_base); info.align_mask = 0; addr = vm_unmapped_area(&info); @@ -2135,7 +2169,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = TASK_SIZE; + info.high_limit = mmap_end; addr = vm_unmapped_area(&info); } @@ -2391,12 +2425,11 @@ int expand_downwards(struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *prev; - int error; + int error = 0; address &= PAGE_MASK; - error = security_mmap_addr(address); - if (error) - return error; + if (address < mmap_min_addr) + return -EPERM; /* Enforce stack_guard_gap */ prev = vma->vm_prev; @@ -2687,8 +2720,8 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, * work. This now handles partial unmappings. * Jeremy Fitzhardinge <[email protected]> */ -int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, - struct list_head *uf) +int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, + struct list_head *uf, bool downgrade) { unsigned long end; struct vm_area_struct *vma, *prev, *last; @@ -2770,25 +2803,38 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, mm->locked_vm -= vma_pages(tmp); munlock_vma_pages_all(tmp); } + tmp = tmp->vm_next; } } - /* - * Remove the vma's, and unmap the actual pages - */ + /* Detach vmas from rbtree */ detach_vmas_to_be_unmapped(mm, vma, prev, end); - unmap_region(mm, vma, prev, start, end); + /* + * mpx unmap needs to be called with mmap_sem held for write. + * It is safe to call it before unmap_region(). + */ arch_unmap(mm, vma, start, end); + if (downgrade) + downgrade_write(&mm->mmap_sem); + + unmap_region(mm, vma, prev, start, end); + /* Fix up all other VM information */ remove_vma_list(mm, vma); - return 0; + return downgrade ? 1 : 0; } -int vm_munmap(unsigned long start, size_t len) +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, + struct list_head *uf) +{ + return __do_munmap(mm, start, len, uf, false); +} + +static int __vm_munmap(unsigned long start, size_t len, bool downgrade) { int ret; struct mm_struct *mm = current->mm; @@ -2797,17 +2843,32 @@ int vm_munmap(unsigned long start, size_t len) if (down_write_killable(&mm->mmap_sem)) return -EINTR; - ret = do_munmap(mm, start, len, &uf); - up_write(&mm->mmap_sem); + ret = __do_munmap(mm, start, len, &uf, downgrade); + /* + * Returning 1 indicates mmap_sem is downgraded. + * But 1 is not legal return value of vm_munmap() and munmap(), reset + * it to 0 before return. + */ + if (ret == 1) { + up_read(&mm->mmap_sem); + ret = 0; + } else + up_write(&mm->mmap_sem); + userfaultfd_unmap_complete(mm, &uf); return ret; } + +int vm_munmap(unsigned long start, size_t len) +{ + return __vm_munmap(start, len, false); +} EXPORT_SYMBOL(vm_munmap); SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { profile_munmap(addr); - return vm_munmap(addr, len); + return __vm_munmap(addr, len, true); } @@ -2910,16 +2971,6 @@ out: return ret; } -static inline void verify_mm_writelocked(struct mm_struct *mm) -{ -#ifdef CONFIG_DEBUG_VM - if (unlikely(down_read_trylock(&mm->mmap_sem))) { - WARN_ON(1); - up_read(&mm->mmap_sem); - } -#endif -} - /* * this is really a simplified "do_mmap". it only handles * anonymous maps. eventually we may be able to do some @@ -2947,12 +2998,6 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla return error; /* - * mm->mmap_sem is required to protect against another thread - * changing the mappings in case we sleep. - */ - verify_mm_writelocked(mm); - - /* * Clear old maps. this also does some error checking for us */ while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 2a9fbc4a37d5..f2f03c655807 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -199,7 +199,7 @@ void tlb_table_flush(struct mmu_gather *tlb) if (*batch) { tlb_table_invalidate(tlb); - call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); + call_rcu(&(*batch)->rcu, tlb_remove_table_rcu); *batch = NULL; } } diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 82bb1a939c0e..9c884abc7850 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -35,13 +35,6 @@ void mmu_notifier_call_srcu(struct rcu_head *rcu, } EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu); -void mmu_notifier_synchronize(void) -{ - /* Wait for any running method to finish. */ - srcu_barrier(&srcu); -} -EXPORT_SYMBOL_GPL(mmu_notifier_synchronize); - /* * This function can't run concurrently against mmu_notifier_register * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap @@ -174,22 +167,20 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, srcu_read_unlock(&srcu, id); } -int __mmu_notifier_invalidate_range_start(struct mm_struct *mm, - unsigned long start, unsigned long end, - bool blockable) +int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { struct mmu_notifier *mn; int ret = 0; int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { + hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_range_start) { - int _ret = mn->ops->invalidate_range_start(mn, mm, start, end, blockable); + int _ret = mn->ops->invalidate_range_start(mn, range); if (_ret) { pr_info("%pS callback failed with %d in %sblockable context.\n", - mn->ops->invalidate_range_start, _ret, - !blockable ? "non-" : ""); + mn->ops->invalidate_range_start, _ret, + !range->blockable ? "non-" : ""); ret = _ret; } } @@ -200,16 +191,14 @@ int __mmu_notifier_invalidate_range_start(struct mm_struct *mm, } EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); -void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, - unsigned long start, - unsigned long end, +void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, bool only_end) { struct mmu_notifier *mn; int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { + hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { /* * Call invalidate_range here too to avoid the need for the * subsystem of having to register an invalidate_range_end @@ -224,9 +213,11 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, * already happen under page table lock. */ if (!only_end && mn->ops->invalidate_range) - mn->ops->invalidate_range(mn, mm, start, end); + mn->ops->invalidate_range(mn, range->mm, + range->start, + range->end); if (mn->ops->invalidate_range_end) - mn->ops->invalidate_range_end(mn, mm, start, end); + mn->ops->invalidate_range_end(mn, range); } srcu_read_unlock(&srcu, id); } @@ -247,37 +238,6 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, } EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range); -/* - * Must be called while holding mm->mmap_sem for either read or write. - * The result is guaranteed to be valid until mm->mmap_sem is dropped. - */ -bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm) -{ - struct mmu_notifier *mn; - int id; - bool ret = false; - - WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem)); - - if (!mm_has_notifiers(mm)) - return ret; - - id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { - if (!mn->ops->invalidate_range && - !mn->ops->invalidate_range_start && - !mn->ops->invalidate_range_end) - continue; - - if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) { - ret = true; - break; - } - } - srcu_read_unlock(&srcu, id); - return ret; -} - static int do_mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm, int take_mmap_sem) diff --git a/mm/mprotect.c b/mm/mprotect.c index 6d331620b9e5..028c724dcb1a 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -110,8 +110,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, continue; } - ptent = ptep_modify_prot_start(mm, addr, pte); - ptent = pte_modify(ptent, newprot); + oldpte = ptep_modify_prot_start(vma, addr, pte); + ptent = pte_modify(oldpte, newprot); if (preserve_write) ptent = pte_mk_savedwrite(ptent); @@ -121,7 +121,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, !(vma->vm_flags & VM_SOFTDIRTY))) { ptent = pte_mkwrite(ptent); } - ptep_modify_prot_commit(mm, addr, pte, ptent); + ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); pages++; } else if (IS_ENABLED(CONFIG_MIGRATION)) { swp_entry_t entry = pte_to_swp_entry(oldpte); @@ -167,11 +167,12 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pgprot_t newprot, int dirty_accountable, int prot_numa) { pmd_t *pmd; - struct mm_struct *mm = vma->vm_mm; unsigned long next; unsigned long pages = 0; unsigned long nr_huge_updates = 0; - unsigned long mni_start = 0; + struct mmu_notifier_range range; + + range.start = 0; pmd = pmd_offset(pud, addr); do { @@ -183,9 +184,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, goto next; /* invoke the mmu notifier if the pmd is populated */ - if (!mni_start) { - mni_start = addr; - mmu_notifier_invalidate_range_start(mm, mni_start, end); + if (!range.start) { + mmu_notifier_range_init(&range, vma->vm_mm, addr, end); + mmu_notifier_invalidate_range_start(&range); } if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { @@ -214,8 +215,8 @@ next: cond_resched(); } while (pmd++, addr = next, addr != end); - if (mni_start) - mmu_notifier_invalidate_range_end(mm, mni_start, end); + if (range.start) + mmu_notifier_invalidate_range_end(&range); if (nr_huge_updates) count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); diff --git a/mm/mremap.c b/mm/mremap.c index a9617e72e6b7..e3edef6b7a12 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -191,22 +191,66 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, drop_rmap_locks(vma); } +#ifdef CONFIG_HAVE_MOVE_PMD +static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, unsigned long old_end, + pmd_t *old_pmd, pmd_t *new_pmd) +{ + spinlock_t *old_ptl, *new_ptl; + struct mm_struct *mm = vma->vm_mm; + pmd_t pmd; + + if ((old_addr & ~PMD_MASK) || (new_addr & ~PMD_MASK) + || old_end - old_addr < PMD_SIZE) + return false; + + /* + * The destination pmd shouldn't be established, free_pgtables() + * should have release it. + */ + if (WARN_ON(!pmd_none(*new_pmd))) + return false; + + /* + * We don't have to worry about the ordering of src and dst + * ptlocks because exclusive mmap_sem prevents deadlock. + */ + old_ptl = pmd_lock(vma->vm_mm, old_pmd); + new_ptl = pmd_lockptr(mm, new_pmd); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + + /* Clear the pmd */ + pmd = *old_pmd; + pmd_clear(old_pmd); + + VM_BUG_ON(!pmd_none(*new_pmd)); + + /* Set the new pmd */ + set_pmd_at(mm, new_addr, new_pmd, pmd); + flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + spin_unlock(old_ptl); + + return true; +} +#endif + unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, bool need_rmap_locks) { unsigned long extent, next, old_end; + struct mmu_notifier_range range; pmd_t *old_pmd, *new_pmd; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ old_end = old_addr + len; flush_cache_range(vma, old_addr, old_end); - mmun_start = old_addr; - mmun_end = old_end; - mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, vma->vm_mm, old_addr, old_end); + mmu_notifier_invalidate_range_start(&range); for (; old_addr < old_end; old_addr += extent, new_addr += extent) { cond_resched(); @@ -237,8 +281,26 @@ unsigned long move_page_tables(struct vm_area_struct *vma, split_huge_pmd(vma, old_pmd, old_addr); if (pmd_trans_unstable(old_pmd)) continue; + } else if (extent == PMD_SIZE) { +#ifdef CONFIG_HAVE_MOVE_PMD + /* + * If the extent is PMD-sized, try to speed the move by + * moving at the PMD level if possible. + */ + bool moved; + + if (need_rmap_locks) + take_rmap_locks(vma); + moved = move_normal_pmd(vma, old_addr, new_addr, + old_end, old_pmd, new_pmd); + if (need_rmap_locks) + drop_rmap_locks(vma); + if (moved) + continue; +#endif } - if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr)) + + if (pte_alloc(new_vma->vm_mm, new_pmd)) break; next = (new_addr + PMD_SIZE) & PMD_MASK; if (extent > next - new_addr) @@ -247,7 +309,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, new_pmd, new_addr, need_rmap_locks); } - mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); return len + old_addr - old_end; /* how much done */ } @@ -454,6 +516,23 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if (addr + old_len > new_addr && new_addr + new_len > addr) goto out; + /* + * move_vma() need us to stay 4 maps below the threshold, otherwise + * it will bail out at the very beginning. + * That is a problem if we have already unmaped the regions here + * (new_addr, and old_addr), because userspace will not know the + * state of the vma's after it gets -ENOMEM. + * So, to avoid such scenario we can pre-compute if the whole + * operation has high chances to success map-wise. + * Worst-scenario case is when both vma's (new_addr and old_addr) get + * split in 3 before unmaping it. + * That means 2 more maps (1 for each) to the ones we already hold. + * Check whether current map count plus 2 still leads us to 4 maps below + * the threshold, otherwise return -ENOMEM here to be more safe. + */ + if ((mm->map_count + 2) >= sysctl_max_map_count - 3) + return -ENOMEM; + ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); if (ret) goto out; @@ -521,6 +600,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long ret = -EINVAL; unsigned long charged = 0; bool locked = false; + bool downgraded = false; struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; LIST_HEAD(uf_unmap_early); LIST_HEAD(uf_unmap); @@ -557,12 +637,20 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. - * do_munmap does all the needed commit accounting + * __do_munmap does all the needed commit accounting, and + * downgrades mmap_sem to read if so directed. */ if (old_len >= new_len) { - ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap); - if (ret && old_len != new_len) + int retval; + + retval = __do_munmap(mm, addr+new_len, old_len - new_len, + &uf_unmap, true); + if (retval < 0 && old_len != new_len) { + ret = retval; goto out; + /* Returning 1 indicates mmap_sem is downgraded to read. */ + } else if (retval == 1) + downgraded = true; ret = addr; goto out; } @@ -627,7 +715,10 @@ out: vm_unacct_memory(charged); locked = 0; } - up_write(¤t->mm->mmap_sem); + if (downgraded) + up_read(¤t->mm->mmap_sem); + else + up_write(¤t->mm->mmap_sem); if (locked && new_len > old_len) mm_populate(new_addr + old_len, new_len - old_len); userfaultfd_unmap_complete(mm, &uf_unmap_early); diff --git a/mm/nobootmem.c b/mm/nobootmem.c deleted file mode 100644 index 439af3b765a7..000000000000 --- a/mm/nobootmem.c +++ /dev/null @@ -1,445 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * bootmem - A boot-time physical memory allocator and configurator - * - * Copyright (C) 1999 Ingo Molnar - * 1999 Kanoj Sarcar, SGI - * 2008 Johannes Weiner - * - * Access to this subsystem has to be serialized externally (which is true - * for the boot process anyway). - */ -#include <linux/init.h> -#include <linux/pfn.h> -#include <linux/slab.h> -#include <linux/export.h> -#include <linux/kmemleak.h> -#include <linux/range.h> -#include <linux/memblock.h> -#include <linux/bootmem.h> - -#include <asm/bug.h> -#include <asm/io.h> - -#include "internal.h" - -#ifndef CONFIG_HAVE_MEMBLOCK -#error CONFIG_HAVE_MEMBLOCK not defined -#endif - -#ifndef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data __refdata contig_page_data; -EXPORT_SYMBOL(contig_page_data); -#endif - -unsigned long max_low_pfn; -unsigned long min_low_pfn; -unsigned long max_pfn; -unsigned long long max_possible_pfn; - -static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, - u64 goal, u64 limit) -{ - void *ptr; - u64 addr; - enum memblock_flags flags = choose_memblock_flags(); - - if (limit > memblock.current_limit) - limit = memblock.current_limit; - -again: - addr = memblock_find_in_range_node(size, align, goal, limit, nid, - flags); - if (!addr && (flags & MEMBLOCK_MIRROR)) { - flags &= ~MEMBLOCK_MIRROR; - pr_warn("Could not allocate %pap bytes of mirrored memory\n", - &size); - goto again; - } - if (!addr) - return NULL; - - if (memblock_reserve(addr, size)) - return NULL; - - ptr = phys_to_virt(addr); - memset(ptr, 0, size); - /* - * The min_count is set to 0 so that bootmem allocated blocks - * are never reported as leaks. - */ - kmemleak_alloc(ptr, size, 0, 0); - return ptr; -} - -/** - * free_bootmem_late - free bootmem pages directly to page allocator - * @addr: starting address of the range - * @size: size of the range in bytes - * - * This is only useful when the bootmem allocator has already been torn - * down, but we are still initializing the system. Pages are given directly - * to the page allocator, no bootmem metadata is updated because it is gone. - */ -void __init free_bootmem_late(unsigned long addr, unsigned long size) -{ - unsigned long cursor, end; - - kmemleak_free_part_phys(addr, size); - - cursor = PFN_UP(addr); - end = PFN_DOWN(addr + size); - - for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); - totalram_pages++; - } -} - -static void __init __free_pages_memory(unsigned long start, unsigned long end) -{ - int order; - - while (start < end) { - order = min(MAX_ORDER - 1UL, __ffs(start)); - - while (start + (1UL << order) > end) - order--; - - __free_pages_bootmem(pfn_to_page(start), start, order); - - start += (1UL << order); - } -} - -static unsigned long __init __free_memory_core(phys_addr_t start, - phys_addr_t end) -{ - unsigned long start_pfn = PFN_UP(start); - unsigned long end_pfn = min_t(unsigned long, - PFN_DOWN(end), max_low_pfn); - - if (start_pfn >= end_pfn) - return 0; - - __free_pages_memory(start_pfn, end_pfn); - - return end_pfn - start_pfn; -} - -static unsigned long __init free_low_memory_core_early(void) -{ - unsigned long count = 0; - phys_addr_t start, end; - u64 i; - - memblock_clear_hotplug(0, -1); - - for_each_reserved_mem_region(i, &start, &end) - reserve_bootmem_region(start, end); - - /* - * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id - * because in some case like Node0 doesn't have RAM installed - * low ram will be on Node1 - */ - for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, - NULL) - count += __free_memory_core(start, end); - - return count; -} - -static int reset_managed_pages_done __initdata; - -void reset_node_managed_pages(pg_data_t *pgdat) -{ - struct zone *z; - - for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) - z->managed_pages = 0; -} - -void __init reset_all_zones_managed_pages(void) -{ - struct pglist_data *pgdat; - - if (reset_managed_pages_done) - return; - - for_each_online_pgdat(pgdat) - reset_node_managed_pages(pgdat); - - reset_managed_pages_done = 1; -} - -/** - * free_all_bootmem - release free pages to the buddy allocator - * - * Return: the number of pages actually released. - */ -unsigned long __init free_all_bootmem(void) -{ - unsigned long pages; - - reset_all_zones_managed_pages(); - - pages = free_low_memory_core_early(); - totalram_pages += pages; - - return pages; -} - -/** - * free_bootmem_node - mark a page range as usable - * @pgdat: node the range resides on - * @physaddr: starting physical address of the range - * @size: size of the range in bytes - * - * Partial pages will be considered reserved and left as they are. - * - * The range must reside completely on the specified node. - */ -void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, - unsigned long size) -{ - memblock_free(physaddr, size); -} - -/** - * free_bootmem - mark a page range as usable - * @addr: starting physical address of the range - * @size: size of the range in bytes - * - * Partial pages will be considered reserved and left as they are. - * - * The range must be contiguous but may span node boundaries. - */ -void __init free_bootmem(unsigned long addr, unsigned long size) -{ - memblock_free(addr, size); -} - -static void * __init ___alloc_bootmem_nopanic(unsigned long size, - unsigned long align, - unsigned long goal, - unsigned long limit) -{ - void *ptr; - - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc(size, GFP_NOWAIT); - -restart: - - ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit); - - if (ptr) - return ptr; - - if (goal != 0) { - goal = 0; - goto restart; - } - - return NULL; -} - -/** - * __alloc_bootmem_nopanic - allocate boot memory without panicking - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may happen on any node in the system. - * - * Return: address of the allocated region or %NULL on failure. - */ -void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, - unsigned long goal) -{ - unsigned long limit = -1UL; - - return ___alloc_bootmem_nopanic(size, align, goal, limit); -} - -static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, - unsigned long goal, unsigned long limit) -{ - void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit); - - if (mem) - return mem; - /* - * Whoops, we cannot satisfy the allocation request. - */ - pr_alert("bootmem alloc of %lu bytes failed!\n", size); - panic("Out of memory"); - return NULL; -} - -/** - * __alloc_bootmem - allocate boot memory - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may happen on any node in the system. - * - * The function panics if the request can not be satisfied. - * - * Return: address of the allocated region. - */ -void * __init __alloc_bootmem(unsigned long size, unsigned long align, - unsigned long goal) -{ - unsigned long limit = -1UL; - - return ___alloc_bootmem(size, align, goal, limit); -} - -void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, - unsigned long size, - unsigned long align, - unsigned long goal, - unsigned long limit) -{ - void *ptr; - -again: - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - goal, limit); - if (ptr) - return ptr; - - ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, - goal, limit); - if (ptr) - return ptr; - - if (goal) { - goal = 0; - goto again; - } - - return NULL; -} - -void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - - return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); -} - -static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal, - unsigned long limit) -{ - void *ptr; - - ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit); - if (ptr) - return ptr; - - pr_alert("bootmem alloc of %lu bytes failed!\n", size); - panic("Out of memory"); - return NULL; -} - -/** - * __alloc_bootmem_node - allocate boot memory from a specific node - * @pgdat: node to allocate from - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may fall back to any node in the system if the specified node - * can not hold the requested memory. - * - * The function panics if the request can not be satisfied. - * - * Return: address of the allocated region. - */ -void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - - return ___alloc_bootmem_node(pgdat, size, align, goal, 0); -} - -void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - return __alloc_bootmem_node(pgdat, size, align, goal); -} - - -/** - * __alloc_bootmem_low - allocate low boot memory - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may happen on any node in the system. - * - * The function panics if the request can not be satisfied. - * - * Return: address of the allocated region. - */ -void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, - unsigned long goal) -{ - return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); -} - -void * __init __alloc_bootmem_low_nopanic(unsigned long size, - unsigned long align, - unsigned long goal) -{ - return ___alloc_bootmem_nopanic(size, align, goal, - ARCH_LOW_ADDRESS_LIMIT); -} - -/** - * __alloc_bootmem_low_node - allocate low boot memory from a specific node - * @pgdat: node to allocate from - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may fall back to any node in the system if the specified node - * can not hold the requested memory. - * - * The function panics if the request can not be satisfied. - * - * Return: address of the allocated region. - */ -void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - - return ___alloc_bootmem_node(pgdat, size, align, goal, - ARCH_LOW_ADDRESS_LIMIT); -} diff --git a/mm/nommu.c b/mm/nommu.c index e4aac33216ae..749276beb109 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1709,11 +1709,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, return ret; } -struct page *follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned int *page_mask) +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, + unsigned int foll_flags) { - *page_mask = 0; return NULL; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 6589f60d5018..3a2484884cfd 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -245,11 +245,11 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, return points > 0 ? points : 1; } -enum oom_constraint { - CONSTRAINT_NONE, - CONSTRAINT_CPUSET, - CONSTRAINT_MEMORY_POLICY, - CONSTRAINT_MEMCG, +static const char * const oom_constraint_text[] = { + [CONSTRAINT_NONE] = "CONSTRAINT_NONE", + [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET", + [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY", + [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG", }; /* @@ -269,7 +269,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) } /* Default to all available memory */ - oc->totalpages = totalram_pages + total_swap_pages; + oc->totalpages = totalram_pages() + total_swap_pages; if (!IS_ENABLED(CONFIG_NUMA)) return CONSTRAINT_NONE; @@ -428,19 +428,29 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) rcu_read_unlock(); } +static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim) +{ + /* one line summary of the oom killer context. */ + pr_info("oom-kill:constraint=%s,nodemask=%*pbl", + oom_constraint_text[oc->constraint], + nodemask_pr_args(oc->nodemask)); + cpuset_print_current_mems_allowed(); + mem_cgroup_print_oom_context(oc->memcg, victim); + pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid, + from_kuid(&init_user_ns, task_uid(victim))); +} + static void dump_header(struct oom_control *oc, struct task_struct *p) { - pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n", - current->comm, oc->gfp_mask, &oc->gfp_mask, - nodemask_pr_args(oc->nodemask), oc->order, + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", + current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, current->signal->oom_score_adj); if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) pr_warn("COMPACTION is disabled!!!\n"); - cpuset_print_current_mems_allowed(); dump_stack(); if (is_memcg_oom(oc)) - mem_cgroup_print_oom_info(oc->memcg, p); + mem_cgroup_print_oom_meminfo(oc->memcg); else { show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); if (is_dump_unreclaim_slabs()) @@ -448,6 +458,8 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) } if (sysctl_oom_dump_tasks) dump_tasks(oc->memcg, oc->nodemask); + if (p) + dump_oom_summary(oc, p); } /* @@ -516,19 +528,20 @@ bool __oom_reap_task_mm(struct mm_struct *mm) * count elevated without a good reason. */ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { - const unsigned long start = vma->vm_start; - const unsigned long end = vma->vm_end; + struct mmu_notifier_range range; struct mmu_gather tlb; - tlb_gather_mmu(&tlb, mm, start, end); - if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) { - tlb_finish_mmu(&tlb, start, end); + mmu_notifier_range_init(&range, mm, vma->vm_start, + vma->vm_end); + tlb_gather_mmu(&tlb, mm, range.start, range.end); + if (mmu_notifier_invalidate_range_start_nonblock(&range)) { + tlb_finish_mmu(&tlb, range.start, range.end); ret = false; continue; } - unmap_page_range(&tlb, vma, start, end, NULL); - mmu_notifier_invalidate_range_end(mm, start, end); - tlb_finish_mmu(&tlb, start, end); + unmap_page_range(&tlb, vma, range.start, range.end, NULL); + mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb, range.start, range.end); } } @@ -634,8 +647,8 @@ static int oom_reaper(void *unused) static void wake_oom_reaper(struct task_struct *tsk) { - /* tsk is already queued? */ - if (tsk == oom_reaper_list || tsk->oom_reaper_list) + /* mm is already queued? */ + if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) return; get_task_struct(tsk); @@ -830,7 +843,7 @@ static bool task_will_free_mem(struct task_struct *task) return ret; } -static void __oom_kill_process(struct task_struct *victim) +static void __oom_kill_process(struct task_struct *victim, const char *message) { struct task_struct *p; struct mm_struct *mm; @@ -861,8 +874,9 @@ static void __oom_kill_process(struct task_struct *victim) */ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); mark_oom_victim(victim); - pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", - task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), + pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", + message, task_pid_nr(victim), victim->comm, + K(victim->mm->total_vm), K(get_mm_counter(victim->mm, MM_ANONPAGES)), K(get_mm_counter(victim->mm, MM_FILEPAGES)), K(get_mm_counter(victim->mm, MM_SHMEMPAGES))); @@ -913,24 +927,20 @@ static void __oom_kill_process(struct task_struct *victim) * Kill provided task unless it's secured by setting * oom_score_adj to OOM_SCORE_ADJ_MIN. */ -static int oom_kill_memcg_member(struct task_struct *task, void *unused) +static int oom_kill_memcg_member(struct task_struct *task, void *message) { - if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { + if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN && + !is_global_init(task)) { get_task_struct(task); - __oom_kill_process(task); + __oom_kill_process(task, message); } return 0; } static void oom_kill_process(struct oom_control *oc, const char *message) { - struct task_struct *p = oc->chosen; - unsigned int points = oc->chosen_points; - struct task_struct *victim = p; - struct task_struct *child; - struct task_struct *t; + struct task_struct *victim = oc->chosen; struct mem_cgroup *oom_group; - unsigned int victim_points = 0; static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -939,49 +949,18 @@ static void oom_kill_process(struct oom_control *oc, const char *message) * its children or threads, just give it access to memory reserves * so it can die quickly */ - task_lock(p); - if (task_will_free_mem(p)) { - mark_oom_victim(p); - wake_oom_reaper(p); - task_unlock(p); - put_task_struct(p); + task_lock(victim); + if (task_will_free_mem(victim)) { + mark_oom_victim(victim); + wake_oom_reaper(victim); + task_unlock(victim); + put_task_struct(victim); return; } - task_unlock(p); + task_unlock(victim); if (__ratelimit(&oom_rs)) - dump_header(oc, p); - - pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", - message, task_pid_nr(p), p->comm, points); - - /* - * If any of p's children has a different mm and is eligible for kill, - * the one with the highest oom_badness() score is sacrificed for its - * parent. This attempts to lose the minimal amount of work done while - * still freeing memory. - */ - read_lock(&tasklist_lock); - for_each_thread(p, t) { - list_for_each_entry(child, &t->children, sibling) { - unsigned int child_points; - - if (process_shares_mm(child, p->mm)) - continue; - /* - * oom_badness() returns 0 if the thread is unkillable - */ - child_points = oom_badness(child, - oc->memcg, oc->nodemask, oc->totalpages); - if (child_points > victim_points) { - put_task_struct(victim); - victim = child; - victim_points = child_points; - get_task_struct(victim); - } - } - } - read_unlock(&tasklist_lock); + dump_header(oc, victim); /* * Do we need to kill the entire memory cgroup? @@ -990,14 +969,15 @@ static void oom_kill_process(struct oom_control *oc, const char *message) */ oom_group = mem_cgroup_get_oom_group(victim, oc->memcg); - __oom_kill_process(victim); + __oom_kill_process(victim, message); /* * If necessary, kill all tasks in the selected memory cgroup. */ if (oom_group) { mem_cgroup_print_oom_group(oom_group); - mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL); + mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, + (void*)message); mem_cgroup_put(oom_group); } } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 84ae9bf5858a..9f61dfec6a1f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -270,7 +270,7 @@ static void wb_min_max_ratio(struct bdi_writeback *wb, * node_dirtyable_memory - number of dirtyable pages in a node * @pgdat: the node * - * Returns the node's number of pages potentially available for dirty + * Return: the node's number of pages potentially available for dirty * page cache. This is the base value for the per-node dirty limits. */ static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) @@ -355,7 +355,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) /** * global_dirtyable_memory - number of globally dirtyable pages * - * Returns the global number of pages potentially available for dirty + * Return: the global number of pages potentially available for dirty * page cache. This is the base value for the global dirty limits. */ static unsigned long global_dirtyable_memory(void) @@ -470,7 +470,7 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) * node_dirty_limit - maximum number of dirty pages allowed in a node * @pgdat: the node * - * Returns the maximum number of dirty pages allowed in a node, based + * Return: the maximum number of dirty pages allowed in a node, based * on the node's dirtyable memory. */ static unsigned long node_dirty_limit(struct pglist_data *pgdat) @@ -495,7 +495,7 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat) * node_dirty_ok - tells whether a node is within its dirty limits * @pgdat: the node to check * - * Returns %true when the dirty pages in @pgdat are within the node's + * Return: %true when the dirty pages in @pgdat are within the node's * dirty limit, %false if the limit is exceeded. */ bool node_dirty_ok(struct pglist_data *pgdat) @@ -743,9 +743,6 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, * __wb_calc_thresh - @wb's share of dirty throttling threshold * @dtc: dirty_throttle_context of interest * - * Returns @wb's dirty limit in pages. The term "dirty" in the context of - * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. - * * Note that balance_dirty_pages() will only seriously take it as a hard limit * when sleeping max_pause per page is not enough to keep the dirty pages under * control. For example, when the device is completely stalled due to some error @@ -759,6 +756,9 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, * * The wb's share of dirty limit will be adapting to its throughput and * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. + * + * Return: @wb's dirty limit in pages. The term "dirty" in the context of + * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. */ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) { @@ -1918,7 +1918,9 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited); * @wb: bdi_writeback of interest * * Determines whether background writeback should keep writing @wb or it's - * clean enough. Returns %true if writeback should continue. + * clean enough. + * + * Return: %true if writeback should continue. */ bool wb_over_bg_thresh(struct bdi_writeback *wb) { @@ -2097,34 +2099,25 @@ void __init page_writeback_init(void) * dirty pages in the file (thus it is important for this function to be quick * so that it can tag pages faster than a dirtying process can create them). */ -/* - * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce the i_pages lock - * latency. - */ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) { -#define WRITEBACK_TAG_BATCH 4096 - unsigned long tagged = 0; - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, start); + unsigned int tagged = 0; + void *page; - xa_lock_irq(&mapping->i_pages); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, - PAGECACHE_TAG_DIRTY) { - if (iter.index > end) - break; - radix_tree_iter_tag_set(&mapping->i_pages, &iter, - PAGECACHE_TAG_TOWRITE); - tagged++; - if ((tagged % WRITEBACK_TAG_BATCH) != 0) + xas_lock_irq(&xas); + xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) { + xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); + if (++tagged % XA_CHECK_SCHED) continue; - slot = radix_tree_iter_resume(slot, &iter); - xa_unlock_irq(&mapping->i_pages); + + xas_pause(&xas); + xas_unlock_irq(&xas); cond_resched(); - xa_lock_irq(&mapping->i_pages); + xas_lock_irq(&xas); } - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); } EXPORT_SYMBOL(tag_pages_for_writeback); @@ -2149,6 +2142,15 @@ EXPORT_SYMBOL(tag_pages_for_writeback); * not miss some pages (e.g., because some other process has cleared TOWRITE * tag we set). The rule we follow is that TOWRITE tag can be cleared only * by the process clearing the DIRTY tag (and submitting the page for IO). + * + * To avoid deadlocks between range_cyclic writeback and callers that hold + * pages in PageWriteback to aggregate IO until write_cache_pages() returns, + * we do not loop back to the start of the file. Doing so causes a page + * lock/page writeback access order inversion - we should only ever lock + * multiple pages in ascending page->index order, and looping back to the start + * of the file violates that rule and causes deadlocks. + * + * Return: %0 on success, negative error code otherwise */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, @@ -2156,37 +2158,31 @@ int write_cache_pages(struct address_space *mapping, { int ret = 0; int done = 0; + int error; struct pagevec pvec; int nr_pages; pgoff_t uninitialized_var(writeback_index); pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; - int cycled; int range_whole = 0; - int tag; + xa_mark_t tag; pagevec_init(&pvec); if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; - if (index == 0) - cycled = 1; - else - cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - cycled = 1; /* ignore range_cyclic tests */ } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; -retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; @@ -2236,25 +2232,31 @@ continue_unlock: goto continue_unlock; trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); - ret = (*writepage)(page, wbc, data); - if (unlikely(ret)) { - if (ret == AOP_WRITEPAGE_ACTIVATE) { + error = (*writepage)(page, wbc, data); + if (unlikely(error)) { + /* + * Handle errors according to the type of + * writeback. There's no need to continue for + * background writeback. Just push done_index + * past this page so media errors won't choke + * writeout for the entire file. For integrity + * writeback, we must process the entire dirty + * set regardless of errors because the fs may + * still have state to clear for each page. In + * that case we continue processing and return + * the first error. + */ + if (error == AOP_WRITEPAGE_ACTIVATE) { unlock_page(page); - ret = 0; - } else { - /* - * done_index is set past this page, - * so media errors will not choke - * background writeout for the entire - * file. This has consequences for - * range_cyclic semantics (ie. it may - * not be suitable for data integrity - * writeout). - */ + error = 0; + } else if (wbc->sync_mode != WB_SYNC_ALL) { + ret = error; done_index = page->index + 1; done = 1; break; } + if (!ret) + ret = error; } /* @@ -2272,17 +2274,14 @@ continue_unlock: pagevec_release(&pvec); cond_resched(); } - if (!cycled && !done) { - /* - * range_cyclic: - * We hit the last page and there is more work to be done: wrap - * back to the start of the file - */ - cycled = 1; - index = 0; - end = writeback_index - 1; - goto retry; - } + + /* + * If we hit the last page and there is more work to be done: wrap + * back the index back to the start of the file for the next + * time we are called. + */ + if (wbc->range_cyclic && !done) + done_index = 0; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; @@ -2310,6 +2309,8 @@ static int __writepage(struct page *page, struct writeback_control *wbc, * * This is a library function, which implements the writepages() * address_space_operation. + * + * Return: %0 on success, negative error code otherwise */ int generic_writepages(struct address_space *mapping, struct writeback_control *wbc) @@ -2356,6 +2357,8 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) * * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this * function returns. + * + * Return: %0 on success, negative error code otherwise */ int write_one_page(struct page *page) { @@ -2445,7 +2448,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, /* * For address_spaces which do not use buffers. Just tag the page as dirty in - * its radix tree. + * the xarray. * * This is also used when a single buffer is being dirtied: we want to set the * page dirty in that case, but not all the buffers. This is a "bottom-up" @@ -2471,7 +2474,7 @@ int __set_page_dirty_nobuffers(struct page *page) BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->i_pages, page_index(page), + __xa_set_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); xa_unlock_irqrestore(&mapping->i_pages, flags); unlock_page_memcg(page); @@ -2634,13 +2637,13 @@ EXPORT_SYMBOL(__cancel_dirty_page); * Returns true if the page was previously dirty. * * This is for preparing to put the page under writeout. We leave the page - * tagged as dirty in the radix tree so that a concurrent write-for-sync + * tagged as dirty in the xarray so that a concurrent write-for-sync * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage * implementation will run either set_page_writeback() or set_page_dirty(), - * at which stage we bring the page's dirty flag and radix-tree dirty tag + * at which stage we bring the page's dirty flag and xarray dirty tag * back into sync. * - * This incoherency between the page's dirty flag and radix-tree tag is + * This incoherency between the page's dirty flag and xarray tag is * unfortunate, but it only exists while the page is locked. */ int clear_page_dirty_for_io(struct page *page) @@ -2721,7 +2724,7 @@ int test_clear_page_writeback(struct page *page) xa_lock_irqsave(&mapping->i_pages, flags); ret = TestClearPageWriteback(page); if (ret) { - radix_tree_tag_clear(&mapping->i_pages, page_index(page), + __xa_clear_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) { struct bdi_writeback *wb = inode_to_wb(inode); @@ -2761,11 +2764,13 @@ int __test_set_page_writeback(struct page *page, bool keep_write) lock_page_memcg(page); if (mapping && mapping_use_writeback_tags(mapping)) { + XA_STATE(xas, &mapping->i_pages, page_index(page)); struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; - xa_lock_irqsave(&mapping->i_pages, flags); + xas_lock_irqsave(&xas, flags); + xas_load(&xas); ret = TestSetPageWriteback(page); if (!ret) { bool on_wblist; @@ -2773,8 +2778,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); - radix_tree_tag_set(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_WRITEBACK); + xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); @@ -2787,12 +2791,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write) sb_mark_inode_writeback(mapping->host); } if (!PageDirty(page)) - radix_tree_tag_clear(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_DIRTY); + xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); if (!keep_write) - radix_tree_tag_clear(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_TOWRITE); - xa_unlock_irqrestore(&mapping->i_pages, flags); + xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); + xas_unlock_irqrestore(&xas, flags); } else { ret = TestSetPageWriteback(page); } @@ -2806,16 +2808,6 @@ int __test_set_page_writeback(struct page *page, bool keep_write) } EXPORT_SYMBOL(__test_set_page_writeback); -/* - * Return true if any of the pages in the mapping are marked with the - * passed tag. - */ -int mapping_tagged(struct address_space *mapping, int tag) -{ - return radix_tree_tagged(&mapping->i_pages, tag); -} -EXPORT_SYMBOL(mapping_tagged); - /** * wait_for_stable_page() - wait for writeback to finish, if necessary. * @page: The page to wait on. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e2ef1c17942f..3eb01dedfb50 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -16,11 +16,11 @@ #include <linux/stddef.h> #include <linux/mm.h> +#include <linux/highmem.h> #include <linux/swap.h> #include <linux/interrupt.h> #include <linux/pagemap.h> #include <linux/jiffies.h> -#include <linux/bootmem.h> #include <linux/memblock.h> #include <linux/compiler.h> #include <linux/kernel.h> @@ -66,6 +66,7 @@ #include <linux/ftrace.h> #include <linux/lockdep.h> #include <linux/nmi.h> +#include <linux/psi.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -96,8 +97,12 @@ int _node_numa_mem_[MAX_NUMNODES]; #endif /* work_structs for global per-cpu drains */ +struct pcpu_drain { + struct zone *zone; + struct work_struct work; +}; DEFINE_MUTEX(pcpu_drain_mutex); -DEFINE_PER_CPU(struct work_struct, pcpu_drain); +DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain); #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY volatile unsigned long latent_entropy __latent_entropy; @@ -121,10 +126,8 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { }; EXPORT_SYMBOL(node_states); -/* Protect totalram_pages and zone->managed_pages */ -static DEFINE_SPINLOCK(managed_page_count_lock); - -unsigned long totalram_pages __read_mostly; +atomic_long_t _totalram_pages __read_mostly; +EXPORT_SYMBOL(_totalram_pages); unsigned long totalreserve_pages __read_mostly; unsigned long totalcma_pages __read_mostly; @@ -237,7 +240,7 @@ static char * const zone_names[MAX_NR_ZONES] = { #endif }; -char * const migratetype_names[MIGRATE_TYPES] = { +const char * const migratetype_names[MIGRATE_TYPES] = { "Unmovable", "Movable", "Reclaimable", @@ -263,20 +266,21 @@ compound_page_dtor * const compound_page_dtors[] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +int watermark_boost_factor __read_mostly = 15000; int watermark_scale_factor = 10; -static unsigned long nr_kernel_pages __meminitdata; -static unsigned long nr_all_pages __meminitdata; -static unsigned long dma_reserve __meminitdata; +static unsigned long nr_kernel_pages __initdata; +static unsigned long nr_all_pages __initdata; +static unsigned long dma_reserve __initdata; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata; -static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata; +static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata; +static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata; static unsigned long required_kernelcore __initdata; static unsigned long required_kernelcore_percent __initdata; static unsigned long required_movablecore __initdata; static unsigned long required_movablecore_percent __initdata; -static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata; +static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata; static bool mirrored_kernelcore __meminitdata; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ @@ -285,8 +289,8 @@ EXPORT_SYMBOL(movable_zone); #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ #if MAX_NUMNODES > 1 -int nr_node_ids __read_mostly = MAX_NUMNODES; -int nr_online_nodes __read_mostly = 1; +unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; +unsigned int nr_online_nodes __read_mostly = 1; EXPORT_SYMBOL(nr_node_ids); EXPORT_SYMBOL(nr_online_nodes); #endif @@ -294,6 +298,32 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +/* + * During boot we initialize deferred pages on-demand, as needed, but once + * page_alloc_init_late() has finished, the deferred pages are all initialized, + * and we can permanently disable that path. + */ +static DEFINE_STATIC_KEY_TRUE(deferred_pages); + +/* + * Calling kasan_free_pages() only after deferred memory initialization + * has completed. Poisoning pages during deferred memory init will greatly + * lengthen the process and cause problem in large memory systems as the + * deferred pages initialization is done with interrupt disabled. + * + * Assuming that there will be no reference to those newly initialized + * pages before they are ever allocated, this should have no effect on + * KASAN memory tracking as the poison will be properly inserted at page + * allocation time. The only corner case is when pages are allocated by + * on-demand allocation and then freed again before the deferred pages + * initialization is done, but this is not likely to happen. + */ +static inline void kasan_free_nondeferred_pages(struct page *page, int order) +{ + if (!static_branch_unlikely(&deferred_pages)) + kasan_free_pages(page, order); +} + /* Returns true if the struct page for the pfn is uninitialised */ static inline bool __meminit early_page_uninitialised(unsigned long pfn) { @@ -306,36 +336,50 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn) } /* - * Returns false when the remaining initialisation should be deferred until + * Returns true when the remaining initialisation should be deferred until * later in the boot cycle when it can be parallelised. */ -static inline bool update_defer_init(pg_data_t *pgdat, - unsigned long pfn, unsigned long zone_end, - unsigned long *nr_initialised) +static bool __meminit +defer_init(int nid, unsigned long pfn, unsigned long end_pfn) { + static unsigned long prev_end_pfn, nr_initialised; + + /* + * prev_end_pfn static that contains the end of previous zone + * No need to protect because called very early in boot before smp_init. + */ + if (prev_end_pfn != end_pfn) { + prev_end_pfn = end_pfn; + nr_initialised = 0; + } + /* Always populate low zones for address-constrained allocations */ - if (zone_end < pgdat_end_pfn(pgdat)) - return true; - (*nr_initialised)++; - if ((*nr_initialised > pgdat->static_init_pgcnt) && - (pfn & (PAGES_PER_SECTION - 1)) == 0) { - pgdat->first_deferred_pfn = pfn; + if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) return false; - } - return true; + /* + * We start only with one section of pages, more pages are added as + * needed until the rest of deferred pages are initialized. + */ + nr_initialised++; + if ((nr_initialised > PAGES_PER_SECTION) && + (pfn & (PAGES_PER_SECTION - 1)) == 0) { + NODE_DATA(nid)->first_deferred_pfn = pfn; + return true; + } + return false; } #else +#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o) + static inline bool early_page_uninitialised(unsigned long pfn) { return false; } -static inline bool update_defer_init(pg_data_t *pgdat, - unsigned long pfn, unsigned long zone_end, - unsigned long *nr_initialised) +static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) { - return true; + return false; } #endif @@ -419,6 +463,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags, unsigned long old_word, word; BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); + BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits)); bitmap = get_pageblock_bitmap(page, pfn); bitidx = pfn_to_bitidx(page, pfn); @@ -744,6 +789,57 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, return 0; } +#ifdef CONFIG_COMPACTION +static inline struct capture_control *task_capc(struct zone *zone) +{ + struct capture_control *capc = current->capture_control; + + return capc && + !(current->flags & PF_KTHREAD) && + !capc->page && + capc->cc->zone == zone && + capc->cc->direct_compaction ? capc : NULL; +} + +static inline bool +compaction_capture(struct capture_control *capc, struct page *page, + int order, int migratetype) +{ + if (!capc || order != capc->cc->order) + return false; + + /* Do not accidentally pollute CMA or isolated regions*/ + if (is_migrate_cma(migratetype) || + is_migrate_isolate(migratetype)) + return false; + + /* + * Do not let lower order allocations polluate a movable pageblock. + * This might let an unmovable request use a reclaimable pageblock + * and vice-versa but no more than normal fallback logic which can + * have trouble finding a high-order free page. + */ + if (order < pageblock_order && migratetype == MIGRATE_MOVABLE) + return false; + + capc->page = page; + return true; +} + +#else +static inline struct capture_control *task_capc(struct zone *zone) +{ + return NULL; +} + +static inline bool +compaction_capture(struct capture_control *capc, struct page *page, + int order, int migratetype) +{ + return false; +} +#endif /* CONFIG_COMPACTION */ + /* * Freeing function for a buddy system allocator. * @@ -777,6 +873,7 @@ static inline void __free_one_page(struct page *page, unsigned long uninitialized_var(buddy_pfn); struct page *buddy; unsigned int max_order; + struct capture_control *capc = task_capc(zone); max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); @@ -792,6 +889,11 @@ static inline void __free_one_page(struct page *page, continue_merging: while (order < max_order - 1) { + if (compaction_capture(capc, page, order, migratetype)) { + __mod_zone_freepage_state(zone, -(1 << order), + migratetype); + return; + } buddy_pfn = __find_buddy_pfn(pfn, order); buddy = page + (buddy_pfn - pfn); @@ -1011,7 +1113,7 @@ static __always_inline bool free_pages_prepare(struct page *page, if (PageMappingFlags(page)) page->mapping = NULL; if (memcg_kmem_enabled() && PageKmemcg(page)) - memcg_kmem_uncharge(page, order); + __memcg_kmem_uncharge(page, order); if (check_free) bad += free_pages_check(page); if (bad) @@ -1030,7 +1132,7 @@ static __always_inline bool free_pages_prepare(struct page *page, arch_free_page(page, order); kernel_poison_pages(page, 1 << order, 0); kernel_map_pages(page, 1 << order, 0); - kasan_free_pages(page, order); + kasan_free_nondeferred_pages(page, order); return true; } @@ -1176,6 +1278,7 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn, init_page_count(page); page_mapcount_reset(page); page_cpupid_reset_last(page); + page_kasan_tag_reset(page); INIT_LIST_HEAD(&page->lru); #ifdef WANT_PAGE_VIRTUAL @@ -1231,7 +1334,12 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) /* Avoid false-positive PageTail() */ INIT_LIST_HEAD(&page->lru); - SetPageReserved(page); + /* + * no need for atomic set_bit because the struct + * page is not visible yet so nobody should + * access it yet. + */ + __SetPageReserved(page); } } } @@ -1252,7 +1360,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -static void __init __free_pages_boot_core(struct page *page, unsigned int order) +void __free_pages_core(struct page *page, unsigned int order) { unsigned int nr_pages = 1 << order; struct page *p = page; @@ -1267,7 +1375,7 @@ static void __init __free_pages_boot_core(struct page *page, unsigned int order) __ClearPageReserved(p); set_page_count(p, 0); - page_zone(page)->managed_pages += nr_pages; + atomic_long_add(nr_pages, &page_zone(page)->managed_pages); set_page_refcounted(page); __free_pages(page, order); } @@ -1326,12 +1434,12 @@ meminit_pfn_in_nid(unsigned long pfn, int node, #endif -void __init __free_pages_bootmem(struct page *page, unsigned long pfn, +void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) { if (early_page_uninitialised(pfn)) return; - return __free_pages_boot_core(page, order); + __free_pages_core(page, order); } /* @@ -1421,14 +1529,14 @@ static void __init deferred_free_range(unsigned long pfn, if (nr_pages == pageblock_nr_pages && (pfn & (pageblock_nr_pages - 1)) == 0) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); - __free_pages_boot_core(page, pageblock_order); + __free_pages_core(page, pageblock_order); return; } for (i = 0; i < nr_pages; i++, page++, pfn++) { if ((pfn & (pageblock_nr_pages - 1)) == 0) set_pageblock_migratetype(page, MIGRATE_MOVABLE); - __free_pages_boot_core(page, 0); + __free_pages_core(page, 0); } } @@ -1594,13 +1702,6 @@ static int __init deferred_init_memmap(void *data) } /* - * During boot we initialize deferred pages on-demand, as needed, but once - * page_alloc_init_late() has finished, the deferred pages are all initialized, - * and we can permanently disable that path. - */ -static DEFINE_STATIC_KEY_TRUE(deferred_pages); - -/* * If this zone has deferred pages, try to grow it by initializing enough * deferred pages to satisfy the allocation specified by order, rounded up to * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments @@ -1901,8 +2002,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order, arch_alloc_page(page, order); kernel_map_pages(page, 1 << order, 1); - kernel_poison_pages(page, 1 << order, 1); kasan_alloc_pages(page, order); + kernel_poison_pages(page, 1 << order, 1); set_page_owner(page, order, gfp_flags); } @@ -1969,8 +2070,8 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, */ static int fallbacks[MIGRATE_TYPES][4] = { [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, - [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, #ifdef CONFIG_CMA [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ #endif @@ -2015,10 +2116,6 @@ static int move_freepages(struct zone *zone, pfn_valid(page_to_pfn(end_page)) && page_zone(start_page) != page_zone(end_page)); #endif - - if (num_movable) - *num_movable = 0; - for (page = start_page; page <= end_page;) { if (!pfn_valid_within(page_to_pfn(page))) { page++; @@ -2058,6 +2155,9 @@ int move_freepages_block(struct zone *zone, struct page *page, unsigned long start_pfn, end_pfn; struct page *start_page, *end_page; + if (num_movable) + *num_movable = 0; + start_pfn = page_to_pfn(page); start_pfn = start_pfn & ~(pageblock_nr_pages-1); start_page = pfn_to_page(start_pfn); @@ -2118,6 +2218,33 @@ static bool can_steal_fallback(unsigned int order, int start_mt) return false; } +static inline void boost_watermark(struct zone *zone) +{ + unsigned long max_boost; + + if (!watermark_boost_factor) + return; + + max_boost = mult_frac(zone->_watermark[WMARK_HIGH], + watermark_boost_factor, 10000); + + /* + * high watermark may be uninitialised if fragmentation occurs + * very early in boot so do not boost. We do not fall + * through and boost by pageblock_nr_pages as failing + * allocations that early means that reclaim is not going + * to help and it may even be impossible to reclaim the + * boosted watermark resulting in a hang. + */ + if (!max_boost) + return; + + max_boost = max(pageblock_nr_pages, max_boost); + + zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, + max_boost); +} + /* * This function implements actual steal behaviour. If order is large enough, * we can steal whole pageblock. If not, we first move freepages in this @@ -2127,7 +2254,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt) * itself, so pages freed in the future will be put on the correct free list. */ static void steal_suitable_fallback(struct zone *zone, struct page *page, - int start_type, bool whole_block) + unsigned int alloc_flags, int start_type, bool whole_block) { unsigned int current_order = page_order(page); struct free_area *area; @@ -2149,6 +2276,15 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, goto single_page; } + /* + * Boost watermarks to increase reclaim pressure to reduce the + * likelihood of future fallbacks. Wake kswapd now as the node + * may be balanced overall and kswapd will not wake naturally. + */ + boost_watermark(zone); + if (alloc_flags & ALLOC_KSWAPD) + set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); + /* We are not allowed to try stealing from the whole block */ if (!whole_block) goto single_page; @@ -2247,7 +2383,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, * Limit the number reserved to 1 pageblock or roughly 1% of a zone. * Check is race-prone but harmless. */ - max_managed = (zone->managed_pages / 100) + pageblock_nr_pages; + max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages; if (zone->nr_reserved_highatomic >= max_managed) return; @@ -2364,20 +2500,30 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * condition simpler. */ static __always_inline bool -__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) +__rmqueue_fallback(struct zone *zone, int order, int start_migratetype, + unsigned int alloc_flags) { struct free_area *area; int current_order; + int min_order = order; struct page *page; int fallback_mt; bool can_steal; /* + * Do not steal pages from freelists belonging to other pageblocks + * i.e. orders < pageblock_order. If there are no local zones free, + * the zonelists will be reiterated without ALLOC_NOFRAGMENT. + */ + if (alloc_flags & ALLOC_NOFRAGMENT) + min_order = pageblock_order; + + /* * Find the largest available free page in the other list. This roughly * approximates finding the pageblock with the most free pages, which * would be too costly to do exactly. */ - for (current_order = MAX_ORDER - 1; current_order >= order; + for (current_order = MAX_ORDER - 1; current_order >= min_order; --current_order) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, @@ -2422,7 +2568,8 @@ do_steal: page = list_first_entry(&area->free_list[fallback_mt], struct page, lru); - steal_suitable_fallback(zone, page, start_migratetype, can_steal); + steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, + can_steal); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, fallback_mt); @@ -2436,7 +2583,8 @@ do_steal: * Call me with the zone->lock already held. */ static __always_inline struct page * -__rmqueue(struct zone *zone, unsigned int order, int migratetype) +__rmqueue(struct zone *zone, unsigned int order, int migratetype, + unsigned int alloc_flags) { struct page *page; @@ -2446,7 +2594,8 @@ retry: if (migratetype == MIGRATE_MOVABLE) page = __rmqueue_cma_fallback(zone, order); - if (!page && __rmqueue_fallback(zone, order, migratetype)) + if (!page && __rmqueue_fallback(zone, order, migratetype, + alloc_flags)) goto retry; } @@ -2461,13 +2610,14 @@ retry: */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype) + int migratetype, unsigned int alloc_flags) { int i, alloced = 0; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { - struct page *page = __rmqueue(zone, order, migratetype); + struct page *page = __rmqueue(zone, order, migratetype, + alloc_flags); if (unlikely(page == NULL)) break; @@ -2581,6 +2731,10 @@ void drain_local_pages(struct zone *zone) static void drain_local_pages_wq(struct work_struct *work) { + struct pcpu_drain *drain; + + drain = container_of(work, struct pcpu_drain, work); + /* * drain_all_pages doesn't use proper cpu hotplug protection so * we can race with cpu offline when the WQ can move this from @@ -2589,7 +2743,7 @@ static void drain_local_pages_wq(struct work_struct *work) * a different one. */ preempt_disable(); - drain_local_pages(NULL); + drain_local_pages(drain->zone); preempt_enable(); } @@ -2660,12 +2814,14 @@ void drain_all_pages(struct zone *zone) } for_each_cpu(cpu, &cpus_with_pcps) { - struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu); - INIT_WORK(work, drain_local_pages_wq); - queue_work_on(cpu, mm_percpu_wq, work); + struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu); + + drain->zone = zone; + INIT_WORK(&drain->work, drain_local_pages_wq); + queue_work_on(cpu, mm_percpu_wq, &drain->work); } for_each_cpu(cpu, &cpus_with_pcps) - flush_work(per_cpu_ptr(&pcpu_drain, cpu)); + flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work); mutex_unlock(&pcpu_drain_mutex); } @@ -2863,7 +3019,7 @@ int __isolate_free_page(struct page *page, unsigned int order) * watermark, because we already know our high-order page * exists. */ - watermark = min_wmark_pages(zone) + (1UL << order); + watermark = zone->_watermark[WMARK_MIN] + (1UL << order); if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) return 0; @@ -2923,6 +3079,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) /* Remove page from the per-cpu list, caller must protect the list */ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, + unsigned int alloc_flags, struct per_cpu_pages *pcp, struct list_head *list) { @@ -2932,7 +3089,7 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, if (list_empty(list)) { pcp->count += rmqueue_bulk(zone, 0, pcp->batch, list, - migratetype); + migratetype, alloc_flags); if (unlikely(list_empty(list))) return NULL; } @@ -2948,7 +3105,8 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, /* Lock and remove page from the per-cpu list */ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct zone *zone, unsigned int order, - gfp_t gfp_flags, int migratetype) + gfp_t gfp_flags, int migratetype, + unsigned int alloc_flags) { struct per_cpu_pages *pcp; struct list_head *list; @@ -2958,7 +3116,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, local_irq_save(flags); pcp = &this_cpu_ptr(zone->pageset)->pcp; list = &pcp->lists[migratetype]; - page = __rmqueue_pcplist(zone, migratetype, pcp, list); + page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone); @@ -2981,7 +3139,7 @@ struct page *rmqueue(struct zone *preferred_zone, if (likely(order == 0)) { page = rmqueue_pcplist(preferred_zone, zone, order, - gfp_flags, migratetype); + gfp_flags, migratetype, alloc_flags); goto out; } @@ -3000,7 +3158,7 @@ struct page *rmqueue(struct zone *preferred_zone, trace_mm_page_alloc_zone_locked(page, order, migratetype); } if (!page) - page = __rmqueue(zone, order, migratetype); + page = __rmqueue(zone, order, migratetype, alloc_flags); } while (page && check_new_pages(page, order)); spin_unlock(&zone->lock); if (!page) @@ -3013,6 +3171,12 @@ struct page *rmqueue(struct zone *preferred_zone, local_irq_restore(flags); out: + /* Separate test+clear to avoid unnecessary atomics */ + if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) { + clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); + wakeup_kswapd(zone, 0, 0, zone_idx(zone)); + } + VM_BUG_ON_PAGE(page && bad_range(zone, page), page); return page; @@ -3042,7 +3206,7 @@ static int __init setup_fail_page_alloc(char *str) } __setup("fail_page_alloc=", setup_fail_page_alloc); -static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { if (order < fail_page_alloc.min_order) return false; @@ -3066,24 +3230,14 @@ static int __init fail_page_alloc_debugfs(void) dir = fault_create_debugfs_attr("fail_page_alloc", NULL, &fail_page_alloc.attr); - if (IS_ERR(dir)) - return PTR_ERR(dir); - - if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, - &fail_page_alloc.ignore_gfp_reclaim)) - goto fail; - if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, - &fail_page_alloc.ignore_gfp_highmem)) - goto fail; - if (!debugfs_create_u32("min-order", mode, dir, - &fail_page_alloc.min_order)) - goto fail; - return 0; -fail: - debugfs_remove_recursive(dir); + debugfs_create_bool("ignore-gfp-wait", mode, dir, + &fail_page_alloc.ignore_gfp_reclaim); + debugfs_create_bool("ignore-gfp-highmem", mode, dir, + &fail_page_alloc.ignore_gfp_highmem); + debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); - return -ENOMEM; + return 0; } late_initcall(fail_page_alloc_debugfs); @@ -3092,13 +3246,19 @@ late_initcall(fail_page_alloc_debugfs); #else /* CONFIG_FAIL_PAGE_ALLOC */ -static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { return false; } #endif /* CONFIG_FAIL_PAGE_ALLOC */ +static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +{ + return __should_fail_alloc_page(gfp_mask, order); +} +ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); + /* * Return true if free base pages are above 'mark'. For high-order checks it * will return true of the order-0 watermark is reached and there is at least @@ -3243,6 +3403,40 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) #endif /* CONFIG_NUMA */ /* + * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid + * fragmentation is subtle. If the preferred zone was HIGHMEM then + * premature use of a lower zone may cause lowmem pressure problems that + * are worse than fragmentation. If the next zone is ZONE_DMA then it is + * probably too small. It only makes sense to spread allocations to avoid + * fragmentation between the Normal and DMA32 zones. + */ +static inline unsigned int +alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) +{ + unsigned int alloc_flags = 0; + + if (gfp_mask & __GFP_KSWAPD_RECLAIM) + alloc_flags |= ALLOC_KSWAPD; + +#ifdef CONFIG_ZONE_DMA32 + if (zone_idx(zone) != ZONE_NORMAL) + goto out; + + /* + * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and + * the pointer is within zone->zone_pgdat->node_zones[]. Also assume + * on UMA that if Normal is populated then so is DMA32. + */ + BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1); + if (nr_online_nodes > 1 && !populated_zone(--zone)) + goto out; + +out: +#endif /* CONFIG_ZONE_DMA32 */ + return alloc_flags; +} + +/* * get_page_from_freelist goes through the zonelist trying to allocate * a page. */ @@ -3250,14 +3444,18 @@ static struct page * get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac) { - struct zoneref *z = ac->preferred_zoneref; + struct zoneref *z; struct zone *zone; struct pglist_data *last_pgdat_dirty_limit = NULL; + bool no_fallback; +retry: /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. */ + no_fallback = alloc_flags & ALLOC_NOFRAGMENT; + z = ac->preferred_zoneref; for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { struct page *page; @@ -3296,7 +3494,23 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, } } - mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; + if (no_fallback && nr_online_nodes > 1 && + zone != ac->preferred_zoneref->zone) { + int local_nid; + + /* + * If moving to a remote node, retry but allow + * fragmenting fallbacks. Locality is more important + * than fragmentation avoidance. + */ + local_nid = zone_to_nid(ac->preferred_zoneref->zone); + if (zone_to_nid(zone) != local_nid) { + alloc_flags &= ~ALLOC_NOFRAGMENT; + goto retry; + } + } + + mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); if (!zone_watermark_fast(zone, order, mark, ac_classzone_idx(ac), alloc_flags)) { int ret; @@ -3363,21 +3577,16 @@ try_this_zone: } } - return NULL; -} - -/* - * Large machines with many possible nodes should not always dump per-node - * meminfo in irq context. - */ -static inline bool should_suppress_show_mem(void) -{ - bool ret = false; + /* + * It's possible on a UMA machine to get through all zones that are + * fragmented. If avoiding fragmentation, reset and try again. + */ + if (no_fallback) { + alloc_flags &= ~ALLOC_NOFRAGMENT; + goto retry; + } -#if NODES_SHIFT > 8 - ret = in_interrupt(); -#endif - return ret; + return NULL; } static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) @@ -3385,7 +3594,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) unsigned int filter = SHOW_MEM_FILTER_NODES; static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); - if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs)) + if (!__ratelimit(&show_mem_rs)) return; /* @@ -3416,13 +3625,13 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n", + pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl", current->comm, &vaf, gfp_mask, &gfp_mask, nodemask_pr_args(nodemask)); va_end(args); cpuset_print_current_mems_allowed(); - + pr_cont("\n"); dump_stack(); warn_alloc_show_mem(gfp_mask, nodemask); } @@ -3548,19 +3757,26 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum compact_priority prio, enum compact_result *compact_result) { - struct page *page; + struct page *page = NULL; + unsigned long pflags; unsigned int noreclaim_flag; if (!order) return NULL; + psi_memstall_enter(&pflags); noreclaim_flag = memalloc_noreclaim_save(); + *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, - prio); + prio, &page); + memalloc_noreclaim_restore(noreclaim_flag); + psi_memstall_leave(&pflags); - if (*compact_result <= COMPACT_INACTIVE) + if (*compact_result <= COMPACT_INACTIVE) { + WARN_ON_ONCE(page); return NULL; + } /* * At least in one zone compaction wasn't deferred or skipped, so let's @@ -3568,7 +3784,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, */ count_vm_event(COMPACTSTALL); - page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); + /* Prep a captured page if available */ + if (page) + prep_new_page(page, order, gfp_mask, alloc_flags); + + /* Try get a page from the freelist if available */ + if (!page) + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); if (page) { struct zone *zone = page_zone(page); @@ -3756,11 +3978,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct reclaim_state reclaim_state; int progress; unsigned int noreclaim_flag; + unsigned long pflags; cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); + psi_memstall_enter(&pflags); fs_reclaim_acquire(gfp_mask); noreclaim_flag = memalloc_noreclaim_save(); reclaim_state.reclaimed_slab = 0; @@ -3772,6 +3996,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, current->reclaim_state = NULL; memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(gfp_mask); + psi_memstall_leave(&pflags); cond_resched(); @@ -3856,6 +4081,9 @@ gfp_to_alloc_flags(gfp_t gfp_mask) } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; + if (gfp_mask & __GFP_KSWAPD_RECLAIM) + alloc_flags |= ALLOC_KSWAPD; + #ifdef CONFIG_CMA if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; @@ -3922,6 +4150,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, { struct zone *zone; struct zoneref *z; + bool ret = false; /* * Costly allocations might have made a progress but this doesn't mean @@ -3985,25 +4214,24 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, } } - /* - * Memory allocation/reclaim might be called from a WQ - * context and the current implementation of the WQ - * concurrency control doesn't recognize that - * a particular WQ is congested if the worker thread is - * looping without ever sleeping. Therefore we have to - * do a short sleep here rather than calling - * cond_resched(). - */ - if (current->flags & PF_WQ_WORKER) - schedule_timeout_uninterruptible(1); - else - cond_resched(); - - return true; + ret = true; + goto out; } } - return false; +out: + /* + * Memory allocation/reclaim might be called from a WQ context and the + * current implementation of the WQ concurrency control doesn't + * recognize that a particular WQ is congested if the worker thread is + * looping without ever sleeping. Therefore we have to do a short sleep + * here rather than calling cond_resched(). + */ + if (current->flags & PF_WQ_WORKER) + schedule_timeout_uninterruptible(1); + else + cond_resched(); + return ret; } static inline bool @@ -4056,17 +4284,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int reserve_flags; /* - * In the slowpath, we sanity check order to avoid ever trying to - * reclaim >= MAX_ORDER areas which will never succeed. Callers may - * be using allocators in order of preference for an area that is - * too large. - */ - if (order >= MAX_ORDER) { - WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); - return NULL; - } - - /* * We also sanity check to catch abuse of atomic reserves being used by * callers that are not in atomic context. */ @@ -4098,7 +4315,7 @@ retry_cpuset: if (!ac->preferred_zoneref->zone) goto nopage; - if (gfp_mask & __GFP_KSWAPD_RECLAIM) + if (alloc_flags & ALLOC_KSWAPD) wake_all_kswapds(order, gfp_mask, ac); /* @@ -4156,7 +4373,7 @@ retry_cpuset: retry: /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ - if (gfp_mask & __GFP_KSWAPD_RECLAIM) + if (alloc_flags & ALLOC_KSWAPD) wake_all_kswapds(order, gfp_mask, ac); reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); @@ -4359,6 +4576,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { }; + /* + * There are several places where we assume that the order value is sane + * so bail out early if the request is out of bound. + */ + if (unlikely(order >= MAX_ORDER)) { + WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); + return NULL; + } + gfp_mask &= gfp_allowed_mask; alloc_mask = gfp_mask; if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) @@ -4366,6 +4592,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, finalise_ac(gfp_mask, &ac); + /* + * Forbid the first pass from falling back to types that fragment + * memory until all local zones are considered. + */ + alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask); + /* First allocation attempt */ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); if (likely(page)) @@ -4391,7 +4623,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, out: if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && - unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) { + unlikely(__memcg_kmem_charge(page, gfp_mask, order) != 0)) { __free_pages(page, order); page = NULL; } @@ -4424,16 +4656,19 @@ unsigned long get_zeroed_page(gfp_t gfp_mask) } EXPORT_SYMBOL(get_zeroed_page); -void __free_pages(struct page *page, unsigned int order) +static inline void free_the_page(struct page *page, unsigned int order) { - if (put_page_testzero(page)) { - if (order == 0) - free_unref_page(page); - else - __free_pages_ok(page, order); - } + if (order == 0) /* Via pcp? */ + free_unref_page(page); + else + __free_pages_ok(page, order); } +void __free_pages(struct page *page, unsigned int order) +{ + if (put_page_testzero(page)) + free_the_page(page, order); +} EXPORT_SYMBOL(__free_pages); void free_pages(unsigned long addr, unsigned int order) @@ -4482,14 +4717,8 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) { VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); - if (page_ref_sub_and_test(page, count)) { - unsigned int order = compound_order(page); - - if (order == 0) - free_unref_page(page); - else - __free_pages_ok(page, order); - } + if (page_ref_sub_and_test(page, count)) + free_the_page(page, compound_order(page)); } EXPORT_SYMBOL(__page_frag_cache_drain); @@ -4513,11 +4742,11 @@ refill: /* Even if we own the page, we do not use atomic_set(). * This would break get_page_unless_zero() users. */ - page_ref_add(page, size - 1); + page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE); /* reset page count bias and offset to start of new frag */ nc->pfmemalloc = page_is_pfmemalloc(page); - nc->pagecnt_bias = size; + nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; nc->offset = size; } @@ -4533,10 +4762,10 @@ refill: size = nc->size; #endif /* OK, page count is 0, we can safely set it */ - set_page_count(page, size); + set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1); /* reset page count bias and offset to start of new frag */ - nc->pagecnt_bias = size; + nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; offset = size - fragsz; } @@ -4555,7 +4784,7 @@ void page_frag_free(void *addr) struct page *page = virt_to_head_page(addr); if (unlikely(put_page_testzero(page))) - __free_pages_ok(page, compound_order(page)); + free_the_page(page, compound_order(page)); } EXPORT_SYMBOL(page_frag_free); @@ -4587,6 +4816,8 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, * This function is also limited by MAX_ORDER. * * Memory allocated by this function must be released by free_pages_exact(). + * + * Return: pointer to the allocated area or %NULL in case of error. */ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) { @@ -4607,6 +4838,8 @@ EXPORT_SYMBOL(alloc_pages_exact); * * Like alloc_pages_exact(), but try to allocate on node nid first before falling * back. + * + * Return: pointer to the allocated area or %NULL in case of error. */ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { @@ -4640,11 +4873,13 @@ EXPORT_SYMBOL(free_pages_exact); * nr_free_zone_pages - count number of pages beyond high watermark * @offset: The zone index of the highest zone * - * nr_free_zone_pages() counts the number of counts pages which are beyond the + * nr_free_zone_pages() counts the number of pages which are beyond the * high watermark within all zones at or below a given zone index. For each * zone, the number of pages is calculated as: * * nr_free_zone_pages = managed_pages - high_pages + * + * Return: number of pages beyond high watermark. */ static unsigned long nr_free_zone_pages(int offset) { @@ -4657,7 +4892,7 @@ static unsigned long nr_free_zone_pages(int offset) struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); for_each_zone_zonelist(zone, z, zonelist, offset) { - unsigned long size = zone->managed_pages; + unsigned long size = zone_managed_pages(zone); unsigned long high = high_wmark_pages(zone); if (size > high) sum += size - high; @@ -4671,6 +4906,9 @@ static unsigned long nr_free_zone_pages(int offset) * * nr_free_buffer_pages() counts the number of pages which are beyond the high * watermark within ZONE_DMA and ZONE_NORMAL. + * + * Return: number of pages beyond high watermark within ZONE_DMA and + * ZONE_NORMAL. */ unsigned long nr_free_buffer_pages(void) { @@ -4683,6 +4921,8 @@ EXPORT_SYMBOL_GPL(nr_free_buffer_pages); * * nr_free_pagecache_pages() counts the number of pages which are beyond the * high watermark within all zones. + * + * Return: number of pages beyond high watermark within all zones. */ unsigned long nr_free_pagecache_pages(void) { @@ -4701,6 +4941,7 @@ long si_mem_available(void) unsigned long pagecache; unsigned long wmark_low = 0; unsigned long pages[NR_LRU_LISTS]; + unsigned long reclaimable; struct zone *zone; int lru; @@ -4708,7 +4949,7 @@ long si_mem_available(void) pages[lru] = global_node_page_state(NR_LRU_BASE + lru); for_each_zone(zone) - wmark_low += zone->watermark[WMARK_LOW]; + wmark_low += low_wmark_pages(zone); /* * Estimate the amount of memory available for userspace allocations, @@ -4726,19 +4967,13 @@ long si_mem_available(void) available += pagecache; /* - * Part of the reclaimable slab consists of items that are in use, - * and cannot be freed. Cap this estimate at the low watermark. - */ - available += global_node_page_state(NR_SLAB_RECLAIMABLE) - - min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2, - wmark_low); - - /* - * Part of the kernel memory, which can be released under memory - * pressure. + * Part of the reclaimable slab and other kernel memory consists of + * items that are in use, and cannot be freed. Cap this estimate at the + * low watermark. */ - available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >> - PAGE_SHIFT; + reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) + + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); + available += reclaimable - min(reclaimable / 2, wmark_low); if (available < 0) available = 0; @@ -4748,11 +4983,11 @@ EXPORT_SYMBOL_GPL(si_mem_available); void si_meminfo(struct sysinfo *val) { - val->totalram = totalram_pages; + val->totalram = totalram_pages(); val->sharedram = global_node_page_state(NR_SHMEM); val->freeram = global_zone_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); - val->totalhigh = totalhigh_pages; + val->totalhigh = totalhigh_pages(); val->freehigh = nr_free_highpages(); val->mem_unit = PAGE_SIZE; } @@ -4769,7 +5004,7 @@ void si_meminfo_node(struct sysinfo *val, int nid) pg_data_t *pgdat = NODE_DATA(nid); for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) - managed_pages += pgdat->node_zones[zone_type].managed_pages; + managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); val->totalram = managed_pages; val->sharedram = node_page_state(pgdat, NR_SHMEM); val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); @@ -4778,7 +5013,7 @@ void si_meminfo_node(struct sysinfo *val, int nid) struct zone *zone = &pgdat->node_zones[zone_type]; if (is_highmem(zone)) { - managed_highpages += zone->managed_pages; + managed_highpages += zone_managed_pages(zone); free_highpages += zone_page_state(zone, NR_FREE_PAGES); } } @@ -4985,7 +5220,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), K(zone->present_pages), - K(zone->managed_pages), + K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), zone_page_state(zone, NR_KERNEL_STACK_KB), K(zone_page_state(zone, NR_PAGETABLE)), @@ -5134,7 +5369,8 @@ static int node_load[MAX_NUMNODES]; * from each node to each node in the system), and should also prefer nodes * with no CPUs, since presumably they'll have very little allocation pressure * on them otherwise. - * It returns -1 if no node is found. + * + * Return: node id of the found node or %NUMA_NO_NODE if no node is found. */ static int find_next_best_node(int node, nodemask_t *used_node_mask) { @@ -5440,7 +5676,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat) else page_group_by_mobility_disabled = 0; - pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n", + pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n", nr_online_nodes, page_group_by_mobility_disabled ? "off" : "on", vm_total_pages); @@ -5449,76 +5685,151 @@ void __ref build_all_zonelists(pg_data_t *pgdat) #endif } +/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */ +static bool __meminit +overlap_memmap_init(unsigned long zone, unsigned long *pfn) +{ +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + static struct memblock_region *r; + + if (mirrored_kernelcore && zone == ZONE_MOVABLE) { + if (!r || *pfn >= memblock_region_memory_end_pfn(r)) { + for_each_memblock(memory, r) { + if (*pfn < memblock_region_memory_end_pfn(r)) + break; + } + } + if (*pfn >= memblock_region_memory_base_pfn(r) && + memblock_is_mirror(r)) { + *pfn = memblock_region_memory_end_pfn(r); + return true; + } + } +#endif + return false; +} + /* * Initially all pages are reserved - free ones are freed - * up by free_all_bootmem() once the early boot process is + * up by memblock_free_all() once the early boot process is * done. Non-atomic initialization, single-pass. */ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, enum memmap_context context, struct vmem_altmap *altmap) { - unsigned long end_pfn = start_pfn + size; - pg_data_t *pgdat = NODE_DATA(nid); - unsigned long pfn; - unsigned long nr_initialised = 0; + unsigned long pfn, end_pfn = start_pfn + size; struct page *page; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - struct memblock_region *r = NULL, *tmp; -#endif if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; +#ifdef CONFIG_ZONE_DEVICE /* * Honor reservation requested by the driver for this ZONE_DEVICE - * memory + * memory. We limit the total number of pages to initialize to just + * those that might contain the memory mapping. We will defer the + * ZONE_DEVICE page initialization until after we have released + * the hotplug lock. */ - if (altmap && start_pfn == altmap->base_pfn) - start_pfn += altmap->reserve; + if (zone == ZONE_DEVICE) { + if (!altmap) + return; + + if (start_pfn == altmap->base_pfn) + start_pfn += altmap->reserve; + end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); + } +#endif for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* * There can be holes in boot-time mem_map[]s handed to this * function. They do not exist on hotplugged memory. */ - if (context != MEMMAP_EARLY) - goto not_early; - - if (!early_pfn_valid(pfn)) - continue; - if (!early_pfn_in_nid(pfn, nid)) - continue; - if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) - break; - -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - /* - * Check given memblock attribute by firmware which can affect - * kernel memory layout. If zone==ZONE_MOVABLE but memory is - * mirrored, it's an overlapped memmap init. skip it. - */ - if (mirrored_kernelcore && zone == ZONE_MOVABLE) { - if (!r || pfn >= memblock_region_memory_end_pfn(r)) { - for_each_memblock(memory, tmp) - if (pfn < memblock_region_memory_end_pfn(tmp)) - break; - r = tmp; - } - if (pfn >= memblock_region_memory_base_pfn(r) && - memblock_is_mirror(r)) { - /* already initialized as NORMAL */ - pfn = memblock_region_memory_end_pfn(r); + if (context == MEMMAP_EARLY) { + if (!early_pfn_valid(pfn)) continue; - } + if (!early_pfn_in_nid(pfn, nid)) + continue; + if (overlap_memmap_init(zone, &pfn)) + continue; + if (defer_init(nid, pfn, end_pfn)) + break; } -#endif -not_early: page = pfn_to_page(pfn); __init_single_page(page, pfn, zone, nid); if (context == MEMMAP_HOTPLUG) - SetPageReserved(page); + __SetPageReserved(page); + + /* + * Mark the block movable so that blocks are reserved for + * movable at startup. This will force kernel allocations + * to reserve their blocks rather than leaking throughout + * the address space during boot when many long-lived + * kernel allocations are made. + * + * bitmap is created for zone's valid pfn range. but memmap + * can be created for invalid pages (for alignment) + * check here not to call set_pageblock_migratetype() against + * pfn out of zone. + */ + if (!(pfn & (pageblock_nr_pages - 1))) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + cond_resched(); + } + } +} + +#ifdef CONFIG_ZONE_DEVICE +void __ref memmap_init_zone_device(struct zone *zone, + unsigned long start_pfn, + unsigned long size, + struct dev_pagemap *pgmap) +{ + unsigned long pfn, end_pfn = start_pfn + size; + struct pglist_data *pgdat = zone->zone_pgdat; + unsigned long zone_idx = zone_idx(zone); + unsigned long start = jiffies; + int nid = pgdat->node_id; + + if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone))) + return; + + /* + * The call to memmap_init_zone should have already taken care + * of the pages reserved for the memmap, so we can just jump to + * the end of that region and start processing the device pages. + */ + if (pgmap->altmap_valid) { + struct vmem_altmap *altmap = &pgmap->altmap; + + start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); + size = end_pfn - start_pfn; + } + + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + struct page *page = pfn_to_page(pfn); + + __init_single_page(page, pfn, zone_idx, nid); + + /* + * Mark page reserved as it will need to wait for onlining + * phase for it to be fully associated with a zone. + * + * We can use the non-atomic __set_bit operation for setting + * the flag as we are still initializing the pages. + */ + __SetPageReserved(page); + + /* + * ZONE_DEVICE pages union ->lru with a ->pgmap back + * pointer and hmm_data. It is a bug if a ZONE_DEVICE + * page is ever freed or placed on a driver-private list. + */ + page->pgmap = pgmap; + page->hmm_data = 0; /* * Mark the block movable so that blocks are reserved for @@ -5540,8 +5851,12 @@ not_early: cond_resched(); } } + + pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev), + size, jiffies_to_msecs(jiffies - start)); } +#endif static void __meminit zone_init_free_lists(struct zone *zone) { unsigned int order, t; @@ -5551,10 +5866,11 @@ static void __meminit zone_init_free_lists(struct zone *zone) } } -#ifndef __HAVE_ARCH_MEMMAP_INIT -#define memmap_init(size, nid, zone, start_pfn) \ - memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY, NULL) -#endif +void __meminit __weak memmap_init(unsigned long size, int nid, + unsigned long zone, unsigned long start_pfn) +{ + memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL); +} static int zone_batchsize(struct zone *zone) { @@ -5565,7 +5881,7 @@ static int zone_batchsize(struct zone *zone) * The per-cpu-pages pools are set to around 1000th of the * size of the zone. */ - batch = zone->managed_pages / 1024; + batch = zone_managed_pages(zone) / 1024; /* But no more than a meg. */ if (batch * PAGE_SIZE > 1024 * 1024) batch = (1024 * 1024) / PAGE_SIZE; @@ -5646,7 +5962,6 @@ static void pageset_init(struct per_cpu_pageset *p) memset(p, 0, sizeof(*p)); pcp = &p->pcp; - pcp->count = 0; for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) INIT_LIST_HEAD(&pcp->lists[migratetype]); } @@ -5676,7 +5991,7 @@ static void pageset_set_high_and_batch(struct zone *zone, { if (percpu_pagelist_fraction) pageset_set_high(pcp, - (zone->managed_pages / + (zone_managed_pages(zone) / percpu_pagelist_fraction)); else pageset_set_batch(pcp, zone_batchsize(zone)); @@ -5735,8 +6050,10 @@ void __meminit init_currently_empty_zone(struct zone *zone, unsigned long size) { struct pglist_data *pgdat = zone->zone_pgdat; + int zone_idx = zone_idx(zone) + 1; - pgdat->nr_zones = zone_idx(zone) + 1; + if (zone_idx > pgdat->nr_zones) + pgdat->nr_zones = zone_idx; zone->zone_start_pfn = zone_start_pfn; @@ -5766,7 +6083,7 @@ int __meminit __early_pfn_to_nid(unsigned long pfn, return state->last_nid; nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); - if (nid != -1) { + if (nid != NUMA_NO_NODE) { state->last_start = start_pfn; state->last_end = end_pfn; state->last_nid = nid; @@ -5828,7 +6145,7 @@ void __init sparse_memory_present_with_active_regions(int nid) * with no available memory, a warning is printed and the start and end * PFNs will be 0. */ -void __meminit get_pfn_range_for_nid(unsigned int nid, +void __init get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn) { unsigned long this_start_pfn, this_end_pfn; @@ -5877,7 +6194,7 @@ static void __init find_usable_zone_for_movable(void) * highest usable zone for ZONE_MOVABLE. This preserves the assumption that * zones within a node are in order of monotonic increases memory addresses */ -static void __meminit adjust_zone_range_for_zone_movable(int nid, +static void __init adjust_zone_range_for_zone_movable(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, @@ -5908,7 +6225,7 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid, * Return the number of pages a zone spans in a node, including holes * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() */ -static unsigned long __meminit zone_spanned_pages_in_node(int nid, +static unsigned long __init zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, @@ -5943,7 +6260,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, * then all holes in the requested range will be accounted for. */ -unsigned long __meminit __absent_pages_in_range(int nid, +unsigned long __init __absent_pages_in_range(int nid, unsigned long range_start_pfn, unsigned long range_end_pfn) { @@ -5964,7 +6281,7 @@ unsigned long __meminit __absent_pages_in_range(int nid, * @start_pfn: The start PFN to start searching for holes * @end_pfn: The end PFN to stop searching for holes * - * It returns the number of pages frames in memory holes within a range. + * Return: the number of pages frames in memory holes within a range. */ unsigned long __init absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn) @@ -5973,7 +6290,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn, } /* Return the number of page frames in holes in a zone on a node */ -static unsigned long __meminit zone_absent_pages_in_node(int nid, +static unsigned long __init zone_absent_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, @@ -6025,7 +6342,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, } #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ -static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, +static inline unsigned long __init zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, @@ -6044,7 +6361,7 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, return zones_size[zone_type]; } -static inline unsigned long __meminit zone_absent_pages_in_node(int nid, +static inline unsigned long __init zone_absent_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, @@ -6058,7 +6375,7 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid, #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ -static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, +static void __init calculate_node_totalpages(struct pglist_data *pgdat, unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zones_size, @@ -6126,10 +6443,14 @@ static void __ref setup_usemap(struct pglist_data *pgdat, { unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); zone->pageblock_flags = NULL; - if (usemapsize) + if (usemapsize) { zone->pageblock_flags = - memblock_virt_alloc_node_nopanic(usemapsize, + memblock_alloc_node_nopanic(usemapsize, pgdat->node_id); + if (!zone->pageblock_flags) + panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n", + usemapsize, zone->name, pgdat->node_id); + } } #else static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, @@ -6231,7 +6552,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, unsigned long remaining_pages) { - zone->managed_pages = remaining_pages; + atomic_long_set(&zone->managed_pages, remaining_pages); zone_set_nid(zone, nid); zone->name = zone_names[idx]; zone->zone_pgdat = NODE_DATA(nid); @@ -6358,7 +6679,10 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) end = pgdat_end_pfn(pgdat); end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); - map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id); + map = memblock_alloc_node_nopanic(size, pgdat->node_id); + if (!map) + panic("Failed to allocate %ld bytes for node %d memory map\n", + size, pgdat->node_id); pgdat->node_mem_map = map + offset; } pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", @@ -6384,12 +6708,6 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static inline void pgdat_set_deferred_range(pg_data_t *pgdat) { - /* - * We start only with one section of pages, more pages are added as - * needed until the rest of deferred pages are initialized. - */ - pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION, - pgdat->node_spanned_pages); pgdat->first_deferred_pfn = ULONG_MAX; } #else @@ -6427,48 +6745,67 @@ void __init free_area_init_node(int nid, unsigned long *zones_size, free_area_init_core(pgdat); } -#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP) +#if !defined(CONFIG_FLAT_NODE_MEM_MAP) +/* + * Zero all valid struct pages in range [spfn, epfn), return number of struct + * pages zeroed + */ +static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn) +{ + unsigned long pfn; + u64 pgcnt = 0; + + for (pfn = spfn; pfn < epfn; pfn++) { + if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { + pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) + + pageblock_nr_pages - 1; + continue; + } + mm_zero_struct_page(pfn_to_page(pfn)); + pgcnt++; + } + + return pgcnt; +} + /* * Only struct pages that are backed by physical memory are zeroed and * initialized by going through __init_single_page(). But, there are some * struct pages which are reserved in memblock allocator and their fields * may be accessed (for example page_to_pfn() on some configuration accesses * flags). We must explicitly zero those struct pages. + * + * This function also addresses a similar issue where struct pages are left + * uninitialized because the physical address range is not covered by + * memblock.memory or memblock.reserved. That could happen when memblock + * layout is manually configured via memmap=. */ void __init zero_resv_unavail(void) { phys_addr_t start, end; - unsigned long pfn; u64 i, pgcnt; + phys_addr_t next = 0; /* - * Loop through ranges that are reserved, but do not have reported - * physical memory backing. + * Loop through unavailable ranges not covered by memblock.memory. */ pgcnt = 0; - for_each_resv_unavail_range(i, &start, &end) { - for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) { - if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { - pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) - + pageblock_nr_pages - 1; - continue; - } - mm_zero_struct_page(pfn_to_page(pfn)); - pgcnt++; - } + for_each_mem_range(i, &memblock.memory, NULL, + NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) { + if (next < start) + pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start)); + next = end; } + pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn); /* * Struct pages that do not have backing memory. This could be because * firmware is using some of this memory, or for some other reasons. - * Once memblock is changed so such behaviour is not allowed: i.e. - * list of "reserved" memory must be a subset of list of "memory", then - * this code can be removed. */ if (pgcnt) - pr_info("Reserved but unavailable: %lld pages", pgcnt); + pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt); } -#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */ +#endif /* !CONFIG_FLAT_NODE_MEM_MAP */ #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -6501,14 +6838,14 @@ void __init setup_nr_node_ids(void) * model has fine enough granularity to avoid incorrect mapping for the * populated node map. * - * Returns the determined alignment in pfn's. 0 if there is no alignment + * Return: the determined alignment in pfn's. 0 if there is no alignment * requirement (single node). */ unsigned long __init node_map_pfn_alignment(void) { unsigned long accl_mask = 0, last_end = 0; unsigned long start, end, mask; - int last_nid = -1; + int last_nid = NUMA_NO_NODE; int i, nid; for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { @@ -6556,7 +6893,7 @@ static unsigned long __init find_min_pfn_for_node(int nid) /** * find_min_pfn_with_active_regions - Find the minimum PFN registered * - * It returns the minimum PFN based on information provided via + * Return: the minimum PFN based on information provided via * memblock_set_node(). */ unsigned long __init find_min_pfn_with_active_regions(void) @@ -6803,15 +7140,12 @@ static void check_for_memory(pg_data_t *pgdat, int nid) { enum zone_type zone_type; - if (N_MEMORY == N_NORMAL_MEMORY) - return; - for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; if (populated_zone(zone)) { - node_set_state(nid, N_HIGH_MEMORY); - if (N_NORMAL_MEMORY != N_HIGH_MEMORY && - zone_type <= ZONE_NORMAL) + if (IS_ENABLED(CONFIG_HIGHMEM)) + node_set_state(nid, N_HIGH_MEMORY); + if (zone_type <= ZONE_NORMAL) node_set_state(nid, N_NORMAL_MEMORY); break; } @@ -6967,18 +7301,16 @@ early_param("movablecore", cmdline_parse_movablecore); void adjust_managed_page_count(struct page *page, long count) { - spin_lock(&managed_page_count_lock); - page_zone(page)->managed_pages += count; - totalram_pages += count; + atomic_long_add(count, &page_zone(page)->managed_pages); + totalram_pages_add(count); #ifdef CONFIG_HIGHMEM if (PageHighMem(page)) - totalhigh_pages += count; + totalhigh_pages_add(count); #endif - spin_unlock(&managed_page_count_lock); } EXPORT_SYMBOL(adjust_managed_page_count); -unsigned long free_reserved_area(void *start, void *end, int poison, char *s) +unsigned long free_reserved_area(void *start, void *end, int poison, const char *s) { void *pos; unsigned long pages = 0; @@ -7009,15 +7341,14 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s) return pages; } -EXPORT_SYMBOL(free_reserved_area); #ifdef CONFIG_HIGHMEM void free_highmem_page(struct page *page) { __free_reserved_page(page); - totalram_pages++; - page_zone(page)->managed_pages++; - totalhigh_pages++; + totalram_pages_inc(); + atomic_long_inc(&page_zone(page)->managed_pages); + totalhigh_pages_inc(); } #endif @@ -7066,10 +7397,10 @@ void __init mem_init_print_info(const char *str) physpages << (PAGE_SHIFT - 10), codesize >> 10, datasize >> 10, rosize >> 10, (init_data_size + init_code_size) >> 10, bss_size >> 10, - (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10), + (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10), totalcma_pages << (PAGE_SHIFT - 10), #ifdef CONFIG_HIGHMEM - totalhigh_pages << (PAGE_SHIFT - 10), + totalhigh_pages() << (PAGE_SHIFT - 10), #endif str ? ", " : "", str ? str : ""); } @@ -7149,6 +7480,7 @@ static void calculate_totalreserve_pages(void) for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; long max = 0; + unsigned long managed_pages = zone_managed_pages(zone); /* Find valid and maximum lowmem_reserve in the zone */ for (j = i; j < MAX_NR_ZONES; j++) { @@ -7159,8 +7491,8 @@ static void calculate_totalreserve_pages(void) /* we treat the high watermark as reserved pages. */ max += high_wmark_pages(zone); - if (max > zone->managed_pages) - max = zone->managed_pages; + if (max > managed_pages) + max = managed_pages; pgdat->totalreserve_pages += max; @@ -7184,7 +7516,7 @@ static void setup_per_zone_lowmem_reserve(void) for_each_online_pgdat(pgdat) { for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long managed_pages = zone->managed_pages; + unsigned long managed_pages = zone_managed_pages(zone); zone->lowmem_reserve[j] = 0; @@ -7202,7 +7534,7 @@ static void setup_per_zone_lowmem_reserve(void) lower_zone->lowmem_reserve[j] = managed_pages / sysctl_lowmem_reserve_ratio[idx]; } - managed_pages += lower_zone->managed_pages; + managed_pages += zone_managed_pages(lower_zone); } } } @@ -7221,14 +7553,14 @@ static void __setup_per_zone_wmarks(void) /* Calculate total number of !ZONE_HIGHMEM pages */ for_each_zone(zone) { if (!is_highmem(zone)) - lowmem_pages += zone->managed_pages; + lowmem_pages += zone_managed_pages(zone); } for_each_zone(zone) { u64 tmp; spin_lock_irqsave(&zone->lock, flags); - tmp = (u64)pages_min * zone->managed_pages; + tmp = (u64)pages_min * zone_managed_pages(zone); do_div(tmp, lowmem_pages); if (is_highmem(zone)) { /* @@ -7237,20 +7569,20 @@ static void __setup_per_zone_wmarks(void) * value here. * * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) - * deltas control asynch page reclaim, and so should + * deltas control async page reclaim, and so should * not be capped for highmem. */ unsigned long min_pages; - min_pages = zone->managed_pages / 1024; + min_pages = zone_managed_pages(zone) / 1024; min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); - zone->watermark[WMARK_MIN] = min_pages; + zone->_watermark[WMARK_MIN] = min_pages; } else { /* * If it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ - zone->watermark[WMARK_MIN] = tmp; + zone->_watermark[WMARK_MIN] = tmp; } /* @@ -7259,11 +7591,12 @@ static void __setup_per_zone_wmarks(void) * ensure a minimum size on small systems. */ tmp = max_t(u64, tmp >> 2, - mult_frac(zone->managed_pages, + mult_frac(zone_managed_pages(zone), watermark_scale_factor, 10000)); - zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; - zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; + zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; + zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; + zone->watermark_boost = 0; spin_unlock_irqrestore(&zone->lock, flags); } @@ -7364,6 +7697,18 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, return 0; } +int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + return 0; +} + int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { @@ -7389,8 +7734,8 @@ static void setup_min_unmapped_ratio(void) pgdat->min_unmapped_pages = 0; for_each_zone(zone) - zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * - sysctl_min_unmapped_ratio) / 100; + zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) * + sysctl_min_unmapped_ratio) / 100; } @@ -7417,8 +7762,8 @@ static void setup_min_slab_ratio(void) pgdat->min_slab_pages = 0; for_each_zone(zone) - zone->zone_pgdat->min_slab_pages += (zone->managed_pages * - sysctl_min_slab_ratio) / 100; + zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) * + sysctl_min_slab_ratio) / 100; } int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, @@ -7614,9 +7959,11 @@ void *__init alloc_large_system_hash(const char *tablename, size = bucketsize << log2qty; if (flags & HASH_EARLY) { if (flags & HASH_ZERO) - table = memblock_virt_alloc_nopanic(size, 0); + table = memblock_alloc_nopanic(size, + SMP_CACHE_BYTES); else - table = memblock_virt_alloc_raw(size, 0); + table = memblock_alloc_raw(size, + SMP_CACHE_BYTES); } else if (hashdist) { table = __vmalloc(size, gfp_flags, PAGE_KERNEL); } else { @@ -7656,8 +8003,7 @@ void *__init alloc_large_system_hash(const char *tablename, * race condition. So you can't expect this function should be exact. */ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, - int migratetype, - bool skip_hwpoisoned_pages) + int migratetype, int flags) { unsigned long pfn, iter, found; @@ -7691,16 +8037,27 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, goto unmovable; /* + * If the zone is movable and we have ruled out all reserved + * pages then it should be reasonably safe to assume the rest + * is movable. + */ + if (zone_idx(zone) == ZONE_MOVABLE) + continue; + + /* * Hugepages are not in LRU lists, but they're movable. - * We need not scan over tail pages bacause we don't + * We need not scan over tail pages because we don't * handle each tail page individually in migration. */ if (PageHuge(page)) { + struct page *head = compound_head(page); + unsigned int skip_pages; - if (!hugepage_migration_supported(page_hstate(page))) + if (!hugepage_migration_supported(page_hstate(head))) goto unmovable; - iter = round_up(iter + 1, 1<<compound_order(page)) - 1; + skip_pages = (1 << compound_order(head)) - (page - head); + iter += skip_pages - 1; continue; } @@ -7720,7 +8077,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, * The HWPoisoned page may be not in buddy system, and * page_count() is not 0. */ - if (skip_hwpoisoned_pages && PageHWPoison(page)) + if ((flags & SKIP_HWPOISON) && PageHWPoison(page)) continue; if (__PageMovable(page)) @@ -7747,6 +8104,8 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, return false; unmovable: WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); + if (flags & REPORT_FAILURE) + dump_page(pfn_to_page(pfn+iter), "unmovable page"); return true; } @@ -7826,7 +8185,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * pageblocks in the range. Once isolated, the pageblocks should not * be modified by others. * - * Returns zero on success or negative error code. On success all + * Return: zero on success or negative error code. On success all * pages which PFN is in [start, end) are allocated for the caller and * need to be freed with free_contig_range(). */ @@ -7873,8 +8232,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, */ ret = start_isolate_page_range(pfn_max_align_down(start), - pfn_max_align_up(end), migratetype, - false); + pfn_max_align_up(end), migratetype, 0); if (ret) return ret; @@ -7911,7 +8269,6 @@ int alloc_contig_range(unsigned long start, unsigned long end, */ lru_add_drain_all(); - drain_all_pages(cc.zone); order = 0; outer_start = start; diff --git a/mm/page_ext.c b/mm/page_ext.c index a9826da84ccb..ab4244920e0f 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/mmzone.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/page_ext.h> #include <linux/memory.h> #include <linux/vmalloc.h> @@ -161,9 +161,9 @@ static int __init alloc_node_page_ext(int nid) table_size = get_entry_size() * nr_pages; - base = memblock_virt_alloc_try_nid_nopanic( + base = memblock_alloc_try_nid_nopanic( table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, nid); + MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!base) return -ENOMEM; NODE_DATA(nid)->node_page_ext = base; @@ -273,6 +273,7 @@ static void free_page_ext(void *addr) table_size = get_entry_size() * PAGES_PER_SECTION; BUG_ON(PageReserved(page)); + kmemleak_free(addr); free_pages_exact(addr, table_size); } } @@ -300,7 +301,7 @@ static int __meminit online_page_ext(unsigned long start_pfn, start = SECTION_ALIGN_DOWN(start_pfn); end = SECTION_ALIGN_UP(start_pfn + nr_pages); - if (nid == -1) { + if (nid == NUMA_NO_NODE) { /* * In this case, "nid" already exists and contains valid memory. * "start_pfn" passed to us is a pfn which is an arg for @@ -398,10 +399,8 @@ void __init page_ext_init(void) * We know some arch can have a nodes layout such as * -------------pfn--------------> * N0 | N1 | N2 | N0 | N1 | N2|.... - * - * Take into account DEFERRED_STRUCT_PAGE_INIT. */ - if (early_pfn_to_nid(pfn) != nid) + if (pfn_to_nid(pfn) != nid) continue; if (init_section_page_ext(pfn, nid)) goto oom; diff --git a/mm/page_idle.c b/mm/page_idle.c index 6302bc62c27d..0b39ec0c945c 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/init.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/fs.h> #include <linux/sysfs.h> #include <linux/kobject.h> @@ -31,7 +31,7 @@ static struct page *page_idle_get_page(unsigned long pfn) { struct page *page; - struct zone *zone; + pg_data_t *pgdat; if (!pfn_valid(pfn)) return NULL; @@ -41,13 +41,13 @@ static struct page *page_idle_get_page(unsigned long pfn) !get_page_unless_zero(page)) return NULL; - zone = page_zone(page); - spin_lock_irq(zone_lru_lock(zone)); + pgdat = page_pgdat(page); + spin_lock_irq(&pgdat->lru_lock); if (unlikely(!PageLRU(page))) { put_page(page); page = NULL; } - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); return page; } diff --git a/mm/page_io.c b/mm/page_io.c index 573d3663d846..2e8019d0e048 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -140,7 +140,7 @@ out: unlock_page(page); WRITE_ONCE(bio->bi_private, NULL); bio_put(bio); - wake_up_process(waiter); + blk_wake_io_task(waiter); put_task_struct(waiter); } @@ -283,7 +283,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, struct swap_info_struct *sis = page_swap_info(page); VM_BUG_ON_PAGE(!PageSwapCache(page), page); - if (sis->flags & SWP_FILE) { + if (sis->flags & SWP_FS) { struct kiocb kiocb; struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; @@ -294,7 +294,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, }; struct iov_iter from; - iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE); + iov_iter_bvec(&from, WRITE, &bv, 1, PAGE_SIZE); init_sync_kiocb(&kiocb, swap_file); kiocb.ki_pos = page_file_offset(page); @@ -365,7 +365,7 @@ int swap_readpage(struct page *page, bool synchronous) goto out; } - if (sis->flags & SWP_FILE) { + if (sis->flags & SWP_FS) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; @@ -401,6 +401,8 @@ int swap_readpage(struct page *page, bool synchronous) get_task_struct(current); bio->bi_private = current; bio_set_op_attrs(bio, REQ_OP_READ, 0); + if (synchronous) + bio->bi_opf |= REQ_HIPRI; count_vm_event(PSWPIN); bio_get(bio); qc = submit_bio(bio); @@ -409,8 +411,8 @@ int swap_readpage(struct page *page, bool synchronous) if (!READ_ONCE(bio->bi_private)) break; - if (!blk_poll(disk->queue, qc)) - break; + if (!blk_poll(disk->queue, qc, true)) + io_schedule(); } __set_current_state(TASK_RUNNING); bio_put(bio); @@ -423,7 +425,7 @@ int swap_set_page_dirty(struct page *page) { struct swap_info_struct *sis = page_swap_info(page); - if (sis->flags & SWP_FILE) { + if (sis->flags & SWP_FS) { struct address_space *mapping = sis->swap_file->f_mapping; VM_BUG_ON_PAGE(!PageSwapCache(page), page); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 43e085608846..ce323e56b34d 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -15,8 +15,7 @@ #define CREATE_TRACE_POINTS #include <trace/events/page_isolation.h> -static int set_migratetype_isolate(struct page *page, int migratetype, - bool skip_hwpoisoned_pages) +static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags) { struct zone *zone; unsigned long flags, pfn; @@ -60,8 +59,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. * We just check MOVABLE pages. */ - if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype, - skip_hwpoisoned_pages)) + if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype, flags)) ret = 0; /* @@ -185,7 +183,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * prevents two threads from simultaneously working on overlapping ranges. */ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - unsigned migratetype, bool skip_hwpoisoned_pages) + unsigned migratetype, int flags) { unsigned long pfn; unsigned long undo_pfn; @@ -199,7 +197,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); if (page && - set_migratetype_isolate(page, migratetype, skip_hwpoisoned_pages)) { + set_migratetype_isolate(page, migratetype, flags)) { undo_pfn = pfn; goto undo; } diff --git a/mm/page_owner.c b/mm/page_owner.c index d80adfe702d3..925b6f44a444 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -3,7 +3,7 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/uaccess.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/stacktrace.h> #include <linux/page_owner.h> #include <linux/jump_label.h> @@ -351,6 +351,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, .skip = 0 }; + count = min_t(size_t, count, PAGE_SIZE); kbuf = kmalloc(count, GFP_KERNEL); if (!kbuf) return -ENOMEM; @@ -624,16 +625,14 @@ static const struct file_operations proc_page_owner_operations = { static int __init pageowner_init(void) { - struct dentry *dentry; - if (!static_branch_unlikely(&page_owner_inited)) { pr_info("page_owner is disabled\n"); return 0; } - dentry = debugfs_create_file("page_owner", 0400, NULL, - NULL, &proc_page_owner_operations); + debugfs_create_file("page_owner", 0400, NULL, NULL, + &proc_page_owner_operations); - return PTR_ERR_OR_ZERO(dentry); + return 0; } late_initcall(pageowner_init) diff --git a/mm/page_poison.c b/mm/page_poison.c index aa2b3d34e8ea..21d4f97cb49b 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -6,6 +6,7 @@ #include <linux/page_ext.h> #include <linux/poison.h> #include <linux/ratelimit.h> +#include <linux/kasan.h> static bool want_page_poisoning __read_mostly; @@ -17,11 +18,16 @@ static int __init early_page_poison_param(char *buf) } early_param("page_poison", early_page_poison_param); +/** + * page_poisoning_enabled - check if page poisoning is enabled + * + * Return true if page poisoning is enabled, or false if not. + */ bool page_poisoning_enabled(void) { /* * Assumes that debug_pagealloc_enabled is set before - * free_all_bootmem. + * memblock_free_all. * Page poisoning is debug page alloc for some arches. If * either of those options are enabled, enable poisoning. */ @@ -29,12 +35,16 @@ bool page_poisoning_enabled(void) (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && debug_pagealloc_enabled())); } +EXPORT_SYMBOL_GPL(page_poisoning_enabled); static void poison_page(struct page *page) { void *addr = kmap_atomic(page); + /* KASAN still think the page is in-use, so skip it. */ + kasan_disable_current(); memset(addr, PAGE_POISON, PAGE_SIZE); + kasan_enable_current(); kunmap_atomic(addr); } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index ae3c2a35d61b..11df03e71288 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -21,7 +21,29 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw) if (!is_swap_pte(*pvmw->pte)) return false; } else { - if (!pte_present(*pvmw->pte)) + /* + * We get here when we are trying to unmap a private + * device page from the process address space. Such + * page is not CPU accessible and thus is mapped as + * a special swap entry, nonetheless it still does + * count as a valid regular mapping for the page (and + * is accounted as such in page maps count). + * + * So handle this special case as if it was a normal + * page mapping ie lock CPU page table and returns + * true. + * + * For more details on device private memory see HMM + * (include/linux/hmm.h or mm/hmm.c). + */ + if (is_swap_pte(*pvmw->pte)) { + swp_entry_t entry; + + /* Handle un-addressable ZONE_DEVICE memory */ + entry = pte_to_swp_entry(*pvmw->pte); + if (!is_device_private_entry(entry)) + return false; + } else if (!pte_present(*pvmw->pte)) return false; } } diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 38de70ab1a0d..b68d5df14731 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -50,6 +50,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT; struct pcpu_chunk *chunk; struct page *pages; + unsigned long flags; int i; chunk = pcpu_alloc_chunk(gfp); @@ -66,11 +67,11 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) pcpu_set_page_chunk(nth_page(pages, i), chunk); chunk->data = pages; - chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; + chunk->base_addr = page_address(pages); - spin_lock_irq(&pcpu_lock); + spin_lock_irqsave(&pcpu_lock, flags); pcpu_chunk_populated(chunk, 0, nr_pages, false); - spin_unlock_irq(&pcpu_lock); + spin_unlock_irqrestore(&pcpu_lock, flags); pcpu_stats_chunk_alloc(); trace_percpu_create_chunk(chunk->base_addr); diff --git a/mm/percpu.c b/mm/percpu.c index 4b90682623e9..c5c750781628 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -65,7 +65,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/bitmap.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/err.h> #include <linux/lcm.h> #include <linux/list.h> @@ -1101,9 +1101,9 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, region_size = ALIGN(start_offset + map_size, lcm_align); /* allocate chunk */ - chunk = memblock_virt_alloc(sizeof(struct pcpu_chunk) + - BITS_TO_LONGS(region_size >> PAGE_SHIFT), - 0); + chunk = memblock_alloc(sizeof(struct pcpu_chunk) + + BITS_TO_LONGS(region_size >> PAGE_SHIFT), + SMP_CACHE_BYTES); INIT_LIST_HEAD(&chunk->list); @@ -1114,12 +1114,12 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, chunk->nr_pages = region_size >> PAGE_SHIFT; region_bits = pcpu_chunk_map_bits(chunk); - chunk->alloc_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits) * - sizeof(chunk->alloc_map[0]), 0); - chunk->bound_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits + 1) * - sizeof(chunk->bound_map[0]), 0); - chunk->md_blocks = memblock_virt_alloc(pcpu_chunk_nr_blocks(chunk) * - sizeof(chunk->md_blocks[0]), 0); + chunk->alloc_map = memblock_alloc(BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]), + SMP_CACHE_BYTES); + chunk->bound_map = memblock_alloc(BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]), + SMP_CACHE_BYTES); + chunk->md_blocks = memblock_alloc(pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]), + SMP_CACHE_BYTES); pcpu_init_md_blocks(chunk); /* manage populated page bitmap */ @@ -1888,7 +1888,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, __alignof__(ai->groups[0].cpu_map[0])); ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); - ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), PAGE_SIZE); + ptr = memblock_alloc_nopanic(PFN_ALIGN(ai_size), PAGE_SIZE); if (!ptr) return NULL; ai = ptr; @@ -2075,12 +2075,14 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); /* process group information and build config tables accordingly */ - group_offsets = memblock_virt_alloc(ai->nr_groups * - sizeof(group_offsets[0]), 0); - group_sizes = memblock_virt_alloc(ai->nr_groups * - sizeof(group_sizes[0]), 0); - unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0); - unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0); + group_offsets = memblock_alloc(ai->nr_groups * sizeof(group_offsets[0]), + SMP_CACHE_BYTES); + group_sizes = memblock_alloc(ai->nr_groups * sizeof(group_sizes[0]), + SMP_CACHE_BYTES); + unit_map = memblock_alloc(nr_cpu_ids * sizeof(unit_map[0]), + SMP_CACHE_BYTES); + unit_off = memblock_alloc(nr_cpu_ids * sizeof(unit_off[0]), + SMP_CACHE_BYTES); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; @@ -2144,8 +2146,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, * empty chunks. */ pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; - pcpu_slot = memblock_virt_alloc( - pcpu_nr_slots * sizeof(pcpu_slot[0]), 0); + pcpu_slot = memblock_alloc(pcpu_nr_slots * sizeof(pcpu_slot[0]), + SMP_CACHE_BYTES); for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_slot[i]); @@ -2382,7 +2384,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( ai->atom_size = atom_size; ai->alloc_size = alloc_size; - for (group = 0, unit = 0; group_cnt[group]; group++) { + for (group = 0, unit = 0; group < nr_groups; group++) { struct pcpu_group_info *gi = &ai->groups[group]; /* @@ -2458,7 +2460,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); - areas = memblock_virt_alloc_nopanic(areas_size, 0); + areas = memblock_alloc_nopanic(areas_size, SMP_CACHE_BYTES); if (!areas) { rc = -ENOMEM; goto out_free; @@ -2589,7 +2591,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, BUG_ON(ai->nr_groups != 1); upa = ai->alloc_size/ai->unit_size; nr_g0_units = roundup(num_possible_cpus(), upa); - if (unlikely(WARN_ON(ai->groups[0].nr_units != nr_g0_units))) { + if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) { pcpu_free_alloc_info(ai); return -EINVAL; } @@ -2599,7 +2601,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, /* unaligned allocations can't be freed, round up to page size */ pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * sizeof(pages[0])); - pages = memblock_virt_alloc(pages_size, 0); + pages = memblock_alloc(pages_size, SMP_CACHE_BYTES); /* allocate pages */ j = 0; @@ -2688,7 +2690,7 @@ EXPORT_SYMBOL(__per_cpu_offset); static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, size_t align) { - return memblock_virt_alloc_from_nopanic( + return memblock_alloc_from_nopanic( size, align, __pa(MAX_DMA_ADDRESS)); } @@ -2737,7 +2739,7 @@ void __init setup_per_cpu_areas(void) void *fc; ai = pcpu_alloc_alloc_info(1, 1); - fc = memblock_virt_alloc_from_nopanic(unit_size, + fc = memblock_alloc_from_nopanic(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); if (!ai || !fc) diff --git a/mm/readahead.c b/mm/readahead.c index 4e630143a0ba..a4593654a26c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -81,6 +81,8 @@ static void read_cache_pages_invalidate_pages(struct address_space *mapping, * @data: private data for the callback routine. * * Hides the details of the LRU cache etc from the filesystems. + * + * Returns: %0 on success, error return by @filler otherwise */ int read_cache_pages(struct address_space *mapping, struct list_head *pages, int (*filler)(void *, struct page *), void *data) @@ -176,10 +178,8 @@ unsigned int __do_page_cache_readahead(struct address_space *mapping, if (page_offset > end_index) break; - rcu_read_lock(); - page = radix_tree_lookup(&mapping->i_pages, page_offset); - rcu_read_unlock(); - if (page && !radix_tree_exceptional_entry(page)) { + page = xa_load(&mapping->i_pages, page_offset); + if (page && !xa_is_value(page)) { /* * Page already present? Kick off the current batch of * contiguous pages before continuing with the next @@ -272,17 +272,15 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max) * return it as the new window size. */ static unsigned long get_next_ra_size(struct file_ra_state *ra, - unsigned long max) + unsigned long max) { unsigned long cur = ra->size; - unsigned long newsize; if (cur < max / 16) - newsize = 4 * cur; - else - newsize = 2 * cur; - - return min(newsize, max); + return 4 * cur; + if (cur <= max / 2) + return 2 * cur; + return max; } /* @@ -336,7 +334,7 @@ static pgoff_t count_history_pages(struct address_space *mapping, pgoff_t head; rcu_read_lock(); - head = page_cache_prev_hole(mapping, offset - 1, max); + head = page_cache_prev_miss(mapping, offset - 1, max); rcu_read_unlock(); return offset - 1 - head; @@ -425,7 +423,7 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = page_cache_next_hole(mapping, offset + 1, max_pages); + start = page_cache_next_miss(mapping, offset + 1, max_pages); rcu_read_unlock(); if (!start || start - offset > max_pages) diff --git a/mm/rmap.c b/mm/rmap.c index 1e79fac3186b..b30c7c71d1d9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -27,7 +27,7 @@ * mapping->i_mmap_rwsem * anon_vma->rwsem * mm->page_table_lock or pte_lock - * zone_lru_lock (in mark_page_accessed, isolate_lru_page) + * pgdat->lru_lock (in mark_page_accessed, isolate_lru_page) * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) @@ -889,15 +889,17 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, .address = address, .flags = PVMW_SYNC, }; - unsigned long start = address, end; + struct mmu_notifier_range range; int *cleaned = arg; /* * We have to assume the worse case ie pmd for invalidation. Note that * the page can not be free from this function. */ - end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); - mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); + mmu_notifier_range_init(&range, vma->vm_mm, address, + min(vma->vm_end, address + + (PAGE_SIZE << compound_order(page)))); + mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { unsigned long cstart; @@ -949,7 +951,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, (*cleaned)++; } - mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); + mmu_notifier_invalidate_range_end(&range); return true; } @@ -1017,7 +1019,7 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) /** * __page_set_anon_rmap - set up new anonymous rmap - * @page: Page to add to rmap + * @page: Page or Hugepage to add to rmap * @vma: VM area to add page to. * @address: User virtual address of the mapping * @exclusive: the page is exclusively owned by the current process @@ -1345,7 +1347,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, pte_t pteval; struct page *subpage; bool ret = true; - unsigned long start = address, end; + struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)arg; /* munlock has nothing to gain from examining un-locked vmas */ @@ -1369,15 +1371,18 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ - end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); + mmu_notifier_range_init(&range, vma->vm_mm, address, + min(vma->vm_end, address + + (PAGE_SIZE << compound_order(page)))); if (PageHuge(page)) { /* * If sharing is possible, start and end will be adjusted * accordingly. */ - adjust_range_if_pmd_sharing_possible(vma, &start, &end); + adjust_range_if_pmd_sharing_possible(vma, &range.start, + &range.end); } - mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); + mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION @@ -1428,9 +1433,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * we must flush them all. start/end were * already adjusted above to cover this range. */ - flush_cache_range(vma, start, end); - flush_tlb_range(vma, start, end); - mmu_notifier_invalidate_range(mm, start, end); + flush_cache_range(vma, range.start, range.end); + flush_tlb_range(vma, range.start, range.end); + mmu_notifier_invalidate_range(mm, range.start, + range.end); /* * The ref count of the PMD page was dropped @@ -1627,16 +1633,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, address + PAGE_SIZE); } else { /* - * We should not need to notify here as we reach this - * case only from freeze_page() itself only call from - * split_huge_page_to_list() so everything below must - * be true: - * - page is not anonymous - * - page is locked - * - * So as it is a locked file back page thus it can not - * be remove from the page cache and replace by a new - * page before mmu_notifier_invalidate_range_end so no + * This is a locked file-backed page, thus it cannot + * be removed from the page cache and replaced by a new + * page before mmu_notifier_invalidate_range_end, so no * concurrent thread might update its page table to * point at new page while a device still is using this * page. @@ -1657,7 +1656,7 @@ discard: put_page(page); } - mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); + mmu_notifier_invalidate_range_end(&range); return ret; } @@ -1917,27 +1916,10 @@ void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) #ifdef CONFIG_HUGETLB_PAGE /* - * The following three functions are for anonymous (private mapped) hugepages. + * The following two functions are for anonymous (private mapped) hugepages. * Unlike common anonymous pages, anonymous hugepages have no accounting code * and no lru code, because we handle hugepages differently from common pages. */ -static void __hugepage_set_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address, int exclusive) -{ - struct anon_vma *anon_vma = vma->anon_vma; - - BUG_ON(!anon_vma); - - if (PageAnon(page)) - return; - if (!exclusive) - anon_vma = anon_vma->root; - - anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; - page->mapping = (struct address_space *) anon_vma; - page->index = linear_page_index(vma, address); -} - void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { @@ -1949,7 +1931,7 @@ void hugepage_add_anon_rmap(struct page *page, /* address might be in next vma when migration races vma_adjust */ first = atomic_inc_and_test(compound_mapcount_ptr(page)); if (first) - __hugepage_set_anon_rmap(page, vma, address, 0); + __page_set_anon_rmap(page, vma, address, 0); } void hugepage_add_new_anon_rmap(struct page *page, @@ -1957,6 +1939,6 @@ void hugepage_add_new_anon_rmap(struct page *page, { BUG_ON(address < vma->vm_start || address >= vma->vm_end); atomic_set(compound_mapcount_ptr(page), 0); - __hugepage_set_anon_rmap(page, vma, address, 1); + __page_set_anon_rmap(page, vma, address, 1); } #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/mm/shmem.c b/mm/shmem.c index 446942677cd4..b3db3779a30a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -36,6 +36,7 @@ #include <linux/uio.h> #include <linux/khugepaged.h> #include <linux/hugetlb.h> +#include <linux/frontswap.h> #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */ @@ -109,18 +110,24 @@ struct shmem_falloc { #ifdef CONFIG_TMPFS static unsigned long shmem_default_max_blocks(void) { - return totalram_pages / 2; + return totalram_pages() / 2; } static unsigned long shmem_default_max_inodes(void) { - return min(totalram_pages - totalhigh_pages, totalram_pages / 2); + unsigned long nr_pages = totalram_pages(); + + return min(nr_pages - totalhigh_pages(), nr_pages / 2); } #endif static bool shmem_should_replace_page(struct page *page, gfp_t gfp); static int shmem_replace_page(struct page **pagep, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index); +static int shmem_swapin_page(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp, + gfp_t gfp, struct vm_area_struct *vma, + vm_fault_t *fault_type); static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, @@ -297,12 +304,14 @@ bool shmem_charge(struct inode *inode, long pages) if (!shmem_inode_acct_block(inode, pages)) return false; + /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ + inode->i_mapping->nrpages += pages; + spin_lock_irqsave(&info->lock, flags); info->alloced += pages; inode->i_blocks += pages * BLOCKS_PER_PAGE; shmem_recalc_inode(inode); spin_unlock_irqrestore(&info->lock, flags); - inode->i_mapping->nrpages += pages; return true; } @@ -312,6 +321,8 @@ void shmem_uncharge(struct inode *inode, long pages) struct shmem_inode_info *info = SHMEM_I(inode); unsigned long flags; + /* nrpages adjustment done by __delete_from_page_cache() or caller */ + spin_lock_irqsave(&info->lock, flags); info->alloced -= pages; inode->i_blocks -= pages * BLOCKS_PER_PAGE; @@ -322,24 +333,20 @@ void shmem_uncharge(struct inode *inode, long pages) } /* - * Replace item expected in radix tree by a new item, while holding tree lock. + * Replace item expected in xarray by a new item, while holding xa_lock. */ -static int shmem_radix_tree_replace(struct address_space *mapping, +static int shmem_replace_entry(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { - struct radix_tree_node *node; - void __rcu **pslot; + XA_STATE(xas, &mapping->i_pages, index); void *item; VM_BUG_ON(!expected); VM_BUG_ON(!replacement); - item = __radix_tree_lookup(&mapping->i_pages, index, &node, &pslot); - if (!item) - return -ENOENT; + item = xas_load(&xas); if (item != expected) return -ENOENT; - __radix_tree_replace(&mapping->i_pages, node, pslot, - replacement, NULL); + xas_store(&xas, replacement); return 0; } @@ -353,12 +360,7 @@ static int shmem_radix_tree_replace(struct address_space *mapping, static bool shmem_confirm_swap(struct address_space *mapping, pgoff_t index, swp_entry_t swap) { - void *item; - - rcu_read_lock(); - item = radix_tree_lookup(&mapping->i_pages, index); - rcu_read_unlock(); - return item == swp_to_radix_entry(swap); + return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); } /* @@ -586,9 +588,11 @@ static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) */ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, - pgoff_t index, void *expected) + pgoff_t index, void *expected, gfp_t gfp) { - int error, nr = hpage_nr_pages(page); + XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); + unsigned long i = 0; + unsigned long nr = 1UL << compound_order(page); VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(index != round_down(index, nr), page); @@ -600,47 +604,39 @@ static int shmem_add_to_page_cache(struct page *page, page->mapping = mapping; page->index = index; - xa_lock_irq(&mapping->i_pages); - if (PageTransHuge(page)) { - void __rcu **results; - pgoff_t idx; - int i; - - error = 0; - if (radix_tree_gang_lookup_slot(&mapping->i_pages, - &results, &idx, index, 1) && - idx < index + HPAGE_PMD_NR) { - error = -EEXIST; + do { + void *entry; + xas_lock_irq(&xas); + entry = xas_find_conflict(&xas); + if (entry != expected) + xas_set_err(&xas, -EEXIST); + xas_create_range(&xas); + if (xas_error(&xas)) + goto unlock; +next: + xas_store(&xas, page + i); + if (++i < nr) { + xas_next(&xas); + goto next; } - - if (!error) { - for (i = 0; i < HPAGE_PMD_NR; i++) { - error = radix_tree_insert(&mapping->i_pages, - index + i, page + i); - VM_BUG_ON(error); - } + if (PageTransHuge(page)) { count_vm_event(THP_FILE_ALLOC); + __inc_node_page_state(page, NR_SHMEM_THPS); } - } else if (!expected) { - error = radix_tree_insert(&mapping->i_pages, index, page); - } else { - error = shmem_radix_tree_replace(mapping, index, expected, - page); - } - - if (!error) { mapping->nrpages += nr; - if (PageTransHuge(page)) - __inc_node_page_state(page, NR_SHMEM_THPS); __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); - xa_unlock_irq(&mapping->i_pages); - } else { +unlock: + xas_unlock_irq(&xas); + } while (xas_nomem(&xas, gfp)); + + if (xas_error(&xas)) { page->mapping = NULL; - xa_unlock_irq(&mapping->i_pages); page_ref_sub(page, nr); + return xas_error(&xas); } - return error; + + return 0; } /* @@ -654,7 +650,7 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) VM_BUG_ON_PAGE(PageCompound(page), page); xa_lock_irq(&mapping->i_pages); - error = shmem_radix_tree_replace(mapping, page->index, page, radswap); + error = shmem_replace_entry(mapping, page->index, page, radswap); page->mapping = NULL; mapping->nrpages--; __dec_node_page_state(page, NR_FILE_PAGES); @@ -665,16 +661,14 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) } /* - * Remove swap entry from radix tree, free the swap and its page cache. + * Remove swap entry from page cache, free the swap and its page cache. */ static int shmem_free_swap(struct address_space *mapping, pgoff_t index, void *radswap) { void *old; - xa_lock_irq(&mapping->i_pages); - old = radix_tree_delete_item(&mapping->i_pages, index, radswap); - xa_unlock_irq(&mapping->i_pages); + old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); if (old != radswap) return -ENOENT; free_swap_and_cache(radix_to_swp_entry(radswap)); @@ -691,29 +685,19 @@ static int shmem_free_swap(struct address_space *mapping, unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { - struct radix_tree_iter iter; - void __rcu **slot; + XA_STATE(xas, &mapping->i_pages, start); struct page *page; unsigned long swapped = 0; rcu_read_lock(); - - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - if (iter.index >= end) - break; - - page = radix_tree_deref_slot(slot); - - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); + xas_for_each(&xas, page, end - 1) { + if (xas_retry(&xas, page)) continue; - } - - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) swapped++; if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); + xas_pause(&xas); cond_resched_rcu(); } } @@ -781,14 +765,14 @@ void shmem_unlock_mapping(struct address_space *mapping) break; index = indices[pvec.nr - 1] + 1; pagevec_remove_exceptionals(&pvec); - check_move_unevictable_pages(pvec.pages, pvec.nr); + check_move_unevictable_pages(&pvec); pagevec_release(&pvec); cond_resched(); } } /* - * Remove range of pages and swap entries from radix tree, and free them. + * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. */ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, @@ -824,7 +808,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (index >= end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { if (unfalloc) continue; nr_swaps_freed += !shmem_free_swap(mapping, @@ -921,7 +905,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (index >= end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { if (unfalloc) continue; if (shmem_free_swap(mapping, index, page)) { @@ -1110,166 +1094,184 @@ static void shmem_evict_inode(struct inode *inode) clear_inode(inode); } -static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) +extern struct swap_info_struct *swap_info[]; + +static int shmem_find_swap_entries(struct address_space *mapping, + pgoff_t start, unsigned int nr_entries, + struct page **entries, pgoff_t *indices, + bool frontswap) { - struct radix_tree_iter iter; - void __rcu **slot; - unsigned long found = -1; - unsigned int checked = 0; + XA_STATE(xas, &mapping->i_pages, start); + struct page *page; + unsigned int ret = 0; + + if (!nr_entries) + return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, root, &iter, 0) { - void *entry = radix_tree_deref_slot(slot); + xas_for_each(&xas, page, ULONG_MAX) { + if (xas_retry(&xas, page)) + continue; - if (radix_tree_deref_retry(entry)) { - slot = radix_tree_iter_retry(&iter); + if (!xa_is_value(page)) continue; + + if (frontswap) { + swp_entry_t entry = radix_to_swp_entry(page); + + if (!frontswap_test(swap_info[swp_type(entry)], + swp_offset(entry))) + continue; } - if (entry == item) { - found = iter.index; - break; + + indices[ret] = xas.xa_index; + entries[ret] = page; + + if (need_resched()) { + xas_pause(&xas); + cond_resched_rcu(); } - checked++; - if ((checked % 4096) != 0) - continue; - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); + if (++ret == nr_entries) + break; } - rcu_read_unlock(); - return found; + + return ret; } /* - * If swap found in inode, free it and move page from swapcache to filecache. + * Move the swapped pages for an inode to page cache. Returns the count + * of pages swapped in, or the error in case of failure. */ -static int shmem_unuse_inode(struct shmem_inode_info *info, - swp_entry_t swap, struct page **pagep) +static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec, + pgoff_t *indices) { - struct address_space *mapping = info->vfs_inode.i_mapping; - void *radswap; - pgoff_t index; - gfp_t gfp; + int i = 0; + int ret = 0; int error = 0; + struct address_space *mapping = inode->i_mapping; - radswap = swp_to_radix_entry(swap); - index = find_swap_entry(&mapping->i_pages, radswap); - if (index == -1) - return -EAGAIN; /* tell shmem_unuse we found nothing */ + for (i = 0; i < pvec.nr; i++) { + struct page *page = pvec.pages[i]; - /* - * Move _head_ to start search for next from here. - * But be careful: shmem_evict_inode checks list_empty without taking - * mutex, and there's an instant in list_move_tail when info->swaplist - * would appear empty, if it were the only one on shmem_swaplist. - */ - if (shmem_swaplist.next != &info->swaplist) - list_move_tail(&shmem_swaplist, &info->swaplist); - - gfp = mapping_gfp_mask(mapping); - if (shmem_should_replace_page(*pagep, gfp)) { - mutex_unlock(&shmem_swaplist_mutex); - error = shmem_replace_page(pagep, gfp, info, index); - mutex_lock(&shmem_swaplist_mutex); - /* - * We needed to drop mutex to make that restrictive page - * allocation, but the inode might have been freed while we - * dropped it: although a racing shmem_evict_inode() cannot - * complete without emptying the radix_tree, our page lock - * on this swapcache page is not enough to prevent that - - * free_swap_and_cache() of our swap entry will only - * trylock_page(), removing swap from radix_tree whatever. - * - * We must not proceed to shmem_add_to_page_cache() if the - * inode has been freed, but of course we cannot rely on - * inode or mapping or info to check that. However, we can - * safely check if our swap entry is still in use (and here - * it can't have got reused for another page): if it's still - * in use, then the inode cannot have been freed yet, and we - * can safely proceed (if it's no longer in use, that tells - * nothing about the inode, but we don't need to unuse swap). - */ - if (!page_swapcount(*pagep)) - error = -ENOENT; + if (!xa_is_value(page)) + continue; + error = shmem_swapin_page(inode, indices[i], + &page, SGP_CACHE, + mapping_gfp_mask(mapping), + NULL, NULL); + if (error == 0) { + unlock_page(page); + put_page(page); + ret++; + } + if (error == -ENOMEM) + break; + error = 0; } + return error ? error : ret; +} - /* - * We rely on shmem_swaplist_mutex, not only to protect the swaplist, - * but also to hold up shmem_evict_inode(): so inode cannot be freed - * beneath us (pagelock doesn't help until the page is in pagecache). - */ - if (!error) - error = shmem_add_to_page_cache(*pagep, mapping, index, - radswap); - if (error != -ENOMEM) { - /* - * Truncation and eviction use free_swap_and_cache(), which - * only does trylock page: if we raced, best clean up here. - */ - delete_from_swap_cache(*pagep); - set_page_dirty(*pagep); - if (!error) { - spin_lock_irq(&info->lock); - info->swapped--; - spin_unlock_irq(&info->lock); - swap_free(swap); +/* + * If swap found in inode, free it and move page from swapcache to filecache. + */ +static int shmem_unuse_inode(struct inode *inode, unsigned int type, + bool frontswap, unsigned long *fs_pages_to_unuse) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t start = 0; + struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; + bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0); + int ret = 0; + + pagevec_init(&pvec); + do { + unsigned int nr_entries = PAGEVEC_SIZE; + + if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE) + nr_entries = *fs_pages_to_unuse; + + pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries, + pvec.pages, indices, + frontswap); + if (pvec.nr == 0) { + ret = 0; + break; } - } - return error; + + ret = shmem_unuse_swap_entries(inode, pvec, indices); + if (ret < 0) + break; + + if (frontswap_partial) { + *fs_pages_to_unuse -= ret; + if (*fs_pages_to_unuse == 0) { + ret = FRONTSWAP_PAGES_UNUSED; + break; + } + } + + start = indices[pvec.nr - 1]; + } while (true); + + return ret; } /* - * Search through swapped inodes to find and replace swap by page. + * Read all the shared memory data that resides in the swap + * device 'type' back into memory, so the swap device can be + * unused. */ -int shmem_unuse(swp_entry_t swap, struct page *page) +int shmem_unuse(unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { - struct list_head *this, *next; - struct shmem_inode_info *info; - struct mem_cgroup *memcg; + struct shmem_inode_info *info, *next; + struct inode *inode; + struct inode *prev_inode = NULL; int error = 0; - /* - * There's a faint possibility that swap page was replaced before - * caller locked it: caller will come back later with the right page. - */ - if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) - goto out; + if (list_empty(&shmem_swaplist)) + return 0; + + mutex_lock(&shmem_swaplist_mutex); /* - * Charge page using GFP_KERNEL while we can wait, before taking - * the shmem_swaplist_mutex which might hold up shmem_writepage(). - * Charged back to the user (not to caller) when swap account is used. + * The extra refcount on the inode is necessary to safely dereference + * p->next after re-acquiring the lock. New shmem inodes with swap + * get added to the end of the list and we will scan them all. */ - error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL, - &memcg, false); - if (error) - goto out; - /* No radix_tree_preload: swap entry keeps a place for page in tree */ - error = -EAGAIN; - - mutex_lock(&shmem_swaplist_mutex); - list_for_each_safe(this, next, &shmem_swaplist) { - info = list_entry(this, struct shmem_inode_info, swaplist); - if (info->swapped) - error = shmem_unuse_inode(info, swap, &page); - else + list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { + if (!info->swapped) { list_del_init(&info->swaplist); + continue; + } + + inode = igrab(&info->vfs_inode); + if (!inode) + continue; + + mutex_unlock(&shmem_swaplist_mutex); + if (prev_inode) + iput(prev_inode); + prev_inode = inode; + + error = shmem_unuse_inode(inode, type, frontswap, + fs_pages_to_unuse); cond_resched(); - if (error != -EAGAIN) + + mutex_lock(&shmem_swaplist_mutex); + next = list_next_entry(info, swaplist); + if (!info->swapped) + list_del_init(&info->swaplist); + if (error) break; - /* found nothing in this: move on to search the next */ } mutex_unlock(&shmem_swaplist_mutex); - if (error) { - if (error != -ENOMEM) - error = 0; - mem_cgroup_cancel_charge(page, memcg, false); - } else - mem_cgroup_commit_charge(page, memcg, true, false); -out: - unlock_page(page); - put_page(page); + if (prev_inode) + iput(prev_inode); + return error; } @@ -1353,7 +1355,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) */ mutex_lock(&shmem_swaplist_mutex); if (list_empty(&info->swaplist)) - list_add_tail(&info->swaplist, &shmem_swaplist); + list_add(&info->swaplist, &shmem_swaplist); if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { spin_lock_irq(&info->lock); @@ -1453,23 +1455,17 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; - struct inode *inode = &info->vfs_inode; - struct address_space *mapping = inode->i_mapping; - pgoff_t idx, hindex; - void __rcu **results; + struct address_space *mapping = info->vfs_inode.i_mapping; + pgoff_t hindex; struct page *page; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) return NULL; hindex = round_down(index, HPAGE_PMD_NR); - rcu_read_lock(); - if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx, - hindex, 1) && idx < hindex + HPAGE_PMD_NR) { - rcu_read_unlock(); + if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1, + XA_PRESENT)) return NULL; - } - rcu_read_unlock(); shmem_pseudo_vma_init(&pvma, info, hindex); page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, @@ -1547,11 +1543,13 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, { struct page *oldpage, *newpage; struct address_space *swap_mapping; + swp_entry_t entry; pgoff_t swap_index; int error; oldpage = *pagep; - swap_index = page_private(oldpage); + entry.val = page_private(oldpage); + swap_index = swp_offset(entry); swap_mapping = page_mapping(oldpage); /* @@ -1570,7 +1568,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, __SetPageLocked(newpage); __SetPageSwapBacked(newpage); SetPageUptodate(newpage); - set_page_private(newpage, swap_index); + set_page_private(newpage, entry.val); SetPageSwapCache(newpage); /* @@ -1578,8 +1576,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, * a nice clean interface for us to replace oldpage by newpage there. */ xa_lock_irq(&swap_mapping->i_pages); - error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, - newpage); + error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage); if (!error) { __inc_node_page_state(newpage, NR_FILE_PAGES); __dec_node_page_state(oldpage, NR_FILE_PAGES); @@ -1609,6 +1606,116 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, } /* + * Swap in the page pointed to by *pagep. + * Caller has to make sure that *pagep contains a valid swapped page. + * Returns 0 and the page in pagep if success. On failure, returns the + * the error code and NULL in *pagep. + */ +static int shmem_swapin_page(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp, + gfp_t gfp, struct vm_area_struct *vma, + vm_fault_t *fault_type) +{ + struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm; + struct mem_cgroup *memcg; + struct page *page; + swp_entry_t swap; + int error; + + VM_BUG_ON(!*pagep || !xa_is_value(*pagep)); + swap = radix_to_swp_entry(*pagep); + *pagep = NULL; + + /* Look it up and read it in.. */ + page = lookup_swap_cache(swap, NULL, 0); + if (!page) { + /* Or update major stats only when swapin succeeds?? */ + if (fault_type) { + *fault_type |= VM_FAULT_MAJOR; + count_vm_event(PGMAJFAULT); + count_memcg_event_mm(charge_mm, PGMAJFAULT); + } + /* Here we actually start the io */ + page = shmem_swapin(swap, gfp, info, index); + if (!page) { + error = -ENOMEM; + goto failed; + } + } + + /* We have to do this with page locked to prevent races */ + lock_page(page); + if (!PageSwapCache(page) || page_private(page) != swap.val || + !shmem_confirm_swap(mapping, index, swap)) { + error = -EEXIST; + goto unlock; + } + if (!PageUptodate(page)) { + error = -EIO; + goto failed; + } + wait_on_page_writeback(page); + + if (shmem_should_replace_page(page, gfp)) { + error = shmem_replace_page(&page, gfp, info, index); + if (error) + goto failed; + } + + error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, + false); + if (!error) { + error = shmem_add_to_page_cache(page, mapping, index, + swp_to_radix_entry(swap), gfp); + /* + * We already confirmed swap under page lock, and make + * no memory allocation here, so usually no possibility + * of error; but free_swap_and_cache() only trylocks a + * page, so it is just possible that the entry has been + * truncated or holepunched since swap was confirmed. + * shmem_undo_range() will have done some of the + * unaccounting, now delete_from_swap_cache() will do + * the rest. + */ + if (error) { + mem_cgroup_cancel_charge(page, memcg, false); + delete_from_swap_cache(page); + } + } + if (error) + goto failed; + + mem_cgroup_commit_charge(page, memcg, true, false); + + spin_lock_irq(&info->lock); + info->swapped--; + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); + + if (sgp == SGP_WRITE) + mark_page_accessed(page); + + delete_from_swap_cache(page); + set_page_dirty(page); + swap_free(swap); + + *pagep = page; + return 0; +failed: + if (!shmem_confirm_swap(mapping, index, swap)) + error = -EEXIST; +unlock: + if (page) { + unlock_page(page); + put_page(page); + } + + return error; +} + +/* * shmem_getpage_gfp - find page in cache, or get from swap, or allocate * * If we allocate a new one we do not mark it dirty. That's up to the @@ -1629,7 +1736,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct mm_struct *charge_mm; struct mem_cgroup *memcg; struct page *page; - swp_entry_t swap; enum sgp_type sgp_huge = sgp; pgoff_t hindex = index; int error; @@ -1641,17 +1747,23 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, if (sgp == SGP_NOHUGE || sgp == SGP_HUGE) sgp = SGP_CACHE; repeat: - swap.val = 0; - page = find_lock_entry(mapping, index); - if (radix_tree_exceptional_entry(page)) { - swap = radix_to_swp_entry(page); - page = NULL; - } - if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { - error = -EINVAL; - goto unlock; + return -EINVAL; + } + + sbinfo = SHMEM_SB(inode->i_sb); + charge_mm = vma ? vma->vm_mm : current->mm; + + page = find_lock_entry(mapping, index); + if (xa_is_value(page)) { + error = shmem_swapin_page(inode, index, &page, + sgp, gfp, vma, fault_type); + if (error == -EEXIST) + goto repeat; + + *pagep = page; + return error; } if (page && sgp == SGP_WRITE) @@ -1665,7 +1777,7 @@ repeat: put_page(page); page = NULL; } - if (page || (sgp == SGP_READ && !swap.val)) { + if (page || sgp == SGP_READ) { *pagep = page; return 0; } @@ -1674,220 +1786,138 @@ repeat: * Fast cache lookup did not find it: * bring it back from swap or allocate. */ - sbinfo = SHMEM_SB(inode->i_sb); - charge_mm = vma ? vma->vm_mm : current->mm; - if (swap.val) { - /* Look it up and read it in.. */ - page = lookup_swap_cache(swap, NULL, 0); - if (!page) { - /* Or update major stats only when swapin succeeds?? */ - if (fault_type) { - *fault_type |= VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - count_memcg_event_mm(charge_mm, PGMAJFAULT); - } - /* Here we actually start the io */ - page = shmem_swapin(swap, gfp, info, index); - if (!page) { - error = -ENOMEM; - goto failed; - } - } - - /* We have to do this with page locked to prevent races */ - lock_page(page); - if (!PageSwapCache(page) || page_private(page) != swap.val || - !shmem_confirm_swap(mapping, index, swap)) { - error = -EEXIST; /* try again */ - goto unlock; - } - if (!PageUptodate(page)) { - error = -EIO; - goto failed; - } - wait_on_page_writeback(page); - - if (shmem_should_replace_page(page, gfp)) { - error = shmem_replace_page(&page, gfp, info, index); - if (error) - goto failed; - } - - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, - false); - if (!error) { - error = shmem_add_to_page_cache(page, mapping, index, - swp_to_radix_entry(swap)); - /* - * We already confirmed swap under page lock, and make - * no memory allocation here, so usually no possibility - * of error; but free_swap_and_cache() only trylocks a - * page, so it is just possible that the entry has been - * truncated or holepunched since swap was confirmed. - * shmem_undo_range() will have done some of the - * unaccounting, now delete_from_swap_cache() will do - * the rest. - * Reset swap.val? No, leave it so "failed" goes back to - * "repeat": reading a hole and writing should succeed. - */ - if (error) { - mem_cgroup_cancel_charge(page, memcg, false); - delete_from_swap_cache(page); - } - } - if (error) - goto failed; - - mem_cgroup_commit_charge(page, memcg, true, false); - - spin_lock_irq(&info->lock); - info->swapped--; - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); - - if (sgp == SGP_WRITE) - mark_page_accessed(page); - - delete_from_swap_cache(page); - set_page_dirty(page); - swap_free(swap); - - } else { - if (vma && userfaultfd_missing(vma)) { - *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); - return 0; - } + if (vma && userfaultfd_missing(vma)) { + *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); + return 0; + } - /* shmem_symlink() */ - if (mapping->a_ops != &shmem_aops) - goto alloc_nohuge; - if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) - goto alloc_nohuge; - if (shmem_huge == SHMEM_HUGE_FORCE) + /* shmem_symlink() */ + if (mapping->a_ops != &shmem_aops) + goto alloc_nohuge; + if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) + goto alloc_nohuge; + if (shmem_huge == SHMEM_HUGE_FORCE) + goto alloc_huge; + switch (sbinfo->huge) { + loff_t i_size; + pgoff_t off; + case SHMEM_HUGE_NEVER: + goto alloc_nohuge; + case SHMEM_HUGE_WITHIN_SIZE: + off = round_up(index, HPAGE_PMD_NR); + i_size = round_up(i_size_read(inode), PAGE_SIZE); + if (i_size >= HPAGE_PMD_SIZE && + i_size >> PAGE_SHIFT >= off) goto alloc_huge; - switch (sbinfo->huge) { - loff_t i_size; - pgoff_t off; - case SHMEM_HUGE_NEVER: - goto alloc_nohuge; - case SHMEM_HUGE_WITHIN_SIZE: - off = round_up(index, HPAGE_PMD_NR); - i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >= HPAGE_PMD_SIZE && - i_size >> PAGE_SHIFT >= off) - goto alloc_huge; - /* fallthrough */ - case SHMEM_HUGE_ADVISE: - if (sgp_huge == SGP_HUGE) - goto alloc_huge; - /* TODO: implement fadvise() hints */ - goto alloc_nohuge; - } + /* fallthrough */ + case SHMEM_HUGE_ADVISE: + if (sgp_huge == SGP_HUGE) + goto alloc_huge; + /* TODO: implement fadvise() hints */ + goto alloc_nohuge; + } alloc_huge: - page = shmem_alloc_and_acct_page(gfp, inode, index, true); - if (IS_ERR(page)) { -alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, - index, false); - } - if (IS_ERR(page)) { - int retry = 5; - error = PTR_ERR(page); - page = NULL; - if (error != -ENOSPC) - goto failed; - /* - * Try to reclaim some spece by splitting a huge page - * beyond i_size on the filesystem. - */ - while (retry--) { - int ret; - ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); - if (ret == SHRINK_STOP) - break; - if (ret) - goto alloc_nohuge; - } - goto failed; - } - - if (PageTransHuge(page)) - hindex = round_down(index, HPAGE_PMD_NR); - else - hindex = index; + page = shmem_alloc_and_acct_page(gfp, inode, index, true); + if (IS_ERR(page)) { +alloc_nohuge: + page = shmem_alloc_and_acct_page(gfp, inode, + index, false); + } + if (IS_ERR(page)) { + int retry = 5; - if (sgp == SGP_WRITE) - __SetPageReferenced(page); + error = PTR_ERR(page); + page = NULL; + if (error != -ENOSPC) + goto unlock; + /* + * Try to reclaim some space by splitting a huge page + * beyond i_size on the filesystem. + */ + while (retry--) { + int ret; - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, - PageTransHuge(page)); - if (error) - goto unacct; - error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK, - compound_order(page)); - if (!error) { - error = shmem_add_to_page_cache(page, mapping, hindex, - NULL); - radix_tree_preload_end(); - } - if (error) { - mem_cgroup_cancel_charge(page, memcg, - PageTransHuge(page)); - goto unacct; + ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); + if (ret == SHRINK_STOP) + break; + if (ret) + goto alloc_nohuge; } - mem_cgroup_commit_charge(page, memcg, false, - PageTransHuge(page)); - lru_cache_add_anon(page); + goto unlock; + } - spin_lock_irq(&info->lock); - info->alloced += 1 << compound_order(page); - inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); - alloced = true; + if (PageTransHuge(page)) + hindex = round_down(index, HPAGE_PMD_NR); + else + hindex = index; - if (PageTransHuge(page) && - DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < - hindex + HPAGE_PMD_NR - 1) { - /* - * Part of the huge page is beyond i_size: subject - * to shrink under memory pressure. - */ - spin_lock(&sbinfo->shrinklist_lock); - /* - * _careful to defend against unlocked access to - * ->shrink_list in shmem_unused_huge_shrink() - */ - if (list_empty_careful(&info->shrinklist)) { - list_add_tail(&info->shrinklist, - &sbinfo->shrinklist); - sbinfo->shrinklist_len++; - } - spin_unlock(&sbinfo->shrinklist_lock); - } + if (sgp == SGP_WRITE) + __SetPageReferenced(page); + error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, + PageTransHuge(page)); + if (error) + goto unacct; + error = shmem_add_to_page_cache(page, mapping, hindex, + NULL, gfp & GFP_RECLAIM_MASK); + if (error) { + mem_cgroup_cancel_charge(page, memcg, + PageTransHuge(page)); + goto unacct; + } + mem_cgroup_commit_charge(page, memcg, false, + PageTransHuge(page)); + lru_cache_add_anon(page); + + spin_lock_irq(&info->lock); + info->alloced += 1 << compound_order(page); + inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); + alloced = true; + + if (PageTransHuge(page) && + DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < + hindex + HPAGE_PMD_NR - 1) { /* - * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. + * Part of the huge page is beyond i_size: subject + * to shrink under memory pressure. */ - if (sgp == SGP_FALLOC) - sgp = SGP_WRITE; -clear: + spin_lock(&sbinfo->shrinklist_lock); /* - * Let SGP_WRITE caller clear ends if write does not fill page; - * but SGP_FALLOC on a page fallocated earlier must initialize - * it now, lest undo on failure cancel our earlier guarantee. + * _careful to defend against unlocked access to + * ->shrink_list in shmem_unused_huge_shrink() */ - if (sgp != SGP_WRITE && !PageUptodate(page)) { - struct page *head = compound_head(page); - int i; + if (list_empty_careful(&info->shrinklist)) { + list_add_tail(&info->shrinklist, + &sbinfo->shrinklist); + sbinfo->shrinklist_len++; + } + spin_unlock(&sbinfo->shrinklist_lock); + } - for (i = 0; i < (1 << compound_order(head)); i++) { - clear_highpage(head + i); - flush_dcache_page(head + i); - } - SetPageUptodate(head); + /* + * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. + */ + if (sgp == SGP_FALLOC) + sgp = SGP_WRITE; +clear: + /* + * Let SGP_WRITE caller clear ends if write does not fill page; + * but SGP_FALLOC on a page fallocated earlier must initialize + * it now, lest undo on failure cancel our earlier guarantee. + */ + if (sgp != SGP_WRITE && !PageUptodate(page)) { + struct page *head = compound_head(page); + int i; + + for (i = 0; i < (1 << compound_order(head)); i++) { + clear_highpage(head + i); + flush_dcache_page(head + i); } + SetPageUptodate(head); } /* Perhaps the file has been truncated since we checked */ @@ -1917,9 +1947,6 @@ unacct: put_page(page); goto alloc_nohuge; } -failed: - if (swap.val && !shmem_confirm_swap(mapping, index, swap)) - error = -EEXIST; unlock: if (page) { unlock_page(page); @@ -1931,7 +1958,7 @@ unlock: spin_unlock_irq(&info->lock); goto repeat; } - if (error == -EEXIST) /* from above or from radix_tree_insert */ + if (error == -EEXIST) goto repeat; return error; } @@ -2163,6 +2190,24 @@ out_nomem: static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { + struct shmem_inode_info *info = SHMEM_I(file_inode(file)); + + if (info->seals & F_SEAL_FUTURE_WRITE) { + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * "future write" seal active. + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return -EPERM; + + /* + * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED + * read-only mapping, take care to not allow mprotect to revert + * protections. + */ + vma->vm_flags &= ~(VM_MAYWRITE); + } + file_accessed(file); vma->vm_ops = &shmem_vm_ops; if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && @@ -2258,6 +2303,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, struct page *page; pte_t _dst_pte, *dst_pte; int ret; + pgoff_t offset, max_off; ret = -ENOMEM; if (!shmem_inode_acct_block(inode, 1)) @@ -2280,7 +2326,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, *pagep = page; shmem_inode_unacct_blocks(inode, 1); /* don't free the page */ - return -EFAULT; + return -ENOENT; } } else { /* mfill_zeropage_atomic */ clear_highpage(page); @@ -2295,15 +2341,18 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, __SetPageSwapBacked(page); __SetPageUptodate(page); + ret = -EFAULT; + offset = linear_page_index(dst_vma, dst_addr); + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(offset >= max_off)) + goto out_release; + ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false); if (ret) goto out_release; - ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); - if (!ret) { - ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL); - radix_tree_preload_end(); - } + ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, + gfp & GFP_RECLAIM_MASK); if (ret) goto out_release_uncharge; @@ -2312,9 +2361,25 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, _dst_pte = mk_pte(page, dst_vma->vm_page_prot); if (dst_vma->vm_flags & VM_WRITE) _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); + else { + /* + * We don't set the pte dirty if the vma has no + * VM_WRITE permission, so mark the page dirty or it + * could be freed from under us. We could do it + * unconditionally before unlock_page(), but doing it + * only if VM_WRITE is not set is faster. + */ + set_page_dirty(page); + } - ret = -EEXIST; dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + + ret = -EFAULT; + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(offset >= max_off)) + goto out_release_uncharge_unlock; + + ret = -EEXIST; if (!pte_none(*dst_pte)) goto out_release_uncharge_unlock; @@ -2332,13 +2397,15 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); - unlock_page(page); pte_unmap_unlock(dst_pte, ptl); + unlock_page(page); ret = 0; out: return ret; out_release_uncharge_unlock: pte_unmap_unlock(dst_pte, ptl); + ClearPageDirty(page); + delete_from_page_cache(page); out_release_uncharge: mem_cgroup_cancel_charge(page, memcg, false); out_release: @@ -2391,8 +2458,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping, pgoff_t index = pos >> PAGE_SHIFT; /* i_mutex is held by caller */ - if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) { - if (info->seals & F_SEAL_WRITE) + if (unlikely(info->seals & (F_SEAL_GROW | + F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) return -EPERM; if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) return -EPERM; @@ -2548,7 +2616,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } /* - * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. + * llseek SEEK_DATA or SEEK_HOLE through the page cache. */ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, pgoff_t index, pgoff_t end, int whence) @@ -2578,7 +2646,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, index = indices[i]; } page = pvec.pages[i]; - if (page && !radix_tree_exceptional_entry(page)) { + if (page && !xa_is_value(page)) { if (!PageUptodate(page)) page = NULL; } @@ -2610,9 +2678,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) inode_lock(inode); /* We're holding i_mutex so we can access i_size directly */ - if (offset < 0) - offset = -EINVAL; - else if (offset >= inode->i_size) + if (offset < 0 || offset >= inode->i_size) offset = -ENXIO; else { start = offset >> PAGE_SHIFT; @@ -2657,7 +2723,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); /* protected by i_mutex */ - if (info->seals & F_SEAL_WRITE) { + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { error = -EPERM; goto out; } @@ -2866,16 +2932,20 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); - int ret; + int ret = 0; /* * No ordinary (disk based) filesystem counts links as inodes; * but each new link needs a new dentry, pinning lowmem, and * tmpfs dentries cannot be pruned until they are unlinked. + * But if an O_TMPFILE file is linked into the tmpfs, the + * first link must skip that, to get the accounting right. */ - ret = shmem_reserve_inode(inode->i_sb); - if (ret) - goto out; + if (inode->i_nlink) { + ret = shmem_reserve_inode(inode->i_sb); + if (ret) + goto out; + } dir->i_size += BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); @@ -3321,7 +3391,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, size = memparse(value,&rest); if (*rest == '%') { size <<= PAGE_SHIFT; - size *= totalram_pages; + size *= totalram_pages(); do_div(size, 100); rest++; } @@ -3861,7 +3931,8 @@ int __init shmem_init(void) return 0; } -int shmem_unuse(swp_entry_t swap, struct page *page) +int shmem_unuse(unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { return 0; } diff --git a/mm/slab.c b/mm/slab.c index aa76a70e087e..28652e4218e0 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -406,19 +406,6 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct page *page, return page->s_mem + cache->size * idx; } -/* - * We want to avoid an expensive divide : (offset / cache->size) - * Using the fact that size is a constant for a particular cache, - * we can replace (offset / cache->size) by - * reciprocal_divide(offset, cache->reciprocal_buffer_size) - */ -static inline unsigned int obj_to_index(const struct kmem_cache *cache, - const struct page *page, void *obj) -{ - u32 offset = (obj - page->s_mem); - return reciprocal_divide(offset, cache->reciprocal_buffer_size); -} - #define BOOT_CPUCACHE_ENTRIES 1 /* internal cache of cache description objs */ static struct kmem_cache kmem_cache_boot = { @@ -563,14 +550,6 @@ static void start_cpu_timer(int cpu) static void init_arraycache(struct array_cache *ac, int limit, int batch) { - /* - * The array_cache structures contain pointers to free object. - * However, when such objects are allocated or transferred to another - * cache the pointers are not cleared and they could be counted as - * valid references during a kmemleak scan. Therefore, kmemleak must - * not scan such objects. - */ - kmemleak_no_scan(ac); if (ac) { ac->avail = 0; ac->limit = limit; @@ -586,6 +565,14 @@ static struct array_cache *alloc_arraycache(int node, int entries, struct array_cache *ac = NULL; ac = kmalloc_node(memsize, gfp, node); + /* + * The array_cache structures contain pointers to free object. + * However, when such objects are allocated or transferred to another + * cache the pointers are not cleared and they could be counted as + * valid references during a kmemleak scan. Therefore, kmemleak must + * not scan such objects. + */ + kmemleak_no_scan(ac); init_arraycache(ac, entries, batchcount); return ac; } @@ -679,20 +666,22 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries, struct alien_cache *alc = NULL; alc = kmalloc_node(memsize, gfp, node); - init_arraycache(&alc->ac, entries, batch); - spin_lock_init(&alc->lock); + if (alc) { + kmemleak_no_scan(alc); + init_arraycache(&alc->ac, entries, batch); + spin_lock_init(&alc->lock); + } return alc; } static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { struct alien_cache **alc_ptr; - size_t memsize = sizeof(void *) * nr_node_ids; int i; if (limit > 1) limit = 12; - alc_ptr = kzalloc_node(memsize, gfp, node); + alc_ptr = kcalloc_node(nr_node_ids, sizeof(void *), gfp, node); if (!alc_ptr) return NULL; @@ -962,10 +951,10 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep, * To protect lockless access to n->shared during irq disabled context. * If n->shared isn't NULL in irq disabled context, accessing to it is * guaranteed to be valid until irq is re-enabled, because it will be - * freed after synchronize_sched(). + * freed after synchronize_rcu(). */ if (old_shared && force_change) - synchronize_sched(); + synchronize_rcu(); fail: kfree(old_shared); @@ -1248,7 +1237,7 @@ void __init kmem_cache_init(void) * page orders on machines with more than 32MB of memory if * not overridden on the command line. */ - if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT) + if (!slab_max_order_set && totalram_pages() > (32 << 20) >> PAGE_SHIFT) slab_max_order = SLAB_MAX_ORDER_HI; /* Bootstrap is tricky, because several objects are allocated @@ -1288,7 +1277,7 @@ void __init kmem_cache_init(void) * Initialize the caches that provide memory for the kmem_cache_node * structures first. Without this, further allocations will bug. */ - kmalloc_caches[INDEX_NODE] = create_kmalloc_cache( + kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache( kmalloc_info[INDEX_NODE].name, kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS, 0, kmalloc_size(INDEX_NODE)); @@ -1304,7 +1293,7 @@ void __init kmem_cache_init(void) for_each_online_node(nid) { init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); - init_list(kmalloc_caches[INDEX_NODE], + init_list(kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE], &init_kmem_cache_node[SIZE_NODE + nid], nid); } } @@ -1738,6 +1727,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) * This could be made much more intelligent. For now, try to avoid using * high order pages for slabs. When the gfp() functions are more friendly * towards high-order requests, this should be changed. + * + * Return: number of left-over bytes in a slab */ static size_t calculate_slab_order(struct kmem_cache *cachep, size_t size, slab_flags_t flags) @@ -1986,6 +1977,8 @@ static bool set_on_slab_cache(struct kmem_cache *cachep, * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware * cacheline. This can be beneficial if you're counting cycles as closely * as davem. + * + * Return: a pointer to the created cache or %NULL in case of error */ int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags) { @@ -2379,6 +2372,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, /* Slab management obj is off-slab. */ freelist = kmem_cache_alloc_node(cachep->freelist_cache, local_flags, nodeid); + freelist = kasan_reset_tag(freelist); if (!freelist) return NULL; } else { @@ -2574,7 +2568,7 @@ static void cache_init_objs(struct kmem_cache *cachep, for (i = 0; i < cachep->num; i++) { objp = index_to_obj(cachep, page, i); - kasan_init_slab_obj(cachep, objp); + objp = kasan_init_slab_obj(cachep, objp); /* constructor could break poison info */ if (DEBUG == 0 && cachep->ctor) { @@ -2692,6 +2686,13 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, offset *= cachep->colour_off; + /* + * Call kasan_poison_slab() before calling alloc_slabmgmt(), so + * page_address() in the latter returns a non-tagged pointer, + * as it should be for slab pages. + */ + kasan_poison_slab(page); + /* Get slab management. */ freelist = alloc_slabmgmt(cachep, page, offset, local_flags & ~GFP_CONSTRAINT_MASK, page_node); @@ -2700,7 +2701,6 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, slab_map_pages(cachep, page, freelist); - kasan_poison_slab(page); cache_init_objs(cachep, page); if (gfpflags_allow_blocking(local_flags)) @@ -3546,12 +3546,13 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, * * Allocate an object from this cache. The flags are only relevant * if the cache has no available objects. + * + * Return: pointer to the new object or %NULL in case of error */ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *ret = slab_alloc(cachep, flags, _RET_IP_); - kasan_slab_alloc(cachep, ret, flags); trace_kmem_cache_alloc(_RET_IP_, ret, cachep->object_size, cachep->size, flags); @@ -3617,7 +3618,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) ret = slab_alloc(cachep, flags, _RET_IP_); - kasan_kmalloc(cachep, ret, size, flags); + ret = kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(_RET_IP_, ret, size, cachep->size, flags); return ret; @@ -3636,12 +3637,13 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace); * node, which can improve the performance for cpu bound structures. * * Fallback to other node is possible if __GFP_THISNODE is not set. + * + * Return: pointer to the new object or %NULL in case of error */ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); - kasan_slab_alloc(cachep, ret, flags); trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep->object_size, cachep->size, flags, nodeid); @@ -3660,7 +3662,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); - kasan_kmalloc(cachep, ret, size, flags); + ret = kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc_node(_RET_IP_, ret, size, cachep->size, flags, nodeid); @@ -3675,11 +3677,13 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) struct kmem_cache *cachep; void *ret; + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) + return NULL; cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; ret = kmem_cache_alloc_node_trace(cachep, flags, node, size); - kasan_kmalloc(cachep, ret, size, flags); + ret = kasan_kmalloc(cachep, ret, size, flags); return ret; } @@ -3703,6 +3707,8 @@ EXPORT_SYMBOL(__kmalloc_node_track_caller); * @size: how many bytes of memory are required. * @flags: the type of memory to allocate (see kmalloc). * @caller: function caller for debug tracking of the caller + * + * Return: pointer to the allocated memory or %NULL in case of error */ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, unsigned long caller) @@ -3710,12 +3716,14 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, struct kmem_cache *cachep; void *ret; + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) + return NULL; cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; ret = slab_alloc(cachep, flags, caller); - kasan_kmalloc(cachep, ret, size, flags); + ret = kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(caller, ret, size, cachep->size, flags); @@ -4166,6 +4174,8 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) * @buffer: user buffer * @count: data length * @ppos: unused + * + * Return: %0 on success, negative error code otherwise. */ ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) @@ -4415,6 +4425,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, unsigned int objnr; unsigned long offset; + ptr = kasan_reset_tag(ptr); + /* Find and validate object. */ cachep = page->slab_cache; objnr = obj_to_index(cachep, page, (void *)ptr); @@ -4457,6 +4469,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, * The caller must guarantee that objp points to a valid object previously * allocated with either kmalloc() or kmem_cache_alloc(). The object * must not be freed during the duration of the call. + * + * Return: size of the actual memory used by @objp in bytes */ size_t ksize(const void *objp) { diff --git a/mm/slab.h b/mm/slab.h index 58c6c1c2a78e..e5e6658eeacc 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -276,8 +276,6 @@ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { - if (!memcg_kmem_enabled()) - return 0; if (is_root_cache(s)) return 0; return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); @@ -286,8 +284,6 @@ static __always_inline int memcg_charge_slab(struct page *page, static __always_inline void memcg_uncharge_slab(struct page *page, int order, struct kmem_cache *s) { - if (!memcg_kmem_enabled()) - return; memcg_kmem_uncharge(page, order); } @@ -437,11 +433,10 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, flags &= gfp_allowed_mask; for (i = 0; i < size; i++) { - void *object = p[i]; - - kmemleak_alloc_recursive(object, s->object_size, 1, + p[i] = kasan_slab_alloc(s, p[i], flags); + /* As p[i] might get tagged, call kmemleak hook after KASAN. */ + kmemleak_alloc_recursive(p[i], s->object_size, 1, s->flags, flags); - kasan_slab_alloc(s, object, flags); } if (memcg_kmem_enabled()) diff --git a/mm/slab_common.c b/mm/slab_common.c index fea3376f9816..03eeb8b7b4b1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -406,8 +406,9 @@ out_free_cache: goto out; } -/* - * kmem_cache_create_usercopy - Create a cache. +/** + * kmem_cache_create_usercopy - Create a cache with a region suitable + * for copying to userspace * @name: A string which is used in /proc/slabinfo to identify this cache. * @size: The size of objects to be created in this cache. * @align: The required alignment for the objects. @@ -416,7 +417,6 @@ out_free_cache: * @usersize: Usercopy region size * @ctor: A constructor for the objects. * - * Returns a ptr to the cache on success, NULL on failure. * Cannot be called within a interrupt, but can be interrupted. * The @ctor is run when new pages are allocated by the cache. * @@ -425,12 +425,14 @@ out_free_cache: * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) * to catch references to uninitialised memory. * - * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check + * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check * for buffer overruns. * * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware * cacheline. This can be beneficial if you're counting cycles as closely * as davem. + * + * Return: a pointer to the cache on success, NULL on failure. */ struct kmem_cache * kmem_cache_create_usercopy(const char *name, @@ -514,6 +516,31 @@ out_unlock: } EXPORT_SYMBOL(kmem_cache_create_usercopy); +/** + * kmem_cache_create - Create a cache. + * @name: A string which is used in /proc/slabinfo to identify this cache. + * @size: The size of objects to be created in this cache. + * @align: The required alignment for the objects. + * @flags: SLAB flags + * @ctor: A constructor for the objects. + * + * Cannot be called within a interrupt, but can be interrupted. + * The @ctor is run when new pages are allocated by the cache. + * + * The flags are + * + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) + * to catch references to uninitialised memory. + * + * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check + * for buffer overruns. + * + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware + * cacheline. This can be beneficial if you're counting cycles as closely + * as davem. + * + * Return: a pointer to the cache on success, NULL on failure. + */ struct kmem_cache * kmem_cache_create(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)) @@ -724,7 +751,7 @@ void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, css_get(&s->memcg_params.memcg->css); s->memcg_params.deact_fn = deact_fn; - call_rcu_sched(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn); + call_rcu(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn); } void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) @@ -839,11 +866,11 @@ static void flush_memcg_workqueue(struct kmem_cache *s) mutex_unlock(&slab_mutex); /* - * SLUB deactivates the kmem_caches through call_rcu_sched. Make + * SLUB deactivates the kmem_caches through call_rcu. Make * sure all registered rcu callbacks have been invoked. */ if (IS_ENABLED(CONFIG_SLUB)) - rcu_barrier_sched(); + rcu_barrier(); /* * SLAB and SLUB create memcg kmem_caches through workqueue and SLUB @@ -912,6 +939,8 @@ EXPORT_SYMBOL(kmem_cache_destroy); * * Releases as many slabs as possible for a cache. * To help debugging, a zero exit status indicates all slabs were released. + * + * Return: %0 if all slabs were released, non-zero otherwise */ int kmem_cache_shrink(struct kmem_cache *cachep) { @@ -973,14 +1002,10 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, return s; } -struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init; +struct kmem_cache * +kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init; EXPORT_SYMBOL(kmalloc_caches); -#ifdef CONFIG_ZONE_DMA -struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init; -EXPORT_SYMBOL(kmalloc_dma_caches); -#endif - /* * Conversion table for small slabs sizes / 8 to the index in the * kmalloc array. This is necessary for slabs < 192 since we have non power @@ -1027,25 +1052,18 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) { unsigned int index; - if (unlikely(size > KMALLOC_MAX_SIZE)) { - WARN_ON_ONCE(!(flags & __GFP_NOWARN)); - return NULL; - } - if (size <= 192) { if (!size) return ZERO_SIZE_PTR; index = size_index[size_index_elem(size)]; - } else + } else { + if (WARN_ON_ONCE(size > KMALLOC_MAX_CACHE_SIZE)) + return NULL; index = fls(size - 1); + } -#ifdef CONFIG_ZONE_DMA - if (unlikely((flags & GFP_DMA))) - return kmalloc_dma_caches[index]; - -#endif - return kmalloc_caches[index]; + return kmalloc_caches[kmalloc_type(flags)][index]; } /* @@ -1059,15 +1077,15 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = { {"kmalloc-16", 16}, {"kmalloc-32", 32}, {"kmalloc-64", 64}, {"kmalloc-128", 128}, {"kmalloc-256", 256}, {"kmalloc-512", 512}, - {"kmalloc-1024", 1024}, {"kmalloc-2048", 2048}, - {"kmalloc-4096", 4096}, {"kmalloc-8192", 8192}, - {"kmalloc-16384", 16384}, {"kmalloc-32768", 32768}, - {"kmalloc-65536", 65536}, {"kmalloc-131072", 131072}, - {"kmalloc-262144", 262144}, {"kmalloc-524288", 524288}, - {"kmalloc-1048576", 1048576}, {"kmalloc-2097152", 2097152}, - {"kmalloc-4194304", 4194304}, {"kmalloc-8388608", 8388608}, - {"kmalloc-16777216", 16777216}, {"kmalloc-33554432", 33554432}, - {"kmalloc-67108864", 67108864} + {"kmalloc-1k", 1024}, {"kmalloc-2k", 2048}, + {"kmalloc-4k", 4096}, {"kmalloc-8k", 8192}, + {"kmalloc-16k", 16384}, {"kmalloc-32k", 32768}, + {"kmalloc-64k", 65536}, {"kmalloc-128k", 131072}, + {"kmalloc-256k", 262144}, {"kmalloc-512k", 524288}, + {"kmalloc-1M", 1048576}, {"kmalloc-2M", 2097152}, + {"kmalloc-4M", 4194304}, {"kmalloc-8M", 8388608}, + {"kmalloc-16M", 16777216}, {"kmalloc-32M", 33554432}, + {"kmalloc-64M", 67108864} }; /* @@ -1117,9 +1135,36 @@ void __init setup_kmalloc_cache_index_table(void) } } -static void __init new_kmalloc_cache(int idx, slab_flags_t flags) +static const char * +kmalloc_cache_name(const char *prefix, unsigned int size) +{ + + static const char units[3] = "\0kM"; + int idx = 0; + + while (size >= 1024 && (size % 1024 == 0)) { + size /= 1024; + idx++; + } + + return kasprintf(GFP_NOWAIT, "%s-%u%c", prefix, size, units[idx]); +} + +static void __init +new_kmalloc_cache(int idx, int type, slab_flags_t flags) { - kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name, + const char *name; + + if (type == KMALLOC_RECLAIM) { + flags |= SLAB_RECLAIM_ACCOUNT; + name = kmalloc_cache_name("kmalloc-rcl", + kmalloc_info[idx].size); + BUG_ON(!name); + } else { + name = kmalloc_info[idx].name; + } + + kmalloc_caches[type][idx] = create_kmalloc_cache(name, kmalloc_info[idx].size, flags, 0, kmalloc_info[idx].size); } @@ -1131,21 +1176,25 @@ static void __init new_kmalloc_cache(int idx, slab_flags_t flags) */ void __init create_kmalloc_caches(slab_flags_t flags) { - int i; + int i, type; - for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { - if (!kmalloc_caches[i]) - new_kmalloc_cache(i, flags); + for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) { + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { + if (!kmalloc_caches[type][i]) + new_kmalloc_cache(i, type, flags); - /* - * Caches that are not of the two-to-the-power-of size. - * These have to be created immediately after the - * earlier power of two caches - */ - if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) - new_kmalloc_cache(1, flags); - if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7) - new_kmalloc_cache(2, flags); + /* + * Caches that are not of the two-to-the-power-of size. + * These have to be created immediately after the + * earlier power of two caches + */ + if (KMALLOC_MIN_SIZE <= 32 && i == 6 && + !kmalloc_caches[type][1]) + new_kmalloc_cache(1, type, flags); + if (KMALLOC_MIN_SIZE <= 64 && i == 7 && + !kmalloc_caches[type][2]) + new_kmalloc_cache(2, type, flags); + } } /* Kmalloc array is now usable */ @@ -1153,16 +1202,15 @@ void __init create_kmalloc_caches(slab_flags_t flags) #ifdef CONFIG_ZONE_DMA for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { - struct kmem_cache *s = kmalloc_caches[i]; + struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i]; if (s) { unsigned int size = kmalloc_size(i); - char *n = kasprintf(GFP_NOWAIT, - "dma-kmalloc-%u", size); + const char *n = kmalloc_cache_name("dma-kmalloc", size); BUG_ON(!n); - kmalloc_dma_caches[i] = create_kmalloc_cache(n, - size, SLAB_CACHE_DMA | flags, 0, 0); + kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache( + n, size, SLAB_CACHE_DMA | flags, 0, 0); } } #endif @@ -1182,8 +1230,9 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) flags |= __GFP_COMP; page = alloc_pages(flags, order); ret = page ? page_address(page) : NULL; + ret = kasan_kmalloc_large(ret, size, flags); + /* As ret might get tagged, call kmemleak hook after KASAN. */ kmemleak_alloc(ret, size, 1, flags); - kasan_kmalloc_large(ret, size, flags); return ret; } EXPORT_SYMBOL(kmalloc_order); @@ -1378,7 +1427,7 @@ void dump_unreclaimable_slab(void) #if defined(CONFIG_MEMCG) void *memcg_slab_start(struct seq_file *m, loff_t *pos) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); mutex_lock(&slab_mutex); return seq_list_start(&memcg->kmem_caches, *pos); @@ -1386,7 +1435,7 @@ void *memcg_slab_start(struct seq_file *m, loff_t *pos) void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos) { - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); return seq_list_next(p, &memcg->kmem_caches, pos); } @@ -1400,7 +1449,7 @@ int memcg_slab_show(struct seq_file *m, void *p) { struct kmem_cache *s = list_entry(p, struct kmem_cache, memcg_params.kmem_caches_node); - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); if (p == memcg->kmem_caches.next) print_slabinfo_header(m); @@ -1461,7 +1510,7 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, ks = ksize(p); if (ks >= new_size) { - kasan_krealloc((void *)p, new_size, flags); + p = kasan_krealloc((void *)p, new_size, flags); return (void *)p; } @@ -1481,6 +1530,8 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, * This function is like krealloc() except it never frees the originally * allocated buffer. Use this if you don't want to free the buffer immediately * like, for example, with RCU. + * + * Return: pointer to the allocated memory or %NULL in case of error */ void *__krealloc(const void *p, size_t new_size, gfp_t flags) { @@ -1502,6 +1553,8 @@ EXPORT_SYMBOL(__krealloc); * lesser of the new and old sizes. If @p is %NULL, krealloc() * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a * %NULL pointer, the object pointed to is freed. + * + * Return: pointer to the allocated memory or %NULL in case of error */ void *krealloc(const void *p, size_t new_size, gfp_t flags) { @@ -1513,7 +1566,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) } ret = __do_krealloc(p, new_size, flags); - if (ret && p != ret) + if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret)) kfree(p); return ret; diff --git a/mm/slub.c b/mm/slub.c index 8da34a8af53d..1b08fbcb7e61 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -249,7 +249,18 @@ static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr, unsigned long ptr_addr) { #ifdef CONFIG_SLAB_FREELIST_HARDENED - return (void *)((unsigned long)ptr ^ s->random ^ ptr_addr); + /* + * When CONFIG_KASAN_SW_TAGS is enabled, ptr_addr might be tagged. + * Normally, this doesn't cause any issues, as both set_freepointer() + * and get_freepointer() are called with a pointer with the same tag. + * However, there are some issues with CONFIG_SLUB_DEBUG code. For + * example, when __free_slub() iterates over objects in a cache, it + * passes untagged pointers to check_object(). check_object() in turns + * calls get_freepointer() with an untagged pointer, which causes the + * freepointer to be restored incorrectly. + */ + return (void *)((unsigned long)ptr ^ s->random ^ + (unsigned long)kasan_reset_tag((void *)ptr_addr)); #else return ptr; #endif @@ -303,15 +314,10 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) __p < (__addr) + (__objects) * (__s)->size; \ __p += (__s)->size) -#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ - for (__p = fixup_red_left(__s, __addr), __idx = 1; \ - __idx <= __objects; \ - __p += (__s)->size, __idx++) - /* Determine object index from a given position */ static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr) { - return (p - addr) / s->size; + return (kasan_reset_tag(p) - addr) / s->size; } static inline unsigned int order_objects(unsigned int order, unsigned int size) @@ -507,6 +513,7 @@ static inline int check_valid_pointer(struct kmem_cache *s, return 1; base = page_address(page); + object = kasan_reset_tag(object); object = restore_red_left(s, object); if (object < base || object >= base + page->objects * s->size || (object - base) % s->size) { @@ -1075,9 +1082,18 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, init_tracking(s, object); } +static void setup_page_debug(struct kmem_cache *s, void *addr, int order) +{ + if (!(s->flags & SLAB_POISON)) + return; + + metadata_access_enable(); + memset(addr, POISON_INUSE, PAGE_SIZE << order); + metadata_access_disable(); +} + static inline int alloc_consistency_checks(struct kmem_cache *s, - struct page *page, - void *object, unsigned long addr) + struct page *page, void *object) { if (!check_slab(s, page)) return 0; @@ -1098,7 +1114,7 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, void *object, unsigned long addr) { if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!alloc_consistency_checks(s, page, object, addr)) + if (!alloc_consistency_checks(s, page, object)) goto bad; } @@ -1276,22 +1292,62 @@ out: __setup("slub_debug", setup_slub_debug); +/* + * kmem_cache_flags - apply debugging options to the cache + * @object_size: the size of an object without meta data + * @flags: flags to set + * @name: name of the cache + * @ctor: constructor function + * + * Debug option(s) are applied to @flags. In addition to the debug + * option(s), if a slab name (or multiple) is specified i.e. + * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ... + * then only the select slabs will receive the debug option(s). + */ slab_flags_t kmem_cache_flags(unsigned int object_size, slab_flags_t flags, const char *name, void (*ctor)(void *)) { - /* - * Enable debugging if selected on the kernel commandline. - */ - if (slub_debug && (!slub_debug_slabs || (name && - !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))) - flags |= slub_debug; + char *iter; + size_t len; + + /* If slub_debug = 0, it folds into the if conditional. */ + if (!slub_debug_slabs) + return flags | slub_debug; + + len = strlen(name); + iter = slub_debug_slabs; + while (*iter) { + char *end, *glob; + size_t cmplen; + + end = strchr(iter, ','); + if (!end) + end = iter + strlen(iter); + + glob = strnchr(iter, end - iter, '*'); + if (glob) + cmplen = glob - iter; + else + cmplen = max_t(size_t, len, (end - iter)); + + if (!strncmp(name, iter, cmplen)) { + flags |= slub_debug; + break; + } + + if (!*end) + break; + iter = end + 1; + } return flags; } #else /* !CONFIG_SLUB_DEBUG */ static inline void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) {} +static inline void setup_page_debug(struct kmem_cache *s, + void *addr, int order) {} static inline int alloc_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { return 0; } @@ -1334,10 +1390,12 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. */ -static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) +static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) { + ptr = kasan_kmalloc_large(ptr, size, flags); + /* As ptr might get tagged, call kmemleak hook after KASAN. */ kmemleak_alloc(ptr, size, 1, flags); - kasan_kmalloc_large(ptr, size, flags); + return ptr; } static __always_inline void kfree_hook(void *x) @@ -1413,16 +1471,17 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, #endif } -static void setup_object(struct kmem_cache *s, struct page *page, +static void *setup_object(struct kmem_cache *s, struct page *page, void *object) { setup_object_debug(s, page, object); - kasan_init_slab_obj(s, object); + object = kasan_init_slab_obj(s, object); if (unlikely(s->ctor)) { kasan_unpoison_object_data(s, object); s->ctor(object); kasan_poison_object_data(s, object); } + return object; } /* @@ -1530,16 +1589,16 @@ static bool shuffle_freelist(struct kmem_cache *s, struct page *page) /* First entry is used as the base of the freelist */ cur = next_freelist_entry(s, page, &pos, start, page_limit, freelist_count); + cur = setup_object(s, page, cur); page->freelist = cur; for (idx = 1; idx < page->objects; idx++) { - setup_object(s, page, cur); next = next_freelist_entry(s, page, &pos, start, page_limit, freelist_count); + next = setup_object(s, page, next); set_freepointer(s, cur, next); cur = next; } - setup_object(s, page, cur); set_freepointer(s, cur, NULL); return true; @@ -1561,7 +1620,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) struct page *page; struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; - void *start, *p; + void *start, *p, *next; int idx, order; bool shuffle; @@ -1602,24 +1661,25 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if (page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); - start = page_address(page); + kasan_poison_slab(page); - if (unlikely(s->flags & SLAB_POISON)) - memset(start, POISON_INUSE, PAGE_SIZE << order); + start = page_address(page); - kasan_poison_slab(page); + setup_page_debug(s, start, order); shuffle = shuffle_freelist(s, page); if (!shuffle) { - for_each_object_idx(p, idx, s, start, page->objects) { - setup_object(s, page, p); - if (likely(idx < page->objects)) - set_freepointer(s, p, p + s->size); - else - set_freepointer(s, p, NULL); + start = fixup_red_left(s, start); + start = setup_object(s, page, start); + page->freelist = start; + for (idx = 0, p = start; idx < page->objects - 1; idx++) { + next = p + s->size; + next = setup_object(s, page, next); + set_freepointer(s, p, next); + p = next; } - page->freelist = fixup_red_left(s, start); + set_freepointer(s, p, NULL); } page->inuse = page->objects; @@ -2069,7 +2129,7 @@ redo: if (!lock) { lock = 1; /* - * Taking the spinlock removes the possiblity + * Taking the spinlock removes the possibility * that acquire_slab() will see a slab page that * is frozen */ @@ -2089,26 +2149,15 @@ redo: } if (l != m) { - if (l == M_PARTIAL) - remove_partial(n, page); - else if (l == M_FULL) - remove_full(s, n, page); - if (m == M_PARTIAL) { - + if (m == M_PARTIAL) add_partial(n, page, tail); - stat(s, tail); - - } else if (m == M_FULL) { - - stat(s, DEACTIVATE_FULL); + else if (m == M_FULL) add_full(s, n, page); - - } } l = m; @@ -2121,7 +2170,11 @@ redo: if (lock) spin_unlock(&n->list_lock); - if (m == M_FREE) { + if (m == M_PARTIAL) + stat(s, tail); + else if (m == M_FULL) + stat(s, DEACTIVATE_FULL); + else if (m == M_FREE) { stat(s, DEACTIVATE_EMPTY); discard_slab(s, page); stat(s, FREE_SLAB); @@ -2200,8 +2253,8 @@ static void unfreeze_partials(struct kmem_cache *s, } /* - * Put a page that was just frozen (in __slab_free) into a partial page - * slot if available. + * Put a page that was just frozen (in __slab_free|get_partial_node) into a + * partial page slot if available. * * If we did not find a slot then simply move all the partials to the * per node partial list. @@ -2275,12 +2328,10 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - if (likely(c)) { - if (c->page) - flush_slab(s, c); + if (c->page) + flush_slab(s, c); - unfreeze_partials(s, c); - } + unfreeze_partials(s, c); } static void flush_cpu_slab(void *d) @@ -2329,7 +2380,7 @@ static int slub_cpu_dead(unsigned int cpu) static inline int node_match(struct page *page, int node) { #ifdef CONFIG_NUMA - if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node)) + if (node != NUMA_NO_NODE && page_to_nid(page) != node) return 0; #endif return 1; @@ -2430,8 +2481,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, stat(s, ALLOC_SLAB); c->page = page; *pc = c; - } else - freelist = NULL; + } return freelist; } @@ -2730,7 +2780,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { void *ret = slab_alloc(s, gfpflags, _RET_IP_); trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); - kasan_kmalloc(s, ret, size, gfpflags); + ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_trace); @@ -2758,7 +2808,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, trace_kmalloc_node(_RET_IP_, ret, size, s->size, gfpflags, node); - kasan_kmalloc(s, ret, size, gfpflags); + ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node_trace); @@ -2954,7 +3004,7 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page, do_slab_free(s, page, head, tail, cnt, addr); } -#ifdef CONFIG_KASAN +#ifdef CONFIG_KASAN_GENERIC void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) { do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr); @@ -3326,16 +3376,16 @@ static void early_kmem_cache_node_alloc(int node) n = page->freelist; BUG_ON(!n); - page->freelist = get_freepointer(kmem_cache_node, n); - page->inuse = 1; - page->frozen = 0; - kmem_cache_node->node[node] = n; #ifdef CONFIG_SLUB_DEBUG init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); init_tracking(kmem_cache_node, n); #endif - kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), + n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), GFP_KERNEL); + page->freelist = get_freepointer(kmem_cache_node, n); + page->inuse = 1; + page->frozen = 0; + kmem_cache_node->node[node] = n; init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, page->objects); @@ -3621,9 +3671,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); void *p; - unsigned long *map = kcalloc(BITS_TO_LONGS(page->objects), - sizeof(long), - GFP_ATOMIC); + unsigned long *map = bitmap_zalloc(page->objects, GFP_ATOMIC); if (!map) return; slab_err(s, page, text, s->name); @@ -3638,7 +3686,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, } } slab_unlock(page); - kfree(map); + bitmap_free(map); #endif } @@ -3748,7 +3796,7 @@ void *__kmalloc(size_t size, gfp_t flags) trace_kmalloc(_RET_IP_, ret, size, s->size, flags); - kasan_kmalloc(s, ret, size, flags); + ret = kasan_kmalloc(s, ret, size, flags); return ret; } @@ -3765,8 +3813,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) if (page) ptr = page_address(page); - kmalloc_large_node_hook(ptr, size, flags); - return ptr; + return kmalloc_large_node_hook(ptr, size, flags); } void *__kmalloc_node(size_t size, gfp_t flags, int node) @@ -3793,7 +3840,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); - kasan_kmalloc(s, ret, size, flags); + ret = kasan_kmalloc(s, ret, size, flags); return ret; } @@ -3816,6 +3863,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, unsigned int offset; size_t object_size; + ptr = kasan_reset_tag(ptr); + /* Find object and usable object size. */ s = page->slab_cache; @@ -4213,7 +4262,7 @@ void __init kmem_cache_init(void) cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL, slub_cpu_dead); - pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%d\n", + pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n", cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); @@ -4411,10 +4460,8 @@ static long validate_slab_cache(struct kmem_cache *s) { int node; unsigned long count = 0; - unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)), - sizeof(unsigned long), - GFP_KERNEL); struct kmem_cache_node *n; + unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL); if (!map) return -ENOMEM; @@ -4422,7 +4469,7 @@ static long validate_slab_cache(struct kmem_cache *s) flush_all(s); for_each_kmem_cache_node(s, node, n) count += validate_slab_node(s, n, map); - kfree(map); + bitmap_free(map); return count; } /* @@ -4573,14 +4620,12 @@ static int list_locations(struct kmem_cache *s, char *buf, unsigned long i; struct loc_track t = { 0, 0, NULL }; int node; - unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)), - sizeof(unsigned long), - GFP_KERNEL); struct kmem_cache_node *n; + unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL); if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) { - kfree(map); + bitmap_free(map); return sprintf(buf, "Out of memory\n"); } /* Push back cpu slabs */ @@ -4646,7 +4691,7 @@ static int list_locations(struct kmem_cache *s, char *buf, } free_loc_track(&t); - kfree(map); + bitmap_free(map); if (!t.count) len += sprintf(buf, "No data\n"); return len; @@ -4657,6 +4702,7 @@ static int list_locations(struct kmem_cache *s, char *buf, static void __init resiliency_test(void) { u8 *p; + int type = KMALLOC_NORMAL; BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); @@ -4669,7 +4715,7 @@ static void __init resiliency_test(void) pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n", p + 16); - validate_slab_cache(kmalloc_caches[4]); + validate_slab_cache(kmalloc_caches[type][4]); /* Hmmm... The next two are dangerous */ p = kzalloc(32, GFP_KERNEL); @@ -4678,33 +4724,33 @@ static void __init resiliency_test(void) p); pr_err("If allocated object is overwritten then not detectable\n\n"); - validate_slab_cache(kmalloc_caches[5]); + validate_slab_cache(kmalloc_caches[type][5]); p = kzalloc(64, GFP_KERNEL); p += 64 + (get_cycles() & 0xff) * sizeof(void *); *p = 0x56; pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", p); pr_err("If allocated object is overwritten then not detectable\n\n"); - validate_slab_cache(kmalloc_caches[6]); + validate_slab_cache(kmalloc_caches[type][6]); pr_err("\nB. Corruption after free\n"); p = kzalloc(128, GFP_KERNEL); kfree(p); *p = 0x78; pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches[7]); + validate_slab_cache(kmalloc_caches[type][7]); p = kzalloc(256, GFP_KERNEL); kfree(p); p[50] = 0x9a; pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches[8]); + validate_slab_cache(kmalloc_caches[type][8]); p = kzalloc(512, GFP_KERNEL); kfree(p); p[512] = 0xab; pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches[9]); + validate_slab_cache(kmalloc_caches[type][9]); } #else #ifdef CONFIG_SYSFS diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 8301293331a2..7fec05796796 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -20,7 +20,7 @@ */ #include <linux/mm.h> #include <linux/mmzone.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/memremap.h> #include <linux/highmem.h> #include <linux/slab.h> @@ -42,8 +42,8 @@ static void * __ref __earlyonly_bootmem_alloc(int node, unsigned long align, unsigned long goal) { - return memblock_virt_alloc_try_nid_raw(size, align, goal, - BOOTMEM_ALLOC_ACCESSIBLE, node); + return memblock_alloc_try_nid_raw(size, align, goal, + MEMBLOCK_ALLOC_ACCESSIBLE, node); } void * __meminit vmemmap_alloc_block(unsigned long size, int node) diff --git a/mm/sparse.c b/mm/sparse.c index 10b07eea9a6e..77a0554fa5bd 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -5,7 +5,7 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/mmzone.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/compiler.h> #include <linux/highmem.h> #include <linux/export.h> @@ -68,7 +68,8 @@ static noinline struct mem_section __ref *sparse_index_alloc(int nid) if (slab_is_available()) section = kzalloc_node(array_size, GFP_KERNEL, nid); else - section = memblock_virt_alloc_node(array_size, nid); + section = memblock_alloc_node(array_size, SMP_CACHE_BYTES, + nid); return section; } @@ -196,7 +197,7 @@ static inline int next_present_section_nr(int section_nr) } #define for_each_present_section_nr(start, section_nr) \ for (section_nr = next_present_section_nr(start-1); \ - ((section_nr >= 0) && \ + ((section_nr != -1) && \ (section_nr <= __highest_present_section_nr)); \ section_nr = next_present_section_nr(section_nr)) @@ -216,7 +217,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) size = sizeof(struct mem_section*) * NR_SECTION_ROOTS; align = 1 << (INTERNODE_CACHE_SHIFT); - mem_section = memblock_virt_alloc(size, align); + mem_section = memblock_alloc(size, align); } #endif @@ -239,6 +240,22 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) } /* + * Mark all memblocks as present using memory_present(). This is a + * convienence function that is useful for a number of arches + * to mark all of the systems memory as present during initialization. + */ +void __init memblocks_present(void) +{ + struct memblock_region *reg; + + for_each_memblock(memory, reg) { + memory_present(memblock_get_region_node(reg), + memblock_region_memory_base_pfn(reg), + memblock_region_memory_end_pfn(reg)); + } +} + +/* * Subtle, we encode the real pfn into the mem_map such that * the identity pfn - section_mem_map will return the actual * physical page frame number. @@ -306,7 +323,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, limit = goal + (1UL << PA_SECTION_SHIFT); nid = early_pfn_to_nid(goal >> PAGE_SHIFT); again: - p = memblock_virt_alloc_try_nid_nopanic(size, + p = memblock_alloc_try_nid_nopanic(size, SMP_CACHE_BYTES, goal, limit, nid); if (!p && limit) { @@ -362,7 +379,7 @@ static unsigned long * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, unsigned long size) { - return memblock_virt_alloc_node_nopanic(size, pgdat->node_id); + return memblock_alloc_node_nopanic(size, pgdat->node_id); } static void __init check_usemap_section_nr(int nid, unsigned long *usemap) @@ -391,9 +408,9 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid, if (map) return map; - map = memblock_virt_alloc_try_nid(size, + map = memblock_alloc_try_nid(size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, nid); + MEMBLOCK_ALLOC_ACCESSIBLE, nid); return map; } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ @@ -405,9 +422,9 @@ static void __init sparse_buffer_init(unsigned long size, int nid) { WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */ sparsemap_buf = - memblock_virt_alloc_try_nid_raw(size, PAGE_SIZE, + memblock_alloc_try_nid_raw(size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, nid); + MEMBLOCK_ALLOC_ACCESSIBLE, nid); sparsemap_buf_end = sparsemap_buf + size; } @@ -661,25 +678,24 @@ static void free_map_bootmem(struct page *memmap) * set. If this is <=0, then that means that the passed-in * map was not consumed and must be freed. */ -int __meminit sparse_add_one_section(struct pglist_data *pgdat, - unsigned long start_pfn, struct vmem_altmap *altmap) +int __meminit sparse_add_one_section(int nid, unsigned long start_pfn, + struct vmem_altmap *altmap) { unsigned long section_nr = pfn_to_section_nr(start_pfn); struct mem_section *ms; struct page *memmap; unsigned long *usemap; - unsigned long flags; int ret; /* * no locking for this, because it does its own * plus, it does a kmalloc */ - ret = sparse_index_init(section_nr, pgdat->node_id); + ret = sparse_index_init(section_nr, nid); if (ret < 0 && ret != -EEXIST) return ret; ret = 0; - memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, altmap); + memmap = kmalloc_section_memmap(section_nr, nid, altmap); if (!memmap) return -ENOMEM; usemap = __kmalloc_section_usemap(); @@ -688,27 +704,22 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat, return -ENOMEM; } - pgdat_resize_lock(pgdat, &flags); - ms = __pfn_to_section(start_pfn); if (ms->section_mem_map & SECTION_MARKED_PRESENT) { ret = -EEXIST; goto out; } -#ifdef CONFIG_DEBUG_VM /* * Poison uninitialized struct pages in order to catch invalid flags * combinations. */ - memset(memmap, PAGE_POISON_PATTERN, sizeof(struct page) * PAGES_PER_SECTION); -#endif + page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION); section_mark_present(ms); sparse_init_one_section(ms, section_nr, memmap, usemap); out: - pgdat_resize_unlock(pgdat, &flags); if (ret < 0) { kfree(usemap); __kfree_section_memmap(memmap, altmap); @@ -725,6 +736,15 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) if (!memmap) return; + /* + * A further optimization is to have per section refcounted + * num_poisoned_pages. But that would need more space per memmap, so + * for now just do a quick global check to speed up this routine in the + * absence of bad pages. + */ + if (atomic_long_read(&num_poisoned_pages) == 0) + return; + for (i = 0; i < nr_pages; i++) { if (PageHWPoison(&memmap[i])) { atomic_long_sub(1, &num_poisoned_pages); @@ -770,10 +790,8 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, unsigned long map_offset, struct vmem_altmap *altmap) { struct page *memmap = NULL; - unsigned long *usemap = NULL, flags; - struct pglist_data *pgdat = zone->zone_pgdat; + unsigned long *usemap = NULL; - pgdat_resize_lock(pgdat, &flags); if (ms->section_mem_map) { usemap = ms->pageblock_flags; memmap = sparse_decode_mem_map(ms->section_mem_map, @@ -781,7 +799,6 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, ms->section_mem_map = 0; ms->pageblock_flags = NULL; } - pgdat_resize_unlock(pgdat, &flags); clear_hwpoisoned_pages(memmap + map_offset, PAGES_PER_SECTION - map_offset); diff --git a/mm/swap.c b/mm/swap.c index 26fc9b5f1b6c..301ed4e04320 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -29,7 +29,6 @@ #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/backing-dev.h> -#include <linux/memremap.h> #include <linux/memcontrol.h> #include <linux/gfp.h> #include <linux/uio.h> @@ -59,16 +58,16 @@ static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); static void __page_cache_release(struct page *page) { if (PageLRU(page)) { - struct zone *zone = page_zone(page); + pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; unsigned long flags; - spin_lock_irqsave(zone_lru_lock(zone), flags); - lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); + spin_lock_irqsave(&pgdat->lru_lock, flags); + lruvec = mem_cgroup_page_lruvec(page, pgdat); VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); - spin_unlock_irqrestore(zone_lru_lock(zone), flags); + spin_unlock_irqrestore(&pgdat->lru_lock, flags); } __ClearPageWaiters(page); mem_cgroup_uncharge(page); @@ -127,7 +126,7 @@ void put_pages_list(struct list_head *pages) while (!list_empty(pages)) { struct page *victim; - victim = list_entry(pages->prev, struct page, lru); + victim = lru_to_page(pages); list_del(&victim->lru); put_page(victim); } @@ -321,19 +320,14 @@ static inline void activate_page_drain(int cpu) { } -static bool need_activate_page_drain(int cpu) -{ - return false; -} - void activate_page(struct page *page) { - struct zone *zone = page_zone(page); + pg_data_t *pgdat = page_pgdat(page); page = compound_head(page); - spin_lock_irq(zone_lru_lock(zone)); - __activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL); - spin_unlock_irq(zone_lru_lock(zone)); + spin_lock_irq(&pgdat->lru_lock); + __activate_page(page, mem_cgroup_page_lruvec(page, pgdat), NULL); + spin_unlock_irq(&pgdat->lru_lock); } #endif @@ -654,13 +648,15 @@ void lru_add_drain(void) put_cpu(); } +#ifdef CONFIG_SMP + +static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); + static void lru_add_drain_per_cpu(struct work_struct *dummy) { lru_add_drain(); } -static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); - /* * Doesn't need any cpu hotplug locking because we do rely on per-cpu * kworkers being shut down before our page_alloc_cpu_dead callback is @@ -703,6 +699,12 @@ void lru_add_drain_all(void) mutex_unlock(&lock); } +#else +void lru_add_drain_all(void) +{ + lru_add_drain(); +} +#endif /** * release_pages - batched put_page() @@ -824,8 +826,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, VM_BUG_ON_PAGE(!PageHead(page), page); VM_BUG_ON_PAGE(PageCompound(page_tail), page); VM_BUG_ON_PAGE(PageLRU(page_tail), page); - VM_BUG_ON(NR_CPUS != 1 && - !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock)); + lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock); if (!list) SetPageLRU(page_tail); @@ -965,7 +966,7 @@ void pagevec_remove_exceptionals(struct pagevec *pvec) for (i = 0, j = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - if (!radix_tree_exceptional_entry(page)) + if (!xa_is_value(page)) pvec->pages[j++] = page; } pvec->nr = j; @@ -1002,7 +1003,7 @@ EXPORT_SYMBOL(pagevec_lookup_range); unsigned pagevec_lookup_range_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, - int tag) + xa_mark_t tag) { pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, PAGEVEC_SIZE, pvec->pages); @@ -1012,7 +1013,7 @@ EXPORT_SYMBOL(pagevec_lookup_range_tag); unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, - int tag, unsigned max_pages) + xa_mark_t tag, unsigned max_pages) { pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages); @@ -1024,7 +1025,7 @@ EXPORT_SYMBOL(pagevec_lookup_range_nr_tag); */ void __init swap_setup(void) { - unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); + unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ if (megs < 16) diff --git a/mm/swap_state.c b/mm/swap_state.c index ecee9c6c4cc1..85245fdec8d9 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -107,14 +107,15 @@ void show_swap_cache_info(void) } /* - * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, + * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -int __add_to_swap_cache(struct page *page, swp_entry_t entry) +int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp) { - int error, i, nr = hpage_nr_pages(page); - struct address_space *address_space; + struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); + XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page)); + unsigned long i, nr = 1UL << compound_order(page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapCache(page), page); @@ -123,73 +124,52 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) page_ref_add(page, nr); SetPageSwapCache(page); - address_space = swap_address_space(entry); - xa_lock_irq(&address_space->i_pages); - for (i = 0; i < nr; i++) { - set_page_private(page + i, entry.val + i); - error = radix_tree_insert(&address_space->i_pages, - idx + i, page + i); - if (unlikely(error)) - break; - } - if (likely(!error)) { + do { + xas_lock_irq(&xas); + xas_create_range(&xas); + if (xas_error(&xas)) + goto unlock; + for (i = 0; i < nr; i++) { + VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); + set_page_private(page + i, entry.val + i); + xas_store(&xas, page + i); + xas_next(&xas); + } address_space->nrpages += nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); ADD_CACHE_INFO(add_total, nr); - } else { - /* - * Only the context which have set SWAP_HAS_CACHE flag - * would call add_to_swap_cache(). - * So add_to_swap_cache() doesn't returns -EEXIST. - */ - VM_BUG_ON(error == -EEXIST); - set_page_private(page + i, 0UL); - while (i--) { - radix_tree_delete(&address_space->i_pages, idx + i); - set_page_private(page + i, 0UL); - } - ClearPageSwapCache(page); - page_ref_sub(page, nr); - } - xa_unlock_irq(&address_space->i_pages); - - return error; -} +unlock: + xas_unlock_irq(&xas); + } while (xas_nomem(&xas, gfp)); + if (!xas_error(&xas)) + return 0; -int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) -{ - int error; - - error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page)); - if (!error) { - error = __add_to_swap_cache(page, entry); - radix_tree_preload_end(); - } - return error; + ClearPageSwapCache(page); + page_ref_sub(page, nr); + return xas_error(&xas); } /* * This must be called only on pages that have * been verified to be in the swap cache. */ -void __delete_from_swap_cache(struct page *page) +void __delete_from_swap_cache(struct page *page, swp_entry_t entry) { - struct address_space *address_space; + struct address_space *address_space = swap_address_space(entry); int i, nr = hpage_nr_pages(page); - swp_entry_t entry; - pgoff_t idx; + pgoff_t idx = swp_offset(entry); + XA_STATE(xas, &address_space->i_pages, idx); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageSwapCache(page), page); VM_BUG_ON_PAGE(PageWriteback(page), page); - entry.val = page_private(page); - address_space = swap_address_space(entry); - idx = swp_offset(entry); for (i = 0; i < nr; i++) { - radix_tree_delete(&address_space->i_pages, idx + i); + void *entry = xas_store(&xas, NULL); + VM_BUG_ON_PAGE(entry != page + i, entry); set_page_private(page + i, 0); + xas_next(&xas); } ClearPageSwapCache(page); address_space->nrpages -= nr; @@ -217,7 +197,7 @@ int add_to_swap(struct page *page) return 0; /* - * Radix-tree node allocations from PF_MEMALLOC contexts could + * XArray node allocations from PF_MEMALLOC contexts could * completely exhaust the page allocator. __GFP_NOMEMALLOC * stops emergency reserves from being allocated. * @@ -229,7 +209,6 @@ int add_to_swap(struct page *page) */ err = add_to_swap_cache(page, entry, __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); - /* -ENOMEM radix-tree allocation failure */ if (err) /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely @@ -263,14 +242,11 @@ fail: */ void delete_from_swap_cache(struct page *page) { - swp_entry_t entry; - struct address_space *address_space; + swp_entry_t entry = { .val = page_private(page) }; + struct address_space *address_space = swap_address_space(entry); - entry.val = page_private(page); - - address_space = swap_address_space(entry); xa_lock_irq(&address_space->i_pages); - __delete_from_swap_cache(page); + __delete_from_swap_cache(page, entry); xa_unlock_irq(&address_space->i_pages); put_swap_page(page, entry); @@ -414,18 +390,10 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, } /* - * call radix_tree_preload() while we can wait. - */ - err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); - if (err) - break; - - /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); if (err == -EEXIST) { - radix_tree_preload_end(); /* * We might race against get_swap_page() and stumble * across a SWAP_HAS_CACHE swap_map entry whose page @@ -433,26 +401,20 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, */ cond_resched(); continue; - } - if (err) { /* swp entry is obsolete ? */ - radix_tree_preload_end(); + } else if (err) /* swp entry is obsolete ? */ break; - } - /* May fail (-ENOMEM) if radix-tree node allocation failed. */ + /* May fail (-ENOMEM) if XArray node allocation failed. */ __SetPageLocked(new_page); __SetPageSwapBacked(new_page); - err = __add_to_swap_cache(new_page, entry); + err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); if (likely(!err)) { - radix_tree_preload_end(); - /* - * Initiate read into locked page and return. - */ + /* Initiate read into locked page */ + SetPageWorkingset(new_page); lru_cache_add_anon(new_page); *new_page_allocated = true; return new_page; } - radix_tree_preload_end(); __ClearPageLocked(new_page); /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely @@ -561,7 +523,7 @@ static unsigned long swapin_nr_pages(unsigned long offset) * This has been extended to use the NUMA policies from the mm triggering * the readahead. * - * Caller must hold down_read on the vma->vm_mm if vmf->vma is not NULL. + * Caller must hold read mmap_sem if vmf->vma is not NULL. */ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) @@ -581,6 +543,13 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, if (!mask) goto skip; + /* Test swap type to make sure the dereference is safe */ + if (likely(si->flags & (SWP_BLKDEV | SWP_FS))) { + struct inode *inode = si->swap_file->f_mapping->host; + if (inode_read_congested(inode)) + goto skip; + } + do_poll = false; /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; @@ -625,7 +594,7 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages) return -ENOMEM; for (i = 0; i < nr; i++) { space = spaces + i; - INIT_RADIX_TREE(&space->i_pages, GFP_ATOMIC|__GFP_NOWARN); + xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); atomic_set(&space->i_mmap_writable, 0); space->a_ops = &swap_aops; /* swap cache doesn't use writeback related tags */ @@ -729,6 +698,20 @@ static void swap_ra_info(struct vm_fault *vmf, pte_unmap(orig_pte); } +/** + * swap_vma_readahead - swap in pages in hope we need them soon + * @entry: swap entry of this memory + * @gfp_mask: memory allocation flags + * @vmf: fault information + * + * Returns the struct page for entry and addr, after queueing swapin. + * + * Primitive swap readahead code. We simply read in a few pages whoes + * virtual addresses are around the fault address in the same vma. + * + * Caller must hold read mmap_sem if vmf->vma is not NULL. + * + */ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, struct vm_fault *vmf) { diff --git a/mm/swapfile.c b/mm/swapfile.c index d954b71c4f9c..2b8d9c3fbb47 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -98,31 +98,53 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0); atomic_t nr_rotate_swap = ATOMIC_INIT(0); +static struct swap_info_struct *swap_type_to_swap_info(int type) +{ + if (type >= READ_ONCE(nr_swapfiles)) + return NULL; + + smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */ + return READ_ONCE(swap_info[type]); +} + static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ } +/* Reclaim the swap entry anyway if possible */ +#define TTRS_ANYWAY 0x1 +/* + * Reclaim the swap entry if there are no more mappings of the + * corresponding page + */ +#define TTRS_UNMAPPED 0x2 +/* Reclaim the swap entry if swap is getting full*/ +#define TTRS_FULL 0x4 + /* returns 1 if swap entry is freed */ -static int -__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) +static int __try_to_reclaim_swap(struct swap_info_struct *si, + unsigned long offset, unsigned long flags) { swp_entry_t entry = swp_entry(si->type, offset); struct page *page; int ret = 0; - page = find_get_page(swap_address_space(entry), swp_offset(entry)); + page = find_get_page(swap_address_space(entry), offset); if (!page) return 0; /* - * This function is called from scan_swap_map() and it's called - * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. - * We have to use trylock for avoiding deadlock. This is a special + * When this function is called from scan_swap_map_slots() and it's + * called by vmscan.c at reclaiming pages. So, we hold a lock on a page, + * here. We have to use trylock for avoiding deadlock. This is a special * case and you should use try_to_free_swap() with explicit lock_page() * in usual operations. */ if (trylock_page(page)) { - ret = try_to_free_swap(page); + if ((flags & TTRS_ANYWAY) || + ((flags & TTRS_UNMAPPED) && !page_mapped(page)) || + ((flags & TTRS_FULL) && mem_cgroup_swap_full(page))) + ret = try_to_free_swap(page); unlock_page(page); } put_page(page); @@ -780,7 +802,7 @@ checks: int swap_was_freed; unlock_cluster(ci); spin_unlock(&si->lock); - swap_was_freed = __try_to_reclaim_swap(si, offset); + swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); spin_lock(&si->lock); /* entry was freed successfully, try to use this again */ if (swap_was_freed) @@ -919,6 +941,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) struct swap_cluster_info *ci; ci = lock_cluster(si, offset); + memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); cluster_set_count_flag(ci, 0, 0); free_cluster(si, idx); unlock_cluster(ci); @@ -989,7 +1012,7 @@ start_over: goto nextsi; } if (size == SWAPFILE_CLUSTER) { - if (!(si->flags & SWP_FILE)) + if (!(si->flags & SWP_FS)) n_ret = swap_alloc_cluster(si, swp_entries); } else n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, @@ -1030,12 +1053,14 @@ noswap: /* The only caller of this function is now suspend routine */ swp_entry_t get_swap_page_of_type(int type) { - struct swap_info_struct *si; + struct swap_info_struct *si = swap_type_to_swap_info(type); pgoff_t offset; - si = swap_info[type]; + if (!si) + goto fail; + spin_lock(&si->lock); - if (si && (si->flags & SWP_WRITEOK)) { + if (si->flags & SWP_WRITEOK) { atomic_long_dec(&nr_swap_pages); /* This is called for allocating swap entry, not cache */ offset = scan_swap_map(si, 1); @@ -1046,6 +1071,7 @@ swp_entry_t get_swap_page_of_type(int type) atomic_long_inc(&nr_swap_pages); } spin_unlock(&si->lock); +fail: return (swp_entry_t) {0}; } @@ -1057,9 +1083,9 @@ static struct swap_info_struct *__swap_info_get(swp_entry_t entry) if (!entry.val) goto out; type = swp_type(entry); - if (type >= nr_swapfiles) + p = swap_type_to_swap_info(type); + if (!p) goto bad_nofile; - p = swap_info[type]; if (!(p->flags & SWP_USED)) goto bad_device; offset = swp_offset(entry); @@ -1169,6 +1195,8 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p, ci = lock_cluster_or_swap_info(p, offset); usage = __swap_entry_free_locked(p, offset, usage); unlock_cluster_or_swap_info(p, ci); + if (!usage) + free_swap_slot(entry); return usage; } @@ -1199,10 +1227,8 @@ void swap_free(swp_entry_t entry) struct swap_info_struct *p; p = _swap_info_get(entry); - if (p) { - if (!__swap_entry_free(p, entry, 1)) - free_swap_slot(entry); - } + if (p) + __swap_entry_free(p, entry, 1); } /* @@ -1237,9 +1263,6 @@ void put_swap_page(struct page *page, swp_entry_t entry) if (free_entries == SWAPFILE_CLUSTER) { unlock_cluster_or_swap_info(si, ci); spin_lock(&si->lock); - ci = lock_cluster(si, offset); - memset(map, 0, SWAPFILE_CLUSTER); - unlock_cluster(ci); mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); swap_free_cluster(si, idx); spin_unlock(&si->lock); @@ -1612,7 +1635,6 @@ int try_to_free_swap(struct page *page) int free_swap_and_cache(swp_entry_t entry) { struct swap_info_struct *p; - struct page *page = NULL; unsigned char count; if (non_swap_entry(entry)) @@ -1622,30 +1644,9 @@ int free_swap_and_cache(swp_entry_t entry) if (p) { count = __swap_entry_free(p, entry, 1); if (count == SWAP_HAS_CACHE && - !swap_page_trans_huge_swapped(p, entry)) { - page = find_get_page(swap_address_space(entry), - swp_offset(entry)); - if (page && !trylock_page(page)) { - put_page(page); - page = NULL; - } - } else if (!count) - free_swap_slot(entry); - } - if (page) { - /* - * Not mapped elsewhere, or swap space full? Free it! - * Also recheck PageSwapCache now page is locked (above). - */ - if (PageSwapCache(page) && !PageWriteback(page) && - (!page_mapped(page) || mem_cgroup_swap_full(page)) && - !swap_page_trans_huge_swapped(p, entry)) { - page = compound_head(page); - delete_from_swap_cache(page); - SetPageDirty(page); - } - unlock_page(page); - put_page(page); + !swap_page_trans_huge_swapped(p, entry)) + __try_to_reclaim_swap(p, swp_offset(entry), + TTRS_UNMAPPED | TTRS_FULL); } return p != NULL; } @@ -1708,10 +1709,9 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) sector_t swapdev_block(int type, pgoff_t offset) { struct block_device *bdev; + struct swap_info_struct *si = swap_type_to_swap_info(type); - if ((unsigned int)type >= nr_swapfiles) - return 0; - if (!(swap_info[type]->flags & SWP_WRITEOK)) + if (!si || !(si->flags & SWP_WRITEOK)) return 0; return map_swap_entry(swp_entry(type, offset), &bdev); } @@ -1810,44 +1810,77 @@ out_nolock: } static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + unsigned long addr, unsigned long end, + unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { - pte_t swp_pte = swp_entry_to_pte(entry); + struct page *page; + swp_entry_t entry; pte_t *pte; + struct swap_info_struct *si; + unsigned long offset; int ret = 0; + volatile unsigned char *swap_map; - /* - * We don't actually need pte lock while scanning for swp_pte: since - * we hold page lock and mmap_sem, swp_pte cannot be inserted into the - * page table while we're scanning; though it could get zapped, and on - * some architectures (e.g. x86_32 with PAE) we might catch a glimpse - * of unmatched parts which look like swp_pte, so unuse_pte must - * recheck under pte lock. Scanning without pte lock lets it be - * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. - */ + si = swap_info[type]; pte = pte_offset_map(pmd, addr); do { - /* - * swapoff spends a _lot_ of time in this loop! - * Test inline before going to call unuse_pte. - */ - if (unlikely(pte_same_as_swp(*pte, swp_pte))) { - pte_unmap(pte); - ret = unuse_pte(vma, pmd, addr, entry, page); - if (ret) - goto out; - pte = pte_offset_map(pmd, addr); + struct vm_fault vmf; + + if (!is_swap_pte(*pte)) + continue; + + entry = pte_to_swp_entry(*pte); + if (swp_type(entry) != type) + continue; + + offset = swp_offset(entry); + if (frontswap && !frontswap_test(si, offset)) + continue; + + pte_unmap(pte); + swap_map = &si->swap_map[offset]; + vmf.vma = vma; + vmf.address = addr; + vmf.pmd = pmd; + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); + if (!page) { + if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) + goto try_next; + return -ENOMEM; } + + lock_page(page); + wait_on_page_writeback(page); + ret = unuse_pte(vma, pmd, addr, entry, page); + if (ret < 0) { + unlock_page(page); + put_page(page); + goto out; + } + + try_to_free_swap(page); + unlock_page(page); + put_page(page); + + if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) { + ret = FRONTSWAP_PAGES_UNUSED; + goto out; + } +try_next: + pte = pte_offset_map(pmd, addr); } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap(pte - 1); + + ret = 0; out: return ret; } static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { pmd_t *pmd; unsigned long next; @@ -1859,7 +1892,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; - ret = unuse_pte_range(vma, pmd, addr, next, entry, page); + ret = unuse_pte_range(vma, pmd, addr, next, type, + frontswap, fs_pages_to_unuse); if (ret) return ret; } while (pmd++, addr = next, addr != end); @@ -1868,7 +1902,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { pud_t *pud; unsigned long next; @@ -1879,7 +1914,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - ret = unuse_pmd_range(vma, pud, addr, next, entry, page); + ret = unuse_pmd_range(vma, pud, addr, next, type, + frontswap, fs_pages_to_unuse); if (ret) return ret; } while (pud++, addr = next, addr != end); @@ -1888,7 +1924,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { p4d_t *p4d; unsigned long next; @@ -1899,78 +1936,66 @@ static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; - ret = unuse_pud_range(vma, p4d, addr, next, entry, page); + ret = unuse_pud_range(vma, p4d, addr, next, type, + frontswap, fs_pages_to_unuse); if (ret) return ret; } while (p4d++, addr = next, addr != end); return 0; } -static int unuse_vma(struct vm_area_struct *vma, - swp_entry_t entry, struct page *page) +static int unuse_vma(struct vm_area_struct *vma, unsigned int type, + bool frontswap, unsigned long *fs_pages_to_unuse) { pgd_t *pgd; unsigned long addr, end, next; int ret; - if (page_anon_vma(page)) { - addr = page_address_in_vma(page, vma); - if (addr == -EFAULT) - return 0; - else - end = addr + PAGE_SIZE; - } else { - addr = vma->vm_start; - end = vma->vm_end; - } + addr = vma->vm_start; + end = vma->vm_end; pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - ret = unuse_p4d_range(vma, pgd, addr, next, entry, page); + ret = unuse_p4d_range(vma, pgd, addr, next, type, + frontswap, fs_pages_to_unuse); if (ret) return ret; } while (pgd++, addr = next, addr != end); return 0; } -static int unuse_mm(struct mm_struct *mm, - swp_entry_t entry, struct page *page) +static int unuse_mm(struct mm_struct *mm, unsigned int type, + bool frontswap, unsigned long *fs_pages_to_unuse) { struct vm_area_struct *vma; int ret = 0; - if (!down_read_trylock(&mm->mmap_sem)) { - /* - * Activate page so shrink_inactive_list is unlikely to unmap - * its ptes while lock is dropped, so swapoff can make progress. - */ - activate_page(page); - unlock_page(page); - down_read(&mm->mmap_sem); - lock_page(page); - } + down_read(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) - break; + if (vma->anon_vma) { + ret = unuse_vma(vma, type, frontswap, + fs_pages_to_unuse); + if (ret) + break; + } cond_resched(); } up_read(&mm->mmap_sem); - return (ret < 0)? ret: 0; + return ret; } /* * Scan swap_map (or frontswap_map if frontswap parameter is true) - * from current position to next entry still in use. - * Recycle to start on reaching the end, returning 0 when empty. + * from current position to next entry still in use. Return 0 + * if there are no inuse entries after prev till end of the map. */ static unsigned int find_next_to_unuse(struct swap_info_struct *si, unsigned int prev, bool frontswap) { - unsigned int max = si->max; - unsigned int i = prev; + unsigned int i; unsigned char count; /* @@ -1979,20 +2004,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, * hits are okay, and sys_swapoff() has already prevented new * allocations from this area (while holding swap_lock). */ - for (;;) { - if (++i >= max) { - if (!prev) { - i = 0; - break; - } - /* - * No entries in use at top of swap_map, - * loop back to start and recheck there. - */ - max = prev + 1; - prev = 0; - i = 1; - } + for (i = prev + 1; i < si->max; i++) { count = READ_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) if (!frontswap || frontswap_test(si, i)) @@ -2000,239 +2012,121 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, if ((i % LATENCY_LIMIT) == 0) cond_resched(); } + + if (i == si->max) + i = 0; + return i; } /* - * We completely avoid races by reading each swap page in advance, - * and then search for the process using it. All the necessary - * page table adjustments can then be made atomically. - * - * if the boolean frontswap is true, only unuse pages_to_unuse pages; + * If the boolean frontswap is true, only unuse pages_to_unuse pages; * pages_to_unuse==0 means all pages; ignored if frontswap is false */ +#define SWAP_UNUSE_MAX_TRIES 3 int try_to_unuse(unsigned int type, bool frontswap, unsigned long pages_to_unuse) { + struct mm_struct *prev_mm; + struct mm_struct *mm; + struct list_head *p; + int retval = 0; struct swap_info_struct *si = swap_info[type]; - struct mm_struct *start_mm; - volatile unsigned char *swap_map; /* swap_map is accessed without - * locking. Mark it as volatile - * to prevent compiler doing - * something odd. - */ - unsigned char swcount; struct page *page; swp_entry_t entry; - unsigned int i = 0; - int retval = 0; + unsigned int i; + int retries = 0; - /* - * When searching mms for an entry, a good strategy is to - * start at the first mm we freed the previous entry from - * (though actually we don't notice whether we or coincidence - * freed the entry). Initialize this start_mm with a hold. - * - * A simpler strategy would be to start at the last mm we - * freed the previous entry from; but that would take less - * advantage of mmlist ordering, which clusters forked mms - * together, child after parent. If we race with dup_mmap(), we - * prefer to resolve parent before child, lest we miss entries - * duplicated after we scanned child: using last mm would invert - * that. - */ - start_mm = &init_mm; - mmget(&init_mm); + if (!si->inuse_pages) + return 0; - /* - * Keep on scanning until all entries have gone. Usually, - * one pass through swap_map is enough, but not necessarily: - * there are races when an instance of an entry might be missed. - */ - while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { + if (!frontswap) + pages_to_unuse = 0; + +retry: + retval = shmem_unuse(type, frontswap, &pages_to_unuse); + if (retval) + goto out; + + prev_mm = &init_mm; + mmget(prev_mm); + + spin_lock(&mmlist_lock); + p = &init_mm.mmlist; + while ((p = p->next) != &init_mm.mmlist) { if (signal_pending(current)) { retval = -EINTR; break; } - /* - * Get a page for the entry, using the existing swap - * cache page if there is one. Otherwise, get a clean - * page and read the swap into it. - */ - swap_map = &si->swap_map[i]; - entry = swp_entry(type, i); - page = read_swap_cache_async(entry, - GFP_HIGHUSER_MOVABLE, NULL, 0, false); - if (!page) { - /* - * Either swap_duplicate() failed because entry - * has been freed independently, and will not be - * reused since sys_swapoff() already disabled - * allocation from here, or alloc_page() failed. - */ - swcount = *swap_map; - /* - * We don't hold lock here, so the swap entry could be - * SWAP_MAP_BAD (when the cluster is discarding). - * Instead of fail out, We can just skip the swap - * entry because swapoff will wait for discarding - * finish anyway. - */ - if (!swcount || swcount == SWAP_MAP_BAD) - continue; - retval = -ENOMEM; - break; - } + mm = list_entry(p, struct mm_struct, mmlist); + if (!mmget_not_zero(mm)) + continue; + spin_unlock(&mmlist_lock); + mmput(prev_mm); + prev_mm = mm; + retval = unuse_mm(mm, type, frontswap, &pages_to_unuse); - /* - * Don't hold on to start_mm if it looks like exiting. - */ - if (atomic_read(&start_mm->mm_users) == 1) { - mmput(start_mm); - start_mm = &init_mm; - mmget(&init_mm); + if (retval) { + mmput(prev_mm); + goto out; } /* - * Wait for and lock page. When do_swap_page races with - * try_to_unuse, do_swap_page can handle the fault much - * faster than try_to_unuse can locate the entry. This - * apparently redundant "wait_on_page_locked" lets try_to_unuse - * defer to do_swap_page in such a case - in some tests, - * do_swap_page and try_to_unuse repeatedly compete. - */ - wait_on_page_locked(page); - wait_on_page_writeback(page); - lock_page(page); - wait_on_page_writeback(page); - - /* - * Remove all references to entry. + * Make sure that we aren't completely killing + * interactive performance. */ - swcount = *swap_map; - if (swap_count(swcount) == SWAP_MAP_SHMEM) { - retval = shmem_unuse(entry, page); - /* page has already been unlocked and released */ - if (retval < 0) - break; - continue; - } - if (swap_count(swcount) && start_mm != &init_mm) - retval = unuse_mm(start_mm, entry, page); - - if (swap_count(*swap_map)) { - int set_start_mm = (*swap_map >= swcount); - struct list_head *p = &start_mm->mmlist; - struct mm_struct *new_start_mm = start_mm; - struct mm_struct *prev_mm = start_mm; - struct mm_struct *mm; - - mmget(new_start_mm); - mmget(prev_mm); - spin_lock(&mmlist_lock); - while (swap_count(*swap_map) && !retval && - (p = p->next) != &start_mm->mmlist) { - mm = list_entry(p, struct mm_struct, mmlist); - if (!mmget_not_zero(mm)) - continue; - spin_unlock(&mmlist_lock); - mmput(prev_mm); - prev_mm = mm; + cond_resched(); + spin_lock(&mmlist_lock); + } + spin_unlock(&mmlist_lock); - cond_resched(); + mmput(prev_mm); - swcount = *swap_map; - if (!swap_count(swcount)) /* any usage ? */ - ; - else if (mm == &init_mm) - set_start_mm = 1; - else - retval = unuse_mm(mm, entry, page); - - if (set_start_mm && *swap_map < swcount) { - mmput(new_start_mm); - mmget(mm); - new_start_mm = mm; - set_start_mm = 0; - } - spin_lock(&mmlist_lock); - } - spin_unlock(&mmlist_lock); - mmput(prev_mm); - mmput(start_mm); - start_mm = new_start_mm; - } - if (retval) { - unlock_page(page); - put_page(page); - break; - } + i = 0; + while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { - /* - * If a reference remains (rare), we would like to leave - * the page in the swap cache; but try_to_unmap could - * then re-duplicate the entry once we drop page lock, - * so we might loop indefinitely; also, that page could - * not be swapped out to other storage meanwhile. So: - * delete from cache even if there's another reference, - * after ensuring that the data has been saved to disk - - * since if the reference remains (rarer), it will be - * read from disk into another page. Splitting into two - * pages would be incorrect if swap supported "shared - * private" pages, but they are handled by tmpfs files. - * - * Given how unuse_vma() targets one particular offset - * in an anon_vma, once the anon_vma has been determined, - * this splitting happens to be just what is needed to - * handle where KSM pages have been swapped out: re-reading - * is unnecessarily slow, but we can fix that later on. - */ - if (swap_count(*swap_map) && - PageDirty(page) && PageSwapCache(page)) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - }; - - swap_writepage(compound_head(page), &wbc); - lock_page(page); - wait_on_page_writeback(page); - } + entry = swp_entry(type, i); + page = find_get_page(swap_address_space(entry), i); + if (!page) + continue; /* * It is conceivable that a racing task removed this page from - * swap cache just before we acquired the page lock at the top, - * or while we dropped it in unuse_mm(). The page might even - * be back in swap cache on another swap area: that we must not - * delete, since it may not have been written out to swap yet. + * swap cache just before we acquired the page lock. The page + * might even be back in swap cache on another swap area. But + * that is okay, try_to_free_swap() only removes stale pages. */ - if (PageSwapCache(page) && - likely(page_private(page) == entry.val) && - !page_swapped(page)) - delete_from_swap_cache(compound_head(page)); - - /* - * So we could skip searching mms once swap count went - * to 1, we did not mark any present ptes as dirty: must - * mark page dirty so shrink_page_list will preserve it. - */ - SetPageDirty(page); + lock_page(page); + wait_on_page_writeback(page); + try_to_free_swap(page); unlock_page(page); put_page(page); /* - * Make sure that we aren't completely killing - * interactive performance. + * For frontswap, we just need to unuse pages_to_unuse, if + * it was specified. Need not check frontswap again here as + * we already zeroed out pages_to_unuse if not frontswap. */ - cond_resched(); - if (frontswap && pages_to_unuse > 0) { - if (!--pages_to_unuse) - break; - } + if (pages_to_unuse && --pages_to_unuse == 0) + goto out; } - mmput(start_mm); - return retval; + /* + * Lets check again to see if there are still swap entries in the map. + * If yes, we would need to do retry the unuse logic again. + * Under global memory pressure, swap entries can be reinserted back + * into process space after the mmlist loop above passes over them. + * Its not worth continuosuly retrying to unuse the swap in this case. + * So we try SWAP_UNUSE_MAX_TRIES times. + */ + if (++retries >= SWAP_UNUSE_MAX_TRIES) + retval = -EBUSY; + else if (si->inuse_pages) + goto retry; + +out: + return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval; } /* @@ -2268,7 +2162,7 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) struct swap_extent *se; pgoff_t offset; - sis = swap_info[swp_type(entry)]; + sis = swp_swap_info(entry); *bdev = sis->bdev; offset = swp_offset(entry); @@ -2310,12 +2204,13 @@ static void destroy_swap_extents(struct swap_info_struct *sis) kfree(se); } - if (sis->flags & SWP_FILE) { + if (sis->flags & SWP_ACTIVATED) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; - sis->flags &= ~SWP_FILE; - mapping->a_ops->swap_deactivate(swap_file); + sis->flags &= ~SWP_ACTIVATED; + if (mapping->a_ops->swap_deactivate) + mapping->a_ops->swap_deactivate(swap_file); } } @@ -2364,6 +2259,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, list_add_tail(&new_se->list, &sis->first_swap_extent.list); return 1; } +EXPORT_SYMBOL_GPL(add_swap_extent); /* * A `swap extent' is a simple thing which maps a contiguous range of pages @@ -2411,8 +2307,10 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) if (mapping->a_ops->swap_activate) { ret = mapping->a_ops->swap_activate(sis, swap_file, span); + if (ret >= 0) + sis->flags |= SWP_ACTIVATED; if (!ret) { - sis->flags |= SWP_FILE; + sis->flags |= SWP_FS; ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; } @@ -2706,9 +2604,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) if (!l) return SEQ_START_TOKEN; - for (type = 0; type < nr_swapfiles; type++) { - smp_rmb(); /* read nr_swapfiles before swap_info[type] */ - si = swap_info[type]; + for (type = 0; (si = swap_type_to_swap_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; if (!--l) @@ -2728,9 +2624,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) else type = si->type + 1; - for (; type < nr_swapfiles; type++) { - smp_rmb(); /* read nr_swapfiles before swap_info[type] */ - si = swap_info[type]; + for (; (si = swap_type_to_swap_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; ++*pos; @@ -2820,7 +2714,7 @@ static struct swap_info_struct *alloc_swap_info(void) unsigned int type; int i; - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); @@ -2831,21 +2725,21 @@ static struct swap_info_struct *alloc_swap_info(void) } if (type >= MAX_SWAPFILES) { spin_unlock(&swap_lock); - kfree(p); + kvfree(p); return ERR_PTR(-EPERM); } if (type >= nr_swapfiles) { p->type = type; - swap_info[type] = p; + WRITE_ONCE(swap_info[type], p); /* * Write swap_info[type] before nr_swapfiles, in case a * racing procfs swap_start() or swap_next() is reading them. * (We never shrink nr_swapfiles, we never free this entry.) */ smp_wmb(); - nr_swapfiles++; + WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1); } else { - kfree(p); + kvfree(p); p = swap_info[type]; /* * Do not memset this entry: a racing procfs swap_next() @@ -3363,7 +3257,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) { struct swap_info_struct *p; struct swap_cluster_info *ci; - unsigned long offset, type; + unsigned long offset; unsigned char count; unsigned char has_cache; int err = -EINVAL; @@ -3371,10 +3265,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) if (non_swap_entry(entry)) goto out; - type = swp_type(entry); - if (type >= nr_swapfiles) + p = swp_swap_info(entry); + if (!p) goto bad_file; - p = swap_info[type]; + offset = swp_offset(entry); if (unlikely(offset >= p->max)) goto out; @@ -3471,7 +3365,7 @@ int swapcache_prepare(swp_entry_t entry) struct swap_info_struct *swp_swap_info(swp_entry_t entry) { - return swap_info[swp_type(entry)]; + return swap_type_to_swap_info(swp_type(entry)); } struct swap_info_struct *page_swap_info(struct page *page) diff --git a/mm/truncate.c b/mm/truncate.c index 1d2fb2dca96f..b7d3c99f00c9 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -33,15 +33,12 @@ static inline void __clear_shadow_entry(struct address_space *mapping, pgoff_t index, void *entry) { - struct radix_tree_node *node; - void **slot; + XA_STATE(xas, &mapping->i_pages, index); - if (!__radix_tree_lookup(&mapping->i_pages, index, &node, &slot)) + xas_set_update(&xas, workingset_update_node); + if (xas_load(&xas) != entry) return; - if (*slot != entry) - return; - __radix_tree_replace(&mapping->i_pages, node, slot, NULL, - workingset_update_node); + xas_store(&xas, NULL); mapping->nrexceptional--; } @@ -70,7 +67,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, return; for (j = 0; j < pagevec_count(pvec); j++) - if (radix_tree_exceptional_entry(pvec->pages[j])) + if (xa_is_value(pvec->pages[j])) break; if (j == pagevec_count(pvec)) @@ -85,7 +82,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, struct page *page = pvec->pages[i]; pgoff_t index = indices[i]; - if (!radix_tree_exceptional_entry(page)) { + if (!xa_is_value(page)) { pvec->pages[j++] = page; continue; } @@ -347,7 +344,7 @@ void truncate_inode_pages_range(struct address_space *mapping, if (index >= end) break; - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) continue; if (!trylock_page(page)) @@ -442,7 +439,7 @@ void truncate_inode_pages_range(struct address_space *mapping, break; } - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) continue; lock_page(page); @@ -520,9 +517,13 @@ void truncate_inode_pages_final(struct address_space *mapping) */ xa_lock_irq(&mapping->i_pages); xa_unlock_irq(&mapping->i_pages); - - truncate_inode_pages(mapping, 0); } + + /* + * Cleancache needs notification even if there are no pages or shadow + * entries. + */ + truncate_inode_pages(mapping, 0); } EXPORT_SYMBOL(truncate_inode_pages_final); @@ -538,6 +539,8 @@ EXPORT_SYMBOL(truncate_inode_pages_final); * invalidate_mapping_pages() will not block on IO activity. It will not * invalidate pages which are dirty, locked, under writeback or mapped into * pagetables. + * + * Return: the number of the pages that were invalidated */ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) @@ -561,7 +564,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (index > end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { invalidate_exceptional_entry(mapping, index, page); continue; @@ -663,7 +666,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page) * Any pages which are found to be mapped into pagetables are unmapped prior to * invalidation. * - * Returns -EBUSY if any pages could not be invalidated. + * Return: -EBUSY if any pages could not be invalidated. */ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) @@ -692,7 +695,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, if (index > end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { if (!invalidate_exceptional_entry2(mapping, index, page)) ret = -EBUSY; @@ -738,10 +741,10 @@ int invalidate_inode_pages2_range(struct address_space *mapping, index++; } /* - * For DAX we invalidate page tables after invalidating radix tree. We + * For DAX we invalidate page tables after invalidating page cache. We * could invalidate page tables while invalidating each entry however * that would be expensive. And doing range unmapping before doesn't - * work as we have no cheap way to find whether radix tree entry didn't + * work as we have no cheap way to find whether page cache entry didn't * get remapped later. */ if (dax_mapping(mapping)) { @@ -760,7 +763,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); * Any pages which are found to be mapped into pagetables are unmapped prior to * invalidation. * - * Returns -EBUSY if any pages could not be invalidated. + * Return: -EBUSY if any pages could not be invalidated. */ int invalidate_inode_pages2(struct address_space *mapping) { diff --git a/mm/usercopy.c b/mm/usercopy.c index 852eb4e53f06..14faadcedd06 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -247,7 +247,8 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks); /* * Validates that the given object is: * - not bogus address - * - known-safe heap or stack object + * - fully contained by stack (or stack frame, when available) + * - fully within SLAB object (or object whitelist area, when available) * - not in kernel text */ void __check_object_size(const void *ptr, unsigned long n, bool to_user) @@ -262,9 +263,6 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user) /* Check for invalid addresses. */ check_bogus_address((const unsigned long)ptr, n, to_user); - /* Check for bad heap object. */ - check_heap_object(ptr, n, to_user); - /* Check for bad stack object. */ switch (check_stack_object(ptr, n)) { case NOT_STACK: @@ -282,6 +280,9 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user) usercopy_abort("process stack", NULL, to_user, 0, n); } + /* Check for bad heap object. */ + check_heap_object(ptr, n, to_user); + /* Check for object in kernel to avoid text exposure. */ check_kernel_text_object((const unsigned long)ptr, n, to_user); } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 5029f241908f..d59b5a73dfb3 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -33,6 +33,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, void *page_kaddr; int ret; struct page *page; + pgoff_t offset, max_off; + struct inode *inode; if (!*pagep) { ret = -ENOMEM; @@ -48,7 +50,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, /* fallback to copy_from_user outside mmap_sem */ if (unlikely(ret)) { - ret = -EFAULT; + ret = -ENOENT; *pagep = page; /* don't free the page */ goto out; @@ -73,8 +75,17 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, if (dst_vma->vm_flags & VM_WRITE) _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); - ret = -EEXIST; dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + if (dst_vma->vm_file) { + /* the shmem MAP_PRIVATE case requires checking the i_size */ + inode = dst_vma->vm_file->f_inode; + offset = linear_page_index(dst_vma, dst_addr); + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + ret = -EFAULT; + if (unlikely(offset >= max_off)) + goto out_release_uncharge_unlock; + } + ret = -EEXIST; if (!pte_none(*dst_pte)) goto out_release_uncharge_unlock; @@ -108,11 +119,22 @@ static int mfill_zeropage_pte(struct mm_struct *dst_mm, pte_t _dst_pte, *dst_pte; spinlock_t *ptl; int ret; + pgoff_t offset, max_off; + struct inode *inode; _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), dst_vma->vm_page_prot)); - ret = -EEXIST; dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + if (dst_vma->vm_file) { + /* the shmem MAP_PRIVATE case requires checking the i_size */ + inode = dst_vma->vm_file->f_inode; + offset = linear_page_index(dst_vma, dst_addr); + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + ret = -EFAULT; + if (unlikely(offset >= max_off)) + goto out_unlock; + } + ret = -EEXIST; if (!pte_none(*dst_pte)) goto out_unlock; set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); @@ -205,8 +227,9 @@ retry: if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) goto out_unlock; /* - * Only allow __mcopy_atomic_hugetlb on userfaultfd - * registered ranges. + * Check the vma is registered in uffd, this is + * required to enforce the VM_MAYWRITE check done at + * uffd registration time. */ if (!dst_vma->vm_userfaultfd_ctx.ctx) goto out_unlock; @@ -274,7 +297,7 @@ retry: cond_resched(); - if (unlikely(err == -EFAULT)) { + if (unlikely(err == -ENOENT)) { up_read(&dst_mm->mmap_sem); BUG_ON(!page); @@ -380,7 +403,17 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, { ssize_t err; - if (vma_is_anonymous(dst_vma)) { + /* + * The normal page fault path for a shmem will invoke the + * fault, fill the hole in the file and COW it right away. The + * result generates plain anonymous memory. So when we are + * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll + * generate anonymous memory directly without actually filling + * the hole. For the MAP_PRIVATE case the robustness check + * only happens in the pagetable (to verify it's still none) + * and not in the radix tree. + */ + if (!(dst_vma->vm_flags & VM_SHARED)) { if (!zeropage) err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, src_addr, page); @@ -449,13 +482,9 @@ retry: if (!dst_vma) goto out_unlock; /* - * Be strict and only allow __mcopy_atomic on userfaultfd - * registered ranges to prevent userland errors going - * unnoticed. As far as the VM consistency is concerned, it - * would be perfectly safe to remove this check, but there's - * no useful usage for __mcopy_atomic ouside of userfaultfd - * registered ranges. This is after all why these are ioctls - * belonging to the userfaultfd and not syscalls. + * Check the vma is registered in uffd, this is required to + * enforce the VM_MAYWRITE check done at uffd registration + * time. */ if (!dst_vma->vm_userfaultfd_ctx.ctx) goto out_unlock; @@ -489,7 +518,8 @@ retry: * dst_vma. */ err = -ENOMEM; - if (vma_is_anonymous(dst_vma) && unlikely(anon_vma_prepare(dst_vma))) + if (!(dst_vma->vm_flags & VM_SHARED) && + unlikely(anon_vma_prepare(dst_vma))) goto out_unlock; while (src_addr < src_start + len) { @@ -513,7 +543,7 @@ retry: break; } if (unlikely(pmd_none(dst_pmdval)) && - unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) { + unlikely(__pte_alloc(dst_mm, dst_pmd))) { err = -ENOMEM; break; } @@ -530,7 +560,7 @@ retry: src_addr, &page, zeropage); cond_resched(); - if (unlikely(err == -EFAULT)) { + if (unlikely(err == -ENOENT)) { void *page_kaddr; up_read(&dst_mm->mmap_sem); diff --git a/mm/util.c b/mm/util.c index 9e3ebd2ef65f..d559bde497a9 100644 --- a/mm/util.c +++ b/mm/util.c @@ -15,17 +15,10 @@ #include <linux/vmalloc.h> #include <linux/userfaultfd_k.h> -#include <asm/sections.h> #include <linux/uaccess.h> #include "internal.h" -static inline int is_kernel_rodata(unsigned long addr) -{ - return addr >= (unsigned long)__start_rodata && - addr < (unsigned long)__end_rodata; -} - /** * kfree_const - conditionally free memory * @x: pointer to the memory @@ -43,6 +36,8 @@ EXPORT_SYMBOL(kfree_const); * kstrdup - allocate space for and copy an existing string * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Return: newly allocated copy of @s or %NULL in case of error */ char *kstrdup(const char *s, gfp_t gfp) { @@ -65,9 +60,10 @@ EXPORT_SYMBOL(kstrdup); * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory * - * Function returns source string if it is in .rodata section otherwise it - * fallbacks to kstrdup. - * Strings allocated by kstrdup_const should be freed by kfree_const. + * Note: Strings allocated by kstrdup_const should be freed by kfree_const. + * + * Return: source string if it is in .rodata section otherwise + * fallback to kstrdup. */ const char *kstrdup_const(const char *s, gfp_t gfp) { @@ -85,6 +81,8 @@ EXPORT_SYMBOL(kstrdup_const); * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Note: Use kmemdup_nul() instead if the size is known exactly. + * + * Return: newly allocated copy of @s or %NULL in case of error */ char *kstrndup(const char *s, size_t max, gfp_t gfp) { @@ -110,6 +108,8 @@ EXPORT_SYMBOL(kstrndup); * @src: memory region to duplicate * @len: memory region length * @gfp: GFP mask to use + * + * Return: newly allocated copy of @src or %NULL in case of error */ void *kmemdup(const void *src, size_t len, gfp_t gfp) { @@ -127,6 +127,9 @@ EXPORT_SYMBOL(kmemdup); * @s: The data to stringify * @len: The size of the data * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Return: newly allocated copy of @s with NUL-termination or %NULL in + * case of error */ char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) { @@ -150,14 +153,14 @@ EXPORT_SYMBOL(kmemdup_nul); * @src: source address in user space * @len: number of bytes to copy * - * Returns an ERR_PTR() on failure. Result is physically + * Return: an ERR_PTR() on failure. Result is physically * contiguous, to be freed by kfree(). */ void *memdup_user(const void __user *src, size_t len) { void *p; - p = kmalloc_track_caller(len, GFP_USER); + p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); @@ -176,7 +179,7 @@ EXPORT_SYMBOL(memdup_user); * @src: source address in user space * @len: number of bytes to copy * - * Returns an ERR_PTR() on failure. Result may be not + * Return: an ERR_PTR() on failure. Result may be not * physically contiguous. Use kvfree() to free. */ void *vmemdup_user(const void __user *src, size_t len) @@ -200,6 +203,8 @@ EXPORT_SYMBOL(vmemdup_user); * strndup_user - duplicate an existing string from user space * @s: The string to duplicate * @n: Maximum number of bytes to copy, including the trailing NUL. + * + * Return: newly allocated copy of @s or %NULL in case of error */ char *strndup_user(const char __user *s, long n) { @@ -231,7 +236,7 @@ EXPORT_SYMBOL(strndup_user); * @src: source address in user space * @len: number of bytes to copy * - * Returns an ERR_PTR() on failure. + * Return: an ERR_PTR() on failure. */ void *memdup_user_nul(const void __user *src, size_t len) { @@ -317,10 +322,6 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. - * * get_user_pages_fast provides equivalent functionality to get_user_pages, * operating on current and current->mm, with force=0 and vma=NULL. However * unlike get_user_pages, it must be called without mmap_sem held. @@ -332,6 +333,10 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); * pages have to be faulted in, it may turn out to be slightly slower so * callers need to carefully consider what to use. On many architectures, * get_user_pages_fast simply falls back to get_user_pages. + * + * Return: number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. */ int __weak get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) @@ -393,6 +398,8 @@ EXPORT_SYMBOL(vm_mmap); * * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not * fall back to vmalloc. + * + * Return: pointer to the allocated memory of %NULL in case of failure */ void *kvmalloc_node(size_t size, gfp_t flags, int node) { @@ -442,7 +449,7 @@ EXPORT_SYMBOL(kvmalloc_node); * It is slightly more efficient to use kfree() or vfree() if you are certain * that you know which one to use. * - * Context: Any context except NMI. + * Context: Either preemptible task context or not-NMI interrupt. */ void kvfree(const void *addr) { @@ -485,7 +492,7 @@ bool page_mapped(struct page *page) return true; if (PageHuge(page)) return false; - for (i = 0; i < hpage_nr_pages(page); i++) { + for (i = 0; i < (1 << compound_order(page)); i++) { if (atomic_read(&page[i]._mapcount) >= 0) return true; } @@ -600,7 +607,7 @@ unsigned long vm_commit_limit(void) if (sysctl_overcommit_kbytes) allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); else - allowed = ((totalram_pages - hugetlb_total_pages()) + allowed = ((totalram_pages() - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100); allowed += total_swap_pages; @@ -685,8 +692,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) * Part of the kernel memory, which can be released * under memory pressure. */ - free += global_node_page_state( - NR_INDIRECTLY_RECLAIMABLE_BYTES) >> PAGE_SHIFT; + free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); /* * Leave reserved pages. The pages are not for anonymous pages. @@ -737,7 +743,8 @@ error: * @buffer: the buffer to copy to. * @buflen: the length of the buffer. Larger cmdline values are truncated * to this length. - * Returns the size of the cmdline field copied. Note that the copy does + * + * Return: the size of the cmdline field copied. Note that the copy does * not guarantee an ending NULL byte. */ int get_cmdline(struct task_struct *task, char *buffer, int buflen) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a728fc492557..e86ba6e74b50 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -498,7 +498,11 @@ nocache: } found: - if (addr + size > vend) + /* + * Check also calculated address against the vstart, + * because it can be 0 because of big align request. + */ + if (addr + size > vend || addr < vstart) goto overflow; va->va_start = addr; @@ -840,7 +844,7 @@ static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) * @order: how many 2^order pages should be occupied in newly allocated block * @gfp_mask: flags for the page level allocator * - * Returns: virtual address in a newly allocated block or ERR_PTR(-errno) + * Return: virtual address in a newly allocated block or ERR_PTR(-errno) */ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) { @@ -1187,6 +1191,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro EXPORT_SYMBOL(vm_map_ram); static struct vm_struct *vmlist __initdata; + /** * vm_area_add_early - add vmap area early during boot * @vm: vm_struct to add @@ -1421,13 +1426,15 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, } /** - * get_vm_area - reserve a contiguous kernel virtual area - * @size: size of the area - * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC + * get_vm_area - reserve a contiguous kernel virtual area + * @size: size of the area + * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC * - * Search an area of @size in the kernel virtual mapping area, - * and reserved it for out purposes. Returns the area descriptor - * on success or %NULL on failure. + * Search an area of @size in the kernel virtual mapping area, + * and reserved it for out purposes. Returns the area descriptor + * on success or %NULL on failure. + * + * Return: the area descriptor on success or %NULL on failure. */ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) { @@ -1444,12 +1451,14 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, } /** - * find_vm_area - find a continuous kernel virtual area - * @addr: base address + * find_vm_area - find a continuous kernel virtual area + * @addr: base address + * + * Search for the kernel VM area starting at @addr, and return it. + * It is up to the caller to do all required locking to keep the returned + * pointer valid. * - * Search for the kernel VM area starting at @addr, and return it. - * It is up to the caller to do all required locking to keep the returned - * pointer valid. + * Return: pointer to the found area or %NULL on faulure */ struct vm_struct *find_vm_area(const void *addr) { @@ -1463,12 +1472,14 @@ struct vm_struct *find_vm_area(const void *addr) } /** - * remove_vm_area - find and remove a continuous kernel virtual area - * @addr: base address + * remove_vm_area - find and remove a continuous kernel virtual area + * @addr: base address * - * Search for the kernel VM area starting at @addr, and remove it. - * This function returns the found VM area, but using it is NOT safe - * on SMP machines, except for its size or flags. + * Search for the kernel VM area starting at @addr, and remove it. + * This function returns the found VM area, but using it is NOT safe + * on SMP machines, except for its size or flags. + * + * Return: pointer to the found area or %NULL on faulure */ struct vm_struct *remove_vm_area(const void *addr) { @@ -1505,7 +1516,7 @@ static void __vunmap(const void *addr, int deallocate_pages) addr)) return; - area = find_vmap_area((unsigned long)addr)->vm; + area = find_vm_area(addr); if (unlikely(!area)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); @@ -1548,11 +1559,11 @@ static inline void __vfree_deferred(const void *addr) } /** - * vfree_atomic - release memory allocated by vmalloc() - * @addr: memory base address + * vfree_atomic - release memory allocated by vmalloc() + * @addr: memory base address * - * This one is just like vfree() but can be called in any atomic context - * except NMIs. + * This one is just like vfree() but can be called in any atomic context + * except NMIs. */ void vfree_atomic(const void *addr) { @@ -1565,19 +1576,29 @@ void vfree_atomic(const void *addr) __vfree_deferred(addr); } +static void __vfree(const void *addr) +{ + if (unlikely(in_interrupt())) + __vfree_deferred(addr); + else + __vunmap(addr, 1); +} + /** - * vfree - release memory allocated by vmalloc() - * @addr: memory base address + * vfree - release memory allocated by vmalloc() + * @addr: memory base address + * + * Free the virtually continuous memory area starting at @addr, as + * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is + * NULL, no operation is performed. * - * Free the virtually continuous memory area starting at @addr, as - * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is - * NULL, no operation is performed. + * Must not be called in NMI context (strictly speaking, only if we don't + * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling + * conventions for vfree() arch-depenedent would be a really bad idea) * - * Must not be called in NMI context (strictly speaking, only if we don't - * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling - * conventions for vfree() arch-depenedent would be a really bad idea) + * May sleep if called *not* from interrupt context. * - * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) + * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) */ void vfree(const void *addr) { @@ -1585,23 +1606,23 @@ void vfree(const void *addr) kmemleak_free(addr); + might_sleep_if(!in_interrupt()); + if (!addr) return; - if (unlikely(in_interrupt())) - __vfree_deferred(addr); - else - __vunmap(addr, 1); + + __vfree(addr); } EXPORT_SYMBOL(vfree); /** - * vunmap - release virtual mapping obtained by vmap() - * @addr: memory base address + * vunmap - release virtual mapping obtained by vmap() + * @addr: memory base address * - * Free the virtually contiguous memory area starting at @addr, - * which was created from the page array passed to vmap(). + * Free the virtually contiguous memory area starting at @addr, + * which was created from the page array passed to vmap(). * - * Must not be called in interrupt context. + * Must not be called in interrupt context. */ void vunmap(const void *addr) { @@ -1613,24 +1634,26 @@ void vunmap(const void *addr) EXPORT_SYMBOL(vunmap); /** - * vmap - map an array of pages into virtually contiguous space - * @pages: array of page pointers - * @count: number of pages to map - * @flags: vm_area->flags - * @prot: page protection for the mapping - * - * Maps @count pages from @pages into contiguous kernel virtual - * space. + * vmap - map an array of pages into virtually contiguous space + * @pages: array of page pointers + * @count: number of pages to map + * @flags: vm_area->flags + * @prot: page protection for the mapping + * + * Maps @count pages from @pages into contiguous kernel virtual + * space. + * + * Return: the address of the area or %NULL on failure */ void *vmap(struct page **pages, unsigned int count, - unsigned long flags, pgprot_t prot) + unsigned long flags, pgprot_t prot) { struct vm_struct *area; unsigned long size; /* In bytes */ might_sleep(); - if (count > totalram_pages) + if (count > totalram_pages()) return NULL; size = (unsigned long)count << PAGE_SHIFT; @@ -1705,25 +1728,27 @@ fail: warn_alloc(gfp_mask, NULL, "vmalloc: allocation failure, allocated %ld of %ld bytes", (area->nr_pages*PAGE_SIZE), area->size); - vfree(area->addr); + __vfree(area->addr); return NULL; } /** - * __vmalloc_node_range - allocate virtually contiguous memory - * @size: allocation size - * @align: desired alignment - * @start: vm area range start - * @end: vm area range end - * @gfp_mask: flags for the page level allocator - * @prot: protection mask for the allocated pages - * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) - * @node: node to use for allocation or NUMA_NO_NODE - * @caller: caller's return address - * - * Allocate enough pages to cover @size from the page level - * allocator with @gfp_mask flags. Map them into contiguous - * kernel virtual space, using a pagetable protection of @prot. + * __vmalloc_node_range - allocate virtually contiguous memory + * @size: allocation size + * @align: desired alignment + * @start: vm area range start + * @end: vm area range end + * @gfp_mask: flags for the page level allocator + * @prot: protection mask for the allocated pages + * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) + * @node: node to use for allocation or NUMA_NO_NODE + * @caller: caller's return address + * + * Allocate enough pages to cover @size from the page level + * allocator with @gfp_mask flags. Map them into contiguous + * kernel virtual space, using a pagetable protection of @prot. + * + * Return: the address of the area or %NULL on failure */ void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, @@ -1735,7 +1760,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long real_size = size; size = PAGE_ALIGN(size); - if (!size || (size >> PAGE_SHIFT) > totalram_pages) + if (!size || (size >> PAGE_SHIFT) > totalram_pages()) goto fail; area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | @@ -1764,25 +1789,35 @@ fail: return NULL; } +/* + * This is only for performance analysis of vmalloc and stress purpose. + * It is required by vmalloc test module, therefore do not use it other + * than that. + */ +#ifdef CONFIG_TEST_VMALLOC_MODULE +EXPORT_SYMBOL_GPL(__vmalloc_node_range); +#endif + /** - * __vmalloc_node - allocate virtually contiguous memory - * @size: allocation size - * @align: desired alignment - * @gfp_mask: flags for the page level allocator - * @prot: protection mask for the allocated pages - * @node: node to use for allocation or NUMA_NO_NODE - * @caller: caller's return address + * __vmalloc_node - allocate virtually contiguous memory + * @size: allocation size + * @align: desired alignment + * @gfp_mask: flags for the page level allocator + * @prot: protection mask for the allocated pages + * @node: node to use for allocation or NUMA_NO_NODE + * @caller: caller's return address * - * Allocate enough pages to cover @size from the page level - * allocator with @gfp_mask flags. Map them into contiguous - * kernel virtual space, using a pagetable protection of @prot. + * Allocate enough pages to cover @size from the page level + * allocator with @gfp_mask flags. Map them into contiguous + * kernel virtual space, using a pagetable protection of @prot. * - * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL - * and __GFP_NOFAIL are not supported + * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL + * and __GFP_NOFAIL are not supported * - * Any use of gfp flags outside of GFP_KERNEL should be consulted - * with mm people. + * Any use of gfp flags outside of GFP_KERNEL should be consulted + * with mm people. * + * Return: pointer to the allocated memory or %NULL on error */ static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, @@ -1814,13 +1849,16 @@ void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, } /** - * vmalloc - allocate virtually contiguous memory - * @size: allocation size - * Allocate enough pages to cover @size from the page level - * allocator and map them into contiguous kernel virtual space. + * vmalloc - allocate virtually contiguous memory + * @size: allocation size + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. + * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc(unsigned long size) { @@ -1830,14 +1868,17 @@ void *vmalloc(unsigned long size) EXPORT_SYMBOL(vmalloc); /** - * vzalloc - allocate virtually contiguous memory with zero fill - * @size: allocation size - * Allocate enough pages to cover @size from the page level - * allocator and map them into contiguous kernel virtual space. - * The memory allocated is set to zero. - * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. + * vzalloc - allocate virtually contiguous memory with zero fill + * @size: allocation size + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + * + * Return: pointer to the allocated memory or %NULL on error */ void *vzalloc(unsigned long size) { @@ -1852,34 +1893,30 @@ EXPORT_SYMBOL(vzalloc); * * The resulting memory area is zeroed so it can be mapped to userspace * without leaking data. + * + * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_user(unsigned long size) { - struct vm_struct *area; - void *ret; - - ret = __vmalloc_node(size, SHMLBA, - GFP_KERNEL | __GFP_ZERO, - PAGE_KERNEL, NUMA_NO_NODE, - __builtin_return_address(0)); - if (ret) { - area = find_vm_area(ret); - area->flags |= VM_USERMAP; - } - return ret; + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, + VM_USERMAP, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_user); /** - * vmalloc_node - allocate memory on a specific node - * @size: allocation size - * @node: numa node + * vmalloc_node - allocate memory on a specific node + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. * - * Allocate enough pages to cover @size from the page level - * allocator and map them into contiguous kernel virtual space. + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. + * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_node(unsigned long size, int node) { @@ -1899,6 +1936,8 @@ EXPORT_SYMBOL(vmalloc_node); * * For tight control over page level allocator and protection flags * use __vmalloc_node() instead. + * + * Return: pointer to the allocated memory or %NULL on error */ void *vzalloc_node(unsigned long size, int node) { @@ -1908,17 +1947,18 @@ void *vzalloc_node(unsigned long size, int node) EXPORT_SYMBOL(vzalloc_node); /** - * vmalloc_exec - allocate virtually contiguous, executable memory - * @size: allocation size + * vmalloc_exec - allocate virtually contiguous, executable memory + * @size: allocation size + * + * Kernel-internal function to allocate enough pages to cover @size + * the page level allocator and map them into contiguous and + * executable kernel virtual space. * - * Kernel-internal function to allocate enough pages to cover @size - * the page level allocator and map them into contiguous and - * executable kernel virtual space. + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. + * Return: pointer to the allocated memory or %NULL on error */ - void *vmalloc_exec(unsigned long size) { return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC, @@ -1938,11 +1978,13 @@ void *vmalloc_exec(unsigned long size) #endif /** - * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) - * @size: allocation size + * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) + * @size: allocation size * - * Allocate enough 32bit PA addressable pages to cover @size from the - * page level allocator and map them into contiguous kernel virtual space. + * Allocate enough 32bit PA addressable pages to cover @size from the + * page level allocator and map them into contiguous kernel virtual space. + * + * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_32(unsigned long size) { @@ -1953,23 +1995,19 @@ EXPORT_SYMBOL(vmalloc_32); /** * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory - * @size: allocation size + * @size: allocation size * * The resulting memory area is 32bit addressable and zeroed so it can be * mapped to userspace without leaking data. + * + * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_32_user(unsigned long size) { - struct vm_struct *area; - void *ret; - - ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, - NUMA_NO_NODE, __builtin_return_address(0)); - if (ret) { - area = find_vm_area(ret); - area->flags |= VM_USERMAP; - } - return ret; + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, + VM_USERMAP, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32_user); @@ -2055,31 +2093,29 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) } /** - * vread() - read vmalloc area in a safe way. - * @buf: buffer for reading data - * @addr: vm address. - * @count: number of bytes to be read. - * - * Returns # of bytes which addr and buf should be increased. - * (same number to @count). Returns 0 if [addr...addr+count) doesn't - * includes any intersect with alive vmalloc area. - * - * This function checks that addr is a valid vmalloc'ed area, and - * copy data from that area to a given buffer. If the given memory range - * of [addr...addr+count) includes some valid address, data is copied to - * proper area of @buf. If there are memory holes, they'll be zero-filled. - * IOREMAP area is treated as memory hole and no copy is done. - * - * If [addr...addr+count) doesn't includes any intersects with alive - * vm_struct area, returns 0. @buf should be kernel's buffer. - * - * Note: In usual ops, vread() is never necessary because the caller - * should know vmalloc() area is valid and can use memcpy(). - * This is for routines which have to access vmalloc area without - * any informaion, as /dev/kmem. - * + * vread() - read vmalloc area in a safe way. + * @buf: buffer for reading data + * @addr: vm address. + * @count: number of bytes to be read. + * + * This function checks that addr is a valid vmalloc'ed area, and + * copy data from that area to a given buffer. If the given memory range + * of [addr...addr+count) includes some valid address, data is copied to + * proper area of @buf. If there are memory holes, they'll be zero-filled. + * IOREMAP area is treated as memory hole and no copy is done. + * + * If [addr...addr+count) doesn't includes any intersects with alive + * vm_struct area, returns 0. @buf should be kernel's buffer. + * + * Note: In usual ops, vread() is never necessary because the caller + * should know vmalloc() area is valid and can use memcpy(). + * This is for routines which have to access vmalloc area without + * any informaion, as /dev/kmem. + * + * Return: number of bytes for which addr and buf should be increased + * (same number as @count) or %0 if [addr...addr+count) doesn't + * include any intersection with valid vmalloc area */ - long vread(char *buf, char *addr, unsigned long count) { struct vmap_area *va; @@ -2136,31 +2172,29 @@ finished: } /** - * vwrite() - write vmalloc area in a safe way. - * @buf: buffer for source data - * @addr: vm address. - * @count: number of bytes to be read. - * - * Returns # of bytes which addr and buf should be incresed. - * (same number to @count). - * If [addr...addr+count) doesn't includes any intersect with valid - * vmalloc area, returns 0. - * - * This function checks that addr is a valid vmalloc'ed area, and - * copy data from a buffer to the given addr. If specified range of - * [addr...addr+count) includes some valid address, data is copied from - * proper area of @buf. If there are memory holes, no copy to hole. - * IOREMAP area is treated as memory hole and no copy is done. - * - * If [addr...addr+count) doesn't includes any intersects with alive - * vm_struct area, returns 0. @buf should be kernel's buffer. - * - * Note: In usual ops, vwrite() is never necessary because the caller - * should know vmalloc() area is valid and can use memcpy(). - * This is for routines which have to access vmalloc area without - * any informaion, as /dev/kmem. + * vwrite() - write vmalloc area in a safe way. + * @buf: buffer for source data + * @addr: vm address. + * @count: number of bytes to be read. + * + * This function checks that addr is a valid vmalloc'ed area, and + * copy data from a buffer to the given addr. If specified range of + * [addr...addr+count) includes some valid address, data is copied from + * proper area of @buf. If there are memory holes, no copy to hole. + * IOREMAP area is treated as memory hole and no copy is done. + * + * If [addr...addr+count) doesn't includes any intersects with alive + * vm_struct area, returns 0. @buf should be kernel's buffer. + * + * Note: In usual ops, vwrite() is never necessary because the caller + * should know vmalloc() area is valid and can use memcpy(). + * This is for routines which have to access vmalloc area without + * any informaion, as /dev/kmem. + * + * Return: number of bytes for which addr and buf should be + * increased (same number as @count) or %0 if [addr...addr+count) + * doesn't include any intersection with valid vmalloc area */ - long vwrite(char *buf, char *addr, unsigned long count) { struct vmap_area *va; @@ -2212,20 +2246,20 @@ finished: } /** - * remap_vmalloc_range_partial - map vmalloc pages to userspace - * @vma: vma to cover - * @uaddr: target user address to start at - * @kaddr: virtual address of vmalloc kernel memory - * @size: size of map area + * remap_vmalloc_range_partial - map vmalloc pages to userspace + * @vma: vma to cover + * @uaddr: target user address to start at + * @kaddr: virtual address of vmalloc kernel memory + * @size: size of map area * - * Returns: 0 for success, -Exxx on failure + * Returns: 0 for success, -Exxx on failure * - * This function checks that @kaddr is a valid vmalloc'ed area, - * and that it is big enough to cover the range starting at - * @uaddr in @vma. Will return failure if that criteria isn't - * met. + * This function checks that @kaddr is a valid vmalloc'ed area, + * and that it is big enough to cover the range starting at + * @uaddr in @vma. Will return failure if that criteria isn't + * met. * - * Similar to remap_pfn_range() (see mm/memory.c) + * Similar to remap_pfn_range() (see mm/memory.c) */ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, void *kaddr, unsigned long size) @@ -2244,7 +2278,7 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, if (!(area->flags & VM_USERMAP)) return -EINVAL; - if (kaddr + size > area->addr + area->size) + if (kaddr + size > area->addr + get_vm_area_size(area)) return -EINVAL; do { @@ -2267,18 +2301,18 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, EXPORT_SYMBOL(remap_vmalloc_range_partial); /** - * remap_vmalloc_range - map vmalloc pages to userspace - * @vma: vma to cover (map full range of vma) - * @addr: vmalloc memory - * @pgoff: number of pages into addr before first page to map + * remap_vmalloc_range - map vmalloc pages to userspace + * @vma: vma to cover (map full range of vma) + * @addr: vmalloc memory + * @pgoff: number of pages into addr before first page to map * - * Returns: 0 for success, -Exxx on failure + * Returns: 0 for success, -Exxx on failure * - * This function checks that addr is a valid vmalloc'ed area, and - * that it is big enough to cover the vma. Will return failure if - * that criteria isn't met. + * This function checks that addr is a valid vmalloc'ed area, and + * that it is big enough to cover the vma. Will return failure if + * that criteria isn't met. * - * Similar to remap_pfn_range() (see mm/memory.c) + * Similar to remap_pfn_range() (see mm/memory.c) */ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff) @@ -2310,18 +2344,18 @@ static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data) } /** - * alloc_vm_area - allocate a range of kernel address space - * @size: size of the area - * @ptes: returns the PTEs for the address space + * alloc_vm_area - allocate a range of kernel address space + * @size: size of the area + * @ptes: returns the PTEs for the address space * - * Returns: NULL on failure, vm_struct on success + * Returns: NULL on failure, vm_struct on success * - * This function reserves a range of kernel address space, and - * allocates pagetables to map that range. No actual mappings - * are created. + * This function reserves a range of kernel address space, and + * allocates pagetables to map that range. No actual mappings + * are created. * - * If @ptes is non-NULL, pointers to the PTEs (in init_mm) - * allocated for the VM area are returned. + * If @ptes is non-NULL, pointers to the PTEs (in init_mm) + * allocated for the VM area are returned. */ struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) { @@ -2747,4 +2781,3 @@ static int __init proc_vmalloc_init(void) module_init(proc_vmalloc_init); #endif - diff --git a/mm/vmscan.c b/mm/vmscan.c index c5ef7240cbcb..a5ad0b35ab8e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -46,9 +46,11 @@ #include <linux/delayacct.h> #include <linux/sysctl.h> #include <linux/oom.h> +#include <linux/pagevec.h> #include <linux/prefetch.h> #include <linux/printk.h> #include <linux/dax.h> +#include <linux/psi.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -86,6 +88,9 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ unsigned int may_swap:1; + /* e.g. boosted watermark reclaim leaves slabs alone */ + unsigned int may_shrinkslab:1; + /* * Cgroups are not reclaimed below their configured memory.low, * unless we threaten to OOM. If any cgroups are skipped due to @@ -369,7 +374,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone */ int prealloc_shrinker(struct shrinker *shrinker) { - size_t size = sizeof(*shrinker->nr_deferred); + unsigned int size = sizeof(*shrinker->nr_deferred); if (shrinker->flags & SHRINKER_NUMA_AWARE) size *= nr_node_ids; @@ -473,19 +478,18 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); total_scan = nr; - delta = freeable >> priority; - delta *= 4; - do_div(delta, shrinker->seeks); - - /* - * Make sure we apply some minimal pressure on default priority - * even on small cgroups. Stale objects are not only consuming memory - * by themselves, but can also hold a reference to a dying cgroup, - * preventing it from being reclaimed. A dying cgroup with all - * corresponding structures like per-cpu stats and kmem caches - * can be really big, so it may lead to a significant waste of memory. - */ - delta = max_t(unsigned long long, delta, min(freeable, batch_size)); + if (shrinker->seeks) { + delta = freeable >> priority; + delta *= 4; + do_div(delta, shrinker->seeks); + } else { + /* + * These objects don't require any IO to create. Trim + * them aggressively under memory pressure to keep + * them from causing refetches in the IO caches. + */ + delta = freeable / 2; + } total_scan += delta; if (total_scan < 0) { @@ -741,12 +745,12 @@ static inline int is_page_cache_freeable(struct page *page) { /* * A freeable page cache page is referenced only by the caller - * that isolated the page, the page cache radix tree and - * optional buffer heads at page->private. + * that isolated the page, the page cache and optional buffer + * heads at page->private. */ - int radix_pins = PageTransHuge(page) && PageSwapCache(page) ? + int page_cache_pins = PageTransHuge(page) && PageSwapCache(page) ? HPAGE_PMD_NR : 1; - return page_count(page) - page_has_private(page) == 1 + radix_pins; + return page_count(page) - page_has_private(page) == 1 + page_cache_pins; } static int may_write_to_inode(struct inode *inode, struct scan_control *sc) @@ -922,7 +926,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; mem_cgroup_swapout(page, swap); - __delete_from_swap_cache(page); + __delete_from_swap_cache(page, swap); xa_unlock_irqrestore(&mapping->i_pages, flags); put_swap_page(page, swap); } else { @@ -948,7 +952,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, */ if (reclaimed && page_is_file_cache(page) && !mapping_exiting(mapping) && !dax_mapping(mapping)) - shadow = workingset_eviction(mapping, page); + shadow = workingset_eviction(page); __delete_from_page_cache(page, shadow); xa_unlock_irqrestore(&mapping->i_pages, flags); @@ -1102,16 +1106,9 @@ static unsigned long shrink_page_list(struct list_head *page_list, { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); - int pgactivate = 0; - unsigned nr_unqueued_dirty = 0; - unsigned nr_dirty = 0; - unsigned nr_congested = 0; unsigned nr_reclaimed = 0; - unsigned nr_writeback = 0; - unsigned nr_immediate = 0; - unsigned nr_ref_keep = 0; - unsigned nr_unmap_fail = 0; + memset(stat, 0, sizeof(*stat)); cond_resched(); while (!list_empty(page_list)) { @@ -1155,10 +1152,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ page_check_dirty_writeback(page, &dirty, &writeback); if (dirty || writeback) - nr_dirty++; + stat->nr_dirty++; if (dirty && !writeback) - nr_unqueued_dirty++; + stat->nr_unqueued_dirty++; /* * Treat this page as congested if the underlying BDI is or if @@ -1170,7 +1167,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (((dirty || writeback) && mapping && inode_write_congested(mapping->host)) || (writeback && PageReclaim(page))) - nr_congested++; + stat->nr_congested++; /* * If a page at the tail of the LRU is under writeback, there @@ -1219,7 +1216,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (current_is_kswapd() && PageReclaim(page) && test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { - nr_immediate++; + stat->nr_immediate++; goto activate_locked; /* Case 2 above */ @@ -1237,7 +1234,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * and it's also appropriate in global reclaim. */ SetPageReclaim(page); - nr_writeback++; + stat->nr_writeback++; goto activate_locked; /* Case 3 above */ @@ -1257,7 +1254,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, case PAGEREF_ACTIVATE: goto activate_locked; case PAGEREF_KEEP: - nr_ref_keep++; + stat->nr_ref_keep++; goto keep_locked; case PAGEREF_RECLAIM: case PAGEREF_RECLAIM_CLEAN: @@ -1322,7 +1319,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (unlikely(PageTransHuge(page))) flags |= TTU_SPLIT_HUGE_PMD; if (!try_to_unmap(page, flags)) { - nr_unmap_fail++; + stat->nr_unmap_fail++; goto activate_locked; } } @@ -1446,14 +1443,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, count_memcg_page_event(page, PGLAZYFREED); } else if (!mapping || !__remove_mapping(mapping, page, true)) goto keep_locked; - /* - * At this point, we have no other references and there is - * no way to pick any more up (removed from LRU, removed - * from pagecache). Can use non-atomic bitops now (and - * we obviously don't have to worry about waking up a process - * waiting on the page lock, because there are no references. - */ - __ClearPageLocked(page); + + unlock_page(page); free_it: nr_reclaimed++; @@ -1476,7 +1467,7 @@ activate_locked: VM_BUG_ON_PAGE(PageActive(page), page); if (!PageMlocked(page)) { SetPageActive(page); - pgactivate++; + stat->nr_activate++; count_memcg_page_event(page, PGACTIVATE); } keep_locked: @@ -1491,18 +1482,8 @@ keep: free_unref_page_list(&free_pages); list_splice(&ret_pages, page_list); - count_vm_events(PGACTIVATE, pgactivate); - - if (stat) { - stat->nr_dirty = nr_dirty; - stat->nr_congested = nr_congested; - stat->nr_unqueued_dirty = nr_unqueued_dirty; - stat->nr_writeback = nr_writeback; - stat->nr_immediate = nr_immediate; - stat->nr_activate = pgactivate; - stat->nr_ref_keep = nr_ref_keep; - stat->nr_unmap_fail = nr_unmap_fail; - } + count_vm_events(PGACTIVATE, stat->nr_activate); + return nr_reclaimed; } @@ -1514,6 +1495,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, .priority = DEF_PRIORITY, .may_unmap = 1, }; + struct reclaim_stat dummy_stat; unsigned long ret; struct page *page, *next; LIST_HEAD(clean_pages); @@ -1527,7 +1509,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, - TTU_IGNORE_ACCESS, NULL, true); + TTU_IGNORE_ACCESS, &dummy_stat, true); list_splice(&clean_pages, page_list); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); return ret; @@ -1632,8 +1614,8 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, } -/* - * zone_lru_lock is heavily contended. Some of the functions that +/** + * pgdat->lru_lock is heavily contended. Some of the functions that * shrink the lists perform better by taking out a batch of pages * and working on them outside the LRU lock. * @@ -1655,7 +1637,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, static unsigned long isolate_lru_pages(unsigned long nr_to_scan, struct lruvec *lruvec, struct list_head *dst, unsigned long *nr_scanned, struct scan_control *sc, - isolate_mode_t mode, enum lru_list lru) + enum lru_list lru) { struct list_head *src = &lruvec->lists[lru]; unsigned long nr_taken = 0; @@ -1664,6 +1646,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, unsigned long skipped = 0; unsigned long scan, total_scan, nr_pages; LIST_HEAD(pages_skipped); + isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED); scan = 0; for (total_scan = 0; @@ -1767,11 +1750,11 @@ int isolate_lru_page(struct page *page) WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); if (PageLRU(page)) { - struct zone *zone = page_zone(page); + pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; - spin_lock_irq(zone_lru_lock(zone)); - lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); + spin_lock_irq(&pgdat->lru_lock); + lruvec = mem_cgroup_page_lruvec(page, pgdat); if (PageLRU(page)) { int lru = page_lru(page); get_page(page); @@ -1779,7 +1762,7 @@ int isolate_lru_page(struct page *page) del_page_from_lru_list(page, lruvec, lru); ret = 0; } - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); } return ret; } @@ -1901,8 +1884,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, unsigned long nr_scanned; unsigned long nr_reclaimed = 0; unsigned long nr_taken; - struct reclaim_stat stat = {}; - isolate_mode_t isolate_mode = 0; + struct reclaim_stat stat; int file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; @@ -1923,13 +1905,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, lru_add_drain(); - if (!sc->may_unmap) - isolate_mode |= ISOLATE_UNMAPPED; - spin_lock_irq(&pgdat->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, - &nr_scanned, sc, isolate_mode, lru); + &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; @@ -2011,9 +1990,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * processes, from rmap. * * If the pages are mostly unmapped, the processing is fast and it is - * appropriate to hold zone_lru_lock across the whole operation. But if + * appropriate to hold pgdat->lru_lock across the whole operation. But if * the pages are mapped, the processing is slow (page_referenced()) so we - * should drop zone_lru_lock around each page. It's impossible to balance + * should drop pgdat->lru_lock around each page. It's impossible to balance * this, so instead we remove the pages from the LRU while processing them. * It is safe to rely on PG_active against the non-LRU pages in here because * nobody will play with that bit on a non-LRU page. @@ -2086,19 +2065,15 @@ static void shrink_active_list(unsigned long nr_to_scan, struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; unsigned nr_deactivate, nr_activate; unsigned nr_rotated = 0; - isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec); lru_add_drain(); - if (!sc->may_unmap) - isolate_mode |= ISOLATE_UNMAPPED; - spin_lock_irq(&pgdat->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, - &nr_scanned, sc, isolate_mode, lru); + &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; @@ -2145,6 +2120,7 @@ static void shrink_active_list(unsigned long nr_to_scan, } ClearPageActive(page); /* we are de-activating */ + SetPageWorkingset(page); list_add(&page->lru, &l_inactive); } @@ -2456,9 +2432,11 @@ out: /* * Scan types proportional to swappiness and * their relative recent reclaim efficiency. + * Make sure we don't miss the last page + * because of a round-off error. */ - scan = div64_u64(scan * fraction[file], - denominator); + scan = DIV64_U64_ROUND_UP(scan * fraction[file], + denominator); break; case SCAN_FILE: case SCAN_ANON: @@ -2742,8 +2720,10 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) shrink_node_memcg(pgdat, memcg, sc, &lru_pages); node_lru_pages += lru_pages; - shrink_slab(sc->gfp_mask, pgdat->node_id, + if (sc->may_shrinkslab) { + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); + } /* Record the group's reclaim efficiency */ vmpressure(sc->gfp_mask, memcg, false, @@ -2751,16 +2731,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) sc->nr_reclaimed - reclaimed); /* - * Direct reclaim and kswapd have to scan all memory - * cgroups to fulfill the overall scan target for the - * node. + * Kswapd have to scan all memory cgroups to fulfill + * the overall scan target for the node. * * Limit reclaim, on the other hand, only cares about * nr_to_reclaim pages to be reclaimed and it will * retry with decreasing priority if one round over the * whole hierarchy is not sufficient. */ - if (!global_reclaim(sc) && + if (!current_is_kswapd() && sc->nr_reclaimed >= sc->nr_to_reclaim) { mem_cgroup_iter_break(root, memcg); break; @@ -3225,6 +3204,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = 1, + .may_shrinkslab = 1, }; /* @@ -3269,6 +3249,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, .may_unmap = 1, .reclaim_idx = MAX_NR_ZONES - 1, .may_swap = !noswap, + .may_shrinkslab = 1, }; unsigned long lru_pages; @@ -3302,6 +3283,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, { struct zonelist *zonelist; unsigned long nr_reclaimed; + unsigned long pflags; int nid; unsigned int noreclaim_flag; struct scan_control sc = { @@ -3314,6 +3296,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = may_swap, + .may_shrinkslab = 1, }; /* @@ -3330,9 +3313,13 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, sc.gfp_mask, sc.reclaim_idx); + psi_memstall_enter(&pflags); noreclaim_flag = memalloc_noreclaim_save(); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + memalloc_noreclaim_restore(noreclaim_flag); + psi_memstall_leave(&pflags); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -3360,6 +3347,30 @@ static void age_active_anon(struct pglist_data *pgdat, } while (memcg); } +static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx) +{ + int i; + struct zone *zone; + + /* + * Check for watermark boosts top-down as the higher zones + * are more likely to be boosted. Both watermarks and boosts + * should not be checked at the time time as reclaim would + * start prematurely when there is no boosting and a lower + * zone is balanced. + */ + for (i = classzone_idx; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (!managed_zone(zone)) + continue; + + if (zone->watermark_boost) + return true; + } + + return false; +} + /* * Returns true if there is an eligible zone balanced for the request order * and classzone_idx @@ -3370,6 +3381,10 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) unsigned long mark = -1; struct zone *zone; + /* + * Check watermarks bottom-up as lower zones are more likely to + * meet watermarks. + */ for (i = 0; i <= classzone_idx; i++) { zone = pgdat->node_zones + i; @@ -3488,7 +3503,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, * * kswapd scans the zones in the highmem->normal->dma direction. It skips * zones which have free_pages > high_wmark_pages(zone), but once a zone is - * found to have free_pages <= high_wmark_pages(zone), any page is that zone + * found to have free_pages <= high_wmark_pages(zone), any page in that zone * or lower is eligible for reclaim until at least one usable zone is * balanced. */ @@ -3497,23 +3512,44 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) int i; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; + unsigned long pflags; + unsigned long nr_boost_reclaim; + unsigned long zone_boosts[MAX_NR_ZONES] = { 0, }; + bool boosted; struct zone *zone; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .order = order, - .priority = DEF_PRIORITY, - .may_writepage = !laptop_mode, .may_unmap = 1, - .may_swap = 1, }; + psi_memstall_enter(&pflags); __fs_reclaim_acquire(); count_vm_event(PAGEOUTRUN); + /* + * Account for the reclaim boost. Note that the zone boost is left in + * place so that parallel allocations that are near the watermark will + * stall or direct reclaim until kswapd is finished. + */ + nr_boost_reclaim = 0; + for (i = 0; i <= classzone_idx; i++) { + zone = pgdat->node_zones + i; + if (!managed_zone(zone)) + continue; + + nr_boost_reclaim += zone->watermark_boost; + zone_boosts[i] = zone->watermark_boost; + } + boosted = nr_boost_reclaim; + +restart: + sc.priority = DEF_PRIORITY; do { unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; + bool balanced; bool ret; sc.reclaim_idx = classzone_idx; @@ -3540,13 +3576,40 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) } /* - * Only reclaim if there are no eligible zones. Note that - * sc.reclaim_idx is not used as buffer_heads_over_limit may - * have adjusted it. + * If the pgdat is imbalanced then ignore boosting and preserve + * the watermarks for a later time and restart. Note that the + * zone watermarks will be still reset at the end of balancing + * on the grounds that the normal reclaim should be enough to + * re-evaluate if boosting is required when kswapd next wakes. */ - if (pgdat_balanced(pgdat, sc.order, classzone_idx)) + balanced = pgdat_balanced(pgdat, sc.order, classzone_idx); + if (!balanced && nr_boost_reclaim) { + nr_boost_reclaim = 0; + goto restart; + } + + /* + * If boosting is not active then only reclaim if there are no + * eligible zones. Note that sc.reclaim_idx is not used as + * buffer_heads_over_limit may have adjusted it. + */ + if (!nr_boost_reclaim && balanced) goto out; + /* Limit the priority of boosting to avoid reclaim writeback */ + if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2) + raise_priority = false; + + /* + * Do not writeback or swap pages for boosted reclaim. The + * intent is to relieve pressure not issue sub-optimal IO + * from reclaim context. If no pages are reclaimed, the + * reclaim will be aborted. + */ + sc.may_writepage = !laptop_mode && !nr_boost_reclaim; + sc.may_swap = !nr_boost_reclaim; + sc.may_shrinkslab = !nr_boost_reclaim; + /* * Do some background aging of the anon list, to give * pages a chance to be referenced before reclaiming. All @@ -3598,6 +3661,16 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * progress in reclaiming pages */ nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; + nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed); + + /* + * If reclaim made no progress for a boost, stop reclaim as + * IO cannot be queued and it could be an infinite loop in + * extreme circumstances. + */ + if (nr_boost_reclaim && !nr_reclaimed) + break; + if (raise_priority || !nr_reclaimed) sc.priority--; } while (sc.priority >= 1); @@ -3606,8 +3679,31 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) pgdat->kswapd_failures++; out: + /* If reclaim was boosted, account for the reclaim done in this pass */ + if (boosted) { + unsigned long flags; + + for (i = 0; i <= classzone_idx; i++) { + if (!zone_boosts[i]) + continue; + + /* Increments are under the zone lock */ + zone = pgdat->node_zones + i; + spin_lock_irqsave(&zone->lock, flags); + zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]); + spin_unlock_irqrestore(&zone->lock, flags); + } + + /* + * As there is now likely space, wakeup kcompact to defragment + * pageblocks. + */ + wakeup_kcompactd(pgdat, pageblock_order, classzone_idx); + } + snapshot_refaults(NULL, pgdat); __fs_reclaim_release(); + psi_memstall_leave(&pflags); /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller @@ -3833,7 +3929,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, /* Hopeless node, leave it to direct reclaim if possible */ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || - pgdat_balanced(pgdat, order, classzone_idx)) { + (pgdat_balanced(pgdat, order, classzone_idx) && + !pgdat_watermark_boosted(pgdat, classzone_idx))) { /* * There may be plenty of free memory available, but it's too * fragmented for high-order allocations. Wake up kcompactd @@ -4161,17 +4258,16 @@ int page_evictable(struct page *page) return ret; } -#ifdef CONFIG_SHMEM /** - * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list - * @pages: array of pages to check - * @nr_pages: number of pages to check - * - * Checks pages for evictability and moves them to the appropriate lru list. + * check_move_unevictable_pages - check pages for evictability and move to + * appropriate zone lru list + * @pvec: pagevec with lru pages to check * - * This function is only used for SysV IPC SHM_UNLOCK. + * Checks pages for evictability, if an evictable page is in the unevictable + * lru list, moves it to the appropriate evictable lru list. This function + * should be only used for lru pages. */ -void check_move_unevictable_pages(struct page **pages, int nr_pages) +void check_move_unevictable_pages(struct pagevec *pvec) { struct lruvec *lruvec; struct pglist_data *pgdat = NULL; @@ -4179,8 +4275,8 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) int pgrescued = 0; int i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pages[i]; + for (i = 0; i < pvec->nr; i++) { + struct page *page = pvec->pages[i]; struct pglist_data *pagepgdat = page_pgdat(page); pgscanned++; @@ -4212,4 +4308,4 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) spin_unlock_irq(&pgdat->lru_lock); } } -#endif /* CONFIG_SHMEM */ +EXPORT_SYMBOL_GPL(check_move_unevictable_pages); diff --git a/mm/vmstat.c b/mm/vmstat.c index 7878da76abf2..36b56f858f0f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -227,7 +227,7 @@ int calculate_normal_threshold(struct zone *zone) * 125 1024 10 16-32 GB 9 */ - mem = zone->managed_pages >> (27 - PAGE_SHIFT); + mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT); threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); @@ -1143,8 +1143,10 @@ const char * const vmstat_text[] = { "nr_slab_unreclaimable", "nr_isolated_anon", "nr_isolated_file", + "workingset_nodes", "workingset_refault", "workingset_activate", + "workingset_restore", "workingset_nodereclaim", "nr_anon_pages", "nr_mapped", @@ -1161,7 +1163,7 @@ const char * const vmstat_text[] = { "nr_vmscan_immediate_reclaim", "nr_dirtied", "nr_written", - "", /* nr_indirectly_reclaimable */ + "nr_kernel_misc_reclaimable", /* enum writeback_stat_item counters */ "nr_dirty_threshold", @@ -1567,7 +1569,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, high_wmark_pages(zone), zone->spanned_pages, zone->present_pages, - zone->managed_pages); + zone_managed_pages(zone)); seq_printf(m, "\n protection: (%ld", @@ -1663,6 +1665,8 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) stat_items_size += sizeof(struct vm_event_state); #endif + BUILD_BUG_ON(stat_items_size != + ARRAY_SIZE(vmstat_text) * sizeof(unsigned long)); v = kmalloc(stat_items_size, GFP_KERNEL); m->private = v; if (!v) @@ -1706,10 +1710,6 @@ static int vmstat_show(struct seq_file *m, void *arg) unsigned long *l = arg; unsigned long off = l - (unsigned long *)m->private; - /* Skip hidden vmstat items. */ - if (*vmstat_text[off] == '\0') - return 0; - seq_puts(m, vmstat_text[off]); seq_put_decimal_ull(m, " ", *l); seq_putc(m, '\n'); @@ -1827,12 +1827,13 @@ static bool need_update(int cpu) /* * The fast way of checking if there are any vmstat diffs. - * This works because the diffs are byte sized items. */ - if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS)) + if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS * + sizeof(p->vm_stat_diff[0]))) return true; #ifdef CONFIG_NUMA - if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS)) + if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS * + sizeof(p->vm_numa_stat_diff[0]))) return true; #endif } @@ -2120,21 +2121,14 @@ static int __init extfrag_debug_init(void) struct dentry *extfrag_debug_root; extfrag_debug_root = debugfs_create_dir("extfrag", NULL); - if (!extfrag_debug_root) - return -ENOMEM; - if (!debugfs_create_file("unusable_index", 0444, - extfrag_debug_root, NULL, &unusable_file_ops)) - goto fail; + debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL, + &unusable_file_ops); - if (!debugfs_create_file("extfrag_index", 0444, - extfrag_debug_root, NULL, &extfrag_file_ops)) - goto fail; + debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL, + &extfrag_file_ops); return 0; -fail: - debugfs_remove_recursive(extfrag_debug_root); - return -ENOMEM; } module_init(extfrag_debug_init); diff --git a/mm/workingset.c b/mm/workingset.c index 4516dd790129..0bedf67502d5 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -121,7 +121,7 @@ * the only thing eating into inactive list space is active pages. * * - * Activating refaulting pages + * Refaulting inactive pages * * All that is known about the active list is that the pages have been * accessed more than once in the past. This means that at any given @@ -134,6 +134,10 @@ * used less frequently than the refaulting page - or even not used at * all anymore. * + * That means if inactive cache is refaulting with a suitable refault + * distance, we assume the cache workingset is transitioning and put + * pressure on the current active list. + * * If this is wrong and demotion kicks in, the pages which are truly * used more frequently will be reactivated while the less frequently * used once will be evicted from memory. @@ -141,6 +145,14 @@ * But if this is right, the stale pages will be pushed out of memory * and the used pages get to stay in cache. * + * Refaulting active pages + * + * If on the other hand the refaulting pages have recently been + * deactivated, it means that the active list is no longer protecting + * actively used cache from reclaim. The cache is NOT transitioning to + * a different workingset; the existing workingset is thrashing in the + * space allocated to the page cache. + * * * Implementation * @@ -148,21 +160,20 @@ * and activations is maintained (node->inactive_age). * * On eviction, a snapshot of this counter (along with some bits to - * identify the node) is stored in the now empty page cache radix tree + * identify the node) is stored in the now empty page cache * slot of the evicted page. This is called a shadow entry. * * On cache misses for which there are shadow entries, an eligible * refault distance will immediately activate the refaulting page. */ -#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ - NODES_SHIFT + \ - MEM_CGROUP_ID_SHIFT) +#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ + 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) /* * Eviction timestamps need to be able to cover the full range of - * actionable refaults. However, bits are tight in the radix tree + * actionable refaults. However, bits are tight in the xarray * entry, and after storing the identifier for the lruvec there might * not be enough left to represent every single actionable refault. In * that case, we have to sacrifice granularity for distance, and group @@ -170,23 +181,27 @@ */ static unsigned int bucket_order __read_mostly; -static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) +static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, + bool workingset) { eviction >>= bucket_order; + eviction &= EVICTION_MASK; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; - eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); + eviction = (eviction << 1) | workingset; - return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); + return xa_mk_value(eviction); } static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, - unsigned long *evictionp) + unsigned long *evictionp, bool *workingsetp) { - unsigned long entry = (unsigned long)shadow; + unsigned long entry = xa_to_value(shadow); int memcgid, nid; + bool workingset; - entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; + workingset = entry & 1; + entry >>= 1; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); @@ -195,20 +210,20 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, *memcgidp = memcgid; *pgdat = NODE_DATA(nid); *evictionp = entry << bucket_order; + *workingsetp = workingset; } /** * workingset_eviction - note the eviction of a page from memory - * @mapping: address space the page was backing * @page: the page being evicted * - * Returns a shadow entry to be stored in @mapping->i_pages in place + * Returns a shadow entry to be stored in @page->mapping->i_pages in place * of the evicted @page so that a later refault can be detected. */ -void *workingset_eviction(struct address_space *mapping, struct page *page) +void *workingset_eviction(struct page *page) { - struct mem_cgroup *memcg = page_memcg(page); struct pglist_data *pgdat = page_pgdat(page); + struct mem_cgroup *memcg = page_memcg(page); int memcgid = mem_cgroup_id(memcg); unsigned long eviction; struct lruvec *lruvec; @@ -220,30 +235,30 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) lruvec = mem_cgroup_lruvec(pgdat, memcg); eviction = atomic_long_inc_return(&lruvec->inactive_age); - return pack_shadow(memcgid, pgdat, eviction); + return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } /** * workingset_refault - evaluate the refault of a previously evicted page + * @page: the freshly allocated replacement page * @shadow: shadow entry of the evicted page * * Calculates and evaluates the refault distance of the previously * evicted page in the context of the node it was allocated in. - * - * Returns %true if the page should be activated, %false otherwise. */ -bool workingset_refault(void *shadow) +void workingset_refault(struct page *page, void *shadow) { unsigned long refault_distance; + struct pglist_data *pgdat; unsigned long active_file; struct mem_cgroup *memcg; unsigned long eviction; struct lruvec *lruvec; unsigned long refault; - struct pglist_data *pgdat; + bool workingset; int memcgid; - unpack_shadow(shadow, &memcgid, &pgdat, &eviction); + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); rcu_read_lock(); /* @@ -263,41 +278,51 @@ bool workingset_refault(void *shadow) * configurations instead. */ memcg = mem_cgroup_from_id(memcgid); - if (!mem_cgroup_disabled() && !memcg) { - rcu_read_unlock(); - return false; - } + if (!mem_cgroup_disabled() && !memcg) + goto out; lruvec = mem_cgroup_lruvec(pgdat, memcg); refault = atomic_long_read(&lruvec->inactive_age); active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); /* - * The unsigned subtraction here gives an accurate distance - * across inactive_age overflows in most cases. + * Calculate the refault distance * - * There is a special case: usually, shadow entries have a - * short lifetime and are either refaulted or reclaimed along - * with the inode before they get too old. But it is not - * impossible for the inactive_age to lap a shadow entry in - * the field, which can then can result in a false small - * refault distance, leading to a false activation should this - * old entry actually refault again. However, earlier kernels - * used to deactivate unconditionally with *every* reclaim - * invocation for the longest time, so the occasional - * inappropriate activation leading to pressure on the active - * list is not a problem. + * The unsigned subtraction here gives an accurate distance + * across inactive_age overflows in most cases. There is a + * special case: usually, shadow entries have a short lifetime + * and are either refaulted or reclaimed along with the inode + * before they get too old. But it is not impossible for the + * inactive_age to lap a shadow entry in the field, which can + * then result in a false small refault distance, leading to a + * false activation should this old entry actually refault + * again. However, earlier kernels used to deactivate + * unconditionally with *every* reclaim invocation for the + * longest time, so the occasional inappropriate activation + * leading to pressure on the active list is not a problem. */ refault_distance = (refault - eviction) & EVICTION_MASK; inc_lruvec_state(lruvec, WORKINGSET_REFAULT); - if (refault_distance <= active_file) { - inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); - rcu_read_unlock(); - return true; + /* + * Compare the distance to the existing workingset size. We + * don't act on pages that couldn't stay resident even if all + * the memory was available to the page cache. + */ + if (refault_distance > active_file) + goto out; + + SetPageActive(page); + atomic_long_inc(&lruvec->inactive_age); + inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); + + /* Page was active prior to eviction */ + if (workingset) { + SetPageWorkingset(page); + inc_lruvec_state(lruvec, WORKINGSET_RESTORE); } +out: rcu_read_unlock(); - return false; } /** @@ -340,7 +365,7 @@ out: static struct list_lru shadow_nodes; -void workingset_update_node(struct radix_tree_node *node) +void workingset_update_node(struct xa_node *node) { /* * Track non-empty nodes that contain only shadow entries; @@ -350,12 +375,20 @@ void workingset_update_node(struct radix_tree_node *node) * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ - if (node->count && node->count == node->exceptional) { - if (list_empty(&node->private_list)) + VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */ + + if (node->count && node->count == node->nr_values) { + if (list_empty(&node->private_list)) { list_lru_add(&shadow_nodes, &node->private_list); + __inc_lruvec_page_state(virt_to_page(node), + WORKINGSET_NODES); + } } else { - if (!list_empty(&node->private_list)) + if (!list_empty(&node->private_list)) { list_lru_del(&shadow_nodes, &node->private_list); + __dec_lruvec_page_state(virt_to_page(node), + WORKINGSET_NODES); + } } } @@ -364,12 +397,12 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, { unsigned long max_nodes; unsigned long nodes; - unsigned long cache; + unsigned long pages; nodes = list_lru_shrink_count(&shadow_nodes, sc); /* - * Approximate a reasonable limit for the radix tree nodes + * Approximate a reasonable limit for the nodes * containing shadow entries. We don't need to keep more * shadow entries than possible pages on the active list, * since refault distances bigger than that are dismissed. @@ -384,20 +417,26 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, * worst-case density of 1/8th. Below that, not all eligible * refaults can be detected anymore. * - * On 64-bit with 7 radix_tree_nodes per page and 64 slots + * On 64-bit with 7 xa_nodes per page and 64 slots * each, this will reclaim shadow entries when they consume * ~1.8% of available memory: * - * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE + * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE */ +#ifdef CONFIG_MEMCG if (sc->memcg) { - cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, - LRU_ALL_FILE); - } else { - cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + - node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); - } - max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3); + struct lruvec *lruvec; + + pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, + LRU_ALL); + lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg); + pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE); + pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE); + } else +#endif + pages = node_present_pages(sc->nid); + + max_nodes = pages >> (XA_CHUNK_SHIFT - 3); if (!nodes) return SHRINK_EMPTY; @@ -410,11 +449,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, static enum lru_status shadow_lru_isolate(struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, - void *arg) + void *arg) __must_hold(lru_lock) { + struct xa_node *node = container_of(item, struct xa_node, private_list); + XA_STATE(xas, node->array, 0); struct address_space *mapping; - struct radix_tree_node *node; - unsigned int i; int ret; /* @@ -422,15 +461,14 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * the shadow node LRU under the i_pages lock and the * lru_lock. Because the page cache tree is emptied before * the inode can be destroyed, holding the lru_lock pins any - * address_space that has radix tree nodes on the LRU. + * address_space that has nodes on the LRU. * * We can then safely transition to the i_pages lock to * pin only the address_space of the particular node we want * to reclaim, take the node off-LRU, and drop the lru_lock. */ - node = container_of(item, struct radix_tree_node, private_list); - mapping = container_of(node->root, struct address_space, i_pages); + mapping = container_of(node->array, struct address_space, i_pages); /* Coming from the list, invert the lock order */ if (!xa_trylock(&mapping->i_pages)) { @@ -440,6 +478,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } list_lru_isolate(lru, item); + __dec_lruvec_page_state(virt_to_page(node), WORKINGSET_NODES); + spin_unlock(lru_lock); /* @@ -447,29 +487,21 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ - if (WARN_ON_ONCE(!node->exceptional)) - goto out_invalid; - if (WARN_ON_ONCE(node->count != node->exceptional)) + if (WARN_ON_ONCE(!node->nr_values)) goto out_invalid; - for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { - if (node->slots[i]) { - if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i]))) - goto out_invalid; - if (WARN_ON_ONCE(!node->exceptional)) - goto out_invalid; - if (WARN_ON_ONCE(!mapping->nrexceptional)) - goto out_invalid; - node->slots[i] = NULL; - node->exceptional--; - node->count--; - mapping->nrexceptional--; - } - } - if (WARN_ON_ONCE(node->exceptional)) + if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; - inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); - __radix_tree_delete_node(&mapping->i_pages, node, - workingset_lookup_update(mapping)); + mapping->nrexceptional -= node->nr_values; + xas.xa_node = xa_parent_locked(&mapping->i_pages, node); + xas.xa_offset = node->offset; + xas.xa_shift = node->shift + XA_CHUNK_SHIFT; + xas_set_update(&xas, workingset_update_node); + /* + * We could store a shadow entry here which was the minimum of the + * shadow entries we were tracking ... + */ + xas_store(&xas, NULL); + __inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); out_invalid: xa_unlock_irq(&mapping->i_pages); @@ -491,7 +523,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, static struct shrinker workingset_shadow_shrinker = { .count_objects = count_shadow_nodes, .scan_objects = scan_shadow_nodes, - .seeks = DEFAULT_SEEKS, + .seeks = 0, /* ->count reports only fully expendable nodes */ .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, }; @@ -516,7 +548,7 @@ static int __init workingset_init(void) * double the initial memory by using totalram_pages as-is. */ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; - max_order = fls_long(totalram_pages - 1); + max_order = fls_long(totalram_pages() - 1); if (max_order > timestamp_bits) bucket_order = max_order - timestamp_bits; pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", diff --git a/mm/z3fold.c b/mm/z3fold.c index 4b366d181f35..aee9b0b8d907 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -99,6 +99,7 @@ struct z3fold_header { #define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) #define BUDDY_MASK (0x3) +#define BUDDY_SHIFT 2 /** * struct z3fold_pool - stores metadata for each z3fold pool @@ -145,7 +146,7 @@ enum z3fold_page_flags { MIDDLE_CHUNK_MAPPED, NEEDS_COMPACTING, PAGE_STALE, - UNDER_RECLAIM + PAGE_CLAIMED, /* by either reclaim or free */ }; /***************** @@ -174,7 +175,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page, clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); clear_bit(NEEDS_COMPACTING, &page->private); clear_bit(PAGE_STALE, &page->private); - clear_bit(UNDER_RECLAIM, &page->private); + clear_bit(PAGE_CLAIMED, &page->private); spin_lock_init(&zhdr->page_lock); kref_init(&zhdr->refcount); @@ -223,8 +224,11 @@ static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) unsigned long handle; handle = (unsigned long)zhdr; - if (bud != HEADLESS) - handle += (bud + zhdr->first_num) & BUDDY_MASK; + if (bud != HEADLESS) { + handle |= (bud + zhdr->first_num) & BUDDY_MASK; + if (bud == LAST) + handle |= (zhdr->last_chunks << BUDDY_SHIFT); + } return handle; } @@ -234,6 +238,12 @@ static struct z3fold_header *handle_to_z3fold_header(unsigned long handle) return (struct z3fold_header *)(handle & PAGE_MASK); } +/* only for LAST bud, returns zero otherwise */ +static unsigned short handle_to_chunks(unsigned long handle) +{ + return (handle & ~PAGE_MASK) >> BUDDY_SHIFT; +} + /* * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle * but that doesn't matter. because the masking will result in the @@ -720,37 +730,39 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) page = virt_to_page(zhdr); if (test_bit(PAGE_HEADLESS, &page->private)) { - /* HEADLESS page stored */ - bud = HEADLESS; - } else { - z3fold_page_lock(zhdr); - bud = handle_to_buddy(handle); - - switch (bud) { - case FIRST: - zhdr->first_chunks = 0; - break; - case MIDDLE: - zhdr->middle_chunks = 0; - zhdr->start_middle = 0; - break; - case LAST: - zhdr->last_chunks = 0; - break; - default: - pr_err("%s: unknown bud %d\n", __func__, bud); - WARN_ON(1); - z3fold_page_unlock(zhdr); - return; + /* if a headless page is under reclaim, just leave. + * NB: we use test_and_set_bit for a reason: if the bit + * has not been set before, we release this page + * immediately so we don't care about its value any more. + */ + if (!test_and_set_bit(PAGE_CLAIMED, &page->private)) { + spin_lock(&pool->lock); + list_del(&page->lru); + spin_unlock(&pool->lock); + free_z3fold_page(page); + atomic64_dec(&pool->pages_nr); } + return; } - if (bud == HEADLESS) { - spin_lock(&pool->lock); - list_del(&page->lru); - spin_unlock(&pool->lock); - free_z3fold_page(page); - atomic64_dec(&pool->pages_nr); + /* Non-headless case */ + z3fold_page_lock(zhdr); + bud = handle_to_buddy(handle); + + switch (bud) { + case FIRST: + zhdr->first_chunks = 0; + break; + case MIDDLE: + zhdr->middle_chunks = 0; + break; + case LAST: + zhdr->last_chunks = 0; + break; + default: + pr_err("%s: unknown bud %d\n", __func__, bud); + WARN_ON(1); + z3fold_page_unlock(zhdr); return; } @@ -758,7 +770,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) atomic64_dec(&pool->pages_nr); return; } - if (test_bit(UNDER_RECLAIM, &page->private)) { + if (test_bit(PAGE_CLAIMED, &page->private)) { z3fold_page_unlock(zhdr); return; } @@ -836,20 +848,30 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) } list_for_each_prev(pos, &pool->lru) { page = list_entry(pos, struct page, lru); + + /* this bit could have been set by free, in which case + * we pass over to the next page in the pool. + */ + if (test_and_set_bit(PAGE_CLAIMED, &page->private)) + continue; + + zhdr = page_address(page); if (test_bit(PAGE_HEADLESS, &page->private)) - /* candidate found */ break; - zhdr = page_address(page); - if (!z3fold_page_trylock(zhdr)) + if (!z3fold_page_trylock(zhdr)) { + zhdr = NULL; continue; /* can't evict at this point */ + } kref_get(&zhdr->refcount); list_del_init(&zhdr->buddy); zhdr->cpu = -1; - set_bit(UNDER_RECLAIM, &page->private); break; } + if (!zhdr) + break; + list_del_init(&page->lru); spin_unlock(&pool->lock); @@ -898,6 +920,7 @@ next: if (test_bit(PAGE_HEADLESS, &page->private)) { if (ret == 0) { free_z3fold_page(page); + atomic64_dec(&pool->pages_nr); return 0; } spin_lock(&pool->lock); @@ -905,7 +928,7 @@ next: spin_unlock(&pool->lock); } else { z3fold_page_lock(zhdr); - clear_bit(UNDER_RECLAIM, &page->private); + clear_bit(PAGE_CLAIMED, &page->private); if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { atomic64_dec(&pool->pages_nr); @@ -964,7 +987,7 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) set_bit(MIDDLE_CHUNK_MAPPED, &page->private); break; case LAST: - addr += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); + addr += PAGE_SIZE - (handle_to_chunks(handle) << CHUNK_SHIFT); break; default: pr_err("unknown buddy id %d\n", buddy); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 9da65552e7ca..0787d33b80d8 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -418,7 +418,7 @@ static void *zs_zpool_map(void *pool, unsigned long handle, case ZPOOL_MM_WO: zs_mm = ZS_MM_WO; break; - case ZPOOL_MM_RW: /* fallthru */ + case ZPOOL_MM_RW: /* fall through */ default: zs_mm = ZS_MM_RW; break; diff --git a/mm/zswap.c b/mm/zswap.c index cd91fd9d96b8..a4e4d36ec085 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -219,8 +219,8 @@ static const struct zpool_ops zswap_zpool_ops = { static bool zswap_is_full(void) { - return totalram_pages * zswap_max_pool_percent / 100 < - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); + return totalram_pages() * zswap_max_pool_percent / 100 < + DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); } static void zswap_update_total_size(void) |