diff options
Diffstat (limited to 'mm')
107 files changed, 10538 insertions, 7409 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 356f4f2c779e..761f5021ba51 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -249,6 +249,9 @@ config MIGRATION pages as migration can relocate pages to satisfy a huge page allocation instead of reclaiming. +config DEVICE_MIGRATION + def_bool MIGRATION && ZONE_DEVICE + config ARCH_ENABLE_HUGEPAGE_MIGRATION bool @@ -262,6 +265,9 @@ config HUGETLB_PAGE_SIZE_VARIABLE HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available on a platform. + Note that the pageblock_order cannot exceed MAX_ORDER - 1 and will be + clamped down to MAX_ORDER - 1. + config CONTIG_ALLOC def_bool (MEMORY_ISOLATION && COMPACTION) || CMA @@ -411,6 +417,9 @@ choice benefit. endchoice +config ARCH_WANT_GENERAL_HUGETLB + bool + config ARCH_WANTS_THP_SWAP def_bool n @@ -432,43 +441,20 @@ config NEED_PER_CPU_KM bool default y -config CLEANCACHE - bool "Enable cleancache driver to cache clean pages if tmem is present" - help - Cleancache can be thought of as a page-granularity victim cache - for clean pages that the kernel's pageframe replacement algorithm - (PFRA) would like to keep around, but can't since there isn't enough - memory. So when the PFRA "evicts" a page, it first attempts to use - cleancache code to put the data contained in that page into - "transcendent memory", memory that is not directly accessible or - addressable by the kernel and is of unknown and possibly - time-varying size. And when a cleancache-enabled - filesystem wishes to access a page in a file on disk, it first - checks cleancache to see if it already contains it; if it does, - the page is copied into the kernel and a disk access is avoided. - When a transcendent memory driver is available (such as zcache or - Xen transcendent memory), a significant I/O reduction - may be achieved. When none is available, all cleancache calls - are reduced to a single pointer-compare-against-NULL resulting - in a negligible performance hit. - - If unsure, say Y to enable cleancache +config NEED_PER_CPU_EMBED_FIRST_CHUNK + bool -config FRONTSWAP - bool "Enable frontswap to cache swap pages if tmem is present" - depends on SWAP - help - Frontswap is so named because it can be thought of as the opposite - of a "backing" store for a swap device. The data is stored into - "transcendent memory", memory that is not directly accessible or - addressable by the kernel and is of unknown and possibly - time-varying size. When space in transcendent memory is available, - a significant swap I/O reduction may be achieved. When none is - available, all frontswap calls are reduced to a single pointer- - compare-against-NULL resulting in a negligible performance hit - and swap data is stored as normal on the matching swap device. +config NEED_PER_CPU_PAGE_FIRST_CHUNK + bool + +config USE_PERCPU_NUMA_NODE_ID + bool - If unsure, say Y to enable frontswap. +config HAVE_SETUP_PER_CPU_AREA + bool + +config FRONTSWAP + bool config CMA bool "Contiguous Memory Allocator" @@ -533,7 +519,8 @@ config MEM_SOFT_DIRTY config ZSWAP bool "Compressed cache for swap pages (EXPERIMENTAL)" - depends on FRONTSWAP && CRYPTO=y + depends on SWAP && CRYPTO=y + select FRONTSWAP select ZPOOL help A lightweight compressed cache for swap pages. It takes @@ -766,6 +753,15 @@ config IDLE_PAGE_TRACKING config ARCH_HAS_CACHE_LINE_SIZE bool +config ARCH_HAS_CURRENT_STACK_POINTER + bool + help + In support of HARDENED_USERCOPY performing stack variable lifetime + checking, an architecture-agnostic way to find the stack pointer + is needed. Once an architecture defines an unsigned long global + register alias named "current_stack_pointer", this config can be + selected. + config ARCH_HAS_PTE_DEVMAP bool @@ -798,9 +794,6 @@ config ZONE_DEVICE If FS_DAX is enabled, then say Y. -config DEV_PAGEMAP_OPS - bool - # # Helpers to mirror range of the CPU page tables of a process into device page # tables. @@ -812,7 +805,6 @@ config HMM_MIRROR config DEVICE_PRIVATE bool "Unaddressable device memory (GPU memory, ...)" depends on ZONE_DEVICE - select DEV_PAGEMAP_OPS help Allows creation of struct pages to represent unaddressable device @@ -900,6 +892,20 @@ config IO_MAPPING config SECRETMEM def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED +config ANON_VMA_NAME + bool "Anonymous VMA name support" + depends on PROC_FS && ADVISE_SYSCALLS && MMU + + help + Allow naming anonymous virtual memory areas. + + This feature allows assigning names to virtual memory areas. Assigned + names can be later retrieved from /proc/pid/maps and /proc/pid/smaps + and help identifying individual anonymous memory areas. + Assigning a name to anonymous virtual memory area might prevent that + area from being merged with adjacent virtual memory areas due to the + difference in their name. + source "mm/damon/Kconfig" endmenu diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 1e73717802f8..5bd5bb097252 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -62,6 +62,30 @@ config PAGE_OWNER If unsure, say N. +config PAGE_TABLE_CHECK + bool "Check for invalid mappings in user page tables" + depends on ARCH_SUPPORTS_PAGE_TABLE_CHECK + select PAGE_EXTENSION + help + Check that anonymous page is not being mapped twice with read write + permissions. Check that anonymous and file pages are not being + erroneously shared. Since the checking is performed at the time + entries are added and removed to user page tables, leaking, corruption + and double mapping problems are detected synchronously. + + If unsure say "n". + +config PAGE_TABLE_CHECK_ENFORCED + bool "Enforce the page table checking by default" + depends on PAGE_TABLE_CHECK + help + Always enable page table checking. By default the page table checking + is disabled, and can be optionally enabled via page_table_check=on + kernel parameter. This config enforces that page table check is always + enabled. + + If unsure say "n". + config PAGE_POISONING bool "Poison pages after freeing" help diff --git a/mm/Makefile b/mm/Makefile index 7919cd7f13f2..4cc13f3179a5 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -92,6 +92,7 @@ obj-$(CONFIG_KFENCE) += kfence/ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o +obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o @@ -104,7 +105,6 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o obj-$(CONFIG_PAGE_OWNER) += page_owner.o -obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o obj-$(CONFIG_ZPOOL) += zpool.o obj-$(CONFIG_ZBUD) += zbud.o @@ -114,6 +114,7 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o +obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_SECRETMEM) += secretmem.o obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index eae96dfe0261..7176af65b103 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1005,60 +1005,3 @@ const char *bdi_dev_name(struct backing_dev_info *bdi) return bdi->dev_name; } EXPORT_SYMBOL_GPL(bdi_dev_name); - -static wait_queue_head_t congestion_wqh[2] = { - __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), - __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) - }; -static atomic_t nr_wb_congested[2]; - -void clear_bdi_congested(struct backing_dev_info *bdi, int sync) -{ - wait_queue_head_t *wqh = &congestion_wqh[sync]; - enum wb_congested_state bit; - - bit = sync ? WB_sync_congested : WB_async_congested; - if (test_and_clear_bit(bit, &bdi->wb.congested)) - atomic_dec(&nr_wb_congested[sync]); - smp_mb__after_atomic(); - if (waitqueue_active(wqh)) - wake_up(wqh); -} -EXPORT_SYMBOL(clear_bdi_congested); - -void set_bdi_congested(struct backing_dev_info *bdi, int sync) -{ - enum wb_congested_state bit; - - bit = sync ? WB_sync_congested : WB_async_congested; - if (!test_and_set_bit(bit, &bdi->wb.congested)) - atomic_inc(&nr_wb_congested[sync]); -} -EXPORT_SYMBOL(set_bdi_congested); - -/** - * congestion_wait - wait for a backing_dev to become uncongested - * @sync: SYNC or ASYNC IO - * @timeout: timeout in jiffies - * - * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit - * write congestion. If no backing_devs are congested then just wait for the - * next write to be completed. - */ -long congestion_wait(int sync, long timeout) -{ - long ret; - unsigned long start = jiffies; - DEFINE_WAIT(wait); - wait_queue_head_t *wqh = &congestion_wqh[sync]; - - prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); - ret = io_schedule_timeout(timeout); - finish_wait(wqh, &wait); - - trace_writeback_congestion_wait(jiffies_to_usecs(timeout), - jiffies_to_usecs(jiffies - start)); - - return ret; -} -EXPORT_SYMBOL(congestion_wait); diff --git a/mm/cleancache.c b/mm/cleancache.c deleted file mode 100644 index db7eee9c0886..000000000000 --- a/mm/cleancache.c +++ /dev/null @@ -1,315 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Cleancache frontend - * - * This code provides the generic "frontend" layer to call a matching - * "backend" driver implementation of cleancache. See - * Documentation/vm/cleancache.rst for more information. - * - * Copyright (C) 2009-2010 Oracle Corp. All rights reserved. - * Author: Dan Magenheimer - */ - -#include <linux/module.h> -#include <linux/fs.h> -#include <linux/exportfs.h> -#include <linux/mm.h> -#include <linux/debugfs.h> -#include <linux/cleancache.h> - -/* - * cleancache_ops is set by cleancache_register_ops to contain the pointers - * to the cleancache "backend" implementation functions. - */ -static const struct cleancache_ops *cleancache_ops __read_mostly; - -/* - * Counters available via /sys/kernel/debug/cleancache (if debugfs is - * properly configured. These are for information only so are not protected - * against increment races. - */ -static u64 cleancache_succ_gets; -static u64 cleancache_failed_gets; -static u64 cleancache_puts; -static u64 cleancache_invalidates; - -static void cleancache_register_ops_sb(struct super_block *sb, void *unused) -{ - switch (sb->cleancache_poolid) { - case CLEANCACHE_NO_BACKEND: - __cleancache_init_fs(sb); - break; - case CLEANCACHE_NO_BACKEND_SHARED: - __cleancache_init_shared_fs(sb); - break; - } -} - -/* - * Register operations for cleancache. Returns 0 on success. - */ -int cleancache_register_ops(const struct cleancache_ops *ops) -{ - if (cmpxchg(&cleancache_ops, NULL, ops)) - return -EBUSY; - - /* - * A cleancache backend can be built as a module and hence loaded after - * a cleancache enabled filesystem has called cleancache_init_fs. To - * handle such a scenario, here we call ->init_fs or ->init_shared_fs - * for each active super block. To differentiate between local and - * shared filesystems, we temporarily initialize sb->cleancache_poolid - * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED - * respectively in case there is no backend registered at the time - * cleancache_init_fs or cleancache_init_shared_fs is called. - * - * Since filesystems can be mounted concurrently with cleancache - * backend registration, we have to be careful to guarantee that all - * cleancache enabled filesystems that has been mounted by the time - * cleancache_register_ops is called has got and all mounted later will - * get cleancache_poolid. This is assured by the following statements - * tied together: - * - * a) iterate_supers skips only those super blocks that has started - * ->kill_sb - * - * b) if iterate_supers encounters a super block that has not finished - * ->mount yet, it waits until it is finished - * - * c) cleancache_init_fs is called from ->mount and - * cleancache_invalidate_fs is called from ->kill_sb - * - * d) we call iterate_supers after cleancache_ops has been set - * - * From a) it follows that if iterate_supers skips a super block, then - * either the super block is already dead, in which case we do not need - * to bother initializing cleancache for it, or it was mounted after we - * initiated iterate_supers. In the latter case, it must have seen - * cleancache_ops set according to d) and initialized cleancache from - * ->mount by itself according to c). This proves that we call - * ->init_fs at least once for each active super block. - * - * From b) and c) it follows that if iterate_supers encounters a super - * block that has already started ->init_fs, it will wait until ->mount - * and hence ->init_fs has finished, then check cleancache_poolid, see - * that it has already been set and therefore do nothing. This proves - * that we call ->init_fs no more than once for each super block. - * - * Combined together, the last two paragraphs prove the function - * correctness. - * - * Note that various cleancache callbacks may proceed before this - * function is called or even concurrently with it, but since - * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop - * until the corresponding ->init_fs has been actually called and - * cleancache_ops has been set. - */ - iterate_supers(cleancache_register_ops_sb, NULL); - return 0; -} -EXPORT_SYMBOL(cleancache_register_ops); - -/* Called by a cleancache-enabled filesystem at time of mount */ -void __cleancache_init_fs(struct super_block *sb) -{ - int pool_id = CLEANCACHE_NO_BACKEND; - - if (cleancache_ops) { - pool_id = cleancache_ops->init_fs(PAGE_SIZE); - if (pool_id < 0) - pool_id = CLEANCACHE_NO_POOL; - } - sb->cleancache_poolid = pool_id; -} -EXPORT_SYMBOL(__cleancache_init_fs); - -/* Called by a cleancache-enabled clustered filesystem at time of mount */ -void __cleancache_init_shared_fs(struct super_block *sb) -{ - int pool_id = CLEANCACHE_NO_BACKEND_SHARED; - - if (cleancache_ops) { - pool_id = cleancache_ops->init_shared_fs(&sb->s_uuid, PAGE_SIZE); - if (pool_id < 0) - pool_id = CLEANCACHE_NO_POOL; - } - sb->cleancache_poolid = pool_id; -} -EXPORT_SYMBOL(__cleancache_init_shared_fs); - -/* - * If the filesystem uses exportable filehandles, use the filehandle as - * the key, else use the inode number. - */ -static int cleancache_get_key(struct inode *inode, - struct cleancache_filekey *key) -{ - int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *); - int len = 0, maxlen = CLEANCACHE_KEY_MAX; - struct super_block *sb = inode->i_sb; - - key->u.ino = inode->i_ino; - if (sb->s_export_op != NULL) { - fhfn = sb->s_export_op->encode_fh; - if (fhfn) { - len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL); - if (len <= FILEID_ROOT || len == FILEID_INVALID) - return -1; - if (maxlen > CLEANCACHE_KEY_MAX) - return -1; - } - } - return 0; -} - -/* - * "Get" data from cleancache associated with the poolid/inode/index - * that were specified when the data was put to cleanache and, if - * successful, use it to fill the specified page with data and return 0. - * The pageframe is unchanged and returns -1 if the get fails. - * Page must be locked by caller. - * - * The function has two checks before any action is taken - whether - * a backend is registered and whether the sb->cleancache_poolid - * is correct. - */ -int __cleancache_get_page(struct page *page) -{ - int ret = -1; - int pool_id; - struct cleancache_filekey key = { .u.key = { 0 } }; - - if (!cleancache_ops) { - cleancache_failed_gets++; - goto out; - } - - VM_BUG_ON_PAGE(!PageLocked(page), page); - pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (pool_id < 0) - goto out; - - if (cleancache_get_key(page->mapping->host, &key) < 0) - goto out; - - ret = cleancache_ops->get_page(pool_id, key, page->index, page); - if (ret == 0) - cleancache_succ_gets++; - else - cleancache_failed_gets++; -out: - return ret; -} -EXPORT_SYMBOL(__cleancache_get_page); - -/* - * "Put" data from a page to cleancache and associate it with the - * (previously-obtained per-filesystem) poolid and the page's, - * inode and page index. Page must be locked. Note that a put_page - * always "succeeds", though a subsequent get_page may succeed or fail. - * - * The function has two checks before any action is taken - whether - * a backend is registered and whether the sb->cleancache_poolid - * is correct. - */ -void __cleancache_put_page(struct page *page) -{ - int pool_id; - struct cleancache_filekey key = { .u.key = { 0 } }; - - if (!cleancache_ops) { - cleancache_puts++; - return; - } - - VM_BUG_ON_PAGE(!PageLocked(page), page); - pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (pool_id >= 0 && - cleancache_get_key(page->mapping->host, &key) >= 0) { - cleancache_ops->put_page(pool_id, key, page->index, page); - cleancache_puts++; - } -} -EXPORT_SYMBOL(__cleancache_put_page); - -/* - * Invalidate any data from cleancache associated with the poolid and the - * page's inode and page index so that a subsequent "get" will fail. - * - * The function has two checks before any action is taken - whether - * a backend is registered and whether the sb->cleancache_poolid - * is correct. - */ -void __cleancache_invalidate_page(struct address_space *mapping, - struct page *page) -{ - /* careful... page->mapping is NULL sometimes when this is called */ - int pool_id = mapping->host->i_sb->cleancache_poolid; - struct cleancache_filekey key = { .u.key = { 0 } }; - - if (!cleancache_ops) - return; - - if (pool_id >= 0) { - VM_BUG_ON_PAGE(!PageLocked(page), page); - if (cleancache_get_key(mapping->host, &key) >= 0) { - cleancache_ops->invalidate_page(pool_id, - key, page->index); - cleancache_invalidates++; - } - } -} -EXPORT_SYMBOL(__cleancache_invalidate_page); - -/* - * Invalidate all data from cleancache associated with the poolid and the - * mappings's inode so that all subsequent gets to this poolid/inode - * will fail. - * - * The function has two checks before any action is taken - whether - * a backend is registered and whether the sb->cleancache_poolid - * is correct. - */ -void __cleancache_invalidate_inode(struct address_space *mapping) -{ - int pool_id = mapping->host->i_sb->cleancache_poolid; - struct cleancache_filekey key = { .u.key = { 0 } }; - - if (!cleancache_ops) - return; - - if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) - cleancache_ops->invalidate_inode(pool_id, key); -} -EXPORT_SYMBOL(__cleancache_invalidate_inode); - -/* - * Called by any cleancache-enabled filesystem at time of unmount; - * note that pool_id is surrendered and may be returned by a subsequent - * cleancache_init_fs or cleancache_init_shared_fs. - */ -void __cleancache_invalidate_fs(struct super_block *sb) -{ - int pool_id; - - pool_id = sb->cleancache_poolid; - sb->cleancache_poolid = CLEANCACHE_NO_POOL; - - if (cleancache_ops && pool_id >= 0) - cleancache_ops->invalidate_fs(pool_id); -} -EXPORT_SYMBOL(__cleancache_invalidate_fs); - -static int __init init_cleancache(void) -{ -#ifdef CONFIG_DEBUG_FS - struct dentry *root = debugfs_create_dir("cleancache", NULL); - - debugfs_create_u64("succ_gets", 0444, root, &cleancache_succ_gets); - debugfs_create_u64("failed_gets", 0444, root, &cleancache_failed_gets); - debugfs_create_u64("puts", 0444, root, &cleancache_puts); - debugfs_create_u64("invalidates", 0444, root, &cleancache_invalidates); -#endif - return 0; -} -module_init(init_cleancache) @@ -131,8 +131,10 @@ not_in_zone: bitmap_free(cma->bitmap); out_error: /* Expose all pages to the buddy, they are useless for CMA. */ - for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn++) - free_reserved_page(pfn_to_page(pfn)); + if (!cma->reserve_pages_on_error) { + for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn++) + free_reserved_page(pfn_to_page(pfn)); + } totalcma_pages -= cma->count; cma->count = 0; pr_err("CMA area %s could not be activated\n", cma->name); @@ -150,6 +152,11 @@ static int __init cma_init_reserved_areas(void) } core_initcall(cma_init_reserved_areas); +void __init cma_reserve_pages_on_error(struct cma *cma) +{ + cma->reserve_pages_on_error = true; +} + /** * cma_init_reserved_mem() - create custom contiguous area from reserved memory * @base: Base address of the reserved area @@ -168,7 +175,6 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, struct cma **res_cma) { struct cma *cma; - phys_addr_t alignment; /* Sanity checks */ if (cma_area_count == ARRAY_SIZE(cma_areas)) { @@ -179,15 +185,12 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, if (!size || !memblock_is_region_reserved(base, size)) return -EINVAL; - /* ensure minimal alignment required by mm core */ - alignment = PAGE_SIZE << - max_t(unsigned long, MAX_ORDER - 1, pageblock_order); - /* alignment should be aligned with order_per_bit */ - if (!IS_ALIGNED(alignment >> PAGE_SHIFT, 1 << order_per_bit)) + if (!IS_ALIGNED(CMA_MIN_ALIGNMENT_PAGES, 1 << order_per_bit)) return -EINVAL; - if (ALIGN(base, alignment) != base || ALIGN(size, alignment) != size) + /* ensure minimal alignment required by mm core */ + if (!IS_ALIGNED(base | size, CMA_MIN_ALIGNMENT_BYTES)) return -EINVAL; /* @@ -262,14 +265,8 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, if (alignment && !is_power_of_2(alignment)) return -EINVAL; - /* - * Sanitise input arguments. - * Pages both ends in CMA area could be merged into adjacent unmovable - * migratetype page by page allocator's buddy algorithm. In the case, - * you couldn't get a contiguous memory, which is not what we want. - */ - alignment = max(alignment, (phys_addr_t)PAGE_SIZE << - max_t(unsigned long, MAX_ORDER - 1, pageblock_order)); + /* Sanitise input arguments. */ + alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES); if (fixed && base & (alignment - 1)) { ret = -EINVAL; pr_err("Region at %pa must be aligned to %pa bytes\n", @@ -30,6 +30,7 @@ struct cma { /* kobject requires dynamic object */ struct cma_kobject *cma_kobj; #endif + bool reserve_pages_on_error; }; extern struct cma cma_areas[MAX_CMA_AREAS]; diff --git a/mm/compaction.c b/mm/compaction.c index 6e446094ce90..c3e37aa9ff9e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -785,7 +785,7 @@ static bool too_many_isolated(pg_data_t *pgdat) * @cc: Compaction control structure. * @low_pfn: The first PFN to isolate * @end_pfn: The one-past-the-last PFN to isolate, within same pageblock - * @isolate_mode: Isolation mode to be used. + * @mode: Isolation mode to be used. * * Isolate all pages that can be migrated from the range specified by * [low_pfn, end_pfn). The range is expected to be within same pageblock. @@ -798,7 +798,7 @@ static bool too_many_isolated(pg_data_t *pgdat) */ static int isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, - unsigned long end_pfn, isolate_mode_t isolate_mode) + unsigned long end_pfn, isolate_mode_t mode) { pg_data_t *pgdat = cc->zone->zone_pgdat; unsigned long nr_scanned = 0, nr_isolated = 0; @@ -806,6 +806,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, unsigned long flags = 0; struct lruvec *locked = NULL; struct page *page = NULL, *valid_page = NULL; + struct address_space *mapping; unsigned long start_pfn = low_pfn; bool skip_on_failure = false; unsigned long next_skip_pfn = 0; @@ -990,7 +991,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, locked = NULL; } - if (!isolate_movable_page(page, isolate_mode)) + if (!isolate_movable_page(page, mode)) goto isolate_success; } @@ -1002,15 +1003,15 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * so avoid taking lru_lock and isolating it unnecessarily in an * admittedly racy check. */ - if (!page_mapping(page) && - page_count(page) > page_mapcount(page)) + mapping = page_mapping(page); + if (!mapping && page_count(page) > page_mapcount(page)) goto isolate_fail; /* * Only allow to migrate anonymous pages in GFP_NOFS context * because those do not depend on fs locks. */ - if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page)) + if (!(cc->gfp_mask & __GFP_FS) && mapping) goto isolate_fail; /* @@ -1021,9 +1022,45 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (unlikely(!get_page_unless_zero(page))) goto isolate_fail; - if (!__isolate_lru_page_prepare(page, isolate_mode)) + /* Only take pages on LRU: a check now makes later tests safe */ + if (!PageLRU(page)) + goto isolate_fail_put; + + /* Compaction might skip unevictable pages but CMA takes them */ + if (!(mode & ISOLATE_UNEVICTABLE) && PageUnevictable(page)) + goto isolate_fail_put; + + /* + * To minimise LRU disruption, the caller can indicate with + * ISOLATE_ASYNC_MIGRATE that it only wants to isolate pages + * it will be able to migrate without blocking - clean pages + * for the most part. PageWriteback would require blocking. + */ + if ((mode & ISOLATE_ASYNC_MIGRATE) && PageWriteback(page)) goto isolate_fail_put; + if ((mode & ISOLATE_ASYNC_MIGRATE) && PageDirty(page)) { + bool migrate_dirty; + + /* + * Only pages without mappings or that have a + * ->migratepage callback are possible to migrate + * without blocking. However, we can be racing with + * truncation so it's necessary to lock the page + * to stabilise the mapping as truncation holds + * the page lock until after the page is removed + * from the page cache. + */ + if (!trylock_page(page)) + goto isolate_fail_put; + + mapping = page_mapping(page); + migrate_dirty = !mapping || mapping->a_ops->migratepage; + unlock_page(page); + if (!migrate_dirty) + goto isolate_fail_put; + } + /* Try isolate the page */ if (!TestClearPageLRU(page)) goto isolate_fail_put; @@ -2280,6 +2317,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) unsigned long last_migrated_pfn; const bool sync = cc->mode != MIGRATE_ASYNC; bool update_cached; + unsigned int nr_succeeded = 0; /* * These counters track activities during zone compaction. Initialize @@ -2349,8 +2387,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) update_cached = !sync && cc->zone->compact_cached_migrate_pfn[0] == cc->zone->compact_cached_migrate_pfn[1]; - trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, - cc->free_pfn, end_pfn, sync); + trace_mm_compaction_begin(cc, start_pfn, end_pfn, sync); /* lru_add_drain_all could be expensive with involving other CPUs */ lru_add_drain(); @@ -2398,10 +2435,9 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) err = migrate_pages(&cc->migratepages, compaction_alloc, compaction_free, (unsigned long)cc, cc->mode, - MR_COMPACTION, NULL); + MR_COMPACTION, &nr_succeeded); - trace_mm_compaction_migratepages(cc->nr_migratepages, err, - &cc->migratepages); + trace_mm_compaction_migratepages(cc, nr_succeeded); /* All pages were either migrated or will be released */ cc->nr_migratepages = 0; @@ -2477,8 +2513,7 @@ out: count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned); count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned); - trace_mm_compaction_end(start_pfn, cc->migrate_pfn, - cc->free_pfn, end_pfn, sync, ret); + trace_mm_compaction_end(cc, start_pfn, end_pfn, sync, ret); return ret; } diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 5bcf05851ad0..9b559c76d6dd 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -25,33 +25,40 @@ config DAMON_KUNIT_TEST If unsure, say N. config DAMON_VADDR - bool "Data access monitoring primitives for virtual address spaces" + bool "Data access monitoring operations for virtual address spaces" depends on DAMON && MMU select PAGE_IDLE_FLAG help - This builds the default data access monitoring primitives for DAMON + This builds the default data access monitoring operations for DAMON that work for virtual address spaces. config DAMON_PADDR - bool "Data access monitoring primitives for the physical address space" + bool "Data access monitoring operations for the physical address space" depends on DAMON && MMU select PAGE_IDLE_FLAG help - This builds the default data access monitoring primitives for DAMON + This builds the default data access monitoring operations for DAMON that works for the physical address space. config DAMON_VADDR_KUNIT_TEST - bool "Test for DAMON primitives" if !KUNIT_ALL_TESTS + bool "Test for DAMON operations" if !KUNIT_ALL_TESTS depends on DAMON_VADDR && KUNIT=y default KUNIT_ALL_TESTS help - This builds the DAMON virtual addresses primitives Kunit test suite. + This builds the DAMON virtual addresses operations Kunit test suite. For more information on KUnit and unit tests in general, please refer to the KUnit documentation. If unsure, say N. +config DAMON_SYSFS + bool "DAMON sysfs interface" + depends on DAMON && SYSFS + help + This builds the sysfs interface for DAMON. The user space can use + the interface for arbitrary data access monitoring. + config DAMON_DBGFS bool "DAMON debugfs interface" depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS diff --git a/mm/damon/Makefile b/mm/damon/Makefile index f7d5ac377a2b..dbf7190b4144 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -1,7 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_DAMON) := core.o -obj-$(CONFIG_DAMON_VADDR) += prmtv-common.o vaddr.o -obj-$(CONFIG_DAMON_PADDR) += prmtv-common.o paddr.o +obj-y := core.o +obj-$(CONFIG_DAMON_VADDR) += ops-common.o vaddr.o +obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o +obj-$(CONFIG_DAMON_SYSFS) += sysfs.o obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 7008c3735e99..b4085deb9fa0 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -24,7 +24,7 @@ static void damon_test_regions(struct kunit *test) KUNIT_EXPECT_EQ(test, 2ul, r->ar.end); KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses); - t = damon_new_target(42); + t = damon_new_target(); KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t)); damon_add_region(r, t); @@ -52,8 +52,7 @@ static void damon_test_target(struct kunit *test) struct damon_ctx *c = damon_new_ctx(); struct damon_target *t; - t = damon_new_target(42); - KUNIT_EXPECT_EQ(test, 42ul, t->id); + t = damon_new_target(); KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c)); damon_add_target(c, t); @@ -78,7 +77,6 @@ static void damon_test_target(struct kunit *test) static void damon_test_aggregate(struct kunit *test) { struct damon_ctx *ctx = damon_new_ctx(); - unsigned long target_ids[] = {1, 2, 3}; unsigned long saddr[][3] = {{10, 20, 30}, {5, 42, 49}, {13, 33, 55} }; unsigned long eaddr[][3] = {{15, 27, 40}, {31, 45, 55}, {23, 44, 66} }; unsigned long accesses[][3] = {{42, 95, 84}, {10, 20, 30}, {0, 1, 2} }; @@ -86,7 +84,10 @@ static void damon_test_aggregate(struct kunit *test) struct damon_region *r; int it, ir; - damon_set_targets(ctx, target_ids, 3); + for (it = 0; it < 3; it++) { + t = damon_new_target(); + damon_add_target(ctx, t); + } it = 0; damon_for_each_target(t, ctx) { @@ -122,7 +123,7 @@ static void damon_test_split_at(struct kunit *test) struct damon_target *t; struct damon_region *r; - t = damon_new_target(42); + t = damon_new_target(); r = damon_new_region(0, 100); damon_add_region(r, t); damon_split_region_at(c, t, r, 25); @@ -143,7 +144,7 @@ static void damon_test_merge_two(struct kunit *test) struct damon_region *r, *r2, *r3; int i; - t = damon_new_target(42); + t = damon_new_target(); r = damon_new_region(0, 100); r->nr_accesses = 10; damon_add_region(r, t); @@ -191,7 +192,7 @@ static void damon_test_merge_regions_of(struct kunit *test) unsigned long eaddrs[] = {112, 130, 156, 170, 230}; int i; - t = damon_new_target(42); + t = damon_new_target(); for (i = 0; i < ARRAY_SIZE(sa); i++) { r = damon_new_region(sa[i], ea[i]); r->nr_accesses = nrs[i]; @@ -215,14 +216,14 @@ static void damon_test_split_regions_of(struct kunit *test) struct damon_target *t; struct damon_region *r; - t = damon_new_target(42); + t = damon_new_target(); r = damon_new_region(0, 22); damon_add_region(r, t); damon_split_regions_of(c, t, 2); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u); damon_free_target(t); - t = damon_new_target(42); + t = damon_new_target(); r = damon_new_region(0, 220); damon_add_region(r, t); damon_split_regions_of(c, t, 4); diff --git a/mm/damon/core.c b/mm/damon/core.c index e92497895202..c1e0fed4e877 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -11,7 +11,6 @@ #include <linux/delay.h> #include <linux/kthread.h> #include <linux/mm.h> -#include <linux/random.h> #include <linux/slab.h> #include <linux/string.h> @@ -23,11 +22,75 @@ #define DAMON_MIN_REGION 1 #endif -/* Get a random number in [l, r) */ -#define damon_rand(l, r) (l + prandom_u32_max(r - l)) - static DEFINE_MUTEX(damon_lock); static int nr_running_ctxs; +static bool running_exclusive_ctxs; + +static DEFINE_MUTEX(damon_ops_lock); +static struct damon_operations damon_registered_ops[NR_DAMON_OPS]; + +/* Should be called under damon_ops_lock with id smaller than NR_DAMON_OPS */ +static bool damon_registered_ops_id(enum damon_ops_id id) +{ + struct damon_operations empty_ops = {}; + + if (!memcmp(&empty_ops, &damon_registered_ops[id], sizeof(empty_ops))) + return false; + return true; +} + +/** + * damon_register_ops() - Register a monitoring operations set to DAMON. + * @ops: monitoring operations set to register. + * + * This function registers a monitoring operations set of valid &struct + * damon_operations->id so that others can find and use them later. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_register_ops(struct damon_operations *ops) +{ + int err = 0; + + if (ops->id >= NR_DAMON_OPS) + return -EINVAL; + mutex_lock(&damon_ops_lock); + /* Fail for already registered ops */ + if (damon_registered_ops_id(ops->id)) { + err = -EINVAL; + goto out; + } + damon_registered_ops[ops->id] = *ops; +out: + mutex_unlock(&damon_ops_lock); + return err; +} + +/** + * damon_select_ops() - Select a monitoring operations to use with the context. + * @ctx: monitoring context to use the operations. + * @id: id of the registered monitoring operations to select. + * + * This function finds registered monitoring operations set of @id and make + * @ctx to use it. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id) +{ + int err = 0; + + if (id >= NR_DAMON_OPS) + return -EINVAL; + + mutex_lock(&damon_ops_lock); + if (!damon_registered_ops_id(id)) + err = -EINVAL; + else + ctx->ops = damon_registered_ops[id]; + mutex_unlock(&damon_ops_lock); + return err; +} /* * Construct a damon_region struct @@ -53,17 +116,6 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end) return region; } -/* - * Add a region between two other regions - */ -inline void damon_insert_region(struct damon_region *r, - struct damon_region *prev, struct damon_region *next, - struct damon_target *t) -{ - __list_add(&r->list, &prev->list, &next->list); - t->nr_regions++; -} - void damon_add_region(struct damon_region *r, struct damon_target *t) { list_add_tail(&r->list, &t->regions_list); @@ -106,8 +158,7 @@ struct damos *damon_new_scheme( scheme->min_age_region = min_age_region; scheme->max_age_region = max_age_region; scheme->action = action; - scheme->stat_count = 0; - scheme->stat_sz = 0; + scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); scheme->quota.ms = quota->ms; @@ -160,7 +211,7 @@ void damon_destroy_scheme(struct damos *s) * * Returns the pointer to the new struct if success, or NULL otherwise */ -struct damon_target *damon_new_target(unsigned long id) +struct damon_target *damon_new_target(void) { struct damon_target *t; @@ -168,7 +219,7 @@ struct damon_target *damon_new_target(unsigned long id) if (!t) return NULL; - t->id = id; + t->pid = NULL; t->nr_regions = 0; INIT_LIST_HEAD(&t->regions_list); @@ -220,10 +271,10 @@ struct damon_ctx *damon_new_ctx(void) ctx->sample_interval = 5 * 1000; ctx->aggr_interval = 100 * 1000; - ctx->primitive_update_interval = 60 * 1000 * 1000; + ctx->ops_update_interval = 60 * 1000 * 1000; ktime_get_coarse_ts64(&ctx->last_aggregation); - ctx->last_primitive_update = ctx->last_aggregation; + ctx->last_ops_update = ctx->last_aggregation; mutex_init(&ctx->kdamond_lock); @@ -240,8 +291,8 @@ static void damon_destroy_targets(struct damon_ctx *ctx) { struct damon_target *t, *next_t; - if (ctx->primitive.cleanup) { - ctx->primitive.cleanup(ctx); + if (ctx->ops.cleanup) { + ctx->ops.cleanup(ctx); return; } @@ -262,43 +313,11 @@ void damon_destroy_ctx(struct damon_ctx *ctx) } /** - * damon_set_targets() - Set monitoring targets. - * @ctx: monitoring context - * @ids: array of target ids - * @nr_ids: number of entries in @ids - * - * This function should not be called while the kdamond is running. - * - * Return: 0 on success, negative error code otherwise. - */ -int damon_set_targets(struct damon_ctx *ctx, - unsigned long *ids, ssize_t nr_ids) -{ - ssize_t i; - struct damon_target *t, *next; - - damon_destroy_targets(ctx); - - for (i = 0; i < nr_ids; i++) { - t = damon_new_target(ids[i]); - if (!t) { - /* The caller should do cleanup of the ids itself */ - damon_for_each_target_safe(t, next, ctx) - damon_destroy_target(t); - return -ENOMEM; - } - damon_add_target(ctx, t); - } - - return 0; -} - -/** * damon_set_attrs() - Set attributes for the monitoring. * @ctx: monitoring context * @sample_int: time interval between samplings * @aggr_int: time interval between aggregations - * @primitive_upd_int: time interval between monitoring primitive updates + * @ops_upd_int: time interval between monitoring operations updates * @min_nr_reg: minimal number of regions * @max_nr_reg: maximum number of regions * @@ -308,7 +327,7 @@ int damon_set_targets(struct damon_ctx *ctx, * Return: 0 on success, negative error code otherwise. */ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, - unsigned long aggr_int, unsigned long primitive_upd_int, + unsigned long aggr_int, unsigned long ops_upd_int, unsigned long min_nr_reg, unsigned long max_nr_reg) { if (min_nr_reg < 3) @@ -318,7 +337,7 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, ctx->sample_interval = sample_int; ctx->aggr_interval = aggr_int; - ctx->primitive_update_interval = primitive_upd_int; + ctx->ops_update_interval = ops_upd_int; ctx->min_nr_regions = min_nr_reg; ctx->max_nr_regions = max_nr_reg; @@ -416,22 +435,25 @@ static int __damon_start(struct damon_ctx *ctx) * damon_start() - Starts the monitorings for a given group of contexts. * @ctxs: an array of the pointers for contexts to start monitoring * @nr_ctxs: size of @ctxs + * @exclusive: exclusiveness of this contexts group * * This function starts a group of monitoring threads for a group of monitoring * contexts. One thread per each context is created and run in parallel. The - * caller should handle synchronization between the threads by itself. If a - * group of threads that created by other 'damon_start()' call is currently - * running, this function does nothing but returns -EBUSY. + * caller should handle synchronization between the threads by itself. If + * @exclusive is true and a group of threads that created by other + * 'damon_start()' call is currently running, this function does nothing but + * returns -EBUSY. * * Return: 0 on success, negative error code otherwise. */ -int damon_start(struct damon_ctx **ctxs, int nr_ctxs) +int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive) { int i; int err = 0; mutex_lock(&damon_lock); - if (nr_running_ctxs) { + if ((exclusive && nr_running_ctxs) || + (!exclusive && running_exclusive_ctxs)) { mutex_unlock(&damon_lock); return -EBUSY; } @@ -442,13 +464,15 @@ int damon_start(struct damon_ctx **ctxs, int nr_ctxs) break; nr_running_ctxs++; } + if (exclusive && nr_running_ctxs) + running_exclusive_ctxs = true; mutex_unlock(&damon_lock); return err; } /* - * __damon_stop() - Stops monitoring of given context. + * __damon_stop() - Stops monitoring of a given context. * @ctx: monitoring context * * Return: 0 on success, negative error code otherwise. @@ -486,9 +510,8 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs) /* nr_running_ctxs is decremented in kdamond_fn */ err = __damon_stop(ctxs[i]); if (err) - return err; + break; } - return err; } @@ -530,15 +553,17 @@ static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx) static void kdamond_reset_aggregated(struct damon_ctx *c) { struct damon_target *t; + unsigned int ti = 0; /* target's index */ damon_for_each_target(t, c) { struct damon_region *r; damon_for_each_region(r, t) { - trace_damon_aggregated(t, r, damon_nr_regions(t)); + trace_damon_aggregated(t, ti, r, damon_nr_regions(t)); r->last_nr_accesses = r->nr_accesses; r->nr_accesses = 0; } + ti++; } } @@ -562,10 +587,10 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, { bool ret = __damos_valid_target(r, s); - if (!ret || !s->quota.esz || !c->primitive.get_scheme_score) + if (!ret || !s->quota.esz || !c->ops.get_scheme_score) return ret; - return c->primitive.get_scheme_score(c, t, r, s) >= s->quota.min_score; + return c->ops.get_scheme_score(c, t, r, s) >= s->quota.min_score; } static void damon_do_apply_schemes(struct damon_ctx *c, @@ -578,6 +603,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, struct damos_quota *quota = &s->quota; unsigned long sz = r->ar.end - r->ar.start; struct timespec64 begin, end; + unsigned long sz_applied = 0; if (!s->wmarks.activated) continue; @@ -621,7 +647,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, continue; /* Apply the scheme */ - if (c->primitive.apply_scheme) { + if (c->ops.apply_scheme) { if (quota->esz && quota->charged_sz + sz > quota->esz) { sz = ALIGN_DOWN(quota->esz - quota->charged_sz, @@ -631,7 +657,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, damon_split_region_at(c, t, r, sz); } ktime_get_coarse_ts64(&begin); - c->primitive.apply_scheme(c, t, r, s); + sz_applied = c->ops.apply_scheme(c, t, r, s); ktime_get_coarse_ts64(&end); quota->total_charged_ns += timespec64_to_ns(&end) - timespec64_to_ns(&begin); @@ -645,8 +671,11 @@ static void damon_do_apply_schemes(struct damon_ctx *c, r->age = 0; update_stat: - s->stat_count++; - s->stat_sz += sz; + s->stat.nr_tried++; + s->stat.sz_tried += sz; + if (sz_applied) + s->stat.nr_applied++; + s->stat.sz_applied += sz_applied; } } @@ -694,13 +723,15 @@ static void kdamond_apply_schemes(struct damon_ctx *c) if (time_after_eq(jiffies, quota->charged_from + msecs_to_jiffies( quota->reset_interval))) { + if (quota->esz && quota->charged_sz >= quota->esz) + s->stat.qt_exceeds++; quota->total_charged_sz += quota->charged_sz; quota->charged_from = jiffies; quota->charged_sz = 0; damos_set_effective_quota(quota); } - if (!c->primitive.get_scheme_score) + if (!c->ops.get_scheme_score) continue; /* Fill up the score histogram */ @@ -709,7 +740,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c) damon_for_each_region(r, t) { if (!__damos_valid_target(r, s)) continue; - score = c->primitive.get_scheme_score( + score = c->ops.get_scheme_score( c, t, r, s); quota->histogram[score] += r->ar.end - r->ar.start; @@ -733,7 +764,10 @@ static void kdamond_apply_schemes(struct damon_ctx *c) } } -#define sz_damon_region(r) (r->ar.end - r->ar.start) +static inline unsigned long sz_damon_region(struct damon_region *r) +{ + return r->ar.end - r->ar.start; +} /* * Merge two adjacent regions into one region @@ -750,8 +784,6 @@ static void damon_merge_two_regions(struct damon_target *t, damon_destroy_region(r, t); } -#define diff_of(a, b) (a > b ? a - b : b - a) - /* * Merge adjacent regions having similar access frequencies * @@ -765,13 +797,13 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, struct damon_region *r, *prev = NULL, *next; damon_for_each_region_safe(r, next, t) { - if (diff_of(r->nr_accesses, r->last_nr_accesses) > thres) + if (abs(r->nr_accesses - r->last_nr_accesses) > thres) r->age = 0; else r->age++; if (prev && prev->ar.end == r->ar.start && - diff_of(prev->nr_accesses, r->nr_accesses) <= thres && + abs(prev->nr_accesses - r->nr_accesses) <= thres && sz_damon_region(prev) + sz_damon_region(r) <= sz_limit) damon_merge_two_regions(t, prev, r); else @@ -887,14 +919,15 @@ static void kdamond_split_regions(struct damon_ctx *ctx) } /* - * Check whether it is time to check and apply the target monitoring regions + * Check whether it is time to check and apply the operations-related data + * structures. * * Returns true if it is. */ -static bool kdamond_need_update_primitive(struct damon_ctx *ctx) +static bool kdamond_need_update_operations(struct damon_ctx *ctx) { - return damon_check_reset_time_interval(&ctx->last_primitive_update, - ctx->primitive_update_interval); + return damon_check_reset_time_interval(&ctx->last_ops_update, + ctx->ops_update_interval); } /* @@ -912,11 +945,11 @@ static bool kdamond_need_stop(struct damon_ctx *ctx) if (kthread_should_stop()) return true; - if (!ctx->primitive.target_valid) + if (!ctx->ops.target_valid) return false; damon_for_each_target(t, ctx) { - if (ctx->primitive.target_valid(t)) + if (ctx->ops.target_valid(t)) return false; } @@ -1015,8 +1048,8 @@ static int kdamond_fn(void *data) pr_debug("kdamond (%d) starts\n", current->pid); - if (ctx->primitive.init) - ctx->primitive.init(ctx); + if (ctx->ops.init) + ctx->ops.init(ctx); if (ctx->callback.before_start && ctx->callback.before_start(ctx)) done = true; @@ -1026,16 +1059,16 @@ static int kdamond_fn(void *data) if (kdamond_wait_activation(ctx)) continue; - if (ctx->primitive.prepare_access_checks) - ctx->primitive.prepare_access_checks(ctx); + if (ctx->ops.prepare_access_checks) + ctx->ops.prepare_access_checks(ctx); if (ctx->callback.after_sampling && ctx->callback.after_sampling(ctx)) done = true; kdamond_usleep(ctx->sample_interval); - if (ctx->primitive.check_accesses) - max_nr_accesses = ctx->primitive.check_accesses(ctx); + if (ctx->ops.check_accesses) + max_nr_accesses = ctx->ops.check_accesses(ctx); if (kdamond_aggregate_interval_passed(ctx)) { kdamond_merge_regions(ctx, @@ -1047,13 +1080,13 @@ static int kdamond_fn(void *data) kdamond_apply_schemes(ctx); kdamond_reset_aggregated(ctx); kdamond_split_regions(ctx); - if (ctx->primitive.reset_aggregated) - ctx->primitive.reset_aggregated(ctx); + if (ctx->ops.reset_aggregated) + ctx->ops.reset_aggregated(ctx); } - if (kdamond_need_update_primitive(ctx)) { - if (ctx->primitive.update) - ctx->primitive.update(ctx); + if (kdamond_need_update_operations(ctx)) { + if (ctx->ops.update) + ctx->ops.update(ctx); sz_limit = damon_region_sz_limit(ctx); } } @@ -1064,8 +1097,8 @@ static int kdamond_fn(void *data) if (ctx->callback.before_terminate) ctx->callback.before_terminate(ctx); - if (ctx->primitive.cleanup) - ctx->primitive.cleanup(ctx); + if (ctx->ops.cleanup) + ctx->ops.cleanup(ctx); pr_debug("kdamond (%d) finishes\n", current->pid); mutex_lock(&ctx->kdamond_lock); @@ -1074,6 +1107,8 @@ static int kdamond_fn(void *data) mutex_lock(&damon_lock); nr_running_ctxs--; + if (!nr_running_ctxs && running_exclusive_ctxs) + running_exclusive_ctxs = false; mutex_unlock(&damon_lock); return 0; diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h index 86b9f9528231..0bb0d532b159 100644 --- a/mm/damon/dbgfs-test.h +++ b/mm/damon/dbgfs-test.h @@ -12,66 +12,58 @@ #include <kunit/test.h> -static void damon_dbgfs_test_str_to_target_ids(struct kunit *test) +static void damon_dbgfs_test_str_to_ints(struct kunit *test) { char *question; - unsigned long *answers; - unsigned long expected[] = {12, 35, 46}; + int *answers; + int expected[] = {12, 35, 46}; ssize_t nr_integers = 0, i; question = "123"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); - KUNIT_EXPECT_EQ(test, 123ul, answers[0]); + KUNIT_EXPECT_EQ(test, 123, answers[0]); kfree(answers); question = "123abc"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); - KUNIT_EXPECT_EQ(test, 123ul, answers[0]); + KUNIT_EXPECT_EQ(test, 123, answers[0]); kfree(answers); question = "a123"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); kfree(answers); question = "12 35"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); for (i = 0; i < nr_integers; i++) KUNIT_EXPECT_EQ(test, expected[i], answers[i]); kfree(answers); question = "12 35 46"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)3, nr_integers); for (i = 0; i < nr_integers; i++) KUNIT_EXPECT_EQ(test, expected[i], answers[i]); kfree(answers); question = "12 35 abc 46"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); for (i = 0; i < 2; i++) KUNIT_EXPECT_EQ(test, expected[i], answers[i]); kfree(answers); question = ""; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); kfree(answers); question = "\n"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); kfree(answers); } @@ -79,30 +71,20 @@ static void damon_dbgfs_test_str_to_target_ids(struct kunit *test) static void damon_dbgfs_test_set_targets(struct kunit *test) { struct damon_ctx *ctx = dbgfs_new_ctx(); - unsigned long ids[] = {1, 2, 3}; char buf[64]; - /* Make DAMON consider target id as plain number */ - ctx->primitive.target_valid = NULL; - ctx->primitive.cleanup = NULL; + /* Make DAMON consider target has no pid */ + damon_select_ops(ctx, DAMON_OPS_PADDR); - damon_set_targets(ctx, ids, 3); - sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2 3\n"); - - damon_set_targets(ctx, NULL, 0); + dbgfs_set_targets(ctx, 0, NULL); sprint_target_ids(ctx, buf, 64); KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); - damon_set_targets(ctx, (unsigned long []){1, 2}, 2); + dbgfs_set_targets(ctx, 1, NULL); sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2\n"); + KUNIT_EXPECT_STREQ(test, (char *)buf, "42\n"); - damon_set_targets(ctx, (unsigned long []){2}, 1); - sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "2\n"); - - damon_set_targets(ctx, NULL, 0); + dbgfs_set_targets(ctx, 0, NULL); sprint_target_ids(ctx, buf, 64); KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); @@ -112,25 +94,26 @@ static void damon_dbgfs_test_set_targets(struct kunit *test) static void damon_dbgfs_test_set_init_regions(struct kunit *test) { struct damon_ctx *ctx = damon_new_ctx(); - unsigned long ids[] = {1, 2, 3}; - /* Each line represents one region in ``<target id> <start> <end>`` */ - char * const valid_inputs[] = {"2 10 20\n 2 20 30\n2 35 45", - "2 10 20\n", - "2 10 20\n1 39 59\n1 70 134\n 2 20 25\n", + /* Each line represents one region in ``<target idx> <start> <end>`` */ + char * const valid_inputs[] = {"1 10 20\n 1 20 30\n1 35 45", + "1 10 20\n", + "1 10 20\n0 39 59\n0 70 134\n 1 20 25\n", ""}; /* Reading the file again will show sorted, clean output */ - char * const valid_expects[] = {"2 10 20\n2 20 30\n2 35 45\n", - "2 10 20\n", - "1 39 59\n1 70 134\n2 10 20\n2 20 25\n", + char * const valid_expects[] = {"1 10 20\n1 20 30\n1 35 45\n", + "1 10 20\n", + "0 39 59\n0 70 134\n1 10 20\n1 20 25\n", ""}; - char * const invalid_inputs[] = {"4 10 20\n", /* target not exists */ - "2 10 20\n 2 14 26\n", /* regions overlap */ - "1 10 20\n2 30 40\n 1 5 8"}; /* not sorted by address */ + char * const invalid_inputs[] = {"3 10 20\n", /* target not exists */ + "1 10 20\n 1 14 26\n", /* regions overlap */ + "0 10 20\n1 30 40\n 0 5 8"}; /* not sorted by address */ char *input, *expect; int i, rc; char buf[256]; - damon_set_targets(ctx, ids, 3); + damon_select_ops(ctx, DAMON_OPS_PADDR); + + dbgfs_set_targets(ctx, 3, NULL); /* Put valid inputs and check the results */ for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) { @@ -158,12 +141,12 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test) KUNIT_EXPECT_STREQ(test, (char *)buf, ""); } - damon_set_targets(ctx, NULL, 0); + dbgfs_set_targets(ctx, 0, NULL); damon_destroy_ctx(ctx); } static struct kunit_case damon_test_cases[] = { - KUNIT_CASE(damon_dbgfs_test_str_to_target_ids), + KUNIT_CASE(damon_dbgfs_test_str_to_ints), KUNIT_CASE(damon_dbgfs_test_set_targets), KUNIT_CASE(damon_dbgfs_test_set_init_regions), {}, diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index ad65436756af..a0dab8b5e45f 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -56,7 +56,7 @@ static ssize_t dbgfs_attrs_read(struct file *file, mutex_lock(&ctx->kdamond_lock); ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n", ctx->sample_interval, ctx->aggr_interval, - ctx->primitive_update_interval, ctx->min_nr_regions, + ctx->ops_update_interval, ctx->min_nr_regions, ctx->max_nr_regions); mutex_unlock(&ctx->kdamond_lock); @@ -105,7 +105,7 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) damon_for_each_scheme(s, c) { rc = scnprintf(&buf[written], len - written, - "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu\n", + "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", s->min_sz_region, s->max_sz_region, s->min_nr_accesses, s->max_nr_accesses, s->min_age_region, s->max_age_region, @@ -117,7 +117,9 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) s->quota.weight_age, s->wmarks.metric, s->wmarks.interval, s->wmarks.high, s->wmarks.mid, s->wmarks.low, - s->stat_count, s->stat_sz); + s->stat.nr_tried, s->stat.sz_tried, + s->stat.nr_applied, s->stat.sz_applied, + s->stat.qt_exceeds); if (!rc) return -ENOMEM; @@ -213,6 +215,13 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, if (!damos_action_valid(action)) goto fail; + if (min_sz > max_sz || min_nr_a > max_nr_a || min_age > max_age) + goto fail; + + if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low || + wmarks.mid < wmarks.low) + goto fail; + pos += parsed; scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a, min_age, max_age, action, "a, &wmarks); @@ -266,25 +275,27 @@ out: return ret; } -static inline bool targetid_is_pid(const struct damon_ctx *ctx) +static inline bool target_has_pid(const struct damon_ctx *ctx) { - return ctx->primitive.target_valid == damon_va_target_valid; + return ctx->ops.id == DAMON_OPS_VADDR; } static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len) { struct damon_target *t; - unsigned long id; + int id; int written = 0; int rc; damon_for_each_target(t, ctx) { - id = t->id; - if (targetid_is_pid(ctx)) + if (target_has_pid(ctx)) /* Show pid numbers to debugfs users */ - id = (unsigned long)pid_vnr((struct pid *)id); + id = pid_vnr(t->pid); + else + /* Show 42 for physical address space, just for fun */ + id = 42; - rc = scnprintf(&buf[written], len - written, "%lu ", id); + rc = scnprintf(&buf[written], len - written, "%d ", id); if (!rc) return -ENOMEM; written += rc; @@ -312,117 +323,177 @@ static ssize_t dbgfs_target_ids_read(struct file *file, } /* - * Converts a string into an array of unsigned long integers + * Converts a string into an integers array * - * Returns an array of unsigned long integers if the conversion success, or - * NULL otherwise. + * Returns an array of integers array if the conversion success, or NULL + * otherwise. */ -static unsigned long *str_to_target_ids(const char *str, ssize_t len, - ssize_t *nr_ids) +static int *str_to_ints(const char *str, ssize_t len, ssize_t *nr_ints) { - unsigned long *ids; - const int max_nr_ids = 32; - unsigned long id; + int *array; + const int max_nr_ints = 32; + int nr; int pos = 0, parsed, ret; - *nr_ids = 0; - ids = kmalloc_array(max_nr_ids, sizeof(id), GFP_KERNEL); - if (!ids) + *nr_ints = 0; + array = kmalloc_array(max_nr_ints, sizeof(*array), GFP_KERNEL); + if (!array) return NULL; - while (*nr_ids < max_nr_ids && pos < len) { - ret = sscanf(&str[pos], "%lu%n", &id, &parsed); + while (*nr_ints < max_nr_ints && pos < len) { + ret = sscanf(&str[pos], "%d%n", &nr, &parsed); pos += parsed; if (ret != 1) break; - ids[*nr_ids] = id; - *nr_ids += 1; + array[*nr_ints] = nr; + *nr_ints += 1; } - return ids; + return array; } -static void dbgfs_put_pids(unsigned long *ids, int nr_ids) +static void dbgfs_put_pids(struct pid **pids, int nr_pids) { int i; - for (i = 0; i < nr_ids; i++) - put_pid((struct pid *)ids[i]); + for (i = 0; i < nr_pids; i++) + put_pid(pids[i]); +} + +/* + * Converts a string into an struct pid pointers array + * + * Returns an array of struct pid pointers if the conversion success, or NULL + * otherwise. + */ +static struct pid **str_to_pids(const char *str, ssize_t len, ssize_t *nr_pids) +{ + int *ints; + ssize_t nr_ints; + struct pid **pids; + + *nr_pids = 0; + + ints = str_to_ints(str, len, &nr_ints); + if (!ints) + return NULL; + + pids = kmalloc_array(nr_ints, sizeof(*pids), GFP_KERNEL); + if (!pids) + goto out; + + for (; *nr_pids < nr_ints; (*nr_pids)++) { + pids[*nr_pids] = find_get_pid(ints[*nr_pids]); + if (!pids[*nr_pids]) { + dbgfs_put_pids(pids, *nr_pids); + kfree(ints); + kfree(pids); + return NULL; + } + } + +out: + kfree(ints); + return pids; +} + +/* + * dbgfs_set_targets() - Set monitoring targets. + * @ctx: monitoring context + * @nr_targets: number of targets + * @pids: array of target pids (size is same to @nr_targets) + * + * This function should not be called while the kdamond is running. @pids is + * ignored if the context is not configured to have pid in each target. On + * failure, reference counts of all pids in @pids are decremented. + * + * Return: 0 on success, negative error code otherwise. + */ +static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets, + struct pid **pids) +{ + ssize_t i; + struct damon_target *t, *next; + + damon_for_each_target_safe(t, next, ctx) { + if (target_has_pid(ctx)) + put_pid(t->pid); + damon_destroy_target(t); + } + + for (i = 0; i < nr_targets; i++) { + t = damon_new_target(); + if (!t) { + damon_for_each_target_safe(t, next, ctx) + damon_destroy_target(t); + if (target_has_pid(ctx)) + dbgfs_put_pids(pids, nr_targets); + return -ENOMEM; + } + if (target_has_pid(ctx)) + t->pid = pids[i]; + damon_add_target(ctx, t); + } + + return 0; } static ssize_t dbgfs_target_ids_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct damon_ctx *ctx = file->private_data; - struct damon_target *t, *next_t; bool id_is_pid = true; - char *kbuf, *nrs; - unsigned long *targets; + char *kbuf; + struct pid **target_pids = NULL; ssize_t nr_targets; ssize_t ret; - int i; kbuf = user_input_str(buf, count, ppos); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - nrs = kbuf; if (!strncmp(kbuf, "paddr\n", count)) { id_is_pid = false; - /* target id is meaningless here, but we set it just for fun */ - scnprintf(kbuf, count, "42 "); - } - - targets = str_to_target_ids(nrs, count, &nr_targets); - if (!targets) { - ret = -ENOMEM; - goto out; + nr_targets = 1; } if (id_is_pid) { - for (i = 0; i < nr_targets; i++) { - targets[i] = (unsigned long)find_get_pid( - (int)targets[i]); - if (!targets[i]) { - dbgfs_put_pids(targets, i); - ret = -EINVAL; - goto free_targets_out; - } + target_pids = str_to_pids(kbuf, count, &nr_targets); + if (!target_pids) { + ret = -ENOMEM; + goto out; } } mutex_lock(&ctx->kdamond_lock); if (ctx->kdamond) { if (id_is_pid) - dbgfs_put_pids(targets, nr_targets); + dbgfs_put_pids(target_pids, nr_targets); ret = -EBUSY; goto unlock_out; } /* remove previously set targets */ - damon_for_each_target_safe(t, next_t, ctx) { - if (targetid_is_pid(ctx)) - put_pid((struct pid *)t->id); - damon_destroy_target(t); + dbgfs_set_targets(ctx, 0, NULL); + if (!nr_targets) { + ret = count; + goto unlock_out; } /* Configure the context for the address space type */ if (id_is_pid) - damon_va_set_primitives(ctx); + ret = damon_select_ops(ctx, DAMON_OPS_VADDR); else - damon_pa_set_primitives(ctx); + ret = damon_select_ops(ctx, DAMON_OPS_PADDR); + if (ret) + goto unlock_out; - ret = damon_set_targets(ctx, targets, nr_targets); - if (ret) { - if (id_is_pid) - dbgfs_put_pids(targets, nr_targets); - } else { + ret = dbgfs_set_targets(ctx, nr_targets, target_pids); + if (!ret) ret = count; - } unlock_out: mutex_unlock(&ctx->kdamond_lock); -free_targets_out: - kfree(targets); + kfree(target_pids); out: kfree(kbuf); return ret; @@ -432,18 +503,20 @@ static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len) { struct damon_target *t; struct damon_region *r; + int target_idx = 0; int written = 0; int rc; damon_for_each_target(t, c) { damon_for_each_region(r, t) { rc = scnprintf(&buf[written], len - written, - "%lu %lu %lu\n", - t->id, r->ar.start, r->ar.end); + "%d %lu %lu\n", + target_idx, r->ar.start, r->ar.end); if (!rc) return -ENOMEM; written += rc; } + target_idx++; } return written; } @@ -477,22 +550,19 @@ out: return len; } -static int add_init_region(struct damon_ctx *c, - unsigned long target_id, struct damon_addr_range *ar) +static int add_init_region(struct damon_ctx *c, int target_idx, + struct damon_addr_range *ar) { struct damon_target *t; struct damon_region *r, *prev; - unsigned long id; + unsigned long idx = 0; int rc = -EINVAL; if (ar->start >= ar->end) return -EINVAL; damon_for_each_target(t, c) { - id = t->id; - if (targetid_is_pid(c)) - id = (unsigned long)pid_vnr((struct pid *)id); - if (id == target_id) { + if (idx++ == target_idx) { r = damon_new_region(ar->start, ar->end); if (!r) return -ENOMEM; @@ -515,7 +585,7 @@ static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len) struct damon_target *t; struct damon_region *r, *next; int pos = 0, parsed, ret; - unsigned long target_id; + int target_idx; struct damon_addr_range ar; int err; @@ -525,11 +595,11 @@ static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len) } while (pos < len) { - ret = sscanf(&str[pos], "%lu %lu %lu%n", - &target_id, &ar.start, &ar.end, &parsed); + ret = sscanf(&str[pos], "%d %lu %lu%n", + &target_idx, &ar.start, &ar.end, &parsed); if (ret != 3) break; - err = add_init_region(c, target_id, &ar); + err = add_init_region(c, target_idx, &ar); if (err) goto fail; pos += parsed; @@ -652,12 +722,12 @@ static void dbgfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; - if (!targetid_is_pid(ctx)) + if (!target_has_pid(ctx)) return; mutex_lock(&ctx->kdamond_lock); damon_for_each_target_safe(t, next, ctx) { - put_pid((struct pid *)t->id); + put_pid(t->pid); damon_destroy_target(t); } mutex_unlock(&ctx->kdamond_lock); @@ -671,7 +741,11 @@ static struct damon_ctx *dbgfs_new_ctx(void) if (!ctx) return NULL; - damon_va_set_primitives(ctx); + if (damon_select_ops(ctx, DAMON_OPS_VADDR) && + damon_select_ops(ctx, DAMON_OPS_PADDR)) { + damon_destroy_ctx(ctx); + return NULL; + } ctx->callback.before_terminate = dbgfs_before_terminate; return ctx; } @@ -893,7 +967,7 @@ static ssize_t dbgfs_monitor_on_write(struct file *file, return -EINVAL; } } - ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs); + ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs, true); } else if (!strncmp(kbuf, "off", count)) { ret = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs); } else { diff --git a/mm/damon/prmtv-common.c b/mm/damon/ops-common.c index 92a04f5831d6..e346cc10d143 100644 --- a/mm/damon/prmtv-common.c +++ b/mm/damon/ops-common.c @@ -10,7 +10,7 @@ #include <linux/pagemap.h> #include <linux/rmap.h> -#include "prmtv-common.h" +#include "ops-common.h" /* * Get an online page for a pfn if it's in the LRU list. Otherwise, returns diff --git a/mm/damon/prmtv-common.h b/mm/damon/ops-common.h index 61f27037603e..e790cb5f8fe0 100644 --- a/mm/damon/prmtv-common.h +++ b/mm/damon/ops-common.h @@ -6,10 +6,6 @@ */ #include <linux/damon.h> -#include <linux/random.h> - -/* Get a random number in [l, r) */ -#define damon_rand(l, r) (l + prandom_u32_max(r - l)) struct page *damon_get_page(unsigned long pfn); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index a496d6f203d6..21474ae63bc7 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -14,16 +14,12 @@ #include <linux/swap.h> #include "../internal.h" -#include "prmtv-common.h" +#include "ops-common.h" -static bool __damon_pa_mkold(struct page *page, struct vm_area_struct *vma, +static bool __damon_pa_mkold(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, void *arg) { - struct page_vma_mapped_walk pvmw = { - .page = page, - .vma = vma, - .address = addr, - }; + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); while (page_vma_mapped_walk(&pvmw)) { addr = pvmw.address; @@ -37,32 +33,34 @@ static bool __damon_pa_mkold(struct page *page, struct vm_area_struct *vma, static void damon_pa_mkold(unsigned long paddr) { + struct folio *folio; struct page *page = damon_get_page(PHYS_PFN(paddr)); struct rmap_walk_control rwc = { .rmap_one = __damon_pa_mkold, - .anon_lock = page_lock_anon_vma_read, + .anon_lock = folio_lock_anon_vma_read, }; bool need_lock; if (!page) return; + folio = page_folio(page); - if (!page_mapped(page) || !page_rmapping(page)) { - set_page_idle(page); + if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { + folio_set_idle(folio); goto out; } - need_lock = !PageAnon(page) || PageKsm(page); - if (need_lock && !trylock_page(page)) + need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); + if (need_lock && !folio_trylock(folio)) goto out; - rmap_walk(page, &rwc); + rmap_walk(folio, &rwc); if (need_lock) - unlock_page(page); + folio_unlock(folio); out: - put_page(page); + folio_put(folio); } static void __damon_pa_prepare_access_check(struct damon_ctx *ctx, @@ -73,7 +71,7 @@ static void __damon_pa_prepare_access_check(struct damon_ctx *ctx, damon_pa_mkold(r->sampling_addr); } -void damon_pa_prepare_access_checks(struct damon_ctx *ctx) +static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) { struct damon_target *t; struct damon_region *r; @@ -89,15 +87,11 @@ struct damon_pa_access_chk_result { bool accessed; }; -static bool __damon_pa_young(struct page *page, struct vm_area_struct *vma, +static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, void *arg) { struct damon_pa_access_chk_result *result = arg; - struct page_vma_mapped_walk pvmw = { - .page = page, - .vma = vma, - .address = addr, - }; + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); result->accessed = false; result->page_sz = PAGE_SIZE; @@ -105,12 +99,12 @@ static bool __damon_pa_young(struct page *page, struct vm_area_struct *vma, addr = pvmw.address; if (pvmw.pte) { result->accessed = pte_young(*pvmw.pte) || - !page_is_idle(page) || + !folio_test_idle(folio) || mmu_notifier_test_young(vma->vm_mm, addr); } else { #ifdef CONFIG_TRANSPARENT_HUGEPAGE result->accessed = pmd_young(*pvmw.pmd) || - !page_is_idle(page) || + !folio_test_idle(folio) || mmu_notifier_test_young(vma->vm_mm, addr); result->page_sz = ((1UL) << HPAGE_PMD_SHIFT); #else @@ -129,6 +123,7 @@ static bool __damon_pa_young(struct page *page, struct vm_area_struct *vma, static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz) { + struct folio *folio; struct page *page = damon_get_page(PHYS_PFN(paddr)); struct damon_pa_access_chk_result result = { .page_sz = PAGE_SIZE, @@ -137,33 +132,34 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz) struct rmap_walk_control rwc = { .arg = &result, .rmap_one = __damon_pa_young, - .anon_lock = page_lock_anon_vma_read, + .anon_lock = folio_lock_anon_vma_read, }; bool need_lock; if (!page) return false; + folio = page_folio(page); - if (!page_mapped(page) || !page_rmapping(page)) { - if (page_is_idle(page)) + if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { + if (folio_test_idle(folio)) result.accessed = false; else result.accessed = true; - put_page(page); + folio_put(folio); goto out; } - need_lock = !PageAnon(page) || PageKsm(page); - if (need_lock && !trylock_page(page)) { - put_page(page); - return NULL; + need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); + if (need_lock && !folio_trylock(folio)) { + folio_put(folio); + return false; } - rmap_walk(page, &rwc); + rmap_walk(folio, &rwc); if (need_lock) - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); out: *page_sz = result.page_sz; @@ -192,7 +188,7 @@ static void __damon_pa_check_access(struct damon_ctx *ctx, last_addr = r->sampling_addr; } -unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) +static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) { struct damon_target *t; struct damon_region *r; @@ -208,19 +204,15 @@ unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) return max_nr_accesses; } -bool damon_pa_target_valid(void *t) -{ - return true; -} - -int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, - struct damon_region *r, struct damos *scheme) +static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) { - unsigned long addr; + unsigned long addr, applied; LIST_HEAD(page_list); if (scheme->action != DAMOS_PAGEOUT) - return -EINVAL; + return 0; for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { struct page *page = damon_get_page(PHYS_PFN(addr)); @@ -241,13 +233,14 @@ int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, put_page(page); } } - reclaim_pages(&page_list); + applied = reclaim_pages(&page_list); cond_resched(); - return 0; + return applied * PAGE_SIZE; } -int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme) +static int damon_pa_scheme_score(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) { switch (scheme->action) { case DAMOS_PAGEOUT: @@ -259,15 +252,22 @@ int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t, return DAMOS_MAX_SCORE; } -void damon_pa_set_primitives(struct damon_ctx *ctx) +static int __init damon_pa_initcall(void) { - ctx->primitive.init = NULL; - ctx->primitive.update = NULL; - ctx->primitive.prepare_access_checks = damon_pa_prepare_access_checks; - ctx->primitive.check_accesses = damon_pa_check_accesses; - ctx->primitive.reset_aggregated = NULL; - ctx->primitive.target_valid = damon_pa_target_valid; - ctx->primitive.cleanup = NULL; - ctx->primitive.apply_scheme = damon_pa_apply_scheme; - ctx->primitive.get_scheme_score = damon_pa_scheme_score; -} + struct damon_operations ops = { + .id = DAMON_OPS_PADDR, + .init = NULL, + .update = NULL, + .prepare_access_checks = damon_pa_prepare_access_checks, + .check_accesses = damon_pa_check_accesses, + .reset_aggregated = NULL, + .target_valid = NULL, + .cleanup = NULL, + .apply_scheme = damon_pa_apply_scheme, + .get_scheme_score = damon_pa_scheme_score, + }; + + return damon_register_ops(&ops); +}; + +subsys_initcall(damon_pa_initcall); diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index dc1485044eaf..e34c4d0c4d93 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -185,6 +185,36 @@ module_param(monitor_region_end, ulong, 0600); static int kdamond_pid __read_mostly = -1; module_param(kdamond_pid, int, 0400); +/* + * Number of memory regions that tried to be reclaimed. + */ +static unsigned long nr_reclaim_tried_regions __read_mostly; +module_param(nr_reclaim_tried_regions, ulong, 0400); + +/* + * Total bytes of memory regions that tried to be reclaimed. + */ +static unsigned long bytes_reclaim_tried_regions __read_mostly; +module_param(bytes_reclaim_tried_regions, ulong, 0400); + +/* + * Number of memory regions that successfully be reclaimed. + */ +static unsigned long nr_reclaimed_regions __read_mostly; +module_param(nr_reclaimed_regions, ulong, 0400); + +/* + * Total bytes of memory regions that successfully be reclaimed. + */ +static unsigned long bytes_reclaimed_regions __read_mostly; +module_param(bytes_reclaimed_regions, ulong, 0400); + +/* + * Number of times that the time/space quota limits have exceeded + */ +static unsigned long nr_quota_exceeds __read_mostly; +module_param(nr_quota_exceeds, ulong, 0400); + static struct damon_ctx *ctx; static struct damon_target *target; @@ -300,7 +330,7 @@ static int damon_reclaim_turn(bool on) if (err) goto free_scheme_out; - err = damon_start(&ctx, 1); + err = damon_start(&ctx, 1, true); if (!err) { kdamond_pid = ctx->kdamond->pid; return 0; @@ -333,16 +363,33 @@ static void damon_reclaim_timer_fn(struct work_struct *work) } static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn); +static int damon_reclaim_after_aggregation(struct damon_ctx *c) +{ + struct damos *s; + + /* update the stats parameter */ + damon_for_each_scheme(s, c) { + nr_reclaim_tried_regions = s->stat.nr_tried; + bytes_reclaim_tried_regions = s->stat.sz_tried; + nr_reclaimed_regions = s->stat.nr_applied; + bytes_reclaimed_regions = s->stat.sz_applied; + nr_quota_exceeds = s->stat.qt_exceeds; + } + return 0; +} + static int __init damon_reclaim_init(void) { ctx = damon_new_ctx(); if (!ctx) return -ENOMEM; - damon_pa_set_primitives(ctx); + if (damon_select_ops(ctx, DAMON_OPS_PADDR)) + return -EINVAL; + + ctx->callback.after_aggregation = damon_reclaim_after_aggregation; - /* 4242 means nothing but fun */ - target = damon_new_target(4242); + target = damon_new_target(); if (!target) { damon_destroy_ctx(ctx); return -ENOMEM; diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c new file mode 100644 index 000000000000..48e434cd43d8 --- /dev/null +++ b/mm/damon/sysfs.c @@ -0,0 +1,2596 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON sysfs Interface + * + * Copyright (c) 2022 SeongJae Park <[email protected]> + */ + +#include <linux/damon.h> +#include <linux/kobject.h> +#include <linux/pid.h> +#include <linux/sched.h> +#include <linux/slab.h> + +static DEFINE_MUTEX(damon_sysfs_lock); + +/* + * unsigned long range directory + */ + +struct damon_sysfs_ul_range { + struct kobject kobj; + unsigned long min; + unsigned long max; +}; + +static struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc( + unsigned long min, + unsigned long max) +{ + struct damon_sysfs_ul_range *range = kmalloc(sizeof(*range), + GFP_KERNEL); + + if (!range) + return NULL; + range->kobj = (struct kobject){}; + range->min = min; + range->max = max; + + return range; +} + +static ssize_t min_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + + return sysfs_emit(buf, "%lu\n", range->min); +} + +static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + unsigned long min; + int err; + + err = kstrtoul(buf, 0, &min); + if (err) + return -EINVAL; + + range->min = min; + return count; +} + +static ssize_t max_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + + return sysfs_emit(buf, "%lu\n", range->max); +} + +static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + unsigned long max; + int err; + + err = kstrtoul(buf, 0, &max); + if (err) + return -EINVAL; + + range->max = max; + return count; +} + +static void damon_sysfs_ul_range_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_ul_range, kobj)); +} + +static struct kobj_attribute damon_sysfs_ul_range_min_attr = + __ATTR_RW_MODE(min, 0600); + +static struct kobj_attribute damon_sysfs_ul_range_max_attr = + __ATTR_RW_MODE(max, 0600); + +static struct attribute *damon_sysfs_ul_range_attrs[] = { + &damon_sysfs_ul_range_min_attr.attr, + &damon_sysfs_ul_range_max_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_ul_range); + +static struct kobj_type damon_sysfs_ul_range_ktype = { + .release = damon_sysfs_ul_range_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_ul_range_groups, +}; + +/* + * schemes/stats directory + */ + +struct damon_sysfs_stats { + struct kobject kobj; + unsigned long nr_tried; + unsigned long sz_tried; + unsigned long nr_applied; + unsigned long sz_applied; + unsigned long qt_exceeds; +}; + +static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_stats), GFP_KERNEL); +} + +static ssize_t nr_tried_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->nr_tried); +} + +static ssize_t sz_tried_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->sz_tried); +} + +static ssize_t nr_applied_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->nr_applied); +} + +static ssize_t sz_applied_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->sz_applied); +} + +static ssize_t qt_exceeds_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->qt_exceeds); +} + +static void damon_sysfs_stats_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_stats, kobj)); +} + +static struct kobj_attribute damon_sysfs_stats_nr_tried_attr = + __ATTR_RO_MODE(nr_tried, 0400); + +static struct kobj_attribute damon_sysfs_stats_sz_tried_attr = + __ATTR_RO_MODE(sz_tried, 0400); + +static struct kobj_attribute damon_sysfs_stats_nr_applied_attr = + __ATTR_RO_MODE(nr_applied, 0400); + +static struct kobj_attribute damon_sysfs_stats_sz_applied_attr = + __ATTR_RO_MODE(sz_applied, 0400); + +static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr = + __ATTR_RO_MODE(qt_exceeds, 0400); + +static struct attribute *damon_sysfs_stats_attrs[] = { + &damon_sysfs_stats_nr_tried_attr.attr, + &damon_sysfs_stats_sz_tried_attr.attr, + &damon_sysfs_stats_nr_applied_attr.attr, + &damon_sysfs_stats_sz_applied_attr.attr, + &damon_sysfs_stats_qt_exceeds_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_stats); + +static struct kobj_type damon_sysfs_stats_ktype = { + .release = damon_sysfs_stats_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_stats_groups, +}; + +/* + * watermarks directory + */ + +struct damon_sysfs_watermarks { + struct kobject kobj; + enum damos_wmark_metric metric; + unsigned long interval_us; + unsigned long high; + unsigned long mid; + unsigned long low; +}; + +static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc( + enum damos_wmark_metric metric, unsigned long interval_us, + unsigned long high, unsigned long mid, unsigned long low) +{ + struct damon_sysfs_watermarks *watermarks = kmalloc( + sizeof(*watermarks), GFP_KERNEL); + + if (!watermarks) + return NULL; + watermarks->kobj = (struct kobject){}; + watermarks->metric = metric; + watermarks->interval_us = interval_us; + watermarks->high = high; + watermarks->mid = mid; + watermarks->low = low; + return watermarks; +} + +/* Should match with enum damos_wmark_metric */ +static const char * const damon_sysfs_wmark_metric_strs[] = { + "none", + "free_mem_rate", +}; + +static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%s\n", + damon_sysfs_wmark_metric_strs[watermarks->metric]); +} + +static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + enum damos_wmark_metric metric; + + for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) { + if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) { + watermarks->metric = metric; + return count; + } + } + return -EINVAL; +} + +static ssize_t interval_us_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->interval_us); +} + +static ssize_t interval_us_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->interval_us); + + if (err) + return -EINVAL; + return count; +} + +static ssize_t high_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->high); +} + +static ssize_t high_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->high); + + if (err) + return -EINVAL; + return count; +} + +static ssize_t mid_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->mid); +} + +static ssize_t mid_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->mid); + + if (err) + return -EINVAL; + return count; +} + +static ssize_t low_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->low); +} + +static ssize_t low_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->low); + + if (err) + return -EINVAL; + return count; +} + +static void damon_sysfs_watermarks_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_watermarks, kobj)); +} + +static struct kobj_attribute damon_sysfs_watermarks_metric_attr = + __ATTR_RW_MODE(metric, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_interval_us_attr = + __ATTR_RW_MODE(interval_us, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_high_attr = + __ATTR_RW_MODE(high, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_mid_attr = + __ATTR_RW_MODE(mid, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_low_attr = + __ATTR_RW_MODE(low, 0600); + +static struct attribute *damon_sysfs_watermarks_attrs[] = { + &damon_sysfs_watermarks_metric_attr.attr, + &damon_sysfs_watermarks_interval_us_attr.attr, + &damon_sysfs_watermarks_high_attr.attr, + &damon_sysfs_watermarks_mid_attr.attr, + &damon_sysfs_watermarks_low_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_watermarks); + +static struct kobj_type damon_sysfs_watermarks_ktype = { + .release = damon_sysfs_watermarks_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_watermarks_groups, +}; + +/* + * scheme/weights directory + */ + +struct damon_sysfs_weights { + struct kobject kobj; + unsigned int sz; + unsigned int nr_accesses; + unsigned int age; +}; + +static struct damon_sysfs_weights *damon_sysfs_weights_alloc(unsigned int sz, + unsigned int nr_accesses, unsigned int age) +{ + struct damon_sysfs_weights *weights = kmalloc(sizeof(*weights), + GFP_KERNEL); + + if (!weights) + return NULL; + weights->kobj = (struct kobject){}; + weights->sz = sz; + weights->nr_accesses = nr_accesses; + weights->age = age; + return weights; +} + +static ssize_t sz_permil_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + + return sysfs_emit(buf, "%u\n", weights->sz); +} + +static ssize_t sz_permil_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + int err = kstrtouint(buf, 0, &weights->sz); + + if (err) + return -EINVAL; + return count; +} + +static ssize_t nr_accesses_permil_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + + return sysfs_emit(buf, "%u\n", weights->nr_accesses); +} + +static ssize_t nr_accesses_permil_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + int err = kstrtouint(buf, 0, &weights->nr_accesses); + + if (err) + return -EINVAL; + return count; +} + +static ssize_t age_permil_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + + return sysfs_emit(buf, "%u\n", weights->age); +} + +static ssize_t age_permil_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + int err = kstrtouint(buf, 0, &weights->age); + + if (err) + return -EINVAL; + return count; +} + +static void damon_sysfs_weights_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_weights, kobj)); +} + +static struct kobj_attribute damon_sysfs_weights_sz_attr = + __ATTR_RW_MODE(sz_permil, 0600); + +static struct kobj_attribute damon_sysfs_weights_nr_accesses_attr = + __ATTR_RW_MODE(nr_accesses_permil, 0600); + +static struct kobj_attribute damon_sysfs_weights_age_attr = + __ATTR_RW_MODE(age_permil, 0600); + +static struct attribute *damon_sysfs_weights_attrs[] = { + &damon_sysfs_weights_sz_attr.attr, + &damon_sysfs_weights_nr_accesses_attr.attr, + &damon_sysfs_weights_age_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_weights); + +static struct kobj_type damon_sysfs_weights_ktype = { + .release = damon_sysfs_weights_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_weights_groups, +}; + +/* + * quotas directory + */ + +struct damon_sysfs_quotas { + struct kobject kobj; + struct damon_sysfs_weights *weights; + unsigned long ms; + unsigned long sz; + unsigned long reset_interval_ms; +}; + +static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_quotas), GFP_KERNEL); +} + +static int damon_sysfs_quotas_add_dirs(struct damon_sysfs_quotas *quotas) +{ + struct damon_sysfs_weights *weights; + int err; + + weights = damon_sysfs_weights_alloc(0, 0, 0); + if (!weights) + return -ENOMEM; + + err = kobject_init_and_add(&weights->kobj, &damon_sysfs_weights_ktype, + "as->kobj, "weights"); + if (err) + kobject_put(&weights->kobj); + else + quotas->weights = weights; + return err; +} + +static void damon_sysfs_quotas_rm_dirs(struct damon_sysfs_quotas *quotas) +{ + kobject_put("as->weights->kobj); +} + +static ssize_t ms_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%lu\n", quotas->ms); +} + +static ssize_t ms_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtoul(buf, 0, "as->ms); + + if (err) + return -EINVAL; + return count; +} + +static ssize_t bytes_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%lu\n", quotas->sz); +} + +static ssize_t bytes_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtoul(buf, 0, "as->sz); + + if (err) + return -EINVAL; + return count; +} + +static ssize_t reset_interval_ms_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%lu\n", quotas->reset_interval_ms); +} + +static ssize_t reset_interval_ms_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtoul(buf, 0, "as->reset_interval_ms); + + if (err) + return -EINVAL; + return count; +} + +static void damon_sysfs_quotas_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_quotas, kobj)); +} + +static struct kobj_attribute damon_sysfs_quotas_ms_attr = + __ATTR_RW_MODE(ms, 0600); + +static struct kobj_attribute damon_sysfs_quotas_sz_attr = + __ATTR_RW_MODE(bytes, 0600); + +static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr = + __ATTR_RW_MODE(reset_interval_ms, 0600); + +static struct attribute *damon_sysfs_quotas_attrs[] = { + &damon_sysfs_quotas_ms_attr.attr, + &damon_sysfs_quotas_sz_attr.attr, + &damon_sysfs_quotas_reset_interval_ms_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_quotas); + +static struct kobj_type damon_sysfs_quotas_ktype = { + .release = damon_sysfs_quotas_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_quotas_groups, +}; + +/* + * access_pattern directory + */ + +struct damon_sysfs_access_pattern { + struct kobject kobj; + struct damon_sysfs_ul_range *sz; + struct damon_sysfs_ul_range *nr_accesses; + struct damon_sysfs_ul_range *age; +}; + +static +struct damon_sysfs_access_pattern *damon_sysfs_access_pattern_alloc(void) +{ + struct damon_sysfs_access_pattern *access_pattern = + kmalloc(sizeof(*access_pattern), GFP_KERNEL); + + if (!access_pattern) + return NULL; + access_pattern->kobj = (struct kobject){}; + return access_pattern; +} + +static int damon_sysfs_access_pattern_add_range_dir( + struct damon_sysfs_access_pattern *access_pattern, + struct damon_sysfs_ul_range **range_dir_ptr, + char *name) +{ + struct damon_sysfs_ul_range *range = damon_sysfs_ul_range_alloc(0, 0); + int err; + + if (!range) + return -ENOMEM; + err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype, + &access_pattern->kobj, name); + if (err) + kobject_put(&range->kobj); + else + *range_dir_ptr = range; + return err; +} + +static int damon_sysfs_access_pattern_add_dirs( + struct damon_sysfs_access_pattern *access_pattern) +{ + int err; + + err = damon_sysfs_access_pattern_add_range_dir(access_pattern, + &access_pattern->sz, "sz"); + if (err) + goto put_sz_out; + + err = damon_sysfs_access_pattern_add_range_dir(access_pattern, + &access_pattern->nr_accesses, "nr_accesses"); + if (err) + goto put_nr_accesses_sz_out; + + err = damon_sysfs_access_pattern_add_range_dir(access_pattern, + &access_pattern->age, "age"); + if (err) + goto put_age_nr_accesses_sz_out; + return 0; + +put_age_nr_accesses_sz_out: + kobject_put(&access_pattern->age->kobj); + access_pattern->age = NULL; +put_nr_accesses_sz_out: + kobject_put(&access_pattern->nr_accesses->kobj); + access_pattern->nr_accesses = NULL; +put_sz_out: + kobject_put(&access_pattern->sz->kobj); + access_pattern->sz = NULL; + return err; +} + +static void damon_sysfs_access_pattern_rm_dirs( + struct damon_sysfs_access_pattern *access_pattern) +{ + kobject_put(&access_pattern->sz->kobj); + kobject_put(&access_pattern->nr_accesses->kobj); + kobject_put(&access_pattern->age->kobj); +} + +static void damon_sysfs_access_pattern_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_access_pattern, kobj)); +} + +static struct attribute *damon_sysfs_access_pattern_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_access_pattern); + +static struct kobj_type damon_sysfs_access_pattern_ktype = { + .release = damon_sysfs_access_pattern_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_access_pattern_groups, +}; + +/* + * scheme directory + */ + +struct damon_sysfs_scheme { + struct kobject kobj; + enum damos_action action; + struct damon_sysfs_access_pattern *access_pattern; + struct damon_sysfs_quotas *quotas; + struct damon_sysfs_watermarks *watermarks; + struct damon_sysfs_stats *stats; +}; + +/* This should match with enum damos_action */ +static const char * const damon_sysfs_damos_action_strs[] = { + "willneed", + "cold", + "pageout", + "hugepage", + "nohugepage", + "stat", +}; + +static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc( + enum damos_action action) +{ + struct damon_sysfs_scheme *scheme = kmalloc(sizeof(*scheme), + GFP_KERNEL); + + if (!scheme) + return NULL; + scheme->kobj = (struct kobject){}; + scheme->action = action; + return scheme; +} + +static int damon_sysfs_scheme_set_access_pattern( + struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_access_pattern *access_pattern; + int err; + + access_pattern = damon_sysfs_access_pattern_alloc(); + if (!access_pattern) + return -ENOMEM; + err = kobject_init_and_add(&access_pattern->kobj, + &damon_sysfs_access_pattern_ktype, &scheme->kobj, + "access_pattern"); + if (err) + goto out; + err = damon_sysfs_access_pattern_add_dirs(access_pattern); + if (err) + goto out; + scheme->access_pattern = access_pattern; + return 0; + +out: + kobject_put(&access_pattern->kobj); + return err; +} + +static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc(); + int err; + + if (!quotas) + return -ENOMEM; + err = kobject_init_and_add("as->kobj, &damon_sysfs_quotas_ktype, + &scheme->kobj, "quotas"); + if (err) + goto out; + err = damon_sysfs_quotas_add_dirs(quotas); + if (err) + goto out; + scheme->quotas = quotas; + return 0; + +out: + kobject_put("as->kobj); + return err; +} + +static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_watermarks *watermarks = + damon_sysfs_watermarks_alloc(DAMOS_WMARK_NONE, 0, 0, 0, 0); + int err; + + if (!watermarks) + return -ENOMEM; + err = kobject_init_and_add(&watermarks->kobj, + &damon_sysfs_watermarks_ktype, &scheme->kobj, + "watermarks"); + if (err) + kobject_put(&watermarks->kobj); + else + scheme->watermarks = watermarks; + return err; +} + +static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc(); + int err; + + if (!stats) + return -ENOMEM; + err = kobject_init_and_add(&stats->kobj, &damon_sysfs_stats_ktype, + &scheme->kobj, "stats"); + if (err) + kobject_put(&stats->kobj); + else + scheme->stats = stats; + return err; +} + +static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) +{ + int err; + + err = damon_sysfs_scheme_set_access_pattern(scheme); + if (err) + return err; + err = damon_sysfs_scheme_set_quotas(scheme); + if (err) + goto put_access_pattern_out; + err = damon_sysfs_scheme_set_watermarks(scheme); + if (err) + goto put_quotas_access_pattern_out; + err = damon_sysfs_scheme_set_stats(scheme); + if (err) + goto put_watermarks_quotas_access_pattern_out; + return 0; + +put_watermarks_quotas_access_pattern_out: + kobject_put(&scheme->watermarks->kobj); + scheme->watermarks = NULL; +put_quotas_access_pattern_out: + kobject_put(&scheme->quotas->kobj); + scheme->quotas = NULL; +put_access_pattern_out: + kobject_put(&scheme->access_pattern->kobj); + scheme->access_pattern = NULL; + return err; +} + +static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme) +{ + damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern); + kobject_put(&scheme->access_pattern->kobj); + damon_sysfs_quotas_rm_dirs(scheme->quotas); + kobject_put(&scheme->quotas->kobj); + kobject_put(&scheme->watermarks->kobj); + kobject_put(&scheme->stats->kobj); +} + +static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + + return sysfs_emit(buf, "%s\n", + damon_sysfs_damos_action_strs[scheme->action]); +} + +static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + enum damos_action action; + + for (action = 0; action < NR_DAMOS_ACTIONS; action++) { + if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) { + scheme->action = action; + return count; + } + } + return -EINVAL; +} + +static void damon_sysfs_scheme_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_scheme, kobj)); +} + +static struct kobj_attribute damon_sysfs_scheme_action_attr = + __ATTR_RW_MODE(action, 0600); + +static struct attribute *damon_sysfs_scheme_attrs[] = { + &damon_sysfs_scheme_action_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_scheme); + +static struct kobj_type damon_sysfs_scheme_ktype = { + .release = damon_sysfs_scheme_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_scheme_groups, +}; + +/* + * schemes directory + */ + +struct damon_sysfs_schemes { + struct kobject kobj; + struct damon_sysfs_scheme **schemes_arr; + int nr; +}; + +static struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_schemes), GFP_KERNEL); +} + +static void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes) +{ + struct damon_sysfs_scheme **schemes_arr = schemes->schemes_arr; + int i; + + for (i = 0; i < schemes->nr; i++) { + damon_sysfs_scheme_rm_dirs(schemes_arr[i]); + kobject_put(&schemes_arr[i]->kobj); + } + schemes->nr = 0; + kfree(schemes_arr); + schemes->schemes_arr = NULL; +} + +static int damon_sysfs_schemes_add_dirs(struct damon_sysfs_schemes *schemes, + int nr_schemes) +{ + struct damon_sysfs_scheme **schemes_arr, *scheme; + int err, i; + + damon_sysfs_schemes_rm_dirs(schemes); + if (!nr_schemes) + return 0; + + schemes_arr = kmalloc_array(nr_schemes, sizeof(*schemes_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!schemes_arr) + return -ENOMEM; + schemes->schemes_arr = schemes_arr; + + for (i = 0; i < nr_schemes; i++) { + scheme = damon_sysfs_scheme_alloc(DAMOS_STAT); + if (!scheme) { + damon_sysfs_schemes_rm_dirs(schemes); + return -ENOMEM; + } + + err = kobject_init_and_add(&scheme->kobj, + &damon_sysfs_scheme_ktype, &schemes->kobj, + "%d", i); + if (err) + goto out; + err = damon_sysfs_scheme_add_dirs(scheme); + if (err) + goto out; + + schemes_arr[i] = scheme; + schemes->nr++; + } + return 0; + +out: + damon_sysfs_schemes_rm_dirs(schemes); + kobject_put(&scheme->kobj); + return err; +} + +static ssize_t nr_schemes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_schemes *schemes = container_of(kobj, + struct damon_sysfs_schemes, kobj); + + return sysfs_emit(buf, "%d\n", schemes->nr); +} + +static ssize_t nr_schemes_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_schemes *schemes = container_of(kobj, + struct damon_sysfs_schemes, kobj); + int nr, err = kstrtoint(buf, 0, &nr); + + if (err) + return err; + if (nr < 0) + return -EINVAL; + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_schemes_add_dirs(schemes, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + return count; +} + +static void damon_sysfs_schemes_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_schemes, kobj)); +} + +static struct kobj_attribute damon_sysfs_schemes_nr_attr = + __ATTR_RW_MODE(nr_schemes, 0600); + +static struct attribute *damon_sysfs_schemes_attrs[] = { + &damon_sysfs_schemes_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_schemes); + +static struct kobj_type damon_sysfs_schemes_ktype = { + .release = damon_sysfs_schemes_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_schemes_groups, +}; + +/* + * init region directory + */ + +struct damon_sysfs_region { + struct kobject kobj; + unsigned long start; + unsigned long end; +}; + +static struct damon_sysfs_region *damon_sysfs_region_alloc( + unsigned long start, + unsigned long end) +{ + struct damon_sysfs_region *region = kmalloc(sizeof(*region), + GFP_KERNEL); + + if (!region) + return NULL; + region->kobj = (struct kobject){}; + region->start = start; + region->end = end; + return region; +} + +static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_region *region = container_of(kobj, + struct damon_sysfs_region, kobj); + + return sysfs_emit(buf, "%lu\n", region->start); +} + +static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_region *region = container_of(kobj, + struct damon_sysfs_region, kobj); + int err = kstrtoul(buf, 0, ®ion->start); + + if (err) + return -EINVAL; + return count; +} + +static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_region *region = container_of(kobj, + struct damon_sysfs_region, kobj); + + return sysfs_emit(buf, "%lu\n", region->end); +} + +static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_region *region = container_of(kobj, + struct damon_sysfs_region, kobj); + int err = kstrtoul(buf, 0, ®ion->end); + + if (err) + return -EINVAL; + return count; +} + +static void damon_sysfs_region_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_region, kobj)); +} + +static struct kobj_attribute damon_sysfs_region_start_attr = + __ATTR_RW_MODE(start, 0600); + +static struct kobj_attribute damon_sysfs_region_end_attr = + __ATTR_RW_MODE(end, 0600); + +static struct attribute *damon_sysfs_region_attrs[] = { + &damon_sysfs_region_start_attr.attr, + &damon_sysfs_region_end_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_region); + +static struct kobj_type damon_sysfs_region_ktype = { + .release = damon_sysfs_region_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_region_groups, +}; + +/* + * init_regions directory + */ + +struct damon_sysfs_regions { + struct kobject kobj; + struct damon_sysfs_region **regions_arr; + int nr; +}; + +static struct damon_sysfs_regions *damon_sysfs_regions_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_regions), GFP_KERNEL); +} + +static void damon_sysfs_regions_rm_dirs(struct damon_sysfs_regions *regions) +{ + struct damon_sysfs_region **regions_arr = regions->regions_arr; + int i; + + for (i = 0; i < regions->nr; i++) + kobject_put(®ions_arr[i]->kobj); + regions->nr = 0; + kfree(regions_arr); + regions->regions_arr = NULL; +} + +static int damon_sysfs_regions_add_dirs(struct damon_sysfs_regions *regions, + int nr_regions) +{ + struct damon_sysfs_region **regions_arr, *region; + int err, i; + + damon_sysfs_regions_rm_dirs(regions); + if (!nr_regions) + return 0; + + regions_arr = kmalloc_array(nr_regions, sizeof(*regions_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!regions_arr) + return -ENOMEM; + regions->regions_arr = regions_arr; + + for (i = 0; i < nr_regions; i++) { + region = damon_sysfs_region_alloc(0, 0); + if (!region) { + damon_sysfs_regions_rm_dirs(regions); + return -ENOMEM; + } + + err = kobject_init_and_add(®ion->kobj, + &damon_sysfs_region_ktype, ®ions->kobj, + "%d", i); + if (err) { + kobject_put(®ion->kobj); + damon_sysfs_regions_rm_dirs(regions); + return err; + } + + regions_arr[i] = region; + regions->nr++; + } + return 0; +} + +static ssize_t nr_regions_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_regions *regions = container_of(kobj, + struct damon_sysfs_regions, kobj); + + return sysfs_emit(buf, "%d\n", regions->nr); +} + +static ssize_t nr_regions_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_regions *regions = container_of(kobj, + struct damon_sysfs_regions, kobj); + int nr, err = kstrtoint(buf, 0, &nr); + + if (err) + return err; + if (nr < 0) + return -EINVAL; + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_regions_add_dirs(regions, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + + return count; +} + +static void damon_sysfs_regions_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_regions, kobj)); +} + +static struct kobj_attribute damon_sysfs_regions_nr_attr = + __ATTR_RW_MODE(nr_regions, 0600); + +static struct attribute *damon_sysfs_regions_attrs[] = { + &damon_sysfs_regions_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_regions); + +static struct kobj_type damon_sysfs_regions_ktype = { + .release = damon_sysfs_regions_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_regions_groups, +}; + +/* + * target directory + */ + +struct damon_sysfs_target { + struct kobject kobj; + struct damon_sysfs_regions *regions; + int pid; +}; + +static struct damon_sysfs_target *damon_sysfs_target_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_target), GFP_KERNEL); +} + +static int damon_sysfs_target_add_dirs(struct damon_sysfs_target *target) +{ + struct damon_sysfs_regions *regions = damon_sysfs_regions_alloc(); + int err; + + if (!regions) + return -ENOMEM; + + err = kobject_init_and_add(®ions->kobj, &damon_sysfs_regions_ktype, + &target->kobj, "regions"); + if (err) + kobject_put(®ions->kobj); + else + target->regions = regions; + return err; +} + +static void damon_sysfs_target_rm_dirs(struct damon_sysfs_target *target) +{ + damon_sysfs_regions_rm_dirs(target->regions); + kobject_put(&target->regions->kobj); +} + +static ssize_t pid_target_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_target *target = container_of(kobj, + struct damon_sysfs_target, kobj); + + return sysfs_emit(buf, "%d\n", target->pid); +} + +static ssize_t pid_target_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_target *target = container_of(kobj, + struct damon_sysfs_target, kobj); + int err = kstrtoint(buf, 0, &target->pid); + + if (err) + return -EINVAL; + return count; +} + +static void damon_sysfs_target_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_target, kobj)); +} + +static struct kobj_attribute damon_sysfs_target_pid_attr = + __ATTR_RW_MODE(pid_target, 0600); + +static struct attribute *damon_sysfs_target_attrs[] = { + &damon_sysfs_target_pid_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_target); + +static struct kobj_type damon_sysfs_target_ktype = { + .release = damon_sysfs_target_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_target_groups, +}; + +/* + * targets directory + */ + +struct damon_sysfs_targets { + struct kobject kobj; + struct damon_sysfs_target **targets_arr; + int nr; +}; + +static struct damon_sysfs_targets *damon_sysfs_targets_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_targets), GFP_KERNEL); +} + +static void damon_sysfs_targets_rm_dirs(struct damon_sysfs_targets *targets) +{ + struct damon_sysfs_target **targets_arr = targets->targets_arr; + int i; + + for (i = 0; i < targets->nr; i++) { + damon_sysfs_target_rm_dirs(targets_arr[i]); + kobject_put(&targets_arr[i]->kobj); + } + targets->nr = 0; + kfree(targets_arr); + targets->targets_arr = NULL; +} + +static int damon_sysfs_targets_add_dirs(struct damon_sysfs_targets *targets, + int nr_targets) +{ + struct damon_sysfs_target **targets_arr, *target; + int err, i; + + damon_sysfs_targets_rm_dirs(targets); + if (!nr_targets) + return 0; + + targets_arr = kmalloc_array(nr_targets, sizeof(*targets_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!targets_arr) + return -ENOMEM; + targets->targets_arr = targets_arr; + + for (i = 0; i < nr_targets; i++) { + target = damon_sysfs_target_alloc(); + if (!target) { + damon_sysfs_targets_rm_dirs(targets); + return -ENOMEM; + } + + err = kobject_init_and_add(&target->kobj, + &damon_sysfs_target_ktype, &targets->kobj, + "%d", i); + if (err) + goto out; + + err = damon_sysfs_target_add_dirs(target); + if (err) + goto out; + + targets_arr[i] = target; + targets->nr++; + } + return 0; + +out: + damon_sysfs_targets_rm_dirs(targets); + kobject_put(&target->kobj); + return err; +} + +static ssize_t nr_targets_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_targets *targets = container_of(kobj, + struct damon_sysfs_targets, kobj); + + return sysfs_emit(buf, "%d\n", targets->nr); +} + +static ssize_t nr_targets_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_targets *targets = container_of(kobj, + struct damon_sysfs_targets, kobj); + int nr, err = kstrtoint(buf, 0, &nr); + + if (err) + return err; + if (nr < 0) + return -EINVAL; + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_targets_add_dirs(targets, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + + return count; +} + +static void damon_sysfs_targets_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_targets, kobj)); +} + +static struct kobj_attribute damon_sysfs_targets_nr_attr = + __ATTR_RW_MODE(nr_targets, 0600); + +static struct attribute *damon_sysfs_targets_attrs[] = { + &damon_sysfs_targets_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_targets); + +static struct kobj_type damon_sysfs_targets_ktype = { + .release = damon_sysfs_targets_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_targets_groups, +}; + +/* + * intervals directory + */ + +struct damon_sysfs_intervals { + struct kobject kobj; + unsigned long sample_us; + unsigned long aggr_us; + unsigned long update_us; +}; + +static struct damon_sysfs_intervals *damon_sysfs_intervals_alloc( + unsigned long sample_us, unsigned long aggr_us, + unsigned long update_us) +{ + struct damon_sysfs_intervals *intervals = kmalloc(sizeof(*intervals), + GFP_KERNEL); + + if (!intervals) + return NULL; + + intervals->kobj = (struct kobject){}; + intervals->sample_us = sample_us; + intervals->aggr_us = aggr_us; + intervals->update_us = update_us; + return intervals; +} + +static ssize_t sample_us_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_intervals *intervals = container_of(kobj, + struct damon_sysfs_intervals, kobj); + + return sysfs_emit(buf, "%lu\n", intervals->sample_us); +} + +static ssize_t sample_us_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_intervals *intervals = container_of(kobj, + struct damon_sysfs_intervals, kobj); + unsigned long us; + int err = kstrtoul(buf, 0, &us); + + if (err) + return -EINVAL; + + intervals->sample_us = us; + return count; +} + +static ssize_t aggr_us_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_intervals *intervals = container_of(kobj, + struct damon_sysfs_intervals, kobj); + + return sysfs_emit(buf, "%lu\n", intervals->aggr_us); +} + +static ssize_t aggr_us_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_intervals *intervals = container_of(kobj, + struct damon_sysfs_intervals, kobj); + unsigned long us; + int err = kstrtoul(buf, 0, &us); + + if (err) + return -EINVAL; + + intervals->aggr_us = us; + return count; +} + +static ssize_t update_us_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_intervals *intervals = container_of(kobj, + struct damon_sysfs_intervals, kobj); + + return sysfs_emit(buf, "%lu\n", intervals->update_us); +} + +static ssize_t update_us_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_intervals *intervals = container_of(kobj, + struct damon_sysfs_intervals, kobj); + unsigned long us; + int err = kstrtoul(buf, 0, &us); + + if (err) + return -EINVAL; + + intervals->update_us = us; + return count; +} + +static void damon_sysfs_intervals_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_intervals, kobj)); +} + +static struct kobj_attribute damon_sysfs_intervals_sample_us_attr = + __ATTR_RW_MODE(sample_us, 0600); + +static struct kobj_attribute damon_sysfs_intervals_aggr_us_attr = + __ATTR_RW_MODE(aggr_us, 0600); + +static struct kobj_attribute damon_sysfs_intervals_update_us_attr = + __ATTR_RW_MODE(update_us, 0600); + +static struct attribute *damon_sysfs_intervals_attrs[] = { + &damon_sysfs_intervals_sample_us_attr.attr, + &damon_sysfs_intervals_aggr_us_attr.attr, + &damon_sysfs_intervals_update_us_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_intervals); + +static struct kobj_type damon_sysfs_intervals_ktype = { + .release = damon_sysfs_intervals_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_intervals_groups, +}; + +/* + * monitoring_attrs directory + */ + +struct damon_sysfs_attrs { + struct kobject kobj; + struct damon_sysfs_intervals *intervals; + struct damon_sysfs_ul_range *nr_regions_range; +}; + +static struct damon_sysfs_attrs *damon_sysfs_attrs_alloc(void) +{ + struct damon_sysfs_attrs *attrs = kmalloc(sizeof(*attrs), GFP_KERNEL); + + if (!attrs) + return NULL; + attrs->kobj = (struct kobject){}; + return attrs; +} + +static int damon_sysfs_attrs_add_dirs(struct damon_sysfs_attrs *attrs) +{ + struct damon_sysfs_intervals *intervals; + struct damon_sysfs_ul_range *nr_regions_range; + int err; + + intervals = damon_sysfs_intervals_alloc(5000, 100000, 60000000); + if (!intervals) + return -ENOMEM; + + err = kobject_init_and_add(&intervals->kobj, + &damon_sysfs_intervals_ktype, &attrs->kobj, + "intervals"); + if (err) + goto put_intervals_out; + attrs->intervals = intervals; + + nr_regions_range = damon_sysfs_ul_range_alloc(10, 1000); + if (!nr_regions_range) { + err = -ENOMEM; + goto put_intervals_out; + } + + err = kobject_init_and_add(&nr_regions_range->kobj, + &damon_sysfs_ul_range_ktype, &attrs->kobj, + "nr_regions"); + if (err) + goto put_nr_regions_intervals_out; + attrs->nr_regions_range = nr_regions_range; + return 0; + +put_nr_regions_intervals_out: + kobject_put(&nr_regions_range->kobj); + attrs->nr_regions_range = NULL; +put_intervals_out: + kobject_put(&intervals->kobj); + attrs->intervals = NULL; + return err; +} + +static void damon_sysfs_attrs_rm_dirs(struct damon_sysfs_attrs *attrs) +{ + kobject_put(&attrs->nr_regions_range->kobj); + kobject_put(&attrs->intervals->kobj); +} + +static void damon_sysfs_attrs_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_attrs, kobj)); +} + +static struct attribute *damon_sysfs_attrs_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_attrs); + +static struct kobj_type damon_sysfs_attrs_ktype = { + .release = damon_sysfs_attrs_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_attrs_groups, +}; + +/* + * context directory + */ + +/* This should match with enum damon_ops_id */ +static const char * const damon_sysfs_ops_strs[] = { + "vaddr", + "paddr", +}; + +struct damon_sysfs_context { + struct kobject kobj; + enum damon_ops_id ops_id; + struct damon_sysfs_attrs *attrs; + struct damon_sysfs_targets *targets; + struct damon_sysfs_schemes *schemes; +}; + +static struct damon_sysfs_context *damon_sysfs_context_alloc( + enum damon_ops_id ops_id) +{ + struct damon_sysfs_context *context = kmalloc(sizeof(*context), + GFP_KERNEL); + + if (!context) + return NULL; + context->kobj = (struct kobject){}; + context->ops_id = ops_id; + return context; +} + +static int damon_sysfs_context_set_attrs(struct damon_sysfs_context *context) +{ + struct damon_sysfs_attrs *attrs = damon_sysfs_attrs_alloc(); + int err; + + if (!attrs) + return -ENOMEM; + err = kobject_init_and_add(&attrs->kobj, &damon_sysfs_attrs_ktype, + &context->kobj, "monitoring_attrs"); + if (err) + goto out; + err = damon_sysfs_attrs_add_dirs(attrs); + if (err) + goto out; + context->attrs = attrs; + return 0; + +out: + kobject_put(&attrs->kobj); + return err; +} + +static int damon_sysfs_context_set_targets(struct damon_sysfs_context *context) +{ + struct damon_sysfs_targets *targets = damon_sysfs_targets_alloc(); + int err; + + if (!targets) + return -ENOMEM; + err = kobject_init_and_add(&targets->kobj, &damon_sysfs_targets_ktype, + &context->kobj, "targets"); + if (err) { + kobject_put(&targets->kobj); + return err; + } + context->targets = targets; + return 0; +} + +static int damon_sysfs_context_set_schemes(struct damon_sysfs_context *context) +{ + struct damon_sysfs_schemes *schemes = damon_sysfs_schemes_alloc(); + int err; + + if (!schemes) + return -ENOMEM; + err = kobject_init_and_add(&schemes->kobj, &damon_sysfs_schemes_ktype, + &context->kobj, "schemes"); + if (err) { + kobject_put(&schemes->kobj); + return err; + } + context->schemes = schemes; + return 0; +} + +static int damon_sysfs_context_add_dirs(struct damon_sysfs_context *context) +{ + int err; + + err = damon_sysfs_context_set_attrs(context); + if (err) + return err; + + err = damon_sysfs_context_set_targets(context); + if (err) + goto put_attrs_out; + + err = damon_sysfs_context_set_schemes(context); + if (err) + goto put_targets_attrs_out; + return 0; + +put_targets_attrs_out: + kobject_put(&context->targets->kobj); + context->targets = NULL; +put_attrs_out: + kobject_put(&context->attrs->kobj); + context->attrs = NULL; + return err; +} + +static void damon_sysfs_context_rm_dirs(struct damon_sysfs_context *context) +{ + damon_sysfs_attrs_rm_dirs(context->attrs); + kobject_put(&context->attrs->kobj); + damon_sysfs_targets_rm_dirs(context->targets); + kobject_put(&context->targets->kobj); + damon_sysfs_schemes_rm_dirs(context->schemes); + kobject_put(&context->schemes->kobj); +} + +static ssize_t operations_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_context *context = container_of(kobj, + struct damon_sysfs_context, kobj); + + return sysfs_emit(buf, "%s\n", damon_sysfs_ops_strs[context->ops_id]); +} + +static ssize_t operations_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_context *context = container_of(kobj, + struct damon_sysfs_context, kobj); + enum damon_ops_id id; + + for (id = 0; id < NR_DAMON_OPS; id++) { + if (sysfs_streq(buf, damon_sysfs_ops_strs[id])) { + context->ops_id = id; + return count; + } + } + return -EINVAL; +} + +static void damon_sysfs_context_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_context, kobj)); +} + +static struct kobj_attribute damon_sysfs_context_operations_attr = + __ATTR_RW_MODE(operations, 0600); + +static struct attribute *damon_sysfs_context_attrs[] = { + &damon_sysfs_context_operations_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_context); + +static struct kobj_type damon_sysfs_context_ktype = { + .release = damon_sysfs_context_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_context_groups, +}; + +/* + * contexts directory + */ + +struct damon_sysfs_contexts { + struct kobject kobj; + struct damon_sysfs_context **contexts_arr; + int nr; +}; + +static struct damon_sysfs_contexts *damon_sysfs_contexts_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_contexts), GFP_KERNEL); +} + +static void damon_sysfs_contexts_rm_dirs(struct damon_sysfs_contexts *contexts) +{ + struct damon_sysfs_context **contexts_arr = contexts->contexts_arr; + int i; + + for (i = 0; i < contexts->nr; i++) { + damon_sysfs_context_rm_dirs(contexts_arr[i]); + kobject_put(&contexts_arr[i]->kobj); + } + contexts->nr = 0; + kfree(contexts_arr); + contexts->contexts_arr = NULL; +} + +static int damon_sysfs_contexts_add_dirs(struct damon_sysfs_contexts *contexts, + int nr_contexts) +{ + struct damon_sysfs_context **contexts_arr, *context; + int err, i; + + damon_sysfs_contexts_rm_dirs(contexts); + if (!nr_contexts) + return 0; + + contexts_arr = kmalloc_array(nr_contexts, sizeof(*contexts_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!contexts_arr) + return -ENOMEM; + contexts->contexts_arr = contexts_arr; + + for (i = 0; i < nr_contexts; i++) { + context = damon_sysfs_context_alloc(DAMON_OPS_VADDR); + if (!context) { + damon_sysfs_contexts_rm_dirs(contexts); + return -ENOMEM; + } + + err = kobject_init_and_add(&context->kobj, + &damon_sysfs_context_ktype, &contexts->kobj, + "%d", i); + if (err) + goto out; + + err = damon_sysfs_context_add_dirs(context); + if (err) + goto out; + + contexts_arr[i] = context; + contexts->nr++; + } + return 0; + +out: + damon_sysfs_contexts_rm_dirs(contexts); + kobject_put(&context->kobj); + return err; +} + +static ssize_t nr_contexts_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_contexts *contexts = container_of(kobj, + struct damon_sysfs_contexts, kobj); + + return sysfs_emit(buf, "%d\n", contexts->nr); +} + +static ssize_t nr_contexts_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_contexts *contexts = container_of(kobj, + struct damon_sysfs_contexts, kobj); + int nr, err; + + err = kstrtoint(buf, 0, &nr); + if (err) + return err; + /* TODO: support multiple contexts per kdamond */ + if (nr < 0 || 1 < nr) + return -EINVAL; + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_contexts_add_dirs(contexts, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + + return count; +} + +static void damon_sysfs_contexts_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_contexts, kobj)); +} + +static struct kobj_attribute damon_sysfs_contexts_nr_attr + = __ATTR_RW_MODE(nr_contexts, 0600); + +static struct attribute *damon_sysfs_contexts_attrs[] = { + &damon_sysfs_contexts_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_contexts); + +static struct kobj_type damon_sysfs_contexts_ktype = { + .release = damon_sysfs_contexts_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_contexts_groups, +}; + +/* + * kdamond directory + */ + +struct damon_sysfs_kdamond { + struct kobject kobj; + struct damon_sysfs_contexts *contexts; + struct damon_ctx *damon_ctx; +}; + +static struct damon_sysfs_kdamond *damon_sysfs_kdamond_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_kdamond), GFP_KERNEL); +} + +static int damon_sysfs_kdamond_add_dirs(struct damon_sysfs_kdamond *kdamond) +{ + struct damon_sysfs_contexts *contexts; + int err; + + contexts = damon_sysfs_contexts_alloc(); + if (!contexts) + return -ENOMEM; + + err = kobject_init_and_add(&contexts->kobj, + &damon_sysfs_contexts_ktype, &kdamond->kobj, + "contexts"); + if (err) { + kobject_put(&contexts->kobj); + return err; + } + kdamond->contexts = contexts; + + return err; +} + +static void damon_sysfs_kdamond_rm_dirs(struct damon_sysfs_kdamond *kdamond) +{ + damon_sysfs_contexts_rm_dirs(kdamond->contexts); + kobject_put(&kdamond->contexts->kobj); +} + +static bool damon_sysfs_ctx_running(struct damon_ctx *ctx) +{ + bool running; + + mutex_lock(&ctx->kdamond_lock); + running = ctx->kdamond != NULL; + mutex_unlock(&ctx->kdamond_lock); + return running; +} + +static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_kdamond *kdamond = container_of(kobj, + struct damon_sysfs_kdamond, kobj); + struct damon_ctx *ctx = kdamond->damon_ctx; + bool running; + + if (!ctx) + running = false; + else + running = damon_sysfs_ctx_running(ctx); + + return sysfs_emit(buf, "%s\n", running ? "on" : "off"); +} + +static int damon_sysfs_set_attrs(struct damon_ctx *ctx, + struct damon_sysfs_attrs *sys_attrs) +{ + struct damon_sysfs_intervals *sys_intervals = sys_attrs->intervals; + struct damon_sysfs_ul_range *sys_nr_regions = + sys_attrs->nr_regions_range; + + return damon_set_attrs(ctx, sys_intervals->sample_us, + sys_intervals->aggr_us, sys_intervals->update_us, + sys_nr_regions->min, sys_nr_regions->max); +} + +static void damon_sysfs_destroy_targets(struct damon_ctx *ctx) +{ + struct damon_target *t, *next; + + damon_for_each_target_safe(t, next, ctx) { + if (ctx->ops.id == DAMON_OPS_VADDR) + put_pid(t->pid); + damon_destroy_target(t); + } +} + +static int damon_sysfs_set_regions(struct damon_target *t, + struct damon_sysfs_regions *sysfs_regions) +{ + int i; + + for (i = 0; i < sysfs_regions->nr; i++) { + struct damon_sysfs_region *sys_region = + sysfs_regions->regions_arr[i]; + struct damon_region *prev, *r; + + if (sys_region->start > sys_region->end) + return -EINVAL; + r = damon_new_region(sys_region->start, sys_region->end); + if (!r) + return -ENOMEM; + damon_add_region(r, t); + if (damon_nr_regions(t) > 1) { + prev = damon_prev_region(r); + if (prev->ar.end > r->ar.start) { + damon_destroy_region(r, t); + return -EINVAL; + } + } + } + return 0; +} + +static int damon_sysfs_set_targets(struct damon_ctx *ctx, + struct damon_sysfs_targets *sysfs_targets) +{ + int i, err; + + for (i = 0; i < sysfs_targets->nr; i++) { + struct damon_sysfs_target *sys_target = + sysfs_targets->targets_arr[i]; + struct damon_target *t = damon_new_target(); + + if (!t) { + damon_sysfs_destroy_targets(ctx); + return -ENOMEM; + } + if (ctx->ops.id == DAMON_OPS_VADDR) { + t->pid = find_get_pid(sys_target->pid); + if (!t->pid) { + damon_sysfs_destroy_targets(ctx); + return -EINVAL; + } + } + damon_add_target(ctx, t); + err = damon_sysfs_set_regions(t, sys_target->regions); + if (err) { + damon_sysfs_destroy_targets(ctx); + return err; + } + } + return 0; +} + +static struct damos *damon_sysfs_mk_scheme( + struct damon_sysfs_scheme *sysfs_scheme) +{ + struct damon_sysfs_access_pattern *pattern = + sysfs_scheme->access_pattern; + struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; + struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; + struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; + struct damos_quota quota = { + .ms = sysfs_quotas->ms, + .sz = sysfs_quotas->sz, + .reset_interval = sysfs_quotas->reset_interval_ms, + .weight_sz = sysfs_weights->sz, + .weight_nr_accesses = sysfs_weights->nr_accesses, + .weight_age = sysfs_weights->age, + }; + struct damos_watermarks wmarks = { + .metric = sysfs_wmarks->metric, + .interval = sysfs_wmarks->interval_us, + .high = sysfs_wmarks->high, + .mid = sysfs_wmarks->mid, + .low = sysfs_wmarks->low, + }; + + return damon_new_scheme(pattern->sz->min, pattern->sz->max, + pattern->nr_accesses->min, pattern->nr_accesses->max, + pattern->age->min, pattern->age->max, + sysfs_scheme->action, "a, &wmarks); +} + +static int damon_sysfs_set_schemes(struct damon_ctx *ctx, + struct damon_sysfs_schemes *sysfs_schemes) +{ + int i; + + for (i = 0; i < sysfs_schemes->nr; i++) { + struct damos *scheme, *next; + + scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]); + if (!scheme) { + damon_for_each_scheme_safe(scheme, next, ctx) + damon_destroy_scheme(scheme); + return -ENOMEM; + } + damon_add_scheme(ctx, scheme); + } + return 0; +} + +static void damon_sysfs_before_terminate(struct damon_ctx *ctx) +{ + struct damon_target *t, *next; + + if (ctx->ops.id != DAMON_OPS_VADDR) + return; + + mutex_lock(&ctx->kdamond_lock); + damon_for_each_target_safe(t, next, ctx) { + put_pid(t->pid); + damon_destroy_target(t); + } + mutex_unlock(&ctx->kdamond_lock); +} + +static struct damon_ctx *damon_sysfs_build_ctx( + struct damon_sysfs_context *sys_ctx) +{ + struct damon_ctx *ctx = damon_new_ctx(); + int err; + + if (!ctx) + return ERR_PTR(-ENOMEM); + + err = damon_select_ops(ctx, sys_ctx->ops_id); + if (err) + goto out; + err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs); + if (err) + goto out; + err = damon_sysfs_set_targets(ctx, sys_ctx->targets); + if (err) + goto out; + err = damon_sysfs_set_schemes(ctx, sys_ctx->schemes); + if (err) + goto out; + + ctx->callback.before_terminate = damon_sysfs_before_terminate; + return ctx; + +out: + damon_destroy_ctx(ctx); + return ERR_PTR(err); +} + +static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx; + int err; + + if (kdamond->damon_ctx && + damon_sysfs_ctx_running(kdamond->damon_ctx)) + return -EBUSY; + /* TODO: support multiple contexts per kdamond */ + if (kdamond->contexts->nr != 1) + return -EINVAL; + + if (kdamond->damon_ctx) + damon_destroy_ctx(kdamond->damon_ctx); + kdamond->damon_ctx = NULL; + + ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + err = damon_start(&ctx, 1, false); + if (err) { + damon_destroy_ctx(ctx); + return err; + } + kdamond->damon_ctx = ctx; + return err; +} + +static int damon_sysfs_turn_damon_off(struct damon_sysfs_kdamond *kdamond) +{ + if (!kdamond->damon_ctx) + return -EINVAL; + return damon_stop(&kdamond->damon_ctx, 1); + /* + * To allow users show final monitoring results of already turned-off + * DAMON, we free kdamond->damon_ctx in next + * damon_sysfs_turn_damon_on(), or kdamonds_nr_store() + */ +} + +static int damon_sysfs_update_schemes_stats(struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + struct damos *scheme; + int schemes_idx = 0; + + if (!ctx) + return -EINVAL; + mutex_lock(&ctx->kdamond_lock); + damon_for_each_scheme(scheme, ctx) { + struct damon_sysfs_schemes *sysfs_schemes; + struct damon_sysfs_stats *sysfs_stats; + + sysfs_schemes = kdamond->contexts->contexts_arr[0]->schemes; + sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats; + sysfs_stats->nr_tried = scheme->stat.nr_tried; + sysfs_stats->sz_tried = scheme->stat.sz_tried; + sysfs_stats->nr_applied = scheme->stat.nr_applied; + sysfs_stats->sz_applied = scheme->stat.sz_applied; + sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds; + } + mutex_unlock(&ctx->kdamond_lock); + return 0; +} + +static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_kdamond *kdamond = container_of(kobj, + struct damon_sysfs_kdamond, kobj); + ssize_t ret; + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + if (sysfs_streq(buf, "on")) + ret = damon_sysfs_turn_damon_on(kdamond); + else if (sysfs_streq(buf, "off")) + ret = damon_sysfs_turn_damon_off(kdamond); + else if (sysfs_streq(buf, "update_schemes_stats")) + ret = damon_sysfs_update_schemes_stats(kdamond); + else + ret = -EINVAL; + mutex_unlock(&damon_sysfs_lock); + if (!ret) + ret = count; + return ret; +} + +static ssize_t pid_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_kdamond *kdamond = container_of(kobj, + struct damon_sysfs_kdamond, kobj); + struct damon_ctx *ctx; + int pid; + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + ctx = kdamond->damon_ctx; + if (!ctx) { + pid = -1; + goto out; + } + mutex_lock(&ctx->kdamond_lock); + if (!ctx->kdamond) + pid = -1; + else + pid = ctx->kdamond->pid; + mutex_unlock(&ctx->kdamond_lock); +out: + mutex_unlock(&damon_sysfs_lock); + return sysfs_emit(buf, "%d\n", pid); +} + +static void damon_sysfs_kdamond_release(struct kobject *kobj) +{ + struct damon_sysfs_kdamond *kdamond = container_of(kobj, + struct damon_sysfs_kdamond, kobj); + + if (kdamond->damon_ctx) + damon_destroy_ctx(kdamond->damon_ctx); + kfree(kdamond); +} + +static struct kobj_attribute damon_sysfs_kdamond_state_attr = + __ATTR_RW_MODE(state, 0600); + +static struct kobj_attribute damon_sysfs_kdamond_pid_attr = + __ATTR_RO_MODE(pid, 0400); + +static struct attribute *damon_sysfs_kdamond_attrs[] = { + &damon_sysfs_kdamond_state_attr.attr, + &damon_sysfs_kdamond_pid_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_kdamond); + +static struct kobj_type damon_sysfs_kdamond_ktype = { + .release = damon_sysfs_kdamond_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_kdamond_groups, +}; + +/* + * kdamonds directory + */ + +struct damon_sysfs_kdamonds { + struct kobject kobj; + struct damon_sysfs_kdamond **kdamonds_arr; + int nr; +}; + +static struct damon_sysfs_kdamonds *damon_sysfs_kdamonds_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_kdamonds), GFP_KERNEL); +} + +static void damon_sysfs_kdamonds_rm_dirs(struct damon_sysfs_kdamonds *kdamonds) +{ + struct damon_sysfs_kdamond **kdamonds_arr = kdamonds->kdamonds_arr; + int i; + + for (i = 0; i < kdamonds->nr; i++) { + damon_sysfs_kdamond_rm_dirs(kdamonds_arr[i]); + kobject_put(&kdamonds_arr[i]->kobj); + } + kdamonds->nr = 0; + kfree(kdamonds_arr); + kdamonds->kdamonds_arr = NULL; +} + +static int damon_sysfs_nr_running_ctxs(struct damon_sysfs_kdamond **kdamonds, + int nr_kdamonds) +{ + int nr_running_ctxs = 0; + int i; + + for (i = 0; i < nr_kdamonds; i++) { + struct damon_ctx *ctx = kdamonds[i]->damon_ctx; + + if (!ctx) + continue; + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) + nr_running_ctxs++; + mutex_unlock(&ctx->kdamond_lock); + } + return nr_running_ctxs; +} + +static int damon_sysfs_kdamonds_add_dirs(struct damon_sysfs_kdamonds *kdamonds, + int nr_kdamonds) +{ + struct damon_sysfs_kdamond **kdamonds_arr, *kdamond; + int err, i; + + if (damon_sysfs_nr_running_ctxs(kdamonds->kdamonds_arr, kdamonds->nr)) + return -EBUSY; + + damon_sysfs_kdamonds_rm_dirs(kdamonds); + if (!nr_kdamonds) + return 0; + + kdamonds_arr = kmalloc_array(nr_kdamonds, sizeof(*kdamonds_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!kdamonds_arr) + return -ENOMEM; + kdamonds->kdamonds_arr = kdamonds_arr; + + for (i = 0; i < nr_kdamonds; i++) { + kdamond = damon_sysfs_kdamond_alloc(); + if (!kdamond) { + damon_sysfs_kdamonds_rm_dirs(kdamonds); + return -ENOMEM; + } + + err = kobject_init_and_add(&kdamond->kobj, + &damon_sysfs_kdamond_ktype, &kdamonds->kobj, + "%d", i); + if (err) + goto out; + + err = damon_sysfs_kdamond_add_dirs(kdamond); + if (err) + goto out; + + kdamonds_arr[i] = kdamond; + kdamonds->nr++; + } + return 0; + +out: + damon_sysfs_kdamonds_rm_dirs(kdamonds); + kobject_put(&kdamond->kobj); + return err; +} + +static ssize_t nr_kdamonds_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_kdamonds *kdamonds = container_of(kobj, + struct damon_sysfs_kdamonds, kobj); + + return sysfs_emit(buf, "%d\n", kdamonds->nr); +} + +static ssize_t nr_kdamonds_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_kdamonds *kdamonds = container_of(kobj, + struct damon_sysfs_kdamonds, kobj); + int nr, err; + + err = kstrtoint(buf, 0, &nr); + if (err) + return err; + if (nr < 0) + return -EINVAL; + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_kdamonds_add_dirs(kdamonds, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + + return count; +} + +static void damon_sysfs_kdamonds_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_kdamonds, kobj)); +} + +static struct kobj_attribute damon_sysfs_kdamonds_nr_attr = + __ATTR_RW_MODE(nr_kdamonds, 0600); + +static struct attribute *damon_sysfs_kdamonds_attrs[] = { + &damon_sysfs_kdamonds_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_kdamonds); + +static struct kobj_type damon_sysfs_kdamonds_ktype = { + .release = damon_sysfs_kdamonds_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_kdamonds_groups, +}; + +/* + * damon user interface directory + */ + +struct damon_sysfs_ui_dir { + struct kobject kobj; + struct damon_sysfs_kdamonds *kdamonds; +}; + +static struct damon_sysfs_ui_dir *damon_sysfs_ui_dir_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_ui_dir), GFP_KERNEL); +} + +static int damon_sysfs_ui_dir_add_dirs(struct damon_sysfs_ui_dir *ui_dir) +{ + struct damon_sysfs_kdamonds *kdamonds; + int err; + + kdamonds = damon_sysfs_kdamonds_alloc(); + if (!kdamonds) + return -ENOMEM; + + err = kobject_init_and_add(&kdamonds->kobj, + &damon_sysfs_kdamonds_ktype, &ui_dir->kobj, + "kdamonds"); + if (err) { + kobject_put(&kdamonds->kobj); + return err; + } + ui_dir->kdamonds = kdamonds; + return err; +} + +static void damon_sysfs_ui_dir_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_ui_dir, kobj)); +} + +static struct attribute *damon_sysfs_ui_dir_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_ui_dir); + +static struct kobj_type damon_sysfs_ui_dir_ktype = { + .release = damon_sysfs_ui_dir_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_ui_dir_groups, +}; + +static int __init damon_sysfs_init(void) +{ + struct kobject *damon_sysfs_root; + struct damon_sysfs_ui_dir *admin; + int err; + + damon_sysfs_root = kobject_create_and_add("damon", mm_kobj); + if (!damon_sysfs_root) + return -ENOMEM; + + admin = damon_sysfs_ui_dir_alloc(); + if (!admin) { + kobject_put(damon_sysfs_root); + return -ENOMEM; + } + err = kobject_init_and_add(&admin->kobj, &damon_sysfs_ui_dir_ktype, + damon_sysfs_root, "admin"); + if (err) + goto out; + err = damon_sysfs_ui_dir_add_dirs(admin); + if (err) + goto out; + return 0; + +out: + kobject_put(&admin->kobj); + kobject_put(damon_sysfs_root); + return err; +} +subsys_initcall(damon_sysfs_init); diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h index 6a1b9272ea12..1a55bb6c36c3 100644 --- a/mm/damon/vaddr-test.h +++ b/mm/damon/vaddr-test.h @@ -139,7 +139,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test, struct damon_region *r; int i; - t = damon_new_target(42); + t = damon_new_target(); for (i = 0; i < nr_regions / 2; i++) { r = damon_new_region(regions[i * 2], regions[i * 2 + 1]); damon_add_region(r, t); @@ -251,7 +251,7 @@ static void damon_test_apply_three_regions4(struct kunit *test) static void damon_test_split_evenly_fail(struct kunit *test, unsigned long start, unsigned long end, unsigned int nr_pieces) { - struct damon_target *t = damon_new_target(42); + struct damon_target *t = damon_new_target(); struct damon_region *r = damon_new_region(start, end); damon_add_region(r, t); @@ -270,7 +270,7 @@ static void damon_test_split_evenly_fail(struct kunit *test, static void damon_test_split_evenly_succ(struct kunit *test, unsigned long start, unsigned long end, unsigned int nr_pieces) { - struct damon_target *t = damon_new_target(42); + struct damon_target *t = damon_new_target(); struct damon_region *r = damon_new_region(start, end); unsigned long expected_width = (end - start) / nr_pieces; unsigned long i = 0; @@ -314,7 +314,7 @@ static struct kunit_case damon_test_cases[] = { }; static struct kunit_suite damon_test_suite = { - .name = "damon-primitives", + .name = "damon-operations", .test_cases = damon_test_cases, }; kunit_test_suite(damon_test_suite); diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 20a9a9d69eb1..b2ec0aa1ff45 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -15,7 +15,7 @@ #include <linux/pagewalk.h> #include <linux/sched/mm.h> -#include "prmtv-common.h" +#include "ops-common.h" #ifdef CONFIG_DAMON_VADDR_KUNIT_TEST #undef DAMON_MIN_REGION @@ -23,11 +23,13 @@ #endif /* - * 't->id' should be the pointer to the relevant 'struct pid' having reference + * 't->pid' should be the pointer to the relevant 'struct pid' having reference * count. Caller must put the returned task, unless it is NULL. */ -#define damon_get_task_struct(t) \ - (get_pid_task((struct pid *)t->id, PIDTYPE_PID)) +static inline struct task_struct *damon_get_task_struct(struct damon_target *t) +{ + return get_pid_task(t->pid, PIDTYPE_PID); +} /* * Get the mm_struct of the given target @@ -98,16 +100,6 @@ static unsigned long sz_range(struct damon_addr_range *r) return r->end - r->start; } -static void swap_ranges(struct damon_addr_range *r1, - struct damon_addr_range *r2) -{ - struct damon_addr_range tmp; - - tmp = *r1; - *r1 = *r2; - *r2 = tmp; -} - /* * Find three regions separated by two biggest unmapped regions * @@ -146,9 +138,9 @@ static int __damon_va_three_regions(struct vm_area_struct *vma, gap.start = last_vma->vm_end; gap.end = vma->vm_start; if (sz_range(&gap) > sz_range(&second_gap)) { - swap_ranges(&gap, &second_gap); + swap(gap, second_gap); if (sz_range(&second_gap) > sz_range(&first_gap)) - swap_ranges(&second_gap, &first_gap); + swap(second_gap, first_gap); } next: last_vma = vma; @@ -159,7 +151,7 @@ next: /* Sort the two biggest gaps by address */ if (first_gap.start > second_gap.start) - swap_ranges(&first_gap, &second_gap); + swap(first_gap, second_gap); /* Store the result */ regions[0].start = ALIGN(start, DAMON_MIN_REGION); @@ -240,13 +232,19 @@ static int damon_va_three_regions(struct damon_target *t, static void __damon_va_init_regions(struct damon_ctx *ctx, struct damon_target *t) { + struct damon_target *ti; struct damon_region *r; struct damon_addr_range regions[3]; unsigned long sz = 0, nr_pieces; - int i; + int i, tidx = 0; if (damon_va_three_regions(t, regions)) { - pr_err("Failed to get three regions of target %lu\n", t->id); + damon_for_each_target(ti, ctx) { + if (ti == t) + break; + tidx++; + } + pr_debug("Failed to get three regions of %dth target\n", tidx); return; } @@ -272,7 +270,7 @@ static void __damon_va_init_regions(struct damon_ctx *ctx, } /* Initialize '->regions_list' of every target (task) */ -void damon_va_init(struct damon_ctx *ctx) +static void damon_va_init(struct damon_ctx *ctx) { struct damon_target *t; @@ -292,7 +290,8 @@ void damon_va_init(struct damon_ctx *ctx) * * Returns true if it is. */ -static bool damon_intersect(struct damon_region *r, struct damon_addr_range *re) +static bool damon_intersect(struct damon_region *r, + struct damon_addr_range *re) { return !(r->ar.end <= re->start || re->end <= r->ar.start); } @@ -356,7 +355,7 @@ static void damon_va_apply_three_regions(struct damon_target *t, /* * Update regions for current memory mappings */ -void damon_va_update(struct damon_ctx *ctx) +static void damon_va_update(struct damon_ctx *ctx) { struct damon_addr_range three_regions[3]; struct damon_target *t; @@ -395,8 +394,62 @@ out: return 0; } +#ifdef CONFIG_HUGETLB_PAGE +static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long addr) +{ + bool referenced = false; + pte_t entry = huge_ptep_get(pte); + struct page *page = pte_page(entry); + + get_page(page); + + if (pte_young(entry)) { + referenced = true; + entry = pte_mkold(entry); + huge_ptep_set_access_flags(vma, addr, pte, entry, + vma->vm_flags & VM_WRITE); + } + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(mm, addr, + addr + huge_page_size(hstate_vma(vma)))) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +} + +static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hstate *h = hstate_vma(walk->vma); + spinlock_t *ptl; + pte_t entry; + + ptl = huge_pte_lock(h, walk->mm, pte); + entry = huge_ptep_get(pte); + if (!pte_present(entry)) + goto out; + + damon_hugetlb_mkold(pte, walk->mm, walk->vma, addr); + +out: + spin_unlock(ptl); + return 0; +} +#else +#define damon_mkold_hugetlb_entry NULL +#endif /* CONFIG_HUGETLB_PAGE */ + static const struct mm_walk_ops damon_mkold_ops = { .pmd_entry = damon_mkold_pmd_entry, + .hugetlb_entry = damon_mkold_hugetlb_entry, }; static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) @@ -410,7 +463,7 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) * Functions for the access checking of the regions */ -static void damon_va_prepare_access_check(struct damon_ctx *ctx, +static void __damon_va_prepare_access_check(struct damon_ctx *ctx, struct mm_struct *mm, struct damon_region *r) { r->sampling_addr = damon_rand(r->ar.start, r->ar.end); @@ -418,7 +471,7 @@ static void damon_va_prepare_access_check(struct damon_ctx *ctx, damon_va_mkold(mm, r->sampling_addr); } -void damon_va_prepare_access_checks(struct damon_ctx *ctx) +static void damon_va_prepare_access_checks(struct damon_ctx *ctx) { struct damon_target *t; struct mm_struct *mm; @@ -429,7 +482,7 @@ void damon_va_prepare_access_checks(struct damon_ctx *ctx) if (!mm) continue; damon_for_each_region(r, t) - damon_va_prepare_access_check(ctx, mm, r); + __damon_va_prepare_access_check(ctx, mm, r); mmput(mm); } } @@ -491,8 +544,44 @@ out: return 0; } +#ifdef CONFIG_HUGETLB_PAGE +static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct damon_young_walk_private *priv = walk->private; + struct hstate *h = hstate_vma(walk->vma); + struct page *page; + spinlock_t *ptl; + pte_t entry; + + ptl = huge_pte_lock(h, walk->mm, pte); + entry = huge_ptep_get(pte); + if (!pte_present(entry)) + goto out; + + page = pte_page(entry); + get_page(page); + + if (pte_young(entry) || !page_is_idle(page) || + mmu_notifier_test_young(walk->mm, addr)) { + *priv->page_sz = huge_page_size(h); + priv->young = true; + } + + put_page(page); + +out: + spin_unlock(ptl); + return 0; +} +#else +#define damon_young_hugetlb_entry NULL +#endif /* CONFIG_HUGETLB_PAGE */ + static const struct mm_walk_ops damon_young_ops = { .pmd_entry = damon_young_pmd_entry, + .hugetlb_entry = damon_young_hugetlb_entry, }; static bool damon_va_young(struct mm_struct *mm, unsigned long addr, @@ -515,7 +604,7 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr, * mm 'mm_struct' for the given virtual address space * r the region to be checked */ -static void damon_va_check_access(struct damon_ctx *ctx, +static void __damon_va_check_access(struct damon_ctx *ctx, struct mm_struct *mm, struct damon_region *r) { static struct mm_struct *last_mm; @@ -539,7 +628,7 @@ static void damon_va_check_access(struct damon_ctx *ctx, last_addr = r->sampling_addr; } -unsigned int damon_va_check_accesses(struct damon_ctx *ctx) +static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) { struct damon_target *t; struct mm_struct *mm; @@ -551,7 +640,7 @@ unsigned int damon_va_check_accesses(struct damon_ctx *ctx) if (!mm) continue; damon_for_each_region(r, t) { - damon_va_check_access(ctx, mm, r); + __damon_va_check_access(ctx, mm, r); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); } mmput(mm); @@ -564,7 +653,7 @@ unsigned int damon_va_check_accesses(struct damon_ctx *ctx) * Functions for the target validity check and cleanup */ -bool damon_va_target_valid(void *target) +static bool damon_va_target_valid(void *target) { struct damon_target *t = target; struct task_struct *task; @@ -579,32 +668,34 @@ bool damon_va_target_valid(void *target) } #ifndef CONFIG_ADVISE_SYSCALLS -static int damos_madvise(struct damon_target *target, struct damon_region *r, - int behavior) +static unsigned long damos_madvise(struct damon_target *target, + struct damon_region *r, int behavior) { - return -EINVAL; + return 0; } #else -static int damos_madvise(struct damon_target *target, struct damon_region *r, - int behavior) +static unsigned long damos_madvise(struct damon_target *target, + struct damon_region *r, int behavior) { struct mm_struct *mm; - int ret = -ENOMEM; + unsigned long start = PAGE_ALIGN(r->ar.start); + unsigned long len = PAGE_ALIGN(r->ar.end - r->ar.start); + unsigned long applied; mm = damon_get_mm(target); if (!mm) - goto out; + return 0; - ret = do_madvise(mm, PAGE_ALIGN(r->ar.start), - PAGE_ALIGN(r->ar.end - r->ar.start), behavior); + applied = do_madvise(mm, start, len, behavior) ? 0 : len; mmput(mm); -out: - return ret; + + return applied; } #endif /* CONFIG_ADVISE_SYSCALLS */ -int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, - struct damon_region *r, struct damos *scheme) +static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) { int madv_action; @@ -627,14 +718,15 @@ int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, case DAMOS_STAT: return 0; default: - return -EINVAL; + return 0; } return damos_madvise(t, r, madv_action); } -int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme) +static int damon_va_scheme_score(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) { switch (scheme->action) { @@ -647,17 +739,24 @@ int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t, return DAMOS_MAX_SCORE; } -void damon_va_set_primitives(struct damon_ctx *ctx) +static int __init damon_va_initcall(void) { - ctx->primitive.init = damon_va_init; - ctx->primitive.update = damon_va_update; - ctx->primitive.prepare_access_checks = damon_va_prepare_access_checks; - ctx->primitive.check_accesses = damon_va_check_accesses; - ctx->primitive.reset_aggregated = NULL; - ctx->primitive.target_valid = damon_va_target_valid; - ctx->primitive.cleanup = NULL; - ctx->primitive.apply_scheme = damon_va_apply_scheme; - ctx->primitive.get_scheme_score = damon_va_scheme_score; -} + struct damon_operations ops = { + .id = DAMON_OPS_VADDR, + .init = damon_va_init, + .update = damon_va_update, + .prepare_access_checks = damon_va_prepare_access_checks, + .check_accesses = damon_va_check_accesses, + .reset_aggregated = NULL, + .target_valid = damon_va_target_valid, + .cleanup = NULL, + .apply_scheme = damon_va_apply_scheme, + .get_scheme_score = damon_va_scheme_score, + }; + + return damon_register_ops(&ops); +}; + +subsys_initcall(damon_va_initcall); #include "vaddr-test.h" diff --git a/mm/debug.c b/mm/debug.c index a05a39ff8fe4..eeb7ea3ca292 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -48,7 +48,8 @@ const struct trace_print_flags vmaflag_names[] = { static void __dump_page(struct page *page) { - struct page *head = compound_head(page); + struct folio *folio = page_folio(page); + struct page *head = &folio->page; struct address_space *mapping; bool compound = PageCompound(page); /* @@ -76,6 +77,7 @@ static void __dump_page(struct page *page) else mapping = (void *)(tmp & ~PAGE_MAPPING_FLAGS); head = page; + folio = (struct folio *)page; compound = false; } else { mapping = page_mapping(page); @@ -92,16 +94,10 @@ static void __dump_page(struct page *page) page, page_ref_count(head), mapcount, mapping, page_to_pgoff(page), page_to_pfn(page)); if (compound) { - if (hpage_pincount_available(page)) { - pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n", - head, compound_order(head), - head_compound_mapcount(head), - head_compound_pincount(head)); - } else { - pr_warn("head:%p order:%u compound_mapcount:%d\n", - head, compound_order(head), - head_compound_mapcount(head)); - } + pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n", + head, compound_order(head), + folio_entire_mapcount(folio), + head_compound_pincount(head)); } #ifdef CONFIG_MEMCG @@ -112,56 +108,8 @@ static void __dump_page(struct page *page) type = "ksm "; else if (PageAnon(page)) type = "anon "; - else if (mapping) { - struct inode *host; - const struct address_space_operations *a_ops; - struct hlist_node *dentry_first; - struct dentry *dentry_ptr; - struct dentry dentry; - unsigned long ino; - - /* - * mapping can be invalid pointer and we don't want to crash - * accessing it, so probe everything depending on it carefully - */ - if (get_kernel_nofault(host, &mapping->host) || - get_kernel_nofault(a_ops, &mapping->a_ops)) { - pr_warn("failed to read mapping contents, not a valid kernel address?\n"); - goto out_mapping; - } - - if (!host) { - pr_warn("aops:%ps\n", a_ops); - goto out_mapping; - } - - if (get_kernel_nofault(dentry_first, &host->i_dentry.first) || - get_kernel_nofault(ino, &host->i_ino)) { - pr_warn("aops:%ps with invalid host inode %px\n", - a_ops, host); - goto out_mapping; - } - - if (!dentry_first) { - pr_warn("aops:%ps ino:%lx\n", a_ops, ino); - goto out_mapping; - } - - dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); - if (get_kernel_nofault(dentry, dentry_ptr)) { - pr_warn("aops:%ps ino:%lx with invalid dentry %px\n", - a_ops, ino, dentry_ptr); - } else { - /* - * if dentry is corrupted, the %pd handler may still - * crash, but it's unlikely that we reach here with a - * corrupted struct page - */ - pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", - a_ops, ino, &dentry); - } - } -out_mapping: + else if (mapping) + dump_mapping(mapping); BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); pr_warn("%sflags: %pGp%s\n", type, &head->flags, diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 228e3954b90c..db2abd9e415b 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -171,6 +171,8 @@ static void __init pte_advanced_tests(struct pgtable_debug_args *args) ptep_test_and_clear_young(args->vma, args->vaddr, args->ptep); pte = ptep_get(args->ptep); WARN_ON(pte_young(pte)); + + ptep_get_and_clear_full(args->mm, args->vaddr, args->ptep, 1); } static void __init pte_savedwrite_tests(struct pgtable_debug_args *args) @@ -652,7 +654,7 @@ static void __init pte_clear_tests(struct pgtable_debug_args *args) set_pte_at(args->mm, args->vaddr, args->ptep, pte); flush_dcache_page(page); barrier(); - pte_clear(args->mm, args->vaddr, args->ptep); + ptep_clear(args->mm, args->vaddr, args->ptep); pte = ptep_get(args->ptep); WARN_ON(!pte_none(pte)); } @@ -888,8 +890,8 @@ static void __init swap_migration_tests(struct pgtable_debug_args *args) pr_debug("Validating swap migration\n"); /* - * make_migration_entry() expects given page to be - * locked, otherwise it stumbles upon a BUG_ON(). + * make_[readable|writable]_migration_entry() expects given page to + * be locked, otherwise it stumbles upon a BUG_ON(). */ __SetPageLocked(page); swp = make_writable_migration_entry(page_to_pfn(page)); diff --git a/mm/dmapool.c b/mm/dmapool.c index 64b537b3ccb0..a7eb5d0eb2da 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -152,7 +152,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, else if ((boundary < size) || (boundary & (boundary - 1))) return NULL; - retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev)); + retval = kmalloc(sizeof(*retval), GFP_KERNEL); if (!retval) return retval; diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index 74984c23a87e..9bc12e526ed0 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -17,6 +17,7 @@ #include <linux/vmalloc.h> #include <asm/fixmap.h> #include <asm/early_ioremap.h> +#include "internal.h" #ifdef CONFIG_MMU static int early_ioremap_debug __initdata; diff --git a/mm/fadvise.c b/mm/fadvise.c index d6baa4f451c5..338f16022012 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -109,9 +109,8 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) case POSIX_FADV_NOREUSE: break; case POSIX_FADV_DONTNEED: - if (!inode_write_congested(mapping->host)) - __filemap_fdatawrite_range(mapping, offset, endbyte, - WB_SYNC_NONE); + __filemap_fdatawrite_range(mapping, offset, endbyte, + WB_SYNC_NONE); /* * First and last FULL page! Partial pages are deliberately diff --git a/mm/filemap.c b/mm/filemap.c index 39c4c46c6133..d2e6a79fe69d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -21,6 +21,7 @@ #include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> +#include <linux/swapops.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/file.h> @@ -34,13 +35,13 @@ #include <linux/cpuset.h> #include <linux/hugetlb.h> #include <linux/memcontrol.h> -#include <linux/cleancache.h> #include <linux/shmem_fs.h> #include <linux/rmap.h> #include <linux/delayacct.h> #include <linux/psi.h> #include <linux/ramfs.h> #include <linux/page_idle.h> +#include <linux/migrate.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include "internal.h" @@ -71,7 +72,7 @@ * Lock ordering: * * ->i_mmap_rwsem (truncate_pagecache) - * ->private_lock (__free_pte->__set_page_dirty_buffers) + * ->private_lock (__free_pte->block_dirty_folio) * ->swap_lock (exclusive_swap_page, others) * ->i_pages lock * @@ -114,106 +115,94 @@ * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) - * ->private_lock (zap_pte_range->__set_page_dirty_buffers) + * ->private_lock (zap_pte_range->block_dirty_folio) * * ->i_mmap_rwsem * ->tasklist_lock (memory_failure, collect_procs_ao) */ static void page_cache_delete(struct address_space *mapping, - struct page *page, void *shadow) + struct folio *folio, void *shadow) { - XA_STATE(xas, &mapping->i_pages, page->index); - unsigned int nr = 1; + XA_STATE(xas, &mapping->i_pages, folio->index); + long nr = 1; mapping_set_update(&xas, mapping); /* hugetlb pages are represented by a single entry in the xarray */ - if (!PageHuge(page)) { - xas_set_order(&xas, page->index, compound_order(page)); - nr = compound_nr(page); + if (!folio_test_hugetlb(folio)) { + xas_set_order(&xas, folio->index, folio_order(folio)); + nr = folio_nr_pages(folio); } - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(nr != 1 && shadow, page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); xas_store(&xas, shadow); xas_init_marks(&xas); - page->mapping = NULL; + folio->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ mapping->nrpages -= nr; } -static void unaccount_page_cache_page(struct address_space *mapping, - struct page *page) +static void filemap_unaccount_folio(struct address_space *mapping, + struct folio *folio) { - int nr; + long nr; - /* - * if we're uptodate, flush out into the cleancache, otherwise - * invalidate any existing cleancache entries. We can't leave - * stale data around in the cleancache once our page is gone - */ - if (PageUptodate(page) && PageMappedToDisk(page)) - cleancache_put_page(page); - else - cleancache_invalidate_page(mapping, page); - - VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(page_mapped(page), page); - if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) { + VM_BUG_ON_FOLIO(folio_mapped(folio), folio); + if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) { int mapcount; pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n", - current->comm, page_to_pfn(page)); - dump_page(page, "still mapped when deleted"); + current->comm, folio_pfn(folio)); + dump_page(&folio->page, "still mapped when deleted"); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); - mapcount = page_mapcount(page); + mapcount = page_mapcount(&folio->page); if (mapping_exiting(mapping) && - page_count(page) >= mapcount + 2) { + folio_ref_count(folio) >= mapcount + 2) { /* * All vmas have already been torn down, so it's - * a good bet that actually the page is unmapped, + * a good bet that actually the folio is unmapped, * and we'd prefer not to leak it: if we're wrong, * some other bad page check should catch it later. */ - page_mapcount_reset(page); - page_ref_sub(page, mapcount); + page_mapcount_reset(&folio->page); + folio_ref_sub(folio, mapcount); } } - /* hugetlb pages do not participate in page cache accounting. */ - if (PageHuge(page)) + /* hugetlb folios do not participate in page cache accounting. */ + if (folio_test_hugetlb(folio)) return; - nr = thp_nr_pages(page); + nr = folio_nr_pages(folio); - __mod_lruvec_page_state(page, NR_FILE_PAGES, -nr); - if (PageSwapBacked(page)) { - __mod_lruvec_page_state(page, NR_SHMEM, -nr); - if (PageTransHuge(page)) - __mod_lruvec_page_state(page, NR_SHMEM_THPS, -nr); - } else if (PageTransHuge(page)) { - __mod_lruvec_page_state(page, NR_FILE_THPS, -nr); + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); + if (folio_test_swapbacked(folio)) { + __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); + if (folio_test_pmd_mappable(folio)) + __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); + } else if (folio_test_pmd_mappable(folio)) { + __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } /* - * At this point page must be either written or cleaned by - * truncate. Dirty page here signals a bug and loss of + * At this point folio must be either written or cleaned by + * truncate. Dirty folio here signals a bug and loss of * unwritten data. * - * This fixes dirty accounting after removing the page entirely - * but leaves PageDirty set: it has no effect for truncated - * page and anyway will be cleared before returning page into + * This fixes dirty accounting after removing the folio entirely + * but leaves the dirty flag set: it has no effect for truncated + * folio and anyway will be cleared before returning folio to * buddy allocator. */ - if (WARN_ON_ONCE(PageDirty(page))) - account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); + if (WARN_ON_ONCE(folio_test_dirty(folio))) + folio_account_cleaned(folio, mapping, + inode_to_wb(mapping->host)); } /* @@ -221,87 +210,81 @@ static void unaccount_page_cache_page(struct address_space *mapping, * sure the page is locked and that nobody else uses it - or that usage * is safe. The caller must hold the i_pages lock. */ -void __delete_from_page_cache(struct page *page, void *shadow) +void __filemap_remove_folio(struct folio *folio, void *shadow) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; - trace_mm_filemap_delete_from_page_cache(page); - - unaccount_page_cache_page(mapping, page); - page_cache_delete(mapping, page, shadow); + trace_mm_filemap_delete_from_page_cache(folio); + filemap_unaccount_folio(mapping, folio); + page_cache_delete(mapping, folio, shadow); } -static void page_cache_free_page(struct address_space *mapping, - struct page *page) +void filemap_free_folio(struct address_space *mapping, struct folio *folio) { void (*freepage)(struct page *); + int refs = 1; freepage = mapping->a_ops->freepage; if (freepage) - freepage(page); + freepage(&folio->page); - if (PageTransHuge(page) && !PageHuge(page)) { - page_ref_sub(page, thp_nr_pages(page)); - VM_BUG_ON_PAGE(page_count(page) <= 0, page); - } else { - put_page(page); - } + if (folio_test_large(folio) && !folio_test_hugetlb(folio)) + refs = folio_nr_pages(folio); + folio_put_refs(folio, refs); } /** - * delete_from_page_cache - delete page from page cache - * @page: the page which the kernel is trying to remove from page cache + * filemap_remove_folio - Remove folio from page cache. + * @folio: The folio. * - * This must be called only on pages that have been verified to be in the page - * cache and locked. It will never put the page into the free list, the caller - * has a reference on the page. + * This must be called only on folios that are locked and have been + * verified to be in the page cache. It will never put the folio into + * the free list because the caller has a reference on the page. */ -void delete_from_page_cache(struct page *page) +void filemap_remove_folio(struct folio *folio) { - struct address_space *mapping = page_mapping(page); + struct address_space *mapping = folio->mapping; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); - __delete_from_page_cache(page, NULL); + __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); - page_cache_free_page(mapping, page); + filemap_free_folio(mapping, folio); } -EXPORT_SYMBOL(delete_from_page_cache); /* - * page_cache_delete_batch - delete several pages from page cache - * @mapping: the mapping to which pages belong - * @pvec: pagevec with pages to delete + * page_cache_delete_batch - delete several folios from page cache + * @mapping: the mapping to which folios belong + * @fbatch: batch of folios to delete * - * The function walks over mapping->i_pages and removes pages passed in @pvec - * from the mapping. The function expects @pvec to be sorted by page index - * and is optimised for it to be dense. - * It tolerates holes in @pvec (mapping entries at those indices are not - * modified). The function expects only THP head pages to be present in the - * @pvec. + * The function walks over mapping->i_pages and removes folios passed in + * @fbatch from the mapping. The function expects @fbatch to be sorted + * by page index and is optimised for it to be dense. + * It tolerates holes in @fbatch (mapping entries at those indices are not + * modified). * * The function expects the i_pages lock to be held. */ static void page_cache_delete_batch(struct address_space *mapping, - struct pagevec *pvec) + struct folio_batch *fbatch) { - XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); - int total_pages = 0; + XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index); + long total_pages = 0; int i = 0; - struct page *page; + struct folio *folio; mapping_set_update(&xas, mapping); - xas_for_each(&xas, page, ULONG_MAX) { - if (i >= pagevec_count(pvec)) + xas_for_each(&xas, folio, ULONG_MAX) { + if (i >= folio_batch_count(fbatch)) break; /* A swap/dax/shadow entry got inserted? Skip it. */ - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; /* * A page got inserted in our range? Skip it. We have our @@ -310,54 +293,48 @@ static void page_cache_delete_batch(struct address_space *mapping, * means our page has been removed, which shouldn't be * possible because we're holding the PageLock. */ - if (page != pvec->pages[i]) { - VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index, - page); + if (folio != fbatch->folios[i]) { + VM_BUG_ON_FOLIO(folio->index > + fbatch->folios[i]->index, folio); continue; } - WARN_ON_ONCE(!PageLocked(page)); + WARN_ON_ONCE(!folio_test_locked(folio)); - if (page->index == xas.xa_index) - page->mapping = NULL; - /* Leave page->index set: truncation lookup relies on it */ + folio->mapping = NULL; + /* Leave folio->index set: truncation lookup relies on it */ - /* - * Move to the next page in the vector if this is a regular - * page or the index is of the last sub-page of this compound - * page. - */ - if (page->index + compound_nr(page) - 1 == xas.xa_index) - i++; + i++; xas_store(&xas, NULL); - total_pages++; + total_pages += folio_nr_pages(folio); } mapping->nrpages -= total_pages; } void delete_from_page_cache_batch(struct address_space *mapping, - struct pagevec *pvec) + struct folio_batch *fbatch) { int i; - if (!pagevec_count(pvec)) + if (!folio_batch_count(fbatch)) return; spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); - for (i = 0; i < pagevec_count(pvec); i++) { - trace_mm_filemap_delete_from_page_cache(pvec->pages[i]); + for (i = 0; i < folio_batch_count(fbatch); i++) { + struct folio *folio = fbatch->folios[i]; - unaccount_page_cache_page(mapping, pvec->pages[i]); + trace_mm_filemap_delete_from_page_cache(folio); + filemap_unaccount_folio(mapping, folio); } - page_cache_delete_batch(mapping, pvec); + page_cache_delete_batch(mapping, fbatch); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); - for (i = 0; i < pagevec_count(pvec); i++) - page_cache_free_page(mapping, pvec->pages[i]); + for (i = 0; i < folio_batch_count(fbatch); i++) + filemap_free_folio(mapping, fbatch->folios[i]); } int filemap_check_errors(struct address_space *mapping) @@ -646,8 +623,8 @@ static bool mapping_needs_writeback(struct address_space *mapping) return mapping->nrpages; } -static bool filemap_range_has_writeback(struct address_space *mapping, - loff_t start_byte, loff_t end_byte) +bool filemap_range_has_writeback(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) { XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT; @@ -667,34 +644,8 @@ static bool filemap_range_has_writeback(struct address_space *mapping, } rcu_read_unlock(); return page != NULL; - } - -/** - * filemap_range_needs_writeback - check if range potentially needs writeback - * @mapping: address space within which to check - * @start_byte: offset in bytes where the range starts - * @end_byte: offset in bytes where the range ends (inclusive) - * - * Find at least one page in the range supplied, usually used to check if - * direct writing in this range will trigger a writeback. Used by O_DIRECT - * read/write with IOCB_NOWAIT, to see if the caller needs to do - * filemap_write_and_wait_range() before proceeding. - * - * Return: %true if the caller should do filemap_write_and_wait_range() before - * doing O_DIRECT to a page in this range, %false otherwise. - */ -bool filemap_range_needs_writeback(struct address_space *mapping, - loff_t start_byte, loff_t end_byte) -{ - if (!mapping_needs_writeback(mapping)) - return false; - if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && - !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) - return false; - return filemap_range_has_writeback(mapping, start_byte, end_byte); -} -EXPORT_SYMBOL_GPL(filemap_range_needs_writeback); +EXPORT_SYMBOL_GPL(filemap_range_has_writeback); /** * filemap_write_and_wait_range - write out & wait on a file range @@ -891,26 +842,27 @@ noinline int __filemap_add_folio(struct address_space *mapping, { XA_STATE(xas, &mapping->i_pages, index); int huge = folio_test_hugetlb(folio); - int error; bool charged = false; + long nr = 1; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio); mapping_set_update(&xas, mapping); - folio_get(folio); - folio->mapping = mapping; - folio->index = index; - if (!huge) { - error = mem_cgroup_charge(folio, NULL, gfp); + int error = mem_cgroup_charge(folio, NULL, gfp); VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); if (error) - goto error; + return error; charged = true; + xas_set_order(&xas, index, folio_order(folio)); + nr = folio_nr_pages(folio); } gfp &= GFP_RECLAIM_MASK; + folio_ref_add(folio, nr); + folio->mapping = mapping; + folio->index = xas.xa_index; do { unsigned int order = xa_get_order(xas.xa, xas.xa_index); @@ -934,6 +886,8 @@ noinline int __filemap_add_folio(struct address_space *mapping, /* entry may have been split before we acquired lock */ order = xa_get_order(xas.xa, xas.xa_index); if (order > folio_order(folio)) { + /* How to handle large swap entries? */ + BUG_ON(shmem_mapping(mapping)); xas_split(&xas, old, order); xas_reset(&xas); } @@ -943,29 +897,31 @@ noinline int __filemap_add_folio(struct address_space *mapping, if (xas_error(&xas)) goto unlock; - mapping->nrpages++; + mapping->nrpages += nr; /* hugetlb pages do not participate in page cache accounting */ - if (!huge) - __lruvec_stat_add_folio(folio, NR_FILE_PAGES); + if (!huge) { + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); + if (folio_test_pmd_mappable(folio)) + __lruvec_stat_mod_folio(folio, + NR_FILE_THPS, nr); + } unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); - if (xas_error(&xas)) { - error = xas_error(&xas); - if (charged) - mem_cgroup_uncharge(folio); + if (xas_error(&xas)) goto error; - } - trace_mm_filemap_add_to_page_cache(&folio->page); + trace_mm_filemap_add_to_page_cache(folio); return 0; error: + if (charged) + mem_cgroup_uncharge(folio); folio->mapping = NULL; /* Leave page->index set: truncation relies upon it */ - folio_put(folio); - return error; + folio_put_refs(folio, nr); + return xas_error(&xas); } ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO); @@ -1103,6 +1059,12 @@ void __init pagecache_init(void) init_waitqueue_head(&folio_wait_table[i]); page_writeback_init(); + + /* + * tmpfs uses the ZERO_PAGE for reading holes: it is up-to-date, + * and splice's page_cache_pipe_buf_confirm() needs to see that. + */ + SetPageUptodate(ZERO_PAGE(0)); } /* @@ -1259,10 +1221,10 @@ enum behavior { * __folio_lock() waiting on then setting PG_locked. */ SHARED, /* Hold ref to page and check the bit when woken, like - * wait_on_page_writeback() waiting on PG_writeback. + * folio_wait_writeback() waiting on PG_writeback. */ DROP, /* Drop ref to page before wait, no check when woken, - * like put_and_wait_on_page_locked() on PG_locked. + * like folio_put_wait_locked() on PG_locked. */ }; @@ -1426,6 +1388,95 @@ repeat: return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR; } +#ifdef CONFIG_MIGRATION +/** + * migration_entry_wait_on_locked - Wait for a migration entry to be removed + * @entry: migration swap entry. + * @ptep: mapped pte pointer. Will return with the ptep unmapped. Only required + * for pte entries, pass NULL for pmd entries. + * @ptl: already locked ptl. This function will drop the lock. + * + * Wait for a migration entry referencing the given page to be removed. This is + * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except + * this can be called without taking a reference on the page. Instead this + * should be called while holding the ptl for the migration entry referencing + * the page. + * + * Returns after unmapping and unlocking the pte/ptl with pte_unmap_unlock(). + * + * This follows the same logic as folio_wait_bit_common() so see the comments + * there. + */ +void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, + spinlock_t *ptl) +{ + struct wait_page_queue wait_page; + wait_queue_entry_t *wait = &wait_page.wait; + bool thrashing = false; + bool delayacct = false; + unsigned long pflags; + wait_queue_head_t *q; + struct folio *folio = page_folio(pfn_swap_entry_to_page(entry)); + + q = folio_waitqueue(folio); + if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) { + if (!folio_test_swapbacked(folio)) { + delayacct_thrashing_start(); + delayacct = true; + } + psi_memstall_enter(&pflags); + thrashing = true; + } + + init_wait(wait); + wait->func = wake_page_function; + wait_page.folio = folio; + wait_page.bit_nr = PG_locked; + wait->flags = 0; + + spin_lock_irq(&q->lock); + folio_set_waiters(folio); + if (!folio_trylock_flag(folio, PG_locked, wait)) + __add_wait_queue_entry_tail(q, wait); + spin_unlock_irq(&q->lock); + + /* + * If a migration entry exists for the page the migration path must hold + * a valid reference to the page, and it must take the ptl to remove the + * migration entry. So the page is valid until the ptl is dropped. + */ + if (ptep) + pte_unmap_unlock(ptep, ptl); + else + spin_unlock(ptl); + + for (;;) { + unsigned int flags; + + set_current_state(TASK_UNINTERRUPTIBLE); + + /* Loop until we've been woken or interrupted */ + flags = smp_load_acquire(&wait->flags); + if (!(flags & WQ_FLAG_WOKEN)) { + if (signal_pending_state(TASK_UNINTERRUPTIBLE, current)) + break; + + io_schedule(); + continue; + } + break; + } + + finish_wait(q, wait); + + if (thrashing) { + if (delayacct) + delayacct_thrashing_end(); + psi_memstall_leave(&pflags); + } +} +#endif + void folio_wait_bit(struct folio *folio, int bit_nr) { folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED); @@ -1439,22 +1490,21 @@ int folio_wait_bit_killable(struct folio *folio, int bit_nr) EXPORT_SYMBOL(folio_wait_bit_killable); /** - * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked - * @page: The page to wait for. + * folio_put_wait_locked - Drop a reference and wait for it to be unlocked + * @folio: The folio to wait for. * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc). * - * The caller should hold a reference on @page. They expect the page to + * The caller should hold a reference on @folio. They expect the page to * become unlocked relatively soon, but do not wish to hold up migration - * (for example) by holding the reference while waiting for the page to + * (for example) by holding the reference while waiting for the folio to * come unlocked. After this function returns, the caller should not - * dereference @page. + * dereference @folio. * - * Return: 0 if the page was unlocked or -EINTR if interrupted by a signal. + * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal. */ -int put_and_wait_on_page_locked(struct page *page, int state) +int folio_put_wait_locked(struct folio *folio, int state) { - return folio_wait_bit_common(page_folio(page), PG_locked, state, - DROP); + return folio_wait_bit_common(folio, PG_locked, state, DROP); } /** @@ -1979,37 +2029,36 @@ no_page: } EXPORT_SYMBOL(__filemap_get_folio); -static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max, +static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max, xa_mark_t mark) { - struct page *page; + struct folio *folio; retry: if (mark == XA_PRESENT) - page = xas_find(xas, max); + folio = xas_find(xas, max); else - page = xas_find_marked(xas, max, mark); + folio = xas_find_marked(xas, max, mark); - if (xas_retry(xas, page)) + if (xas_retry(xas, folio)) goto retry; /* * A shadow entry of a recently evicted page, a swap * entry from shmem/tmpfs or a DAX entry. Return it * without attempting to raise page count. */ - if (!page || xa_is_value(page)) - return page; + if (!folio || xa_is_value(folio)) + return folio; - if (!page_cache_get_speculative(page)) + if (!folio_try_get_rcu(folio)) goto reset; - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(xas))) { - put_page(page); + if (unlikely(folio != xas_reload(xas))) { + folio_put(folio); goto reset; } - return page; + return folio; reset: xas_reset(xas); goto retry; @@ -2020,56 +2069,36 @@ reset: * @mapping: The address_space to search * @start: The starting page cache index * @end: The final page index (inclusive). - * @pvec: Where the resulting entries are placed. + * @fbatch: Where the resulting entries are placed. * @indices: The cache indices corresponding to the entries in @entries * * find_get_entries() will search for and return a batch of entries in - * the mapping. The entries are placed in @pvec. find_get_entries() - * takes a reference on any actual pages it returns. + * the mapping. The entries are placed in @fbatch. find_get_entries() + * takes a reference on any actual folios it returns. * - * The search returns a group of mapping-contiguous page cache entries - * with ascending indexes. There may be holes in the indices due to - * not-present pages. + * The entries have ascending indexes. The indices may not be consecutive + * due to not-present entries or large folios. * - * Any shadow entries of evicted pages, or swap entries from + * Any shadow entries of evicted folios, or swap entries from * shmem/tmpfs, are included in the returned array. * - * If it finds a Transparent Huge Page, head or tail, find_get_entries() - * stops at that page: the caller is likely to have a better way to handle - * the compound page as a whole, and then skip its extent, than repeatedly - * calling find_get_entries() to return all its tails. - * - * Return: the number of pages and shadow entries which were found. + * Return: The number of entries which were found. */ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, - pgoff_t end, struct pagevec *pvec, pgoff_t *indices) + pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, start); - struct page *page; - unsigned int ret = 0; - unsigned nr_entries = PAGEVEC_SIZE; + struct folio *folio; rcu_read_lock(); - while ((page = find_get_entry(&xas, end, XA_PRESENT))) { - /* - * Terminate early on finding a THP, to allow the caller to - * handle it all at once; but continue if this is hugetlbfs. - */ - if (!xa_is_value(page) && PageTransHuge(page) && - !PageHuge(page)) { - page = find_subpage(page, xas.xa_index); - nr_entries = ret + 1; - } - - indices[ret] = xas.xa_index; - pvec->pages[ret] = page; - if (++ret == nr_entries) + while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { + indices[fbatch->nr] = xas.xa_index; + if (!folio_batch_add(fbatch, folio)) break; } rcu_read_unlock(); - pvec->nr = ret; - return ret; + return folio_batch_count(fbatch); } /** @@ -2077,63 +2106,64 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, * @mapping: The address_space to search. * @start: The starting page cache index. * @end: The final page index (inclusive). - * @pvec: Where the resulting entries are placed. - * @indices: The cache indices of the entries in @pvec. + * @fbatch: Where the resulting entries are placed. + * @indices: The cache indices of the entries in @fbatch. * * find_lock_entries() will return a batch of entries from @mapping. - * Swap, shadow and DAX entries are included. Pages are returned - * locked and with an incremented refcount. Pages which are locked by - * somebody else or under writeback are skipped. Only the head page of - * a THP is returned. Pages which are partially outside the range are - * not returned. + * Swap, shadow and DAX entries are included. Folios are returned + * locked and with an incremented refcount. Folios which are locked + * by somebody else or under writeback are skipped. Folios which are + * partially outside the range are not returned. * * The entries have ascending indexes. The indices may not be consecutive - * due to not-present entries, THP pages, pages which could not be locked - * or pages under writeback. + * due to not-present entries, large folios, folios which could not be + * locked or folios under writeback. * * Return: The number of entries which were found. */ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, - pgoff_t end, struct pagevec *pvec, pgoff_t *indices) + pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, start); - struct page *page; + struct folio *folio; rcu_read_lock(); - while ((page = find_get_entry(&xas, end, XA_PRESENT))) { - if (!xa_is_value(page)) { - if (page->index < start) + while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { + if (!xa_is_value(folio)) { + if (folio->index < start) goto put; - if (page->index + thp_nr_pages(page) - 1 > end) + if (folio->index + folio_nr_pages(folio) - 1 > end) goto put; - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto put; - if (page->mapping != mapping || PageWriteback(page)) + if (folio->mapping != mapping || + folio_test_writeback(folio)) goto unlock; - VM_BUG_ON_PAGE(!thp_contains(page, xas.xa_index), - page); + VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index), + folio); } - indices[pvec->nr] = xas.xa_index; - if (!pagevec_add(pvec, page)) + indices[fbatch->nr] = xas.xa_index; + if (!folio_batch_add(fbatch, folio)) break; - goto next; + continue; unlock: - unlock_page(page); + folio_unlock(folio); put: - put_page(page); -next: - if (!xa_is_value(page) && PageTransHuge(page)) { - unsigned int nr_pages = thp_nr_pages(page); - - /* Final THP may cross MAX_LFS_FILESIZE on 32-bit */ - xas_set(&xas, page->index + nr_pages); - if (xas.xa_index < nr_pages) - break; - } + folio_put(folio); } rcu_read_unlock(); - return pagevec_count(pvec); + return folio_batch_count(fbatch); +} + +static inline +bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max) +{ + if (!folio_test_large(folio) || folio_test_hugetlb(folio)) + return false; + if (index >= max) + return false; + return index < folio->index + folio_nr_pages(folio) - 1; } /** @@ -2162,23 +2192,29 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, struct page **pages) { XA_STATE(xas, &mapping->i_pages, *start); - struct page *page; + struct folio *folio; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - while ((page = find_get_entry(&xas, end, XA_PRESENT))) { + while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { /* Skip over shadow, swap and DAX entries */ - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; - pages[ret] = find_subpage(page, xas.xa_index); +again: + pages[ret] = folio_file_page(folio, xas.xa_index); if (++ret == nr_pages) { *start = xas.xa_index + 1; goto out; } + if (folio_more_pages(folio, xas.xa_index, end)) { + xas.xa_index++; + folio_ref_inc(folio); + goto again; + } } /* @@ -2204,8 +2240,9 @@ out: * @nr_pages: The maximum number of pages * @pages: Where the resulting pages are placed * - * find_get_pages_contig() works exactly like find_get_pages(), except - * that the returned number of pages are guaranteed to be contiguous. + * find_get_pages_contig() works exactly like find_get_pages_range(), + * except that the returned number of pages are guaranteed to be + * contiguous. * * Return: the number of pages which were found. */ @@ -2213,36 +2250,41 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, unsigned int nr_pages, struct page **pages) { XA_STATE(xas, &mapping->i_pages, index); - struct page *page; + struct folio *folio; unsigned int ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - for (page = xas_load(&xas); page; page = xas_next(&xas)) { - if (xas_retry(&xas, page)) + for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { + if (xas_retry(&xas, folio)) continue; /* * If the entry has been swapped out, we can stop looking. * No current caller is looking for DAX entries. */ - if (xa_is_value(page)) + if (xa_is_value(folio)) break; - if (!page_cache_get_speculative(page)) + if (!folio_try_get_rcu(folio)) goto retry; - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) + if (unlikely(folio != xas_reload(&xas))) goto put_page; - pages[ret] = find_subpage(page, xas.xa_index); +again: + pages[ret] = folio_file_page(folio, xas.xa_index); if (++ret == nr_pages) break; + if (folio_more_pages(folio, xas.xa_index, ULONG_MAX)) { + xas.xa_index++; + folio_ref_inc(folio); + goto again; + } continue; put_page: - put_page(page); + folio_put(folio); retry: xas_reset(&xas); } @@ -2260,9 +2302,9 @@ EXPORT_SYMBOL(find_get_pages_contig); * @nr_pages: the maximum number of pages * @pages: where the resulting pages are placed * - * Like find_get_pages(), except we only return head pages which are tagged - * with @tag. @index is updated to the index immediately after the last - * page we return, ready for the next iteration. + * Like find_get_pages_range(), except we only return head pages which are + * tagged with @tag. @index is updated to the index immediately after the + * last page we return, ready for the next iteration. * * Return: the number of pages which were found. */ @@ -2271,25 +2313,25 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, struct page **pages) { XA_STATE(xas, &mapping->i_pages, *index); - struct page *page; + struct folio *folio; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - while ((page = find_get_entry(&xas, end, tag))) { + while ((folio = find_get_entry(&xas, end, tag))) { /* * Shadow entries should never be tagged, but this iteration * is lockless so there is a window for page reclaim to evict * a page we saw tagged. Skip over it. */ - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; - pages[ret] = page; + pages[ret] = &folio->page; if (++ret == nr_pages) { - *index = page->index + thp_nr_pages(page); + *index = folio->index + folio_nr_pages(folio); goto out; } } @@ -2332,52 +2374,50 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra) } /* - * filemap_get_read_batch - Get a batch of pages for read + * filemap_get_read_batch - Get a batch of folios for read * - * Get a batch of pages which represent a contiguous range of bytes - * in the file. No tail pages will be returned. If @index is in the - * middle of a THP, the entire THP will be returned. The last page in - * the batch may have Readahead set or be not Uptodate so that the - * caller can take the appropriate action. + * Get a batch of folios which represent a contiguous range of bytes in + * the file. No exceptional entries will be returned. If @index is in + * the middle of a folio, the entire folio will be returned. The last + * folio in the batch may have the readahead flag set or the uptodate flag + * clear so that the caller can take the appropriate action. */ static void filemap_get_read_batch(struct address_space *mapping, - pgoff_t index, pgoff_t max, struct pagevec *pvec) + pgoff_t index, pgoff_t max, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, index); - struct page *head; + struct folio *folio; rcu_read_lock(); - for (head = xas_load(&xas); head; head = xas_next(&xas)) { - if (xas_retry(&xas, head)) + for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { + if (xas_retry(&xas, folio)) continue; - if (xas.xa_index > max || xa_is_value(head)) + if (xas.xa_index > max || xa_is_value(folio)) break; - if (!page_cache_get_speculative(head)) + if (!folio_try_get_rcu(folio)) goto retry; - /* Has the page moved or been split? */ - if (unlikely(head != xas_reload(&xas))) - goto put_page; + if (unlikely(folio != xas_reload(&xas))) + goto put_folio; - if (!pagevec_add(pvec, head)) + if (!folio_batch_add(fbatch, folio)) break; - if (!PageUptodate(head)) + if (!folio_test_uptodate(folio)) break; - if (PageReadahead(head)) + if (folio_test_readahead(folio)) break; - xas.xa_index = head->index + thp_nr_pages(head) - 1; - xas.xa_offset = (xas.xa_index >> xas.xa_shift) & XA_CHUNK_MASK; + xas_advance(&xas, folio->index + folio_nr_pages(folio) - 1); continue; -put_page: - put_page(head); +put_folio: + folio_put(folio); retry: xas_reset(&xas); } rcu_read_unlock(); } -static int filemap_read_page(struct file *file, struct address_space *mapping, - struct page *page) +static int filemap_read_folio(struct file *file, struct address_space *mapping, + struct folio *folio) { int error; @@ -2386,52 +2426,51 @@ static int filemap_read_page(struct file *file, struct address_space *mapping, * eg. multipath errors. PG_error will be set again if readpage * fails. */ - ClearPageError(page); + folio_clear_error(folio); /* Start the actual read. The read will unlock the page. */ - error = mapping->a_ops->readpage(file, page); + error = mapping->a_ops->readpage(file, &folio->page); if (error) return error; - error = wait_on_page_locked_killable(page); + error = folio_wait_locked_killable(folio); if (error) return error; - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) return 0; shrink_readahead_size_eio(&file->f_ra); return -EIO; } static bool filemap_range_uptodate(struct address_space *mapping, - loff_t pos, struct iov_iter *iter, struct page *page) + loff_t pos, struct iov_iter *iter, struct folio *folio) { int count; - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) return true; /* pipes can't handle partially uptodate pages */ if (iov_iter_is_pipe(iter)) return false; if (!mapping->a_ops->is_partially_uptodate) return false; - if (mapping->host->i_blkbits >= (PAGE_SHIFT + thp_order(page))) + if (mapping->host->i_blkbits >= folio_shift(folio)) return false; count = iter->count; - if (page_offset(page) > pos) { - count -= page_offset(page) - pos; + if (folio_pos(folio) > pos) { + count -= folio_pos(folio) - pos; pos = 0; } else { - pos -= page_offset(page); + pos -= folio_pos(folio); } - return mapping->a_ops->is_partially_uptodate(page, pos, count); + return mapping->a_ops->is_partially_uptodate(folio, pos, count); } static int filemap_update_page(struct kiocb *iocb, struct address_space *mapping, struct iov_iter *iter, - struct page *page) + struct folio *folio) { - struct folio *folio = page_folio(page); int error; if (iocb->ki_flags & IOCB_NOWAIT) { @@ -2447,7 +2486,11 @@ static int filemap_update_page(struct kiocb *iocb, goto unlock_mapping; if (!(iocb->ki_flags & IOCB_WAITQ)) { filemap_invalidate_unlock_shared(mapping); - put_and_wait_on_page_locked(&folio->page, TASK_KILLABLE); + /* + * This is where we usually end up waiting for a + * previously submitted readahead to finish. + */ + folio_put_wait_locked(folio, TASK_KILLABLE); return AOP_TRUNCATED_PAGE; } error = __folio_lock_async(folio, iocb->ki_waitq); @@ -2460,14 +2503,14 @@ static int filemap_update_page(struct kiocb *iocb, goto unlock; error = 0; - if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, &folio->page)) + if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, folio)) goto unlock; error = -EAGAIN; if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) goto unlock; - error = filemap_read_page(iocb->ki_filp, mapping, &folio->page); + error = filemap_read_folio(iocb->ki_filp, mapping, folio); goto unlock_mapping; unlock: folio_unlock(folio); @@ -2478,70 +2521,72 @@ unlock_mapping: return error; } -static int filemap_create_page(struct file *file, +static int filemap_create_folio(struct file *file, struct address_space *mapping, pgoff_t index, - struct pagevec *pvec) + struct folio_batch *fbatch) { - struct page *page; + struct folio *folio; int error; - page = page_cache_alloc(mapping); - if (!page) + folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0); + if (!folio) return -ENOMEM; /* - * Protect against truncate / hole punch. Grabbing invalidate_lock here - * assures we cannot instantiate and bring uptodate new pagecache pages - * after evicting page cache during truncate and before actually - * freeing blocks. Note that we could release invalidate_lock after - * inserting the page into page cache as the locked page would then be - * enough to synchronize with hole punching. But there are code paths - * such as filemap_update_page() filling in partially uptodate pages or - * ->readpages() that need to hold invalidate_lock while mapping blocks - * for IO so let's hold the lock here as well to keep locking rules - * simple. + * Protect against truncate / hole punch. Grabbing invalidate_lock + * here assures we cannot instantiate and bring uptodate new + * pagecache folios after evicting page cache during truncate + * and before actually freeing blocks. Note that we could + * release invalidate_lock after inserting the folio into + * the page cache as the locked folio would then be enough to + * synchronize with hole punching. But there are code paths + * such as filemap_update_page() filling in partially uptodate + * pages or ->readpages() that need to hold invalidate_lock + * while mapping blocks for IO so let's hold the lock here as + * well to keep locking rules simple. */ filemap_invalidate_lock_shared(mapping); - error = add_to_page_cache_lru(page, mapping, index, + error = filemap_add_folio(mapping, folio, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error == -EEXIST) error = AOP_TRUNCATED_PAGE; if (error) goto error; - error = filemap_read_page(file, mapping, page); + error = filemap_read_folio(file, mapping, folio); if (error) goto error; filemap_invalidate_unlock_shared(mapping); - pagevec_add(pvec, page); + folio_batch_add(fbatch, folio); return 0; error: filemap_invalidate_unlock_shared(mapping); - put_page(page); + folio_put(folio); return error; } static int filemap_readahead(struct kiocb *iocb, struct file *file, - struct address_space *mapping, struct page *page, + struct address_space *mapping, struct folio *folio, pgoff_t last_index) { + DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index); + if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; - page_cache_async_readahead(mapping, &file->f_ra, file, page, - page->index, last_index - page->index); + page_cache_async_ra(&ractl, folio, last_index - folio->index); return 0; } static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, - struct pagevec *pvec) + struct folio_batch *fbatch) { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; - struct page *page; + struct folio *folio; int err = 0; last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE); @@ -2549,34 +2594,35 @@ retry: if (fatal_signal_pending(current)) return -EINTR; - filemap_get_read_batch(mapping, index, last_index, pvec); - if (!pagevec_count(pvec)) { + filemap_get_read_batch(mapping, index, last_index, fbatch); + if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); - filemap_get_read_batch(mapping, index, last_index, pvec); + filemap_get_read_batch(mapping, index, last_index, fbatch); } - if (!pagevec_count(pvec)) { + if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) return -EAGAIN; - err = filemap_create_page(filp, mapping, - iocb->ki_pos >> PAGE_SHIFT, pvec); + err = filemap_create_folio(filp, mapping, + iocb->ki_pos >> PAGE_SHIFT, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; } - page = pvec->pages[pagevec_count(pvec) - 1]; - if (PageReadahead(page)) { - err = filemap_readahead(iocb, filp, mapping, page, last_index); + folio = fbatch->folios[folio_batch_count(fbatch) - 1]; + if (folio_test_readahead(folio)) { + err = filemap_readahead(iocb, filp, mapping, folio, last_index); if (err) goto err; } - if (!PageUptodate(page)) { - if ((iocb->ki_flags & IOCB_WAITQ) && pagevec_count(pvec) > 1) + if (!folio_test_uptodate(folio)) { + if ((iocb->ki_flags & IOCB_WAITQ) && + folio_batch_count(fbatch) > 1) iocb->ki_flags |= IOCB_NOWAIT; - err = filemap_update_page(iocb, mapping, iter, page); + err = filemap_update_page(iocb, mapping, iter, folio); if (err) goto err; } @@ -2584,8 +2630,8 @@ retry: return 0; err: if (err < 0) - put_page(page); - if (likely(--pvec->nr)) + folio_put(folio); + if (likely(--fbatch->nr)) return 0; if (err == AOP_TRUNCATED_PAGE) goto retry; @@ -2612,7 +2658,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, struct file_ra_state *ra = &filp->f_ra; struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; - struct pagevec pvec; + struct folio_batch fbatch; int i, error = 0; bool writably_mapped; loff_t isize, end_offset; @@ -2623,7 +2669,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, return 0; iov_iter_truncate(iter, inode->i_sb->s_maxbytes); - pagevec_init(&pvec); + folio_batch_init(&fbatch); do { cond_resched(); @@ -2639,7 +2685,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, if (unlikely(iocb->ki_pos >= i_size_read(inode))) break; - error = filemap_get_pages(iocb, iter, &pvec); + error = filemap_get_pages(iocb, iter, &fbatch); if (error < 0) break; @@ -2653,7 +2699,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, */ isize = i_size_read(inode); if (unlikely(iocb->ki_pos >= isize)) - goto put_pages; + goto put_folios; end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); /* @@ -2668,33 +2714,29 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, */ if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT) - mark_page_accessed(pvec.pages[0]); + folio_mark_accessed(fbatch.folios[0]); - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - size_t page_size = thp_size(page); - size_t offset = iocb->ki_pos & (page_size - 1); + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + size_t fsize = folio_size(folio); + size_t offset = iocb->ki_pos & (fsize - 1); size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos, - page_size - offset); + fsize - offset); size_t copied; - if (end_offset < page_offset(page)) + if (end_offset < folio_pos(folio)) break; if (i > 0) - mark_page_accessed(page); + folio_mark_accessed(folio); /* - * If users can be writing to this page using arbitrary - * virtual addresses, take care about potential aliasing - * before reading the page on the kernel side. + * If users can be writing to this folio using arbitrary + * virtual addresses, take care of potential aliasing + * before reading the folio on the kernel side. */ - if (writably_mapped) { - int j; + if (writably_mapped) + flush_dcache_folio(folio); - for (j = 0; j < thp_nr_pages(page); j++) - flush_dcache_page(page + j); - } - - copied = copy_page_to_iter(page, offset, bytes, iter); + copied = copy_folio_to_iter(folio, offset, bytes, iter); already_read += copied; iocb->ki_pos += copied; @@ -2705,10 +2747,10 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, break; } } -put_pages: - for (i = 0; i < pagevec_count(&pvec); i++) - put_page(pvec.pages[i]); - pagevec_reinit(&pvec); +put_folios: + for (i = 0; i < folio_batch_count(&fbatch); i++) + folio_put(fbatch.folios[i]); + folio_batch_init(&fbatch); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); file_accessed(filp); @@ -2793,44 +2835,44 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) } EXPORT_SYMBOL(generic_file_read_iter); -static inline loff_t page_seek_hole_data(struct xa_state *xas, - struct address_space *mapping, struct page *page, +static inline loff_t folio_seek_hole_data(struct xa_state *xas, + struct address_space *mapping, struct folio *folio, loff_t start, loff_t end, bool seek_data) { const struct address_space_operations *ops = mapping->a_ops; size_t offset, bsz = i_blocksize(mapping->host); - if (xa_is_value(page) || PageUptodate(page)) + if (xa_is_value(folio) || folio_test_uptodate(folio)) return seek_data ? start : end; if (!ops->is_partially_uptodate) return seek_data ? end : start; xas_pause(xas); rcu_read_unlock(); - lock_page(page); - if (unlikely(page->mapping != mapping)) + folio_lock(folio); + if (unlikely(folio->mapping != mapping)) goto unlock; - offset = offset_in_thp(page, start) & ~(bsz - 1); + offset = offset_in_folio(folio, start) & ~(bsz - 1); do { - if (ops->is_partially_uptodate(page, offset, bsz) == seek_data) + if (ops->is_partially_uptodate(folio, offset, bsz) == + seek_data) break; start = (start + bsz) & ~(bsz - 1); offset += bsz; - } while (offset < thp_size(page)); + } while (offset < folio_size(folio)); unlock: - unlock_page(page); + folio_unlock(folio); rcu_read_lock(); return start; } -static inline -unsigned int seek_page_size(struct xa_state *xas, struct page *page) +static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio) { - if (xa_is_value(page)) + if (xa_is_value(folio)) return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index); - return thp_size(page); + return folio_size(folio); } /** @@ -2857,15 +2899,15 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT); pgoff_t max = (end - 1) >> PAGE_SHIFT; bool seek_data = (whence == SEEK_DATA); - struct page *page; + struct folio *folio; if (end <= start) return -ENXIO; rcu_read_lock(); - while ((page = find_get_entry(&xas, max, XA_PRESENT))) { + while ((folio = find_get_entry(&xas, max, XA_PRESENT))) { loff_t pos = (u64)xas.xa_index << PAGE_SHIFT; - unsigned int seek_size; + size_t seek_size; if (start < pos) { if (!seek_data) @@ -2873,9 +2915,9 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, start = pos; } - seek_size = seek_page_size(&xas, page); - pos = round_up(pos + 1, seek_size); - start = page_seek_hole_data(&xas, mapping, page, start, pos, + seek_size = seek_folio_size(&xas, folio); + pos = round_up((u64)pos + 1, seek_size); + start = folio_seek_hole_data(&xas, mapping, folio, start, pos, seek_data); if (start < pos) goto unlock; @@ -2883,15 +2925,15 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, break; if (seek_size > PAGE_SIZE) xas_set(&xas, pos >> PAGE_SHIFT); - if (!xa_is_value(page)) - put_page(page); + if (!xa_is_value(folio)) + folio_put(folio); } if (seek_data) start = -ENXIO; unlock: rcu_read_unlock(); - if (page && !xa_is_value(page)) - put_page(page); + if (folio && !xa_is_value(folio)) + folio_put(folio); if (start > end) return end; return start; @@ -2900,21 +2942,20 @@ unlock: #ifdef CONFIG_MMU #define MMAP_LOTSAMISS (100) /* - * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock + * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock * @vmf - the vm_fault for this fault. - * @page - the page to lock. + * @folio - the folio to lock. * @fpin - the pointer to the file we may pin (or is already pinned). * - * This works similar to lock_page_or_retry in that it can drop the mmap_lock. - * It differs in that it actually returns the page locked if it returns 1 and 0 - * if it couldn't lock the page. If we did have to drop the mmap_lock then fpin - * will point to the pinned file and needs to be fput()'ed at a later point. + * This works similar to lock_folio_or_retry in that it can drop the + * mmap_lock. It differs in that it actually returns the folio locked + * if it returns 1 and 0 if it couldn't lock the folio. If we did have + * to drop the mmap_lock then fpin will point to the pinned file and + * needs to be fput()'ed at a later point. */ -static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, +static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio, struct file **fpin) { - struct folio *folio = page_folio(page); - if (folio_trylock(folio)) return 1; @@ -2961,6 +3002,24 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) struct file *fpin = NULL; unsigned int mmap_miss; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* Use the readahead code, even if readahead is disabled */ + if (vmf->vma->vm_flags & VM_HUGEPAGE) { + fpin = maybe_unlock_mmap_for_io(vmf, fpin); + ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1); + ra->size = HPAGE_PMD_NR; + /* + * Fetch two PMD folios, so we get the chance to actually + * readahead, unless we've been told not to. + */ + if (!(vmf->vma->vm_flags & VM_RAND_READ)) + ra->size *= 2; + ra->async_size = HPAGE_PMD_NR; + page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER); + return fpin; + } +#endif + /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ) return fpin; @@ -2993,7 +3052,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) ra->size = ra->ra_pages; ra->async_size = ra->ra_pages / 4; ractl._index = ra->start; - do_page_cache_ra(&ractl, ra->size, ra->async_size); + page_cache_ra_order(&ractl, ra, 0); return fpin; } @@ -3003,25 +3062,25 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) * was pinned if we have to drop the mmap_lock in order to do IO. */ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, - struct page *page) + struct folio *folio) { struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; - struct address_space *mapping = file->f_mapping; + DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff); struct file *fpin = NULL; unsigned int mmap_miss; - pgoff_t offset = vmf->pgoff; /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) return fpin; + mmap_miss = READ_ONCE(ra->mmap_miss); if (mmap_miss) WRITE_ONCE(ra->mmap_miss, --mmap_miss); - if (PageReadahead(page)) { + + if (folio_test_readahead(folio)) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); - page_cache_async_readahead(mapping, ra, file, - page, offset, ra->ra_pages); + page_cache_async_ra(&ractl, folio, ra->ra_pages); } return fpin; } @@ -3040,7 +3099,7 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, * vma->vm_mm->mmap_lock must be held on entry. * * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock - * may be dropped before doing I/O or by lock_page_maybe_drop_mmap(). + * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap(). * * If our return value does not have VM_FAULT_RETRY set, the mmap_lock * has not been released. @@ -3056,28 +3115,27 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) struct file *fpin = NULL; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - pgoff_t offset = vmf->pgoff; - pgoff_t max_off; - struct page *page; + pgoff_t max_idx, index = vmf->pgoff; + struct folio *folio; vm_fault_t ret = 0; bool mapping_locked = false; - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (unlikely(offset >= max_off)) + max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(index >= max_idx)) return VM_FAULT_SIGBUS; /* * Do we have something in the page cache already? */ - page = find_get_page(mapping, offset); - if (likely(page)) { + folio = filemap_get_folio(mapping, index); + if (likely(folio)) { /* * We found the page, so try async readahead before waiting for * the lock. */ if (!(vmf->flags & FAULT_FLAG_TRIED)) - fpin = do_async_mmap_readahead(vmf, page); - if (unlikely(!PageUptodate(page))) { + fpin = do_async_mmap_readahead(vmf, folio); + if (unlikely(!folio_test_uptodate(folio))) { filemap_invalidate_lock_shared(mapping); mapping_locked = true; } @@ -3089,17 +3147,17 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) fpin = do_sync_mmap_readahead(vmf); retry_find: /* - * See comment in filemap_create_page() why we need + * See comment in filemap_create_folio() why we need * invalidate_lock */ if (!mapping_locked) { filemap_invalidate_lock_shared(mapping); mapping_locked = true; } - page = pagecache_get_page(mapping, offset, + folio = __filemap_get_folio(mapping, index, FGP_CREAT|FGP_FOR_MMAP, vmf->gfp_mask); - if (!page) { + if (!folio) { if (fpin) goto out_retry; filemap_invalidate_unlock_shared(mapping); @@ -3107,22 +3165,22 @@ retry_find: } } - if (!lock_page_maybe_drop_mmap(vmf, page, &fpin)) + if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin)) goto out_retry; /* Did it get truncated? */ - if (unlikely(compound_head(page)->mapping != mapping)) { - unlock_page(page); - put_page(page); + if (unlikely(folio->mapping != mapping)) { + folio_unlock(folio); + folio_put(folio); goto retry_find; } - VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); + VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); /* * We have a locked page in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error. */ - if (unlikely(!PageUptodate(page))) { + if (unlikely(!folio_test_uptodate(folio))) { /* * The page was in cache and uptodate and now it is not. * Strange but possible since we didn't hold the page lock all @@ -3130,8 +3188,8 @@ retry_find: * try again. */ if (!mapping_locked) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto retry_find; } goto page_not_uptodate; @@ -3143,7 +3201,7 @@ retry_find: * redo the fault. */ if (fpin) { - unlock_page(page); + folio_unlock(folio); goto out_retry; } if (mapping_locked) @@ -3153,14 +3211,14 @@ retry_find: * Found the page and have a reference on it. * We must recheck i_size under page lock. */ - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (unlikely(offset >= max_off)) { - unlock_page(page); - put_page(page); + max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(index >= max_idx)) { + folio_unlock(folio); + folio_put(folio); return VM_FAULT_SIGBUS; } - vmf->page = page; + vmf->page = folio_file_page(folio, index); return ret | VM_FAULT_LOCKED; page_not_uptodate: @@ -3171,10 +3229,10 @@ page_not_uptodate: * and we need to check for errors. */ fpin = maybe_unlock_mmap_for_io(vmf, fpin); - error = filemap_read_page(file, mapping, page); + error = filemap_read_folio(file, mapping, folio); if (fpin) goto out_retry; - put_page(page); + folio_put(folio); if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; @@ -3188,8 +3246,8 @@ out_retry: * re-find the vma and come back and find our hopefully still populated * page. */ - if (page) - put_page(page); + if (folio) + folio_put(folio); if (mapping_locked) filemap_invalidate_unlock_shared(mapping); if (fpin) @@ -3231,48 +3289,48 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) return false; } -static struct page *next_uptodate_page(struct page *page, +static struct folio *next_uptodate_page(struct folio *folio, struct address_space *mapping, struct xa_state *xas, pgoff_t end_pgoff) { unsigned long max_idx; do { - if (!page) + if (!folio) return NULL; - if (xas_retry(xas, page)) + if (xas_retry(xas, folio)) continue; - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; - if (PageLocked(page)) + if (folio_test_locked(folio)) continue; - if (!page_cache_get_speculative(page)) + if (!folio_try_get_rcu(folio)) continue; /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(xas))) + if (unlikely(folio != xas_reload(xas))) goto skip; - if (!PageUptodate(page) || PageReadahead(page)) + if (!folio_test_uptodate(folio) || folio_test_readahead(folio)) goto skip; - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto skip; - if (page->mapping != mapping) + if (folio->mapping != mapping) goto unlock; - if (!PageUptodate(page)) + if (!folio_test_uptodate(folio)) goto unlock; max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); if (xas->xa_index >= max_idx) goto unlock; - return page; + return folio; unlock: - unlock_page(page); + folio_unlock(folio); skip: - put_page(page); - } while ((page = xas_next_entry(xas, end_pgoff)) != NULL); + folio_put(folio); + } while ((folio = xas_next_entry(xas, end_pgoff)) != NULL); return NULL; } -static inline struct page *first_map_page(struct address_space *mapping, +static inline struct folio *first_map_page(struct address_space *mapping, struct xa_state *xas, pgoff_t end_pgoff) { @@ -3280,7 +3338,7 @@ static inline struct page *first_map_page(struct address_space *mapping, mapping, xas, end_pgoff); } -static inline struct page *next_map_page(struct address_space *mapping, +static inline struct folio *next_map_page(struct address_space *mapping, struct xa_state *xas, pgoff_t end_pgoff) { @@ -3297,16 +3355,17 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t last_pgoff = start_pgoff; unsigned long addr; XA_STATE(xas, &mapping->i_pages, start_pgoff); - struct page *head, *page; + struct folio *folio; + struct page *page; unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); vm_fault_t ret = 0; rcu_read_lock(); - head = first_map_page(mapping, &xas, end_pgoff); - if (!head) + folio = first_map_page(mapping, &xas, end_pgoff); + if (!folio) goto out; - if (filemap_map_pmd(vmf, head)) { + if (filemap_map_pmd(vmf, &folio->page)) { ret = VM_FAULT_NOPAGE; goto out; } @@ -3314,7 +3373,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); do { - page = find_subpage(head, xas.xa_index); +again: + page = folio_file_page(folio, xas.xa_index); if (PageHWPoison(page)) goto unlock; @@ -3335,12 +3395,21 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, do_set_pte(vmf, page, addr); /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, addr, vmf->pte); - unlock_page(head); + if (folio_more_pages(folio, xas.xa_index, end_pgoff)) { + xas.xa_index++; + folio_ref_inc(folio); + goto again; + } + folio_unlock(folio); continue; unlock: - unlock_page(head); - put_page(head); - } while ((head = next_map_page(mapping, &xas, end_pgoff)) != NULL); + if (folio_more_pages(folio, xas.xa_index, end_pgoff)) { + xas.xa_index++; + goto again; + } + folio_unlock(folio); + folio_put(folio); + } while ((folio = next_map_page(mapping, &xas, end_pgoff)) != NULL); pte_unmap_unlock(vmf->pte, vmf->ptl); out: rcu_read_unlock(); @@ -3352,24 +3421,24 @@ EXPORT_SYMBOL(filemap_map_pages); vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); vm_fault_t ret = VM_FAULT_LOCKED; sb_start_pagefault(mapping->host->i_sb); file_update_time(vmf->vma->vm_file); - lock_page(page); - if (page->mapping != mapping) { - unlock_page(page); + folio_lock(folio); + if (folio->mapping != mapping) { + folio_unlock(folio); ret = VM_FAULT_NOPAGE; goto out; } /* - * We mark the page dirty already here so that when freeze is in + * We mark the folio dirty already here so that when freeze is in * progress, we are guaranteed that writeback during freezing will - * see the dirty page and writeprotect it again. + * see the dirty folio and writeprotect it again. */ - set_page_dirty(page); - wait_for_stable_page(page); + folio_mark_dirty(folio); + folio_wait_stable(folio); out: sb_end_pagefault(mapping->host->i_sb); return ret; @@ -3422,35 +3491,20 @@ EXPORT_SYMBOL(filemap_page_mkwrite); EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap); -static struct page *wait_on_page_read(struct page *page) +static struct folio *do_read_cache_folio(struct address_space *mapping, + pgoff_t index, filler_t filler, void *data, gfp_t gfp) { - if (!IS_ERR(page)) { - wait_on_page_locked(page); - if (!PageUptodate(page)) { - put_page(page); - page = ERR_PTR(-EIO); - } - } - return page; -} - -static struct page *do_read_cache_page(struct address_space *mapping, - pgoff_t index, - int (*filler)(void *, struct page *), - void *data, - gfp_t gfp) -{ - struct page *page; + struct folio *folio; int err; repeat: - page = find_get_page(mapping, index); - if (!page) { - page = __page_cache_alloc(gfp); - if (!page) + folio = filemap_get_folio(mapping, index); + if (!folio) { + folio = filemap_alloc_folio(gfp, 0); + if (!folio) return ERR_PTR(-ENOMEM); - err = add_to_page_cache_lru(page, mapping, index, gfp); + err = filemap_add_folio(mapping, folio, index, gfp); if (unlikely(err)) { - put_page(page); + folio_put(folio); if (err == -EEXIST) goto repeat; /* Presumably ENOMEM for xarray node */ @@ -3459,71 +3513,41 @@ repeat: filler: if (filler) - err = filler(data, page); + err = filler(data, &folio->page); else - err = mapping->a_ops->readpage(data, page); + err = mapping->a_ops->readpage(data, &folio->page); if (err < 0) { - put_page(page); + folio_put(folio); return ERR_PTR(err); } - page = wait_on_page_read(page); - if (IS_ERR(page)) - return page; + folio_wait_locked(folio); + if (!folio_test_uptodate(folio)) { + folio_put(folio); + return ERR_PTR(-EIO); + } + goto out; } - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) goto out; - /* - * Page is not up to date and may be locked due to one of the following - * case a: Page is being filled and the page lock is held - * case b: Read/write error clearing the page uptodate status - * case c: Truncation in progress (page locked) - * case d: Reclaim in progress - * - * Case a, the page will be up to date when the page is unlocked. - * There is no need to serialise on the page lock here as the page - * is pinned so the lock gives no additional protection. Even if the - * page is truncated, the data is still valid if PageUptodate as - * it's a race vs truncate race. - * Case b, the page will not be up to date - * Case c, the page may be truncated but in itself, the data may still - * be valid after IO completes as it's a read vs truncate race. The - * operation must restart if the page is not uptodate on unlock but - * otherwise serialising on page lock to stabilise the mapping gives - * no additional guarantees to the caller as the page lock is - * released before return. - * Case d, similar to truncation. If reclaim holds the page lock, it - * will be a race with remove_mapping that determines if the mapping - * is valid on unlock but otherwise the data is valid and there is - * no need to serialise with page lock. - * - * As the page lock gives no additional guarantee, we optimistically - * wait on the page to be unlocked and check if it's up to date and - * use the page if it is. Otherwise, the page lock is required to - * distinguish between the different cases. The motivation is that we - * avoid spurious serialisations and wakeups when multiple processes - * wait on the same page for IO to complete. - */ - wait_on_page_locked(page); - if (PageUptodate(page)) - goto out; - - /* Distinguish between all the cases under the safety of the lock */ - lock_page(page); + if (!folio_trylock(folio)) { + folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE); + goto repeat; + } - /* Case c or d, restart the operation */ - if (!page->mapping) { - unlock_page(page); - put_page(page); + /* Folio was truncated from mapping */ + if (!folio->mapping) { + folio_unlock(folio); + folio_put(folio); goto repeat; } /* Someone else locked and filled the page in a very small window */ - if (PageUptodate(page)) { - unlock_page(page); + if (folio_test_uptodate(folio)) { + folio_unlock(folio); goto out; } @@ -3533,16 +3557,16 @@ filler: * Clear page error before actual read, PG_error will be * set again if read page fails. */ - ClearPageError(page); + folio_clear_error(folio); goto filler; out: - mark_page_accessed(page); - return page; + folio_mark_accessed(folio); + return folio; } /** - * read_cache_page - read into page cache, fill it if needed + * read_cache_folio - read into page cache, fill it if needed * @mapping: the page's address_space * @index: the page index * @filler: function to perform the read @@ -3557,10 +3581,27 @@ out: * * Return: up to date page on success, ERR_PTR() on failure. */ +struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index, + filler_t filler, void *data) +{ + return do_read_cache_folio(mapping, index, filler, data, + mapping_gfp_mask(mapping)); +} +EXPORT_SYMBOL(read_cache_folio); + +static struct page *do_read_cache_page(struct address_space *mapping, + pgoff_t index, filler_t *filler, void *data, gfp_t gfp) +{ + struct folio *folio; + + folio = do_read_cache_folio(mapping, index, filler, data, gfp); + if (IS_ERR(folio)) + return &folio->page; + return folio_file_page(folio, index); +} + struct page *read_cache_page(struct address_space *mapping, - pgoff_t index, - int (*filler)(void *, struct page *), - void *data) + pgoff_t index, filler_t *filler, void *data) { return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); @@ -3920,33 +3961,32 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) EXPORT_SYMBOL(generic_file_write_iter); /** - * try_to_release_page() - release old fs-specific metadata on a page + * filemap_release_folio() - Release fs-specific metadata on a folio. + * @folio: The folio which the kernel is trying to free. + * @gfp: Memory allocation flags (and I/O mode). * - * @page: the page which the kernel is trying to free - * @gfp_mask: memory allocation flags (and I/O mode) + * The address_space is trying to release any data attached to a folio + * (presumably at folio->private). * - * The address_space is to try to release any data against the page - * (presumably at page->private). + * This will also be called if the private_2 flag is set on a page, + * indicating that the folio has other metadata associated with it. * - * This may also be called if PG_fscache is set on a page, indicating that the - * page is known to the local caching routines. + * The @gfp argument specifies whether I/O may be performed to release + * this page (__GFP_IO), and whether the call may block + * (__GFP_RECLAIM & __GFP_FS). * - * The @gfp_mask argument specifies whether I/O may be performed to release - * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). - * - * Return: %1 if the release was successful, otherwise return zero. + * Return: %true if the release was successful, otherwise %false. */ -int try_to_release_page(struct page *page, gfp_t gfp_mask) +bool filemap_release_folio(struct folio *folio, gfp_t gfp) { - struct address_space * const mapping = page->mapping; + struct address_space * const mapping = folio->mapping; - BUG_ON(!PageLocked(page)); - if (PageWriteback(page)) - return 0; + BUG_ON(!folio_test_locked(folio)); + if (folio_test_writeback(folio)) + return false; if (mapping && mapping->a_ops->releasepage) - return mapping->a_ops->releasepage(page, gfp_mask); - return try_to_free_buffers(page); + return mapping->a_ops->releasepage(&folio->page, gfp); + return try_to_free_buffers(&folio->page); } - -EXPORT_SYMBOL(try_to_release_page); +EXPORT_SYMBOL(filemap_release_folio); diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 5b6ae1da314e..46fa179e32fb 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -7,6 +7,7 @@ #include <linux/migrate.h> #include <linux/pagemap.h> #include <linux/swap.h> +#include "internal.h" struct address_space *page_mapping(struct page *page) { @@ -140,3 +141,26 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(grab_cache_page_write_begin); + +void delete_from_page_cache(struct page *page) +{ + return filemap_remove_folio(page_folio(page)); +} + +int try_to_release_page(struct page *page, gfp_t gfp) +{ + return filemap_release_folio(page_folio(page), gfp); +} +EXPORT_SYMBOL(try_to_release_page); + +int isolate_lru_page(struct page *page) +{ + if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page")) + return -EBUSY; + return folio_isolate_lru((struct folio *)page); +} + +void putback_lru_page(struct page *page) +{ + folio_putback_lru(page_folio(page)); +} diff --git a/mm/frontswap.c b/mm/frontswap.c index 130e301c5ac0..6f69b044a8cc 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -27,27 +27,7 @@ DEFINE_STATIC_KEY_FALSE(frontswap_enabled_key); * may be registered, but implementations can never deregister. This * is a simple singly-linked list of all registered implementations. */ -static struct frontswap_ops *frontswap_ops __read_mostly; - -#define for_each_frontswap_ops(ops) \ - for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next) - -/* - * If enabled, frontswap_store will return failure even on success. As - * a result, the swap subsystem will always write the page to swap, in - * effect converting frontswap into a writethrough cache. In this mode, - * there is no direct reduction in swap writes, but a frontswap backend - * can unilaterally "reclaim" any pages in use with no data loss, thus - * providing increases control over maximum memory usage due to frontswap. - */ -static bool frontswap_writethrough_enabled __read_mostly; - -/* - * If enabled, the underlying tmem implementation is capable of doing - * exclusive gets, so frontswap_load, on a successful tmem_get must - * mark the page as no longer in frontswap AND mark it dirty. - */ -static bool frontswap_tmem_exclusive_gets_enabled __read_mostly; +static const struct frontswap_ops *frontswap_ops __read_mostly; #ifdef CONFIG_DEBUG_FS /* @@ -114,87 +94,22 @@ static inline void inc_frontswap_invalidates(void) { } /* * Register operations for frontswap */ -void frontswap_register_ops(struct frontswap_ops *ops) +int frontswap_register_ops(const struct frontswap_ops *ops) { - DECLARE_BITMAP(a, MAX_SWAPFILES); - DECLARE_BITMAP(b, MAX_SWAPFILES); - struct swap_info_struct *si; - unsigned int i; - - bitmap_zero(a, MAX_SWAPFILES); - bitmap_zero(b, MAX_SWAPFILES); - - spin_lock(&swap_lock); - plist_for_each_entry(si, &swap_active_head, list) { - if (!WARN_ON(!si->frontswap_map)) - set_bit(si->type, a); - } - spin_unlock(&swap_lock); - - /* the new ops needs to know the currently active swap devices */ - for_each_set_bit(i, a, MAX_SWAPFILES) - ops->init(i); - - /* - * Setting frontswap_ops must happen after the ops->init() calls - * above; cmpxchg implies smp_mb() which will ensure the init is - * complete at this point. - */ - do { - ops->next = frontswap_ops; - } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next); + if (frontswap_ops) + return -EINVAL; + frontswap_ops = ops; static_branch_inc(&frontswap_enabled_key); - - spin_lock(&swap_lock); - plist_for_each_entry(si, &swap_active_head, list) { - if (si->frontswap_map) - set_bit(si->type, b); - } - spin_unlock(&swap_lock); - - /* - * On the very unlikely chance that a swap device was added or - * removed between setting the "a" list bits and the ops init - * calls, we re-check and do init or invalidate for any changed - * bits. - */ - if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) { - for (i = 0; i < MAX_SWAPFILES; i++) { - if (!test_bit(i, a) && test_bit(i, b)) - ops->init(i); - else if (test_bit(i, a) && !test_bit(i, b)) - ops->invalidate_area(i); - } - } -} -EXPORT_SYMBOL(frontswap_register_ops); - -/* - * Enable/disable frontswap writethrough (see above). - */ -void frontswap_writethrough(bool enable) -{ - frontswap_writethrough_enabled = enable; -} -EXPORT_SYMBOL(frontswap_writethrough); - -/* - * Enable/disable frontswap exclusive gets (see above). - */ -void frontswap_tmem_exclusive_gets(bool enable) -{ - frontswap_tmem_exclusive_gets_enabled = enable; + return 0; } -EXPORT_SYMBOL(frontswap_tmem_exclusive_gets); /* * Called when a swap device is swapon'd. */ -void __frontswap_init(unsigned type, unsigned long *map) +void frontswap_init(unsigned type, unsigned long *map) { struct swap_info_struct *sis = swap_info[type]; - struct frontswap_ops *ops; VM_BUG_ON(sis == NULL); @@ -210,20 +125,16 @@ void __frontswap_init(unsigned type, unsigned long *map) * p->frontswap set to something valid to work properly. */ frontswap_map_set(sis, map); - - for_each_frontswap_ops(ops) - ops->init(type); + frontswap_ops->init(type); } -EXPORT_SYMBOL(__frontswap_init); -bool __frontswap_test(struct swap_info_struct *sis, +static bool __frontswap_test(struct swap_info_struct *sis, pgoff_t offset) { if (sis->frontswap_map) return test_bit(offset, sis->frontswap_map); return false; } -EXPORT_SYMBOL(__frontswap_test); static inline void __frontswap_set(struct swap_info_struct *sis, pgoff_t offset) @@ -253,7 +164,6 @@ int __frontswap_store(struct page *page) int type = swp_type(entry); struct swap_info_struct *sis = swap_info[type]; pgoff_t offset = swp_offset(entry); - struct frontswap_ops *ops; VM_BUG_ON(!frontswap_ops); VM_BUG_ON(!PageLocked(page)); @@ -267,28 +177,19 @@ int __frontswap_store(struct page *page) */ if (__frontswap_test(sis, offset)) { __frontswap_clear(sis, offset); - for_each_frontswap_ops(ops) - ops->invalidate_page(type, offset); + frontswap_ops->invalidate_page(type, offset); } - /* Try to store in each implementation, until one succeeds. */ - for_each_frontswap_ops(ops) { - ret = ops->store(type, offset, page); - if (!ret) /* successful store */ - break; - } + ret = frontswap_ops->store(type, offset, page); if (ret == 0) { __frontswap_set(sis, offset); inc_frontswap_succ_stores(); } else { inc_frontswap_failed_stores(); } - if (frontswap_writethrough_enabled) - /* report failure so swap also writes to swap device */ - ret = -1; + return ret; } -EXPORT_SYMBOL(__frontswap_store); /* * "Get" data from frontswap associated with swaptype and offset that were @@ -302,7 +203,6 @@ int __frontswap_load(struct page *page) int type = swp_type(entry); struct swap_info_struct *sis = swap_info[type]; pgoff_t offset = swp_offset(entry); - struct frontswap_ops *ops; VM_BUG_ON(!frontswap_ops); VM_BUG_ON(!PageLocked(page)); @@ -312,21 +212,11 @@ int __frontswap_load(struct page *page) return -1; /* Try loading from each implementation, until one succeeds. */ - for_each_frontswap_ops(ops) { - ret = ops->load(type, offset, page); - if (!ret) /* successful load */ - break; - } - if (ret == 0) { + ret = frontswap_ops->load(type, offset, page); + if (ret == 0) inc_frontswap_loads(); - if (frontswap_tmem_exclusive_gets_enabled) { - SetPageDirty(page); - __frontswap_clear(sis, offset); - } - } return ret; } -EXPORT_SYMBOL(__frontswap_load); /* * Invalidate any data from frontswap associated with the specified swaptype @@ -335,7 +225,6 @@ EXPORT_SYMBOL(__frontswap_load); void __frontswap_invalidate_page(unsigned type, pgoff_t offset) { struct swap_info_struct *sis = swap_info[type]; - struct frontswap_ops *ops; VM_BUG_ON(!frontswap_ops); VM_BUG_ON(sis == NULL); @@ -343,12 +232,10 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset) if (!__frontswap_test(sis, offset)) return; - for_each_frontswap_ops(ops) - ops->invalidate_page(type, offset); + frontswap_ops->invalidate_page(type, offset); __frontswap_clear(sis, offset); inc_frontswap_invalidates(); } -EXPORT_SYMBOL(__frontswap_invalidate_page); /* * Invalidate all data from frontswap associated with all offsets for the @@ -357,7 +244,6 @@ EXPORT_SYMBOL(__frontswap_invalidate_page); void __frontswap_invalidate_area(unsigned type) { struct swap_info_struct *sis = swap_info[type]; - struct frontswap_ops *ops; VM_BUG_ON(!frontswap_ops); VM_BUG_ON(sis == NULL); @@ -365,123 +251,10 @@ void __frontswap_invalidate_area(unsigned type) if (sis->frontswap_map == NULL) return; - for_each_frontswap_ops(ops) - ops->invalidate_area(type); + frontswap_ops->invalidate_area(type); atomic_set(&sis->frontswap_pages, 0); bitmap_zero(sis->frontswap_map, sis->max); } -EXPORT_SYMBOL(__frontswap_invalidate_area); - -static unsigned long __frontswap_curr_pages(void) -{ - unsigned long totalpages = 0; - struct swap_info_struct *si = NULL; - - assert_spin_locked(&swap_lock); - plist_for_each_entry(si, &swap_active_head, list) - totalpages += atomic_read(&si->frontswap_pages); - return totalpages; -} - -static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, - int *swapid) -{ - int ret = -EINVAL; - struct swap_info_struct *si = NULL; - int si_frontswap_pages; - unsigned long total_pages_to_unuse = total; - unsigned long pages = 0, pages_to_unuse = 0; - - assert_spin_locked(&swap_lock); - plist_for_each_entry(si, &swap_active_head, list) { - si_frontswap_pages = atomic_read(&si->frontswap_pages); - if (total_pages_to_unuse < si_frontswap_pages) { - pages = pages_to_unuse = total_pages_to_unuse; - } else { - pages = si_frontswap_pages; - pages_to_unuse = 0; /* unuse all */ - } - /* ensure there is enough RAM to fetch pages from frontswap */ - if (security_vm_enough_memory_mm(current->mm, pages)) { - ret = -ENOMEM; - continue; - } - vm_unacct_memory(pages); - *unused = pages_to_unuse; - *swapid = si->type; - ret = 0; - break; - } - - return ret; -} - -/* - * Used to check if it's necessary and feasible to unuse pages. - * Return 1 when nothing to do, 0 when need to shrink pages, - * error code when there is an error. - */ -static int __frontswap_shrink(unsigned long target_pages, - unsigned long *pages_to_unuse, - int *type) -{ - unsigned long total_pages = 0, total_pages_to_unuse; - - assert_spin_locked(&swap_lock); - - total_pages = __frontswap_curr_pages(); - if (total_pages <= target_pages) { - /* Nothing to do */ - *pages_to_unuse = 0; - return 1; - } - total_pages_to_unuse = total_pages - target_pages; - return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type); -} - -/* - * Frontswap, like a true swap device, may unnecessarily retain pages - * under certain circumstances; "shrink" frontswap is essentially a - * "partial swapoff" and works by calling try_to_unuse to attempt to - * unuse enough frontswap pages to attempt to -- subject to memory - * constraints -- reduce the number of pages in frontswap to the - * number given in the parameter target_pages. - */ -void frontswap_shrink(unsigned long target_pages) -{ - unsigned long pages_to_unuse = 0; - int type, ret; - - /* - * we don't want to hold swap_lock while doing a very - * lengthy try_to_unuse, but swap_list may change - * so restart scan from swap_active_head each time - */ - spin_lock(&swap_lock); - ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); - spin_unlock(&swap_lock); - if (ret == 0) - try_to_unuse(type, true, pages_to_unuse); - return; -} -EXPORT_SYMBOL(frontswap_shrink); - -/* - * Count and return the number of frontswap pages across all - * swap devices. This is exported so that backend drivers can - * determine current usage without reading debugfs. - */ -unsigned long frontswap_curr_pages(void) -{ - unsigned long totalpages = 0; - - spin_lock(&swap_lock); - totalpages = __frontswap_curr_pages(); - spin_unlock(&swap_lock); - - return totalpages; -} -EXPORT_SYMBOL(frontswap_curr_pages); static int __init init_frontswap(void) { @@ -29,107 +29,71 @@ struct follow_page_context { unsigned int page_mask; }; -static void hpage_pincount_add(struct page *page, int refs) -{ - VM_BUG_ON_PAGE(!hpage_pincount_available(page), page); - VM_BUG_ON_PAGE(page != compound_head(page), page); - - atomic_add(refs, compound_pincount_ptr(page)); -} - -static void hpage_pincount_sub(struct page *page, int refs) -{ - VM_BUG_ON_PAGE(!hpage_pincount_available(page), page); - VM_BUG_ON_PAGE(page != compound_head(page), page); - - atomic_sub(refs, compound_pincount_ptr(page)); -} - -/* Equivalent to calling put_page() @refs times. */ -static void put_page_refs(struct page *page, int refs) -{ -#ifdef CONFIG_DEBUG_VM - if (VM_WARN_ON_ONCE_PAGE(page_ref_count(page) < refs, page)) - return; -#endif - - /* - * Calling put_page() for each ref is unnecessarily slow. Only the last - * ref needs a put_page(). - */ - if (refs > 1) - page_ref_sub(page, refs - 1); - put_page(page); -} - /* - * Return the compound head page with ref appropriately incremented, + * Return the folio with ref appropriately incremented, * or NULL if that failed. */ -static inline struct page *try_get_compound_head(struct page *page, int refs) +static inline struct folio *try_get_folio(struct page *page, int refs) { - struct page *head = compound_head(page); + struct folio *folio; - if (WARN_ON_ONCE(page_ref_count(head) < 0)) +retry: + folio = page_folio(page); + if (WARN_ON_ONCE(folio_ref_count(folio) < 0)) return NULL; - if (unlikely(!page_cache_add_speculative(head, refs))) + if (unlikely(!folio_ref_try_add_rcu(folio, refs))) return NULL; /* - * At this point we have a stable reference to the head page; but it - * could be that between the compound_head() lookup and the refcount - * increment, the compound page was split, in which case we'd end up - * holding a reference on a page that has nothing to do with the page + * At this point we have a stable reference to the folio; but it + * could be that between calling page_folio() and the refcount + * increment, the folio was split, in which case we'd end up + * holding a reference on a folio that has nothing to do with the page * we were given anymore. - * So now that the head page is stable, recheck that the pages still - * belong together. + * So now that the folio is stable, recheck that the page still + * belongs to this folio. */ - if (unlikely(compound_head(page) != head)) { - put_page_refs(head, refs); - return NULL; + if (unlikely(page_folio(page) != folio)) { + folio_put_refs(folio, refs); + goto retry; } - return head; + return folio; } /** - * try_grab_compound_head() - attempt to elevate a page's refcount, by a - * flags-dependent amount. - * - * Even though the name includes "compound_head", this function is still - * appropriate for callers that have a non-compound @page to get. - * + * try_grab_folio() - Attempt to get or pin a folio. * @page: pointer to page to be grabbed - * @refs: the value to (effectively) add to the page's refcount + * @refs: the value to (effectively) add to the folio's refcount * @flags: gup flags: these are the FOLL_* flag values. * * "grab" names in this file mean, "look at flags to decide whether to use - * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount. + * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. * * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the * same time. (That's true throughout the get_user_pages*() and * pin_user_pages*() APIs.) Cases: * - * FOLL_GET: page's refcount will be incremented by @refs. + * FOLL_GET: folio's refcount will be incremented by @refs. * - * FOLL_PIN on compound pages that are > two pages long: page's refcount will - * be incremented by @refs, and page[2].hpage_pinned_refcount will be - * incremented by @refs * GUP_PIN_COUNTING_BIAS. + * FOLL_PIN on large folios: folio's refcount will be incremented by + * @refs, and its compound_pincount will be incremented by @refs. * - * FOLL_PIN on normal pages, or compound pages that are two pages long: - * page's refcount will be incremented by @refs * GUP_PIN_COUNTING_BIAS. + * FOLL_PIN on single-page folios: folio's refcount will be incremented by + * @refs * GUP_PIN_COUNTING_BIAS. * - * Return: head page (with refcount appropriately incremented) for success, or - * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's - * considered failure, and furthermore, a likely bug in the caller, so a warning - * is also emitted. + * Return: The folio containing @page (with refcount appropriately + * incremented) for success, or NULL upon failure. If neither FOLL_GET + * nor FOLL_PIN was set, that's considered failure, and furthermore, + * a likely bug in the caller, so a warning is also emitted. */ -struct page *try_grab_compound_head(struct page *page, - int refs, unsigned int flags) +struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) { if (flags & FOLL_GET) - return try_get_compound_head(page, refs); + return try_get_folio(page, refs); else if (flags & FOLL_PIN) { + struct folio *folio; + /* * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a * right zone, so fail and let the caller fall back to the slow @@ -143,63 +107,57 @@ struct page *try_grab_compound_head(struct page *page, * CAUTION: Don't use compound_head() on the page before this * point, the result won't be stable. */ - page = try_get_compound_head(page, refs); - if (!page) + folio = try_get_folio(page, refs); + if (!folio) return NULL; /* - * When pinning a compound page of order > 1 (which is what - * hpage_pincount_available() checks for), use an exact count to - * track it, via hpage_pincount_add/_sub(). + * When pinning a large folio, use an exact count to track it. * - * However, be sure to *also* increment the normal page refcount - * field at least once, so that the page really is pinned. - * That's why the refcount from the earlier - * try_get_compound_head() is left intact. + * However, be sure to *also* increment the normal folio + * refcount field at least once, so that the folio really + * is pinned. That's why the refcount from the earlier + * try_get_folio() is left intact. */ - if (hpage_pincount_available(page)) - hpage_pincount_add(page, refs); + if (folio_test_large(folio)) + atomic_add(refs, folio_pincount_ptr(folio)); else - page_ref_add(page, refs * (GUP_PIN_COUNTING_BIAS - 1)); + folio_ref_add(folio, + refs * (GUP_PIN_COUNTING_BIAS - 1)); + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); - mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, - refs); - - return page; + return folio; } WARN_ON_ONCE(1); return NULL; } -static void put_compound_head(struct page *page, int refs, unsigned int flags) +static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) { if (flags & FOLL_PIN) { - mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, - refs); - - if (hpage_pincount_available(page)) - hpage_pincount_sub(page, refs); + node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs); + if (folio_test_large(folio)) + atomic_sub(refs, folio_pincount_ptr(folio)); else refs *= GUP_PIN_COUNTING_BIAS; } - put_page_refs(page, refs); + folio_put_refs(folio, refs); } /** * try_grab_page() - elevate a page's refcount by a flag-dependent amount + * @page: pointer to page to be grabbed + * @flags: gup flags: these are the FOLL_* flag values. * * This might not do anything at all, depending on the flags argument. * * "grab" names in this file mean, "look at flags to decide whether to use * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount. * - * @page: pointer to page to be grabbed - * @flags: gup flags: these are the FOLL_* flag values. - * * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same - * time. Cases: please see the try_grab_compound_head() documentation, with + * time. Cases: please see the try_grab_folio() documentation, with * "refs=1". * * Return: true for success, or if no action was required (if neither FOLL_PIN @@ -208,10 +166,31 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags) */ bool __must_check try_grab_page(struct page *page, unsigned int flags) { - if (!(flags & (FOLL_GET | FOLL_PIN))) - return true; + struct folio *folio = page_folio(page); + + WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN)); + if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) + return false; - return try_grab_compound_head(page, 1, flags); + if (flags & FOLL_GET) + folio_ref_inc(folio); + else if (flags & FOLL_PIN) { + /* + * Similar to try_grab_folio(): be sure to *also* + * increment the normal page refcount field at least once, + * so that the page really is pinned. + */ + if (folio_test_large(folio)) { + folio_ref_add(folio, 1); + atomic_add(1, folio_pincount_ptr(folio)); + } else { + folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); + } + + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1); + } + + return true; } /** @@ -225,62 +204,40 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags) */ void unpin_user_page(struct page *page) { - put_compound_head(compound_head(page), 1, FOLL_PIN); + gup_put_folio(page_folio(page), 1, FOLL_PIN); } EXPORT_SYMBOL(unpin_user_page); -static inline void compound_range_next(unsigned long i, unsigned long npages, - struct page **list, struct page **head, - unsigned int *ntails) +static inline struct folio *gup_folio_range_next(struct page *start, + unsigned long npages, unsigned long i, unsigned int *ntails) { - struct page *next, *page; + struct page *next = nth_page(start, i); + struct folio *folio = page_folio(next); unsigned int nr = 1; - if (i >= npages) - return; - - next = *list + i; - page = compound_head(next); - if (PageCompound(page) && compound_order(page) >= 1) - nr = min_t(unsigned int, - page + compound_nr(page) - next, npages - i); + if (folio_test_large(folio)) + nr = min_t(unsigned int, npages - i, + folio_nr_pages(folio) - folio_page_idx(folio, next)); - *head = page; *ntails = nr; + return folio; } -#define for_each_compound_range(__i, __list, __npages, __head, __ntails) \ - for (__i = 0, \ - compound_range_next(__i, __npages, __list, &(__head), &(__ntails)); \ - __i < __npages; __i += __ntails, \ - compound_range_next(__i, __npages, __list, &(__head), &(__ntails))) - -static inline void compound_next(unsigned long i, unsigned long npages, - struct page **list, struct page **head, - unsigned int *ntails) +static inline struct folio *gup_folio_next(struct page **list, + unsigned long npages, unsigned long i, unsigned int *ntails) { - struct page *page; + struct folio *folio = page_folio(list[i]); unsigned int nr; - if (i >= npages) - return; - - page = compound_head(list[i]); for (nr = i + 1; nr < npages; nr++) { - if (compound_head(list[nr]) != page) + if (page_folio(list[nr]) != folio) break; } - *head = page; *ntails = nr - i; + return folio; } -#define for_each_compound_head(__i, __list, __npages, __head, __ntails) \ - for (__i = 0, \ - compound_next(__i, __npages, __list, &(__head), &(__ntails)); \ - __i < __npages; __i += __ntails, \ - compound_next(__i, __npages, __list, &(__head), &(__ntails))) - /** * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages * @pages: array of pages to be maybe marked dirty, and definitely released. @@ -306,16 +263,17 @@ static inline void compound_next(unsigned long i, unsigned long npages, void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty) { - unsigned long index; - struct page *head; - unsigned int ntails; + unsigned long i; + struct folio *folio; + unsigned int nr; if (!make_dirty) { unpin_user_pages(pages, npages); return; } - for_each_compound_head(index, pages, npages, head, ntails) { + for (i = 0; i < npages; i += nr) { + folio = gup_folio_next(pages, npages, i, &nr); /* * Checking PageDirty at this point may race with * clear_page_dirty_for_io(), but that's OK. Two key @@ -336,9 +294,12 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, * written back, so it gets written back again in the * next writeback cycle. This is harmless. */ - if (!PageDirty(head)) - set_page_dirty_lock(head); - put_compound_head(head, ntails, FOLL_PIN); + if (!folio_test_dirty(folio)) { + folio_lock(folio); + folio_mark_dirty(folio); + folio_unlock(folio); + } + gup_put_folio(folio, nr, FOLL_PIN); } } EXPORT_SYMBOL(unpin_user_pages_dirty_lock); @@ -367,14 +328,18 @@ EXPORT_SYMBOL(unpin_user_pages_dirty_lock); void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty) { - unsigned long index; - struct page *head; - unsigned int ntails; + unsigned long i; + struct folio *folio; + unsigned int nr; - for_each_compound_range(index, &page, npages, head, ntails) { - if (make_dirty && !PageDirty(head)) - set_page_dirty_lock(head); - put_compound_head(head, ntails, FOLL_PIN); + for (i = 0; i < npages; i += nr) { + folio = gup_folio_range_next(page, npages, i, &nr); + if (make_dirty && !folio_test_dirty(folio)) { + folio_lock(folio); + folio_mark_dirty(folio); + folio_unlock(folio); + } + gup_put_folio(folio, nr, FOLL_PIN); } } EXPORT_SYMBOL(unpin_user_page_range_dirty_lock); @@ -390,9 +355,9 @@ EXPORT_SYMBOL(unpin_user_page_range_dirty_lock); */ void unpin_user_pages(struct page **pages, unsigned long npages) { - unsigned long index; - struct page *head; - unsigned int ntails; + unsigned long i; + struct folio *folio; + unsigned int nr; /* * If this WARN_ON() fires, then the system *might* be leaking pages (by @@ -402,8 +367,10 @@ void unpin_user_pages(struct page **pages, unsigned long npages) if (WARN_ON(IS_ERR_VALUE(npages))) return; - for_each_compound_head(index, pages, npages, head, ntails) - put_compound_head(head, ntails, FOLL_PIN); + for (i = 0; i < npages; i += nr) { + folio = gup_folio_next(pages, npages, i, &nr); + gup_put_folio(folio, nr, FOLL_PIN); + } } EXPORT_SYMBOL(unpin_user_pages); @@ -439,10 +406,6 @@ static struct page *no_page_table(struct vm_area_struct *vma, static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, pte_t *pte, unsigned int flags) { - /* No page to get reference */ - if (flags & FOLL_GET) - return -EFAULT; - if (flags & FOLL_TOUCH) { pte_t entry = *pte; @@ -572,32 +535,6 @@ retry: */ mark_page_accessed(page); } - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { - /* Do not mlock pte-mapped THP */ - if (PageTransCompound(page)) - goto out; - - /* - * The preliminary mapping check is mainly to avoid the - * pointless overhead of lock_page on the ZERO_PAGE - * which might bounce very badly if there is contention. - * - * If the page is already locked, we don't need to - * handle it now - vmscan will handle it later if and - * when it attempts to reclaim the page. - */ - if (page->mapping && trylock_page(page)) { - lru_add_drain(); /* push cached pages to LRU */ - /* - * Because we lock page here, and migration is - * blocked by the pte's page reference, and we - * know the page is still mapped, we don't even - * need to check for file-cache page truncation. - */ - mlock_vma_page(page); - unlock_page(page); - } - } out: pte_unmap_unlock(ptep, ptl); return page; @@ -642,12 +579,17 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, } retry: if (!pmd_present(pmdval)) { + /* + * Should never reach here, if thp migration is not supported; + * Otherwise, it must be a thp migration entry. + */ + VM_BUG_ON(!thp_migration_supported() || + !is_pmd_migration_entry(pmdval)); + if (likely(!(flags & FOLL_MIGRATION))) return no_page_table(vma, flags); - VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(pmdval)); - if (is_pmd_migration_entry(pmdval)) - pmd_migration_entry_wait(mm, pmd); + + pmd_migration_entry_wait(mm, pmd); pmdval = READ_ONCE(*pmd); /* * MADV_DONTNEED may convert the pmd to null because @@ -915,9 +857,6 @@ static int faultin_page(struct vm_area_struct *vma, unsigned int fault_flags = 0; vm_fault_t ret; - /* mlock all present pages, but do not fault in new pages */ - if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) - return -ENOENT; if (*flags & FOLL_NOFAULT) return -EFAULT; if (*flags & FOLL_WRITE) @@ -1168,15 +1107,20 @@ retry: case -ENOMEM: case -EHWPOISON: goto out; - case -ENOENT: - goto next_page; } BUG(); } else if (PTR_ERR(page) == -EEXIST) { /* * Proper page table entry exists, but no corresponding - * struct page. + * struct page. If the caller expects **pages to be + * filled in, bail out now, because that can't be done + * for this page. */ + if (pages) { + ret = PTR_ERR(page); + goto out; + } + goto next_page; } else if (IS_ERR(page)) { ret = PTR_ERR(page); @@ -1467,9 +1411,14 @@ long populate_vma_page_range(struct vm_area_struct *vma, VM_BUG_ON_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); - gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; + /* + * Rightly or wrongly, the VM_LOCKONFAULT case has never used + * faultin_page() to break COW, so it has no work to do here. + */ if (vma->vm_flags & VM_LOCKONFAULT) - gup_flags &= ~FOLL_POPULATE; + return nr_pages; + + gup_flags = FOLL_TOUCH; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW @@ -1536,10 +1485,9 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, * in the page table. * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit * a poisoned page. - * FOLL_POPULATE: Always populate memory with VM_LOCKONFAULT. * !FOLL_FORCE: Require proper access permissions. */ - gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK | FOLL_HWPOISON; + gup_flags = FOLL_TOUCH | FOLL_HWPOISON; if (write) gup_flags |= FOLL_WRITE; @@ -1672,21 +1620,22 @@ size_t fault_in_writeable(char __user *uaddr, size_t size) if (unlikely(size == 0)) return 0; + if (!user_write_access_begin(uaddr, size)) + return size; if (!PAGE_ALIGNED(uaddr)) { - if (unlikely(__put_user(0, uaddr) != 0)) - return size; + unsafe_put_user(0, uaddr, out); uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr); } end = (char __user *)PAGE_ALIGN((unsigned long)start + size); if (unlikely(end < start)) end = NULL; while (uaddr != end) { - if (unlikely(__put_user(0, uaddr) != 0)) - goto out; + unsafe_put_user(0, uaddr, out); uaddr += PAGE_SIZE; } out: + user_write_access_end(); if (size > uaddr - start) return size - (uaddr - start); return 0; @@ -1698,11 +1647,11 @@ EXPORT_SYMBOL(fault_in_writeable); * @uaddr: start of address range * @size: length of address range * - * Faults in an address range using get_user_pages, i.e., without triggering - * hardware page faults. This is primarily useful when we already know that - * some or all of the pages in the address range aren't in memory. + * Faults in an address range for writing. This is primarily useful when we + * already know that some or all of the pages in the address range aren't in + * memory. * - * Other than fault_in_writeable(), this function is non-destructive. + * Unlike fault_in_writeable(), this function is non-destructive. * * Note that we don't pin or otherwise hold the pages referenced that we fault * in. There's no guarantee that they'll stay in memory for any duration of @@ -1713,46 +1662,27 @@ EXPORT_SYMBOL(fault_in_writeable); */ size_t fault_in_safe_writeable(const char __user *uaddr, size_t size) { - unsigned long start = (unsigned long)untagged_addr(uaddr); - unsigned long end, nstart, nend; + unsigned long start = (unsigned long)uaddr, end; struct mm_struct *mm = current->mm; - struct vm_area_struct *vma = NULL; - int locked = 0; + bool unlocked = false; - nstart = start & PAGE_MASK; + if (unlikely(size == 0)) + return 0; end = PAGE_ALIGN(start + size); - if (end < nstart) + if (end < start) end = 0; - for (; nstart != end; nstart = nend) { - unsigned long nr_pages; - long ret; - if (!locked) { - locked = 1; - mmap_read_lock(mm); - vma = find_vma(mm, nstart); - } else if (nstart >= vma->vm_end) - vma = vma->vm_next; - if (!vma || vma->vm_start >= end) - break; - nend = end ? min(end, vma->vm_end) : vma->vm_end; - if (vma->vm_flags & (VM_IO | VM_PFNMAP)) - continue; - if (nstart < vma->vm_start) - nstart = vma->vm_start; - nr_pages = (nend - nstart) / PAGE_SIZE; - ret = __get_user_pages_locked(mm, nstart, nr_pages, - NULL, NULL, &locked, - FOLL_TOUCH | FOLL_WRITE); - if (ret <= 0) + mmap_read_lock(mm); + do { + if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked)) break; - nend = nstart + ret * PAGE_SIZE; - } - if (locked) - mmap_read_unlock(mm); - if (nstart == end) - return 0; - return size - min_t(size_t, nstart - start, size); + start = (start + PAGE_SIZE) & PAGE_MASK; + } while (start != end); + mmap_read_unlock(mm); + + if (size > (unsigned long)uaddr - start) + return size - ((unsigned long)uaddr - start); + return 0; } EXPORT_SYMBOL(fault_in_safe_writeable); @@ -1771,21 +1701,22 @@ size_t fault_in_readable(const char __user *uaddr, size_t size) if (unlikely(size == 0)) return 0; + if (!user_read_access_begin(uaddr, size)) + return size; if (!PAGE_ALIGNED(uaddr)) { - if (unlikely(__get_user(c, uaddr) != 0)) - return size; + unsafe_get_user(c, uaddr, out); uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr); } end = (const char __user *)PAGE_ALIGN((unsigned long)start + size); if (unlikely(end < start)) end = NULL; while (uaddr != end) { - if (unlikely(__get_user(c, uaddr) != 0)) - goto out; + unsafe_get_user(c, uaddr, out); uaddr += PAGE_SIZE; } out: + user_read_access_end(); (void)c; if (size > uaddr - start) return size - (uaddr - start); @@ -1836,72 +1767,80 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { - unsigned long i; - unsigned long isolation_error_count = 0; - bool drain_allow = true; + unsigned long isolation_error_count = 0, i; + struct folio *prev_folio = NULL; LIST_HEAD(movable_page_list); - long ret = 0; - struct page *prev_head = NULL; - struct page *head; - struct migration_target_control mtc = { - .nid = NUMA_NO_NODE, - .gfp_mask = GFP_USER | __GFP_NOWARN, - }; + bool drain_allow = true; + int ret = 0; for (i = 0; i < nr_pages; i++) { - head = compound_head(pages[i]); - if (head == prev_head) + struct folio *folio = page_folio(pages[i]); + + if (folio == prev_folio) continue; - prev_head = head; + prev_folio = folio; + + if (folio_is_pinnable(folio)) + continue; + /* - * If we get a movable page, since we are going to be pinning - * these entries, try to move them out if possible. + * Try to move out any movable page before pinning the range. */ - if (!is_pinnable_page(head)) { - if (PageHuge(head)) { - if (!isolate_huge_page(head, &movable_page_list)) - isolation_error_count++; - } else { - if (!PageLRU(head) && drain_allow) { - lru_add_drain_all(); - drain_allow = false; - } + if (folio_test_hugetlb(folio)) { + if (!isolate_huge_page(&folio->page, + &movable_page_list)) + isolation_error_count++; + continue; + } - if (isolate_lru_page(head)) { - isolation_error_count++; - continue; - } - list_add_tail(&head->lru, &movable_page_list); - mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + - page_is_file_lru(head), - thp_nr_pages(head)); - } + if (!folio_test_lru(folio) && drain_allow) { + lru_add_drain_all(); + drain_allow = false; + } + + if (folio_isolate_lru(folio)) { + isolation_error_count++; + continue; } + list_add_tail(&folio->lru, &movable_page_list); + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + folio_nr_pages(folio)); } + if (!list_empty(&movable_page_list) || isolation_error_count) + goto unpin_pages; + /* * If list is empty, and no isolation errors, means that all pages are * in the correct zone. */ - if (list_empty(&movable_page_list) && !isolation_error_count) - return nr_pages; + return nr_pages; +unpin_pages: if (gup_flags & FOLL_PIN) { unpin_user_pages(pages, nr_pages); } else { for (i = 0; i < nr_pages; i++) put_page(pages[i]); } + if (!list_empty(&movable_page_list)) { + struct migration_target_control mtc = { + .nid = NUMA_NO_NODE, + .gfp_mask = GFP_USER | __GFP_NOWARN, + }; + ret = migrate_pages(&movable_page_list, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_LONGTERM_PIN, NULL); - if (ret && !list_empty(&movable_page_list)) - putback_movable_pages(&movable_page_list); + if (ret > 0) /* number of pages not migrated */ + ret = -ENOMEM; } - return ret > 0 ? -ENOMEM : ret; + if (ret && !list_empty(&movable_page_list)) + putback_movable_pages(&movable_page_list); + return ret; } #else static long check_and_migrate_movable_pages(unsigned long nr_pages, @@ -2110,65 +2049,6 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages); -/** - * get_user_pages_locked() - variant of get_user_pages() - * - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @gup_flags: flags modifying lookup behaviour - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. Or NULL, if caller - * only intends to ensure the pages are faulted in. - * @locked: pointer to lock flag indicating whether lock is held and - * subsequently whether VM_FAULT_RETRY functionality can be - * utilised. Lock must initially be held. - * - * It is suitable to replace the form: - * - * mmap_read_lock(mm); - * do_something() - * get_user_pages(mm, ..., pages, NULL); - * mmap_read_unlock(mm); - * - * to: - * - * int locked = 1; - * mmap_read_lock(mm); - * do_something() - * get_user_pages_locked(mm, ..., pages, &locked); - * if (locked) - * mmap_read_unlock(mm); - * - * We can leverage the VM_FAULT_RETRY functionality in the page fault - * paths better by using either get_user_pages_locked() or - * get_user_pages_unlocked(). - * - */ -long get_user_pages_locked(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - int *locked) -{ - /* - * FIXME: Current FOLL_LONGTERM behavior is incompatible with - * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on - * vmas. As there are no users of this flag in this call we simply - * disallow this option for now. - */ - if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) - return -EINVAL; - /* - * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, - * never directly by the caller, so enforce that: - */ - if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) - return -EINVAL; - - return __get_user_pages_locked(current->mm, start, nr_pages, - pages, NULL, locked, - gup_flags | FOLL_TOUCH); -} -EXPORT_SYMBOL(get_user_pages_locked); - /* * get_user_pages_unlocked() is suitable to replace the form: * @@ -2270,7 +2150,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, ptem = ptep = pte_offset_map(&pmd, addr); do { pte_t pte = ptep_get_lockless(ptep); - struct page *head, *page; + struct page *page; + struct folio *folio; /* * Similar to the PMD case below, NUMA hinting must take slow @@ -2297,22 +2178,20 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); - head = try_grab_compound_head(page, 1, flags); - if (!head) + folio = try_grab_folio(page, 1, flags); + if (!folio) goto pte_unmap; if (unlikely(page_is_secretmem(page))) { - put_compound_head(head, 1, flags); + gup_put_folio(folio, 1, flags); goto pte_unmap; } if (unlikely(pte_val(pte) != pte_val(*ptep))) { - put_compound_head(head, 1, flags); + gup_put_folio(folio, 1, flags); goto pte_unmap; } - VM_BUG_ON_PAGE(compound_head(page) != head, page); - /* * We need to make the page accessible if and only if we are * going to access its content (the FOLL_PIN case). Please @@ -2322,14 +2201,13 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, if (flags & FOLL_PIN) { ret = arch_make_page_accessible(page); if (ret) { - unpin_user_page(page); + gup_put_folio(folio, 1, flags); goto pte_unmap; } } - SetPageReferenced(page); + folio_set_referenced(folio); pages[*nr] = page; (*nr)++; - } while (ptep++, addr += PAGE_SIZE, addr != end); ret = 1; @@ -2446,8 +2324,8 @@ static int record_subpages(struct page *page, unsigned long addr, { int nr; - for (nr = 0; addr != end; addr += PAGE_SIZE) - pages[nr++] = page++; + for (nr = 0; addr != end; nr++, addr += PAGE_SIZE) + pages[nr] = nth_page(page, nr); return nr; } @@ -2465,7 +2343,8 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, struct page **pages, int *nr) { unsigned long pte_end; - struct page *head, *page; + struct page *page; + struct folio *folio; pte_t pte; int refs; @@ -2481,21 +2360,20 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, /* hugepages are never "special" */ VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - head = pte_page(pte); - page = head + ((addr & (sz-1)) >> PAGE_SHIFT); + page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); - head = try_grab_compound_head(head, refs, flags); - if (!head) + folio = try_grab_folio(page, refs, flags); + if (!folio) return 0; if (unlikely(pte_val(pte) != pte_val(*ptep))) { - put_compound_head(head, refs, flags); + gup_put_folio(folio, refs, flags); return 0; } *nr += refs; - SetPageReferenced(head); + folio_set_referenced(folio); return 1; } @@ -2529,7 +2407,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { - struct page *head, *page; + struct page *page; + struct folio *folio; int refs; if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) @@ -2542,20 +2421,20 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, pages, nr); } - page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); - head = try_grab_compound_head(pmd_page(orig), refs, flags); - if (!head) + folio = try_grab_folio(page, refs, flags); + if (!folio) return 0; if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { - put_compound_head(head, refs, flags); + gup_put_folio(folio, refs, flags); return 0; } *nr += refs; - SetPageReferenced(head); + folio_set_referenced(folio); return 1; } @@ -2563,7 +2442,8 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { - struct page *head, *page; + struct page *page; + struct folio *folio; int refs; if (!pud_access_permitted(orig, flags & FOLL_WRITE)) @@ -2576,20 +2456,20 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, pages, nr); } - page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + page = nth_page(pud_page(orig), (addr & ~PUD_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); - head = try_grab_compound_head(pud_page(orig), refs, flags); - if (!head) + folio = try_grab_folio(page, refs, flags); + if (!folio) return 0; if (unlikely(pud_val(orig) != pud_val(*pudp))) { - put_compound_head(head, refs, flags); + gup_put_folio(folio, refs, flags); return 0; } *nr += refs; - SetPageReferenced(head); + folio_set_referenced(folio); return 1; } @@ -2598,27 +2478,28 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, struct page **pages, int *nr) { int refs; - struct page *head, *page; + struct page *page; + struct folio *folio; if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) return 0; BUILD_BUG_ON(pgd_devmap(orig)); - page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); + page = nth_page(pgd_page(orig), (addr & ~PGDIR_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); - head = try_grab_compound_head(pgd_page(orig), refs, flags); - if (!head) + folio = try_grab_folio(page, refs, flags); + if (!folio) return 0; if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { - put_compound_head(head, refs, flags); + gup_put_folio(folio, refs, flags); return 0; } *nr += refs; - SetPageReferenced(head); + folio_set_referenced(folio); return 1; } @@ -3111,32 +2992,3 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); } EXPORT_SYMBOL(pin_user_pages_unlocked); - -/* - * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked(). - * Behavior is the same, except that this one sets FOLL_PIN and rejects - * FOLL_GET. - */ -long pin_user_pages_locked(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - int *locked) -{ - /* - * FIXME: Current FOLL_LONGTERM behavior is incompatible with - * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on - * vmas. As there are no users of this flag in this call we simply - * disallow this option for now. - */ - if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) - return -EINVAL; - - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ - if (WARN_ON_ONCE(gup_flags & FOLL_GET)) - return -EINVAL; - - gup_flags |= FOLL_PIN; - return __get_user_pages_locked(current->mm, start, nr_pages, - pages, NULL, locked, - gup_flags | FOLL_TOUCH); -} -EXPORT_SYMBOL(pin_user_pages_locked); diff --git a/mm/highmem.c b/mm/highmem.c index 762679050c9a..0cc0c4da7ed9 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -736,11 +736,11 @@ void *page_address(const struct page *page) list_for_each_entry(pam, &pas->lh, list) { if (pam->page == page) { ret = pam->virtual; - goto done; + break; } } } -done: + spin_unlock_irqrestore(&pas->lock, flags); return ret; } @@ -773,13 +773,12 @@ void set_page_address(struct page *page, void *virtual) list_for_each_entry(pam, &pas->lh, list) { if (pam->page == page) { list_del(&pam->list); - spin_unlock_irqrestore(&pas->lock, flags); - goto done; + break; } } spin_unlock_irqrestore(&pas->lock, flags); } -done: + return; } @@ -300,7 +300,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, * Since each architecture defines a struct page for the zero page, just * fall through and treat it like a normal page. */ - if (pte_special(pte) && !pte_devmap(pte) && + if (!vm_normal_page(walk->vma, addr, pte) && + !pte_devmap(pte) && !is_zero_pfn(pte_pfn(pte))) { if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) { pte_unmap(ptep); @@ -416,7 +417,6 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, struct hmm_range *range = hmm_vma_walk->range; unsigned long addr = start; pud_t pud; - int ret = 0; spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma); if (!ptl) @@ -465,7 +465,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, out_unlock: spin_unlock(ptl); - return ret; + return 0; } #else #define hmm_vma_walk_pud NULL @@ -518,7 +518,7 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end, struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) && + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) && vma->vm_flags & VM_READ) return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e5483347291c..005fab2f3b73 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -34,6 +34,7 @@ #include <linux/oom.h> #include <linux/numa.h> #include <linux/page_owner.h> +#include <linux/sched/sysctl.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -582,13 +583,10 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long ret; loff_t off = (loff_t)pgoff << PAGE_SHIFT; - if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD)) - goto out; - ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE); if (ret) return ret; -out: + return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); @@ -1322,7 +1320,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) * We can only reuse the page if nobody else maps the huge page or it's * part. */ - if (reuse_swap_page(page, NULL)) { + if (reuse_swap_page(page)) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -1380,39 +1378,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, if (flags & FOLL_TOUCH) touch_pmd(vma, addr, pmd, flags); - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { - /* - * We don't mlock() pte-mapped THPs. This way we can avoid - * leaking mlocked pages into non-VM_LOCKED VMAs. - * - * For anon THP: - * - * In most cases the pmd is the only mapping of the page as we - * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for - * writable private mappings in populate_vma_page_range(). - * - * The only scenario when we have the page shared here is if we - * mlocking read-only mapping shared over fork(). We skip - * mlocking such pages. - * - * For file THP: - * - * We can expect PageDoubleMap() to be stable under page lock: - * for file pages we set it in page_add_file_rmap(), which - * requires page to be locked. - */ - - if (PageAnon(page) && compound_mapcount(page) != 1) - goto skip_mlock; - if (PageDoubleMap(page) || !page->mapping) - goto skip_mlock; - if (!trylock_page(page)) - goto skip_mlock; - if (page->mapping && !PageDoubleMap(page)) - mlock_vma_page(page); - unlock_page(page); - } -skip_mlock: page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); @@ -1610,7 +1575,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (pmd_present(orig_pmd)) { page = pmd_page(orig_pmd); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); VM_BUG_ON_PAGE(!PageHead(page), page); } else if (thp_migration_supported()) { @@ -1766,17 +1731,28 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, } #endif - /* - * Avoid trapping faults against the zero page. The read-only - * data is likely to be read-cached on the local CPU and - * local/remote hits to the zero page are not interesting. - */ - if (prot_numa && is_huge_zero_pmd(*pmd)) - goto unlock; + if (prot_numa) { + struct page *page; + /* + * Avoid trapping faults against the zero page. The read-only + * data is likely to be read-cached on the local CPU and + * local/remote hits to the zero page are not interesting. + */ + if (is_huge_zero_pmd(*pmd)) + goto unlock; - if (prot_numa && pmd_protnone(*pmd)) - goto unlock; + if (pmd_protnone(*pmd)) + goto unlock; + page = pmd_page(*pmd); + /* + * Skip scanning top tier node if normal numa + * balancing is disabled + */ + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && + node_is_toptier(page_to_nid(page))) + goto unlock; + } /* * In case prot_numa, we are under mmap_read_lock(mm). It's critical * to not clear pmd intermittently to avoid race with MADV_DONTNEED @@ -1995,7 +1971,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, set_page_dirty(page); if (!PageReferenced(page) && pmd_young(old_pmd)) SetPageReferenced(page); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); put_page(page); } add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); @@ -2055,9 +2031,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, young = pmd_young(old_pmd); soft_dirty = pmd_soft_dirty(old_pmd); uffd_wp = pmd_uffd_wp(old_pmd); + VM_BUG_ON_PAGE(!page_count(page), page); + page_ref_add(page, HPAGE_PMD_NR - 1); } - VM_BUG_ON_PAGE(!page_count(page), page); - page_ref_add(page, HPAGE_PMD_NR - 1); /* * Withdraw the table only after we mark the pmd entry invalid. @@ -2129,6 +2105,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } } unlock_page_memcg(page); + + /* Above is effectively page_remove_rmap(page, vma, true) */ + munlock_vma_page(page, vma, true); } smp_wmb(); /* make pte visible before pmd */ @@ -2136,18 +2115,18 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (freeze) { for (i = 0; i < HPAGE_PMD_NR; i++) { - page_remove_rmap(page + i, false); + page_remove_rmap(page + i, vma, false); put_page(page + i); } } } void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long address, bool freeze, struct page *page) + unsigned long address, bool freeze, struct folio *folio) { spinlock_t *ptl; struct mmu_notifier_range range; - bool do_unlock_page = false; + bool do_unlock_folio = false; pmd_t _pmd; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, @@ -2157,20 +2136,20 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, ptl = pmd_lock(vma->vm_mm, pmd); /* - * If caller asks to setup a migration entries, we need a page to check - * pmd against. Otherwise we can end up replacing wrong page. + * If caller asks to setup a migration entry, we need a folio to check + * pmd against. Otherwise we can end up replacing wrong folio. */ - VM_BUG_ON(freeze && !page); - if (page) { - VM_WARN_ON_ONCE(!PageLocked(page)); - if (page != pmd_page(*pmd)) + VM_BUG_ON(freeze && !folio); + if (folio) { + VM_WARN_ON_ONCE(!folio_test_locked(folio)); + if (folio != page_folio(pmd_page(*pmd))) goto out; } repeat: if (pmd_trans_huge(*pmd)) { - if (!page) { - page = pmd_page(*pmd); + if (!folio) { + folio = page_folio(pmd_page(*pmd)); /* * An anonymous page must be locked, to ensure that a * concurrent reuse_swap_page() sees stable mapcount; @@ -2178,33 +2157,31 @@ repeat: * and page lock must not be taken when zap_pmd_range() * calls __split_huge_pmd() while i_mmap_lock is held. */ - if (PageAnon(page)) { - if (unlikely(!trylock_page(page))) { - get_page(page); + if (folio_test_anon(folio)) { + if (unlikely(!folio_trylock(folio))) { + folio_get(folio); _pmd = *pmd; spin_unlock(ptl); - lock_page(page); + folio_lock(folio); spin_lock(ptl); if (unlikely(!pmd_same(*pmd, _pmd))) { - unlock_page(page); - put_page(page); - page = NULL; + folio_unlock(folio); + folio_put(folio); + folio = NULL; goto repeat; } - put_page(page); + folio_put(folio); } - do_unlock_page = true; + do_unlock_folio = true; } } - if (PageMlocked(page)) - clear_page_mlock(page); } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) goto out; __split_huge_pmd_locked(vma, pmd, range.start, freeze); out: spin_unlock(ptl); - if (do_unlock_page) - unlock_page(page); + if (do_unlock_folio) + folio_unlock(folio); /* * No need to double call mmu_notifier->invalidate_range() callback. * They are 3 cases to consider inside __split_huge_pmd_locked(): @@ -2222,7 +2199,7 @@ out: } void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, - bool freeze, struct page *page) + bool freeze, struct folio *folio) { pgd_t *pgd; p4d_t *p4d; @@ -2243,7 +2220,7 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, pmd = pmd_offset(pud, address); - __split_huge_pmd(vma, pmd, address, freeze, page); + __split_huge_pmd(vma, pmd, address, freeze, folio); } static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address) @@ -2283,6 +2260,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, static void unmap_page(struct page *page) { + struct folio *folio = page_folio(page); enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | TTU_SYNC; @@ -2293,26 +2271,27 @@ static void unmap_page(struct page *page) * pages can simply be left unmapped, then faulted back on demand. * If that is ever changed (perhaps for mlock), update remap_page(). */ - if (PageAnon(page)) - try_to_migrate(page, ttu_flags); + if (folio_test_anon(folio)) + try_to_migrate(folio, ttu_flags); else - try_to_unmap(page, ttu_flags | TTU_IGNORE_MLOCK); + try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK); VM_WARN_ON_ONCE_PAGE(page_mapped(page), page); } -static void remap_page(struct page *page, unsigned int nr) +static void remap_page(struct folio *folio, unsigned long nr) { - int i; + int i = 0; /* If unmap_page() uses try_to_migrate() on file, remove this check */ - if (!PageAnon(page)) + if (!folio_test_anon(folio)) return; - if (PageTransHuge(page)) { - remove_migration_ptes(page, page, true); - } else { - for (i = 0; i < nr; i++) - remove_migration_ptes(page + i, page + i, true); + for (;;) { + remove_migration_ptes(folio, folio, true); + i += folio_nr_pages(folio); + if (i >= nr) + break; + folio = folio_next(folio); } } @@ -2332,8 +2311,11 @@ static void lru_add_page_tail(struct page *head, struct page *tail, } else { /* head is still on lru (and we have it frozen) */ VM_WARN_ON(!PageLRU(head)); + if (PageUnevictable(tail)) + tail->mlock_count = 0; + else + list_add_tail(&tail->lru, &head->lru); SetPageLRU(tail); - list_add_tail(&tail->lru, &head->lru); } } @@ -2469,7 +2451,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, } local_irq_enable(); - remap_page(head, nr); + remap_page(folio, nr); if (PageSwapCache(head)) { swp_entry_t entry = { .val = page_private(head) }; @@ -2494,30 +2476,6 @@ static void __split_huge_page(struct page *page, struct list_head *list, } } -int total_mapcount(struct page *page) -{ - int i, compound, nr, ret; - - VM_BUG_ON_PAGE(PageTail(page), page); - - if (likely(!PageCompound(page))) - return atomic_read(&page->_mapcount) + 1; - - compound = compound_mapcount(page); - nr = compound_nr(page); - if (PageHuge(page)) - return compound; - ret = compound; - for (i = 0; i < nr; i++) - ret += atomic_read(&page[i]._mapcount) + 1; - /* File pages has compound_mapcount included in _mapcount */ - if (!PageAnon(page)) - return ret - compound * nr; - if (PageDoubleMap(page)) - ret -= nr; - return ret; -} - /* * This calculates accurately how many mappings a transparent hugepage * has (unlike page_mapcount() which isn't fully accurate). This full @@ -2542,53 +2500,44 @@ int total_mapcount(struct page *page) * need full accuracy to avoid breaking page pinning, because * page_trans_huge_mapcount() is slower than page_mapcount(). */ -int page_trans_huge_mapcount(struct page *page, int *total_mapcount) +int page_trans_huge_mapcount(struct page *page) { - int i, ret, _total_mapcount, mapcount; + int i, ret; /* hugetlbfs shouldn't call it */ VM_BUG_ON_PAGE(PageHuge(page), page); - if (likely(!PageTransCompound(page))) { - mapcount = atomic_read(&page->_mapcount) + 1; - if (total_mapcount) - *total_mapcount = mapcount; - return mapcount; - } + if (likely(!PageTransCompound(page))) + return atomic_read(&page->_mapcount) + 1; page = compound_head(page); - _total_mapcount = ret = 0; + ret = 0; for (i = 0; i < thp_nr_pages(page); i++) { - mapcount = atomic_read(&page[i]._mapcount) + 1; + int mapcount = atomic_read(&page[i]._mapcount) + 1; ret = max(ret, mapcount); - _total_mapcount += mapcount; } - if (PageDoubleMap(page)) { + + if (PageDoubleMap(page)) ret -= 1; - _total_mapcount -= thp_nr_pages(page); - } - mapcount = compound_mapcount(page); - ret += mapcount; - _total_mapcount += mapcount; - if (total_mapcount) - *total_mapcount = _total_mapcount; - return ret; + + return ret + compound_mapcount(page); } /* Racy check whether the huge page can be split */ -bool can_split_huge_page(struct page *page, int *pextra_pins) +bool can_split_folio(struct folio *folio, int *pextra_pins) { int extra_pins; /* Additional pins from page cache */ - if (PageAnon(page)) - extra_pins = PageSwapCache(page) ? thp_nr_pages(page) : 0; + if (folio_test_anon(folio)) + extra_pins = folio_test_swapcache(folio) ? + folio_nr_pages(folio) : 0; else - extra_pins = thp_nr_pages(page); + extra_pins = folio_nr_pages(folio); if (pextra_pins) *pextra_pins = extra_pins; - return total_mapcount(page) == page_count(page) - extra_pins - 1; + return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1; } /* @@ -2612,8 +2561,10 @@ bool can_split_huge_page(struct page *page, int *pextra_pins) */ int split_huge_page_to_list(struct page *page, struct list_head *list) { - struct page *head = compound_head(page); + struct folio *folio = page_folio(page); + struct page *head = &folio->page; struct deferred_split *ds_queue = get_deferred_split_queue(head); + XA_STATE(xas, &head->mapping->i_pages, head->index); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int extra_pins, ret; @@ -2631,7 +2582,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * The caller does not necessarily hold an mmap_lock that would * prevent the anon_vma disappearing so we first we take a * reference to it and then lock the anon_vma for write. This - * is similar to page_lock_anon_vma_read except the write lock + * is similar to folio_lock_anon_vma_read except the write lock * is taken to serialise against parallel split or collapse * operations. */ @@ -2652,6 +2603,13 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out; } + xas_split_alloc(&xas, head, compound_order(head), + mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK); + if (xas_error(&xas)) { + ret = xas_error(&xas); + goto out; + } + anon_vma = NULL; i_mmap_lock_read(mapping); @@ -2671,7 +2629,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * Racy check if we can split the page, before unmap_page() will * split PMDs */ - if (!can_split_huge_page(head, &extra_pins)) { + if (!can_split_folio(folio, &extra_pins)) { ret = -EBUSY; goto out_unlock; } @@ -2681,13 +2639,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); if (mapping) { - XA_STATE(xas, &mapping->i_pages, page_index(head)); - /* * Check if the head page is present in page cache. * We assume all tail are present too, if head is there. */ - xa_lock(&mapping->i_pages); + xas_lock(&xas); + xas_reset(&xas); if (xas_load(&xas) != head) goto fail; } @@ -2703,6 +2660,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mapping) { int nr = thp_nr_pages(head); + xas_split(&xas, head, thp_order(head)); if (PageSwapBacked(head)) { __mod_lruvec_page_state(head, NR_SHMEM_THPS, -nr); @@ -2719,9 +2677,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) spin_unlock(&ds_queue->split_queue_lock); fail: if (mapping) - xa_unlock(&mapping->i_pages); + xas_unlock(&xas); local_irq_enable(); - remap_page(head, thp_nr_pages(head)); + remap_page(folio, folio_nr_pages(folio)); ret = -EBUSY; } @@ -2733,6 +2691,8 @@ out_unlock: if (mapping) i_mmap_unlock_read(mapping); out: + /* Free any memory we didn't use */ + xas_nomem(&xas, 0); count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); return ret; } @@ -2953,7 +2913,6 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, */ for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { struct vm_area_struct *vma = find_vma(mm, addr); - unsigned int follflags; struct page *page; if (!vma || addr < vma->vm_start) @@ -2966,8 +2925,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, } /* FOLL_DUMP to ignore special (like zero) pages */ - follflags = FOLL_GET | FOLL_DUMP; - page = follow_page(vma, addr, follflags); + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); if (IS_ERR(page)) continue; @@ -2978,7 +2936,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, goto next; total++; - if (!can_split_huge_page(compound_head(page), NULL)) + if (!can_split_folio(page_folio(page), NULL)) goto next; if (!trylock_page(page)) @@ -3171,7 +3129,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, if (pmd_soft_dirty(pmdval)) pmdswp = pmd_swp_mksoft_dirty(pmdswp); set_pmd_at(mm, address, pvmw->pmd, pmdswp); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); put_page(page); } @@ -3197,14 +3155,13 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) if (pmd_swp_uffd_wp(*pvmw->pmd)) pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde)); - flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE); if (PageAnon(new)) page_add_anon_rmap(new, vma, mmun_start, true); else - page_add_file_rmap(new, true); + page_add_file_rmap(new, vma, true); set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); - if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new)) - mlock_vma_page(new); + + /* No need to invalidate - it was non-present before */ update_mmu_cache_pmd(vma, address, pvmw->pmd); } #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a1baa198519a..b34f50156f7e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -31,6 +31,7 @@ #include <linux/llist.h> #include <linux/cma.h> #include <linux/migrate.h> +#include <linux/nospec.h> #include <asm/page.h> #include <asm/pgalloc.h> @@ -1320,7 +1321,9 @@ static void __destroy_compound_gigantic_page(struct page *page, } set_compound_order(page, 0); +#ifdef CONFIG_64BIT page[1].compound_nr = 0; +#endif __ClearPageHead(page); } @@ -1812,7 +1815,9 @@ out_error: for (; j < nr_pages; j++, p = mem_map_next(p, page, j)) __ClearPageReserved(p); set_compound_order(page, 0); +#ifdef CONFIG_64BIT page[1].compound_nr = 0; +#endif __ClearPageHead(page); return false; } @@ -1854,6 +1859,7 @@ int PageHeadHuge(struct page *page_head) return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR; } +EXPORT_SYMBOL_GPL(PageHeadHuge); /* * Find and lock address space (mapping) in write mode. @@ -3498,8 +3504,7 @@ static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) static struct kobj_attribute _name##_attr = __ATTR_WO(_name) #define HSTATE_ATTR(_name) \ - static struct kobj_attribute _name##_attr = \ - __ATTR(_name, 0644, _name##_show, _name##_store) + static struct kobj_attribute _name##_attr = __ATTR_RW(_name) static struct kobject *hugepages_kobj; static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; @@ -4159,10 +4164,10 @@ static int __init hugepages_setup(char *s) pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n"); return 0; } - node = tmp; - p += count + 1; - if (node < 0 || node >= nr_online_nodes) + if (tmp >= nr_online_nodes) goto invalid; + node = array_index_nospec(tmp, nr_online_nodes); + p += count + 1; /* Parse hugepages */ if (sscanf(p, "%lu%n", &tmp, &count) != 1) goto invalid; @@ -4637,7 +4642,6 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, vma->vm_page_prot)); } entry = pte_mkyoung(entry); - entry = pte_mkhuge(entry); entry = arch_make_huge_pte(entry, shift, vma->vm_flags); return entry; @@ -4684,8 +4688,8 @@ hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr struct page *new_page) { __SetPageUptodate(new_page); - set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); hugepage_add_new_anon_rmap(new_page, vma, addr); + set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); ClearHPageRestoreReserve(new_page); SetHPageMigratable(new_page); @@ -4851,14 +4855,13 @@ again: } static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, - unsigned long new_addr, pte_t *src_pte) + unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte) { struct hstate *h = hstate_vma(vma); struct mm_struct *mm = vma->vm_mm; - pte_t *dst_pte, pte; spinlock_t *src_ptl, *dst_ptl; + pte_t pte; - dst_pte = huge_pte_offset(mm, new_addr, huge_page_size(h)); dst_ptl = huge_pte_lock(h, mm, dst_pte); src_ptl = huge_pte_lockptr(h, mm, src_pte); @@ -4917,7 +4920,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, if (!dst_pte) break; - move_huge_pte(vma, old_addr, new_addr, src_pte); + move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte); } flush_tlb_range(vma, old_end - len, old_end); mmu_notifier_invalidate_range_end(&range); @@ -5014,7 +5017,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct set_page_dirty(page); hugetlb_count_sub(pages_per_huge_page(h), mm); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); spin_unlock(ptl); tlb_remove_page_size(tlb, page, huge_page_size(h)); @@ -5259,10 +5262,10 @@ retry_avoidcopy: /* Break COW */ huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); + page_remove_rmap(old_page, vma, true); + hugepage_add_new_anon_rmap(new_page, vma, haddr); set_huge_pte_at(mm, haddr, ptep, make_huge_pte(vma, new_page, 1)); - page_remove_rmap(old_page, true); - hugepage_add_new_anon_rmap(new_page, vma, haddr); SetHPageMigratable(new_page); /* Make the old page be freed below */ new_page = old_page; @@ -5342,6 +5345,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, pgoff_t idx, unsigned int flags, unsigned long haddr, + unsigned long addr, unsigned long reason) { vm_fault_t ret; @@ -5349,6 +5353,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, struct vm_fault vmf = { .vma = vma, .address = haddr, + .real_address = addr, .flags = flags, /* @@ -5417,7 +5422,7 @@ retry: /* Check for page in userfault range */ if (userfaultfd_missing(vma)) { ret = hugetlb_handle_userfault(vma, mapping, idx, - flags, haddr, + flags, haddr, address, VM_UFFD_MISSING); goto out; } @@ -5481,7 +5486,7 @@ retry: unlock_page(page); put_page(page); ret = hugetlb_handle_userfault(vma, mapping, idx, - flags, haddr, + flags, haddr, address, VM_UFFD_MINOR); goto out; } @@ -5818,7 +5823,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, *pagep = NULL; goto out; } - folio_copy(page_folio(page), page_folio(*pagep)); + copy_user_huge_page(page, *pagep, dst_addr, dst_vma, + pages_per_huge_page(h)); put_page(*pagep); *pagep = NULL; } @@ -6072,7 +6078,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, if (pages) { /* - * try_grab_compound_head() should always succeed here, + * try_grab_folio() should always succeed here, * because: a) we hold the ptl lock, and b) we've just * checked that the huge page is present in the page * tables. If the huge page is present, then the tail @@ -6081,9 +6087,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * any way. So this page must be available at this * point, unless the page refcount overflowed: */ - if (WARN_ON_ONCE(!try_grab_compound_head(pages[i], - refs, - flags))) { + if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs, + flags))) { spin_unlock(ptl); remainder = 0; err = -ENOMEM; @@ -6172,7 +6177,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned int shift = huge_page_shift(hstate_vma(vma)); old_pte = huge_ptep_modify_prot_start(vma, address, ptep); - pte = pte_mkhuge(huge_pte_modify(old_pte, newprot)); + pte = huge_pte_modify(old_pte, newprot); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; @@ -6890,9 +6895,9 @@ static int __init cmdline_parse_hugetlb_cma(char *p) break; if (s[count] == ':') { - nid = tmp; - if (nid < 0 || nid >= MAX_NUMNODES) + if (tmp >= MAX_NUMNODES) break; + nid = array_index_nospec(tmp, MAX_NUMNODES); s += count + 1; tmp = memparse(s, &s); diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 79d93534ef1e..f9942841df18 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -123,29 +123,58 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, } } +static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup) +{ + int node; + + for_each_node(node) + kfree(h_cgroup->nodeinfo[node]); + kfree(h_cgroup); +} + static struct cgroup_subsys_state * hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); struct hugetlb_cgroup *h_cgroup; + int node; + + h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids), + GFP_KERNEL); - h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); if (!h_cgroup) return ERR_PTR(-ENOMEM); if (!parent_h_cgroup) root_h_cgroup = h_cgroup; + /* + * TODO: this routine can waste much memory for nodes which will + * never be onlined. It's better to use memory hotplug callback + * function. + */ + for_each_node(node) { + /* Set node_to_alloc to -1 for offline nodes. */ + int node_to_alloc = + node_state(node, N_NORMAL_MEMORY) ? node : -1; + h_cgroup->nodeinfo[node] = + kzalloc_node(sizeof(struct hugetlb_cgroup_per_node), + GFP_KERNEL, node_to_alloc); + if (!h_cgroup->nodeinfo[node]) + goto fail_alloc_nodeinfo; + } + hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); return &h_cgroup->css; + +fail_alloc_nodeinfo: + hugetlb_cgroup_free(h_cgroup); + return ERR_PTR(-ENOMEM); } static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) { - struct hugetlb_cgroup *h_cgroup; - - h_cgroup = hugetlb_cgroup_from_css(css); - kfree(h_cgroup); + hugetlb_cgroup_free(hugetlb_cgroup_from_css(css)); } /* @@ -289,7 +318,17 @@ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, return; __set_hugetlb_cgroup(page, h_cg, rsvd); - return; + if (!rsvd) { + unsigned long usage = + h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; + /* + * This write is not atomic due to fetching usage and writing + * to it, but that's fine because we call this with + * hugetlb_lock held anyway. + */ + WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx], + usage + nr_pages); + } } void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, @@ -328,8 +367,17 @@ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, if (rsvd) css_put(&h_cg->css); - - return; + else { + unsigned long usage = + h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; + /* + * This write is not atomic due to fetching usage and writing + * to it, but that's fine because we call this with + * hugetlb_lock held anyway. + */ + WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx], + usage - nr_pages); + } } void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, @@ -418,6 +466,59 @@ enum { RES_RSVD_FAILCNT, }; +static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy) +{ + int nid; + struct cftype *cft = seq_cft(seq); + int idx = MEMFILE_IDX(cft->private); + bool legacy = MEMFILE_ATTR(cft->private); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); + struct cgroup_subsys_state *css; + unsigned long usage; + + if (legacy) { + /* Add up usage across all nodes for the non-hierarchical total. */ + usage = 0; + for_each_node_state(nid, N_MEMORY) + usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]); + seq_printf(seq, "total=%lu", usage * PAGE_SIZE); + + /* Simply print the per-node usage for the non-hierarchical total. */ + for_each_node_state(nid, N_MEMORY) + seq_printf(seq, " N%d=%lu", nid, + READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) * + PAGE_SIZE); + seq_putc(seq, '\n'); + } + + /* + * The hierarchical total is pretty much the value recorded by the + * counter, so use that. + */ + seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "", + page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE); + + /* + * For each node, transverse the css tree to obtain the hierarchical + * node usage. + */ + for_each_node_state(nid, N_MEMORY) { + usage = 0; + rcu_read_lock(); + css_for_each_descendant_pre(css, &h_cg->css) { + usage += READ_ONCE(hugetlb_cgroup_from_css(css) + ->nodeinfo[nid] + ->usage[idx]); + } + rcu_read_unlock(); + seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE); + } + + seq_putc(seq, '\n'); + + return 0; +} + static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -668,8 +769,14 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx) events_local_file[idx]); cft->flags = CFTYPE_NOT_ON_ROOT; - /* NULL terminate the last cft */ + /* Add the numa stat file */ cft = &h->cgroup_files_dfl[6]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); + cft->seq_show = hugetlb_cgroup_read_numa_stat; + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* NULL terminate the last cft */ + cft = &h->cgroup_files_dfl[7]; memset(cft, 0, sizeof(*cft)); WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, @@ -739,8 +846,14 @@ static void __init __hugetlb_cgroup_file_legacy_init(int idx) cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; - /* NULL terminate the last cft */ + /* Add the numa stat file */ cft = &h->cgroup_files_legacy[8]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); + cft->private = MEMFILE_PRIVATE(idx, 1); + cft->seq_show = hugetlb_cgroup_read_numa_stat; + + /* NULL terminate the last cft */ + cft = &h->cgroup_files_legacy[9]; memset(cft, 0, sizeof(*cft)); WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index c540c21e26f5..791626983c2e 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -124,9 +124,9 @@ * page of page structs (page 0) associated with the HugeTLB page contains the 4 * page structs necessary to describe the HugeTLB. The only use of the remaining * pages of page structs (page 1 to page 7) is to point to page->compound_head. - * Therefore, we can remap pages 2 to 7 to page 1. Only 2 pages of page structs + * Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs * will be used for each HugeTLB page. This will allow us to free the remaining - * 6 pages to the buddy allocator. + * 7 pages to the buddy allocator. * * Here is how things look after remapping. * @@ -134,30 +134,30 @@ * +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ * | | | 0 | -------------> | 0 | * | | +-----------+ +-----------+ - * | | | 1 | -------------> | 1 | - * | | +-----------+ +-----------+ - * | | | 2 | ----------------^ ^ ^ ^ ^ ^ - * | | +-----------+ | | | | | - * | | | 3 | ------------------+ | | | | - * | | +-----------+ | | | | - * | | | 4 | --------------------+ | | | - * | PMD | +-----------+ | | | - * | level | | 5 | ----------------------+ | | - * | mapping | +-----------+ | | - * | | | 6 | ------------------------+ | - * | | +-----------+ | - * | | | 7 | --------------------------+ + * | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ + * | | +-----------+ | | | | | | + * | | | 2 | -----------------+ | | | | | + * | | +-----------+ | | | | | + * | | | 3 | -------------------+ | | | | + * | | +-----------+ | | | | + * | | | 4 | ---------------------+ | | | + * | PMD | +-----------+ | | | + * | level | | 5 | -----------------------+ | | + * | mapping | +-----------+ | | + * | | | 6 | -------------------------+ | + * | | +-----------+ | + * | | | 7 | ---------------------------+ * | | +-----------+ * | | * | | * | | * +-----------+ * - * When a HugeTLB is freed to the buddy system, we should allocate 6 pages for + * When a HugeTLB is freed to the buddy system, we should allocate 7 pages for * vmemmap pages and restore the previous mapping relationship. * * For the HugeTLB page of the pud level mapping. It is similar to the former. - * We also can use this approach to free (PAGE_SIZE - 2) vmemmap pages. + * We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages. * * Apart from the HugeTLB page of the pmd/pud level mapping, some architectures * (e.g. aarch64) provides a contiguous bit in the translation table entries @@ -166,7 +166,13 @@ * * The contiguous bit is used to increase the mapping size at the pmd and pte * (last) level. So this type of HugeTLB page can be optimized only when its - * size of the struct page structs is greater than 2 pages. + * size of the struct page structs is greater than 1 page. + * + * Notice: The head vmemmap page is not freed to the buddy allocator and all + * tail vmemmap pages are mapped to the head vmemmap page frame. So we can see + * more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page) + * associated with each HugeTLB page. The compound_head() can handle this + * correctly (more details refer to the comment above compound_head()). */ #define pr_fmt(fmt) "HugeTLB: " fmt @@ -175,19 +181,21 @@ /* * There are a lot of struct page structures associated with each HugeTLB page. * For tail pages, the value of compound_head is the same. So we can reuse first - * page of tail page structures. We map the virtual addresses of the remaining - * pages of tail page structures to the first tail page struct, and then free - * these page frames. Therefore, we need to reserve two pages as vmemmap areas. + * page of head page structures. We map the virtual addresses of all the pages + * of tail page structures to the head page struct, and then free these page + * frames. Therefore, we need to reserve one pages as vmemmap areas. */ -#define RESERVE_VMEMMAP_NR 2U +#define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) -bool hugetlb_free_vmemmap_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); +DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + hugetlb_free_vmemmap_enabled_key); +EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key); static int __init early_hugetlb_free_vmemmap_param(char *buf) { /* We cannot optimize if a "struct page" crosses page boundaries. */ - if ((!is_power_of_2(sizeof(struct page)))) { + if (!is_power_of_2(sizeof(struct page))) { pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n"); return 0; } @@ -196,9 +204,9 @@ static int __init early_hugetlb_free_vmemmap_param(char *buf) return -EINVAL; if (!strcmp(buf, "on")) - hugetlb_free_vmemmap_enabled = true; + static_branch_enable(&hugetlb_free_vmemmap_enabled_key); else if (!strcmp(buf, "off")) - hugetlb_free_vmemmap_enabled = false; + static_branch_disable(&hugetlb_free_vmemmap_enabled_key); else return -EINVAL; @@ -236,7 +244,6 @@ int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) */ ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); - if (!ret) ClearHPageVmemmapOptimized(head); @@ -277,14 +284,13 @@ void __init hugetlb_vmemmap_init(struct hstate *h) BUILD_BUG_ON(__NR_USED_SUBPAGE >= RESERVE_VMEMMAP_SIZE / sizeof(struct page)); - if (!hugetlb_free_vmemmap_enabled) + if (!hugetlb_free_vmemmap_enabled()) return; vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; /* - * The head page and the first tail page are not to be freed to buddy - * allocator, the other pages will map to the first tail page, so they - * can be freed. + * The head page is not to be freed to buddy allocator, the other tail + * pages will map to the head page, so they can be freed. * * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true * on some architectures (e.g. aarch64). See Documentation/arm64/ diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index aff4d27ec235..bb0cea5468cb 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -32,9 +32,9 @@ static int hwpoison_inject(void *data, u64 val) shake_page(hpage); /* - * This implies unable to support non-LRU pages. + * This implies unable to support non-LRU pages except free page. */ - if (!PageLRU(hpage) && !PageHuge(p)) + if (!PageLRU(hpage) && !PageHuge(p) && !is_free_buddy_page(p)) return 0; /* @@ -48,7 +48,8 @@ static int hwpoison_inject(void *data, u64 val) inject: pr_info("Injecting memory failure at pfn %#lx\n", pfn); - return memory_failure(pfn, 0); + err = memory_failure(pfn, 0); + return (err == -EOPNOTSUPP) ? 0 : err; } static int hwpoison_unpoison(void *data, u64 val) diff --git a/mm/init-mm.c b/mm/init-mm.c index b4a6f38fb51d..fbe7844d0912 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -10,6 +10,7 @@ #include <linux/atomic.h> #include <linux/user_namespace.h> +#include <linux/ioasid.h> #include <asm/mmu.h> #ifndef INIT_MM_CONTEXT @@ -38,6 +39,9 @@ struct mm_struct init_mm = { .mmlist = LIST_HEAD_INIT(init_mm.mmlist), .user_ns = &init_user_ns, .cpu_bitmap = CPU_BITS_NONE, +#ifdef CONFIG_IOMMU_SVA + .pasid = INVALID_IOASID, +#endif INIT_MM_CONTEXT(init_mm) }; diff --git a/mm/internal.h b/mm/internal.h index 3b79a5c9427a..58dc6adc19c5 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -10,8 +10,11 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/pagemap.h> +#include <linux/rmap.h> #include <linux/tracepoint-defs.h> +struct folio_batch; + /* * The set of flags that only affect watermark checking and reclaim * behaviour. This is used by the MM to obey the caller constraints @@ -21,7 +24,7 @@ #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ - __GFP_ATOMIC) + __GFP_ATOMIC|__GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) @@ -64,23 +67,20 @@ static inline void wake_throttle_isolated(pg_data_t *pgdat) vm_fault_t do_swap_page(struct vm_fault *vmf); void folio_rotate_reclaimable(struct folio *folio); bool __folio_end_writeback(struct folio *folio); +void deactivate_file_folio(struct folio *folio); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); -static inline bool can_madv_lru_vma(struct vm_area_struct *vma) -{ - return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); -} - +struct zap_details; void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details); -void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read, - unsigned long lookahead_size); +void page_cache_ra_order(struct readahead_control *, struct file_ra_state *, + unsigned int order); void force_page_cache_ra(struct readahead_control *, unsigned long nr); static inline void force_page_cache_readahead(struct address_space *mapping, struct file *file, pgoff_t index, unsigned long nr_to_read) @@ -90,7 +90,16 @@ static inline void force_page_cache_readahead(struct address_space *mapping, } unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, - pgoff_t end, struct pagevec *pvec, pgoff_t *indices); + pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); +unsigned find_get_entries(struct address_space *mapping, pgoff_t start, + pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); +void filemap_free_folio(struct address_space *mapping, struct folio *folio); +int truncate_inode_folio(struct address_space *mapping, struct folio *folio); +bool truncate_inode_partial_folio(struct folio *folio, loff_t start, + loff_t end); +long invalidate_inode_page(struct page *page); +unsigned long invalidate_mapping_pagevec(struct address_space *mapping, + pgoff_t start, pgoff_t end, unsigned long *nr_pagevec); /** * folio_evictable - Test whether a folio is evictable. @@ -146,10 +155,18 @@ extern unsigned long highest_memmap_pfn; #define MAX_RECLAIM_RETRIES 16 /* + * in mm/early_ioremap.c + */ +pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, + unsigned long size, pgprot_t prot); + +/* * in mm/vmscan.c: */ -extern int isolate_lru_page(struct page *page); -extern void putback_lru_page(struct page *page); +int isolate_lru_page(struct page *page); +int folio_isolate_lru(struct folio *folio); +void putback_lru_page(struct page *page); +void folio_putback_lru(struct folio *folio); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); /* @@ -158,11 +175,6 @@ extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); /* - * in mm/memcontrol.c: - */ -extern bool cgroup_memory_nokmem; - -/* * in mm/page_alloc.c */ @@ -386,39 +398,65 @@ static inline bool is_data_mapping(vm_flags_t flags) void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev); void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma); +struct anon_vma *folio_anon_vma(struct folio *folio); #ifdef CONFIG_MMU +void unmap_mapping_folio(struct folio *folio); extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); extern long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool write, int *locked); -extern void munlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end); -static inline void munlock_vma_pages_all(struct vm_area_struct *vma) -{ - munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); -} - -/* - * must be called with vma's mmap_lock held for read or write, and page locked. - */ -extern void mlock_vma_page(struct page *page); -extern unsigned int munlock_vma_page(struct page *page); - extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, unsigned long len); - /* - * Clear the page's PageMlocked(). This can be useful in a situation where - * we want to unconditionally remove a page from the pagecache -- e.g., - * on truncation or freeing. + * mlock_vma_page() and munlock_vma_page(): + * should be called with vma's mmap_lock held for read or write, + * under page table lock for the pte/pmd being added or removed. + * + * mlock is usually called at the end of page_add_*_rmap(), + * munlock at the end of page_remove_rmap(); but new anon + * pages are managed by lru_cache_add_inactive_or_unevictable() + * calling mlock_new_page(). * - * It is legal to call this function for any page, mlocked or not. - * If called for a page that is still mapped by mlocked vmas, all we do - * is revert to lazy LRU behaviour -- semantics are not broken. + * @compound is used to include pmd mappings of THPs, but filter out + * pte mappings of THPs, which cannot be consistently counted: a pte + * mapping of the THP head cannot be distinguished by the page alone. */ -extern void clear_page_mlock(struct page *page); +void mlock_folio(struct folio *folio); +static inline void mlock_vma_folio(struct folio *folio, + struct vm_area_struct *vma, bool compound) +{ + /* + * The VM_SPECIAL check here serves two purposes. + * 1) VM_IO check prevents migration from double-counting during mlock. + * 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED + * is never left set on a VM_SPECIAL vma, there is an interval while + * file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may + * still be set while VM_SPECIAL bits are added: so ignore it then. + */ + if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) && + (compound || !folio_test_large(folio))) + mlock_folio(folio); +} + +static inline void mlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) +{ + mlock_vma_folio(page_folio(page), vma, compound); +} + +void munlock_page(struct page *page); +static inline void munlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) +{ + if (unlikely(vma->vm_flags & VM_LOCKED) && + (compound || !PageTransCompound(page))) + munlock_page(page); +} +void mlock_new_page(struct page *page); +bool need_mlock_page_drain(int cpu); +void mlock_page_drain(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); @@ -452,18 +490,20 @@ vma_address(struct page *page, struct vm_area_struct *vma) } /* - * Then at what user virtual address will none of the page be found in vma? + * Then at what user virtual address will none of the range be found in vma? * Assumes that vma_address() already returned a good starting address. - * If page is a compound head, the entire compound page is considered. */ -static inline unsigned long -vma_address_end(struct page *page, struct vm_area_struct *vma) +static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw) { + struct vm_area_struct *vma = pvmw->vma; pgoff_t pgoff; unsigned long address; - VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ - pgoff = page_to_pgoff(page) + compound_nr(page); + /* Common case, plus ->pgoff is invalid for KSM */ + if (pvmw->nr_pages == 1) + return pvmw->address + PAGE_SIZE; + + pgoff = pvmw->pgoff + pvmw->nr_pages; address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); /* Check for address beyond vma (or wrapped through 0?) */ if (address < vma->vm_start || address > vma->vm_end) @@ -491,10 +531,15 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, } return fpin; } - #else /* !CONFIG_MMU */ -static inline void clear_page_mlock(struct page *page) { } -static inline void mlock_vma_page(struct page *page) { } +static inline void unmap_mapping_folio(struct folio *folio) { } +static inline void mlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) { } +static inline void munlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) { } +static inline void mlock_new_page(struct page *page) { } +static inline bool need_mlock_page_drain(int cpu) { return false; } +static inline void mlock_page_drain(int cpu) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } @@ -567,17 +612,6 @@ static inline void mminit_verify_zonelist(void) } #endif /* CONFIG_DEBUG_MEMORY_INIT */ -/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */ -#if defined(CONFIG_SPARSEMEM) -extern void mminit_validate_memmodel_limits(unsigned long *start_pfn, - unsigned long *end_pfn); -#else -static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, - unsigned long *end_pfn) -{ -} -#endif /* CONFIG_SPARSEMEM */ - #define NODE_RECLAIM_NOSCAN -2 #define NODE_RECLAIM_FULL -1 #define NODE_RECLAIM_SOME 0 @@ -713,4 +747,13 @@ void vunmap_range_noflush(unsigned long start, unsigned long end); int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags); +void free_zone_device_page(struct page *page); + +/* + * mm/gup.c + */ +struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); + +DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 7c06db78a76c..92196562687b 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -36,7 +36,6 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc) unsigned int nr_entries; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); - nr_entries = filter_irq_stacks(entries, nr_entries); return __stack_depot_save(entries, nr_entries, flags, can_alloc); } diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 587da8995f2d..08291ed33e93 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -132,12 +132,23 @@ static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache) static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache) { void *object = qlink_to_object(qlink, cache); + struct kasan_free_meta *meta = kasan_get_free_meta(cache, object); unsigned long flags; if (IS_ENABLED(CONFIG_SLAB)) local_irq_save(flags); /* + * If init_on_free is enabled and KASAN's free metadata is stored in + * the object, zero the metadata. Otherwise, the object's memory will + * not be properly zeroed, as KASAN saves the metadata after the slab + * allocator zeroes the object. + */ + if (slab_want_init_on_free(cache) && + cache->kasan_info.free_meta_offset == 0) + memzero_explicit(meta, sizeof(*meta)); + + /* * As the object now gets freed from the quarantine, assume that its * free track is no longer valid. */ diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 4a4929b29a23..94136f84b449 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -498,7 +498,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, #else /* CONFIG_KASAN_VMALLOC */ -int kasan_module_alloc(void *addr, size_t size) +int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) { void *ret; size_t scaled_size; @@ -520,9 +520,14 @@ int kasan_module_alloc(void *addr, size_t size) __builtin_return_address(0)); if (ret) { + struct vm_struct *vm = find_vm_area(addr); __memset(ret, KASAN_SHADOW_INIT, shadow_size); - find_vm_area(addr)->flags |= VM_KASAN; + vm->flags |= VM_KASAN; kmemleak_ignore(ret); + + if (vm->flags & VM_DEFER_KMEMLEAK) + kmemleak_vmalloc(vm, size, gfp_mask); + return 0; } diff --git a/mm/kfence/Makefile b/mm/kfence/Makefile index 6872cd5e5390..0bb95728a784 100644 --- a/mm/kfence/Makefile +++ b/mm/kfence/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_KFENCE) := core.o report.o +obj-y := core.o report.o CFLAGS_kfence_test.o := -g -fno-omit-frame-pointer -fno-optimize-sibling-calls obj-$(CONFIG_KFENCE_KUNIT_TEST) += kfence_test.o diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 5ad40e3add45..2f9fdfde1941 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -38,22 +38,27 @@ #define KFENCE_WARN_ON(cond) \ ({ \ const bool __cond = WARN_ON(cond); \ - if (unlikely(__cond)) \ + if (unlikely(__cond)) { \ WRITE_ONCE(kfence_enabled, false); \ + disabled_by_warn = true; \ + } \ __cond; \ }) /* === Data ================================================================= */ static bool kfence_enabled __read_mostly; +static bool disabled_by_warn __read_mostly; -static unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL; +unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL; +EXPORT_SYMBOL_GPL(kfence_sample_interval); /* Export for test modules. */ #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX #endif #define MODULE_PARAM_PREFIX "kfence." +static int kfence_enable_late(void); static int param_set_sample_interval(const char *val, const struct kernel_param *kp) { unsigned long num; @@ -64,10 +69,11 @@ static int param_set_sample_interval(const char *val, const struct kernel_param if (!num) /* Using 0 to indicate KFENCE is disabled. */ WRITE_ONCE(kfence_enabled, false); - else if (!READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING) - return -EINVAL; /* Cannot (re-)enable KFENCE on-the-fly. */ *((unsigned long *)kp->arg) = num; + + if (num && !READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING) + return disabled_by_warn ? -EINVAL : kfence_enable_late(); return 0; } @@ -89,8 +95,12 @@ module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_inte static unsigned long kfence_skip_covered_thresh __read_mostly = 75; module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644); +/* If true, use a deferrable timer. */ +static bool kfence_deferrable __read_mostly = IS_ENABLED(CONFIG_KFENCE_DEFERRABLE); +module_param_named(deferrable, kfence_deferrable, bool, 0444); + /* The pool of pages used for guard pages and objects. */ -char *__kfence_pool __ro_after_init; +char *__kfence_pool __read_mostly; EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */ /* @@ -531,17 +541,19 @@ static void rcu_guarded_free(struct rcu_head *h) kfence_guarded_free((void *)meta->addr, meta, false); } -static bool __init kfence_init_pool(void) +/* + * Initialization of the KFENCE pool after its allocation. + * Returns 0 on success; otherwise returns the address up to + * which partial initialization succeeded. + */ +static unsigned long kfence_init_pool(void) { unsigned long addr = (unsigned long)__kfence_pool; struct page *pages; int i; - if (!__kfence_pool) - return false; - if (!arch_kfence_init_pool()) - goto err; + return addr; pages = virt_to_page(addr); @@ -559,7 +571,7 @@ static bool __init kfence_init_pool(void) /* Verify we do not have a compound head page. */ if (WARN_ON(compound_head(&pages[i]) != &pages[i])) - goto err; + return addr; __SetPageSlab(&pages[i]); } @@ -572,7 +584,7 @@ static bool __init kfence_init_pool(void) */ for (i = 0; i < 2; i++) { if (unlikely(!kfence_protect(addr))) - goto err; + return addr; addr += PAGE_SIZE; } @@ -589,7 +601,7 @@ static bool __init kfence_init_pool(void) /* Protect the right redzone. */ if (unlikely(!kfence_protect(addr + PAGE_SIZE))) - goto err; + return addr; addr += 2 * PAGE_SIZE; } @@ -602,9 +614,21 @@ static bool __init kfence_init_pool(void) */ kmemleak_free(__kfence_pool); - return true; + return 0; +} + +static bool __init kfence_init_pool_early(void) +{ + unsigned long addr; + + if (!__kfence_pool) + return false; + + addr = kfence_init_pool(); + + if (!addr) + return true; -err: /* * Only release unprotected pages, and do not try to go back and change * page attributes due to risk of failing to do so as well. If changing @@ -617,6 +641,26 @@ err: return false; } +static bool kfence_init_pool_late(void) +{ + unsigned long addr, free_size; + + addr = kfence_init_pool(); + + if (!addr) + return true; + + /* Same as above. */ + free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool); +#ifdef CONFIG_CONTIG_ALLOC + free_contig_range(page_to_pfn(virt_to_page(addr)), free_size / PAGE_SIZE); +#else + free_pages_exact((void *)addr, free_size); +#endif + __kfence_pool = NULL; + return false; +} + /* === DebugFS Interface ==================================================== */ static int stats_show(struct seq_file *seq, void *v) @@ -700,6 +744,8 @@ late_initcall(kfence_debugfs_init); /* === Allocation Gate Timer ================================================ */ +static struct delayed_work kfence_timer; + #ifdef CONFIG_KFENCE_STATIC_KEYS /* Wait queue to wake up allocation-gate timer task. */ static DECLARE_WAIT_QUEUE_HEAD(allocation_wait); @@ -722,7 +768,6 @@ static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer); * avoids IPIs, at the cost of not immediately capturing allocations if the * instructions remain cached. */ -static struct delayed_work kfence_timer; static void toggle_allocation_gate(struct work_struct *work) { if (!READ_ONCE(kfence_enabled)) @@ -750,7 +795,6 @@ static void toggle_allocation_gate(struct work_struct *work) queue_delayed_work(system_unbound_wq, &kfence_timer, msecs_to_jiffies(kfence_sample_interval)); } -static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate); /* === Public interface ===================================================== */ @@ -765,25 +809,77 @@ void __init kfence_alloc_pool(void) pr_err("failed to allocate pool\n"); } +static void kfence_init_enable(void) +{ + if (!IS_ENABLED(CONFIG_KFENCE_STATIC_KEYS)) + static_branch_enable(&kfence_allocation_key); + + if (kfence_deferrable) + INIT_DEFERRABLE_WORK(&kfence_timer, toggle_allocation_gate); + else + INIT_DELAYED_WORK(&kfence_timer, toggle_allocation_gate); + + WRITE_ONCE(kfence_enabled, true); + queue_delayed_work(system_unbound_wq, &kfence_timer, 0); + + pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE, + CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool, + (void *)(__kfence_pool + KFENCE_POOL_SIZE)); +} + void __init kfence_init(void) { + stack_hash_seed = (u32)random_get_entropy(); + /* Setting kfence_sample_interval to 0 on boot disables KFENCE. */ if (!kfence_sample_interval) return; - stack_hash_seed = (u32)random_get_entropy(); - if (!kfence_init_pool()) { + if (!kfence_init_pool_early()) { pr_err("%s failed\n", __func__); return; } - if (!IS_ENABLED(CONFIG_KFENCE_STATIC_KEYS)) - static_branch_enable(&kfence_allocation_key); + kfence_init_enable(); +} + +static int kfence_init_late(void) +{ + const unsigned long nr_pages = KFENCE_POOL_SIZE / PAGE_SIZE; +#ifdef CONFIG_CONTIG_ALLOC + struct page *pages; + + pages = alloc_contig_pages(nr_pages, GFP_KERNEL, first_online_node, NULL); + if (!pages) + return -ENOMEM; + __kfence_pool = page_to_virt(pages); +#else + if (nr_pages > MAX_ORDER_NR_PAGES) { + pr_warn("KFENCE_NUM_OBJECTS too large for buddy allocator\n"); + return -EINVAL; + } + __kfence_pool = alloc_pages_exact(KFENCE_POOL_SIZE, GFP_KERNEL); + if (!__kfence_pool) + return -ENOMEM; +#endif + + if (!kfence_init_pool_late()) { + pr_err("%s failed\n", __func__); + return -EBUSY; + } + + kfence_init_enable(); + return 0; +} + +static int kfence_enable_late(void) +{ + if (!__kfence_pool) + return kfence_init_late(); + WRITE_ONCE(kfence_enabled, true); queue_delayed_work(system_unbound_wq, &kfence_timer, 0); - pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE, - CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool, - (void *)(__kfence_pool + KFENCE_POOL_SIZE)); + return 0; } void kfence_shutdown_cache(struct kmem_cache *s) diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index a22b1af85577..1b50f70a4c0f 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -268,13 +268,13 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat * 100x the sample interval should be more than enough to ensure we get * a KFENCE allocation eventually. */ - timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL); + timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval); /* * Especially for non-preemption kernels, ensure the allocation-gate * timer can catch up: after @resched_after, every failed allocation * attempt yields, to ensure the allocation-gate timer is scheduled. */ - resched_after = jiffies + msecs_to_jiffies(CONFIG_KFENCE_SAMPLE_INTERVAL); + resched_after = jiffies + msecs_to_jiffies(kfence_sample_interval); do { if (test_cache) alloc = kmem_cache_alloc(test_cache, gfp); @@ -608,7 +608,7 @@ static void test_gfpzero(struct kunit *test) int i; /* Skip if we think it'd take too long. */ - KFENCE_TEST_REQUIRES(test, CONFIG_KFENCE_SAMPLE_INTERVAL <= 100); + KFENCE_TEST_REQUIRES(test, kfence_sample_interval <= 100); setup_test_cache(test, size, 0, NULL); buf1 = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); @@ -623,10 +623,11 @@ static void test_gfpzero(struct kunit *test) break; test_free(buf2); - if (i == CONFIG_KFENCE_NUM_OBJECTS) { + if (kthread_should_stop() || (i == CONFIG_KFENCE_NUM_OBJECTS)) { kunit_warn(test, "giving up ... cannot get same object back\n"); return; } + cond_resched(); } for (i = 0; i < size; i++) @@ -739,7 +740,7 @@ static void test_memcache_alloc_bulk(struct kunit *test) * 100x the sample interval should be more than enough to ensure we get * a KFENCE allocation eventually. */ - timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL); + timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval); do { void *objects[100]; int i, num = kmem_cache_alloc_bulk(test_cache, GFP_ATOMIC, ARRAY_SIZE(objects), diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e99101162f1a..1cdf7c38b9e5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -16,6 +16,7 @@ #include <linux/hashtable.h> #include <linux/userfaultfd_k.h> #include <linux/page_idle.h> +#include <linux/page_table_check.h> #include <linux/swapops.h> #include <linux/shmem_fs.h> @@ -618,6 +619,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, continue; } else { result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out; } } @@ -636,6 +638,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (page_mapcount(page) > 1 && ++shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out; } @@ -681,7 +684,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } if (!pte_write(pteval) && PageSwapCache(page) && - !reuse_swap_page(page, NULL)) { + !reuse_swap_page(page)) { /* * Page is in the swap cache and cannot be re-used. * It cannot be collapsed into a THP. @@ -756,11 +759,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, * ptl mostly unnecessary. */ spin_lock(ptl); - /* - * paravirt calls inside pte_clear here are - * superfluous. - */ - pte_clear(vma->vm_mm, address, _pte); + ptep_clear(vma->vm_mm, address, _pte); spin_unlock(ptl); } } else { @@ -774,12 +773,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, * inside page_remove_rmap(). */ spin_lock(ptl); - /* - * paravirt calls inside pte_clear here are - * superfluous. - */ - pte_clear(vma->vm_mm, address, _pte); - page_remove_rmap(src_page, false); + ptep_clear(vma->vm_mm, address, _pte); + page_remove_rmap(src_page, vma, false); spin_unlock(ptl); free_page_and_swap_cache(src_page); } @@ -1261,6 +1256,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, continue; } else { result = SCAN_EXCEED_SWAP_PTE; + count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); goto out_unmap; } } @@ -1270,6 +1266,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, continue; } else { result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out_unmap; } } @@ -1298,6 +1295,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, if (page_mapcount(page) > 1 && ++shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out_unmap; } @@ -1306,7 +1304,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, /* * Record which node the original page is from and save this * information to khugepaged_node_load[]. - * Khupaged will allocate hugepage from the node has the max + * Khugepaged will allocate hugepage from the node has the max * hit record. */ node = page_to_nid(page); @@ -1419,6 +1417,21 @@ static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm, return 0; } +static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + spinlock_t *ptl; + pmd_t pmd; + + mmap_assert_write_locked(mm); + ptl = pmd_lock(vma->vm_mm, pmdp); + pmd = pmdp_collapse_flush(vma, addr, pmdp); + spin_unlock(ptl); + mm_dec_nr_ptes(mm); + page_table_check_pte_clear_range(mm, addr, pmd); + pte_free(mm, pmd_pgtable(pmd)); +} + /** * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at * address haddr. @@ -1436,7 +1449,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) struct vm_area_struct *vma = find_vma(mm, haddr); struct page *hpage; pte_t *start_pte, *pte; - pmd_t *pmd, _pmd; + pmd_t *pmd; spinlock_t *ptl; int count = 0; int i; @@ -1500,7 +1513,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (pte_none(*pte)) continue; page = vm_normal_page(vma, addr, *pte); - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); } pte_unmap_unlock(start_pte, ptl); @@ -1512,12 +1525,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) } /* step 4: collapse pmd */ - ptl = pmd_lock(vma->vm_mm, pmd); - _pmd = pmdp_collapse_flush(vma, haddr, pmd); - spin_unlock(ptl); - mm_dec_nr_ptes(mm); - pte_free(mm, pmd_pgtable(_pmd)); - + collapse_and_free_pmd(mm, vma, haddr, pmd); drop_hpage: unlock_page(hpage); put_page(hpage); @@ -1555,7 +1563,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) struct vm_area_struct *vma; struct mm_struct *mm; unsigned long addr; - pmd_t *pmd, _pmd; + pmd_t *pmd; i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { @@ -1594,14 +1602,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * reverse order. Trylock is a way to avoid deadlock. */ if (mmap_write_trylock(mm)) { - if (!khugepaged_test_exit(mm)) { - spinlock_t *ptl = pmd_lock(mm, pmd); - /* assume page table is clear */ - _pmd = pmdp_collapse_flush(vma, addr, pmd); - spin_unlock(ptl); - mm_dec_nr_ptes(mm); - pte_free(mm, pmd_pgtable(_pmd)); - } + if (!khugepaged_test_exit(mm)) + collapse_and_free_pmd(mm, vma, addr, pmd); mmap_write_unlock(mm); } else { /* Try again later */ @@ -1667,7 +1669,10 @@ static void collapse_file(struct mm_struct *mm, } count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); - /* This will be less messy when we use multi-index entries */ + /* + * Ensure we have slots for all the pages in the range. This is + * almost certainly a no-op because most of the pages must be present + */ do { xas_lock_irq(&xas); xas_create_range(&xas); @@ -1829,13 +1834,13 @@ static void collapse_file(struct mm_struct *mm, } if (page_mapped(page)) - unmap_mapping_pages(mapping, index, 1, false); + try_to_unmap(page_folio(page), + TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); xas_lock_irq(&xas); xas_set(&xas, index); VM_BUG_ON_PAGE(page != xas_load(&xas), page); - VM_BUG_ON_PAGE(page_mapped(page), page); /* * The page is expected to have page_count() == 3: @@ -1892,10 +1897,20 @@ out_unlock: __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); } + /* Join all the small entries into a single multi-index entry */ + xas_set_order(&xas, start, HPAGE_PMD_ORDER); + xas_store(&xas, new_page); xa_locked: xas_unlock_irq(&xas); xa_unlocked: + /* + * If collapse is successful, flush must be done now before copying. + * If collapse is unsuccessful, does flush actually need to be done? + * Do it anyway, to clear the state. + */ + try_to_unmap_flush(); + if (result == SCAN_SUCCEED) { struct page *page, *tmp; @@ -2008,11 +2023,16 @@ static void khugepaged_scan_file(struct mm_struct *mm, if (xa_is_value(page)) { if (++swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; + count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break; } continue; } + /* + * XXX: khugepaged should compact smaller compound pages + * into a PMD sized page + */ if (PageTransCompound(page)) { result = SCAN_PAGE_COMPOUND; break; @@ -2054,6 +2074,7 @@ static void khugepaged_scan_file(struct mm_struct *mm, if (result == SCAN_SUCCEED) { if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { node = khugepaged_find_target_node(); collapse_file(mm, file, start, hpage, node); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index b57383c17cf6..7580baa76af1 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -381,15 +381,20 @@ static void dump_object_info(struct kmemleak_object *object) static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) { struct rb_node *rb = object_tree_root.rb_node; + unsigned long untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); while (rb) { - struct kmemleak_object *object = - rb_entry(rb, struct kmemleak_object, rb_node); - if (ptr < object->pointer) + struct kmemleak_object *object; + unsigned long untagged_objp; + + object = rb_entry(rb, struct kmemleak_object, rb_node); + untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer); + + if (untagged_ptr < untagged_objp) rb = object->rb_node.rb_left; - else if (object->pointer + object->size <= ptr) + else if (untagged_objp + object->size <= untagged_ptr) rb = object->rb_node.rb_right; - else if (object->pointer == ptr || alias) + else if (untagged_objp == untagged_ptr || alias) return object; else { kmemleak_warn("Found object by alias at 0x%08lx\n", @@ -576,6 +581,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, struct kmemleak_object *object, *parent; struct rb_node **link, *rb_parent; unsigned long untagged_ptr; + unsigned long untagged_objp; object = mem_pool_alloc(gfp); if (!object) { @@ -629,9 +635,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, while (*link) { rb_parent = *link; parent = rb_entry(rb_parent, struct kmemleak_object, rb_node); - if (ptr + size <= parent->pointer) + untagged_objp = (unsigned long)kasan_reset_tag((void *)parent->pointer); + if (untagged_ptr + size <= untagged_objp) link = &parent->rb_node.rb_left; - else if (parent->pointer + parent->size <= ptr) + else if (untagged_objp + parent->size <= untagged_ptr) link = &parent->rb_node.rb_right; else { kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n", @@ -1403,7 +1410,8 @@ static void kmemleak_scan(void) { unsigned long flags; struct kmemleak_object *object; - int i; + struct zone *zone; + int __maybe_unused i; int new_leaks = 0; jiffies_last_scan = jiffies; @@ -1443,9 +1451,9 @@ static void kmemleak_scan(void) * Struct page scanning for each node. */ get_online_mems(); - for_each_online_node(i) { - unsigned long start_pfn = node_start_pfn(i); - unsigned long end_pfn = node_end_pfn(i); + for_each_populated_zone(zone) { + unsigned long start_pfn = zone->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(zone); unsigned long pfn; for (pfn = start_pfn; pfn < end_pfn; pfn++) { @@ -1454,8 +1462,8 @@ static void kmemleak_scan(void) if (!page) continue; - /* only scan pages belonging to this node */ - if (page_to_nid(page) != i) + /* only scan pages belonging to this zone */ + if (page_zone(page) != zone) continue; /* only scan if page is in use */ if (page_count(page) == 0) @@ -15,6 +15,7 @@ #include <linux/errno.h> #include <linux/mm.h> +#include <linux/mm_inline.h> #include <linux/fs.h> #include <linux/mman.h> #include <linux/sched.h> @@ -1033,10 +1034,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, pte_t *orig_pte) { struct mm_struct *mm = vma->vm_mm; - struct page_vma_mapped_walk pvmw = { - .page = page, - .vma = vma, - }; + DEFINE_PAGE_VMA_WALK(pvmw, page, vma, 0, 0); int swapped; int err = -EFAULT; struct mmu_notifier_range range; @@ -1176,7 +1174,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, newpte); - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); if (!page_mapped(page)) try_to_free_swap(page); put_page(page); @@ -1251,16 +1249,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, err = replace_page(vma, page, kpage, orig_pte); } - if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { - munlock_vma_page(page); - if (!PageMlocked(kpage)) { - unlock_page(page); - lock_page(kpage); - mlock_vma_page(kpage); - page = kpage; /* for final unlock */ - } - } - out_unlock: unlock_page(page); out: @@ -2566,7 +2554,8 @@ void __ksm_exit(struct mm_struct *mm) struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address) { - struct anon_vma *anon_vma = page_anon_vma(page); + struct folio *folio = page_folio(page); + struct anon_vma *anon_vma = folio_anon_vma(folio); struct page *new_page; if (PageKsm(page)) { @@ -2575,8 +2564,8 @@ struct page *ksm_might_need_to_copy(struct page *page, return page; /* no need to copy it */ } else if (!anon_vma) { return page; /* no need to copy it */ - } else if (anon_vma->root == vma->anon_vma->root && - page->index == linear_page_index(vma, address)) { + } else if (page->index == linear_page_index(vma, address) && + anon_vma->root == vma->anon_vma->root) { return page; /* still no need to copy it */ } if (!PageUptodate(page)) @@ -2594,26 +2583,29 @@ struct page *ksm_might_need_to_copy(struct page *page, SetPageDirty(new_page); __SetPageUptodate(new_page); __SetPageLocked(new_page); +#ifdef CONFIG_SWAP + count_vm_event(KSM_SWPIN_COPY); +#endif } return new_page; } -void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) +void rmap_walk_ksm(struct folio *folio, const struct rmap_walk_control *rwc) { struct stable_node *stable_node; struct rmap_item *rmap_item; int search_new_forks = 0; - VM_BUG_ON_PAGE(!PageKsm(page), page); + VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio); /* * Rely on the page lock to protect against concurrent modifications * to that page's node of the stable tree. */ - VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - stable_node = page_stable_node(page); + stable_node = folio_stable_node(folio); if (!stable_node) return; again: @@ -2648,11 +2640,11 @@ again: if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - if (!rwc->rmap_one(page, vma, addr, rwc->arg)) { + if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) { anon_vma_unlock_read(anon_vma); return; } - if (rwc->done && rwc->done(page)) { + if (rwc->done && rwc->done(folio)) { anon_vma_unlock_read(anon_vma); return; } @@ -2825,8 +2817,7 @@ static void wait_while_offlining(void) #define KSM_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) #define KSM_ATTR(_name) \ - static struct kobj_attribute _name##_attr = \ - __ATTR(_name, 0644, _name##_show, _name##_store) + static struct kobj_attribute _name##_attr = __ATTR_RW(_name) static ssize_t sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) diff --git a/mm/list_lru.c b/mm/list_lru.c index 0cd5e89ca063..c669d87001a6 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -13,6 +13,7 @@ #include <linux/mutex.h> #include <linux/memcontrol.h> #include "slab.h" +#include "internal.h" #ifdef CONFIG_MEMCG_KMEM static LIST_HEAD(memcg_list_lrus); @@ -49,35 +50,32 @@ static int lru_shrinker_id(struct list_lru *lru) } static inline struct list_lru_one * -list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) +list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) { - struct list_lru_memcg *memcg_lrus; - /* - * Either lock or RCU protects the array of per cgroup lists - * from relocation (see memcg_update_list_lru_node). - */ - memcg_lrus = rcu_dereference_check(nlru->memcg_lrus, - lockdep_is_held(&nlru->lock)); - if (memcg_lrus && idx >= 0) - return memcg_lrus->lru[idx]; - return &nlru->lru; + if (list_lru_memcg_aware(lru) && idx >= 0) { + struct list_lru_memcg *mlru = xa_load(&lru->xa, idx); + + return mlru ? &mlru->node[nid] : NULL; + } + return &lru->node[nid].lru; } static inline struct list_lru_one * -list_lru_from_kmem(struct list_lru_node *nlru, void *ptr, +list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr, struct mem_cgroup **memcg_ptr) { + struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l = &nlru->lru; struct mem_cgroup *memcg = NULL; - if (!nlru->memcg_lrus) + if (!list_lru_memcg_aware(lru)) goto out; memcg = mem_cgroup_from_obj(ptr); if (!memcg) goto out; - l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); + l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); out: if (memcg_ptr) *memcg_ptr = memcg; @@ -103,18 +101,18 @@ static inline bool list_lru_memcg_aware(struct list_lru *lru) } static inline struct list_lru_one * -list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) +list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) { - return &nlru->lru; + return &lru->node[nid].lru; } static inline struct list_lru_one * -list_lru_from_kmem(struct list_lru_node *nlru, void *ptr, +list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr, struct mem_cgroup **memcg_ptr) { if (memcg_ptr) *memcg_ptr = NULL; - return &nlru->lru; + return &lru->node[nid].lru; } #endif /* CONFIG_MEMCG_KMEM */ @@ -127,7 +125,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item) spin_lock(&nlru->lock); if (list_empty(item)) { - l = list_lru_from_kmem(nlru, item, &memcg); + l = list_lru_from_kmem(lru, nid, item, &memcg); list_add_tail(item, &l->list); /* Set shrinker bit if the first element was added */ if (!l->nr_items++) @@ -150,7 +148,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) spin_lock(&nlru->lock); if (!list_empty(item)) { - l = list_lru_from_kmem(nlru, item, NULL); + l = list_lru_from_kmem(lru, nid, item, NULL); list_del_init(item); l->nr_items--; nlru->nr_items--; @@ -180,13 +178,12 @@ EXPORT_SYMBOL_GPL(list_lru_isolate_move); unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg) { - struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; long count; rcu_read_lock(); - l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); - count = READ_ONCE(l->nr_items); + l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); + count = l ? READ_ONCE(l->nr_items) : 0; rcu_read_unlock(); if (unlikely(count < 0)) @@ -206,17 +203,20 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid) EXPORT_SYMBOL_GPL(list_lru_count_node); static unsigned long -__list_lru_walk_one(struct list_lru_node *nlru, int memcg_idx, +__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { - + struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; struct list_head *item, *n; unsigned long isolated = 0; - l = list_lru_from_memcg_idx(nlru, memcg_idx); restart: + l = list_lru_from_memcg_idx(lru, nid, memcg_idx); + if (!l) + goto out; + list_for_each_safe(item, n, &l->list) { enum lru_status ret; @@ -260,6 +260,7 @@ restart: BUG(); } } +out: return isolated; } @@ -272,8 +273,8 @@ list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, unsigned long ret; spin_lock(&nlru->lock); - ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg, - nr_to_walk); + ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate, + cb_arg, nr_to_walk); spin_unlock(&nlru->lock); return ret; } @@ -288,8 +289,8 @@ list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, unsigned long ret; spin_lock_irq(&nlru->lock); - ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg, - nr_to_walk); + ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate, + cb_arg, nr_to_walk); spin_unlock_irq(&nlru->lock); return ret; } @@ -299,16 +300,20 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, unsigned long *nr_to_walk) { long isolated = 0; - int memcg_idx; isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg, nr_to_walk); + +#ifdef CONFIG_MEMCG_KMEM if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { - for_each_memcg_cache_index(memcg_idx) { + struct list_lru_memcg *mlru; + unsigned long index; + + xa_for_each(&lru->xa, index, mlru) { struct list_lru_node *nlru = &lru->node[nid]; spin_lock(&nlru->lock); - isolated += __list_lru_walk_one(nlru, memcg_idx, + isolated += __list_lru_walk_one(lru, nid, index, isolate, cb_arg, nr_to_walk); spin_unlock(&nlru->lock); @@ -317,6 +322,8 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, break; } } +#endif + return isolated; } EXPORT_SYMBOL_GPL(list_lru_walk_node); @@ -328,204 +335,81 @@ static void init_one_lru(struct list_lru_one *l) } #ifdef CONFIG_MEMCG_KMEM -static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus, - int begin, int end) +static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) { - int i; - - for (i = begin; i < end; i++) - kfree(memcg_lrus->lru[i]); -} - -static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus, - int begin, int end) -{ - int i; + int nid; + struct list_lru_memcg *mlru; - for (i = begin; i < end; i++) { - struct list_lru_one *l; + mlru = kmalloc(struct_size(mlru, node, nr_node_ids), gfp); + if (!mlru) + return NULL; - l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL); - if (!l) - goto fail; + for_each_node(nid) + init_one_lru(&mlru->node[nid]); - init_one_lru(l); - memcg_lrus->lru[i] = l; - } - return 0; -fail: - __memcg_destroy_list_lru_node(memcg_lrus, begin, i); - return -ENOMEM; + return mlru; } -static int memcg_init_list_lru_node(struct list_lru_node *nlru) +static void memcg_list_lru_free(struct list_lru *lru, int src_idx) { - struct list_lru_memcg *memcg_lrus; - int size = memcg_nr_cache_ids; - - memcg_lrus = kvmalloc(struct_size(memcg_lrus, lru, size), GFP_KERNEL); - if (!memcg_lrus) - return -ENOMEM; - - if (__memcg_init_list_lru_node(memcg_lrus, 0, size)) { - kvfree(memcg_lrus); - return -ENOMEM; - } - RCU_INIT_POINTER(nlru->memcg_lrus, memcg_lrus); - - return 0; -} + struct list_lru_memcg *mlru = xa_erase_irq(&lru->xa, src_idx); -static void memcg_destroy_list_lru_node(struct list_lru_node *nlru) -{ - struct list_lru_memcg *memcg_lrus; /* - * This is called when shrinker has already been unregistered, - * and nobody can use it. So, there is no need to use kvfree_rcu(). + * The __list_lru_walk_one() can walk the list of this node. + * We need kvfree_rcu() here. And the walking of the list + * is under lru->node[nid]->lock, which can serve as a RCU + * read-side critical section. */ - memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true); - __memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids); - kvfree(memcg_lrus); + if (mlru) + kvfree_rcu(mlru, rcu); } -static int memcg_update_list_lru_node(struct list_lru_node *nlru, - int old_size, int new_size) +static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { - struct list_lru_memcg *old, *new; - - BUG_ON(old_size > new_size); - - old = rcu_dereference_protected(nlru->memcg_lrus, - lockdep_is_held(&list_lrus_mutex)); - new = kvmalloc(struct_size(new, lru, new_size), GFP_KERNEL); - if (!new) - return -ENOMEM; - - if (__memcg_init_list_lru_node(new, old_size, new_size)) { - kvfree(new); - return -ENOMEM; - } - - memcpy(&new->lru, &old->lru, flex_array_size(new, lru, old_size)); - rcu_assign_pointer(nlru->memcg_lrus, new); - kvfree_rcu(old, rcu); - return 0; -} - -static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru, - int old_size, int new_size) -{ - struct list_lru_memcg *memcg_lrus; - - memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, - lockdep_is_held(&list_lrus_mutex)); - /* do not bother shrinking the array back to the old size, because we - * cannot handle allocation failures here */ - __memcg_destroy_list_lru_node(memcg_lrus, old_size, new_size); -} - -static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) -{ - int i; - + if (memcg_aware) + xa_init_flags(&lru->xa, XA_FLAGS_LOCK_IRQ); lru->memcg_aware = memcg_aware; - - if (!memcg_aware) - return 0; - - for_each_node(i) { - if (memcg_init_list_lru_node(&lru->node[i])) - goto fail; - } - return 0; -fail: - for (i = i - 1; i >= 0; i--) { - if (!lru->node[i].memcg_lrus) - continue; - memcg_destroy_list_lru_node(&lru->node[i]); - } - return -ENOMEM; } static void memcg_destroy_list_lru(struct list_lru *lru) { - int i; + XA_STATE(xas, &lru->xa, 0); + struct list_lru_memcg *mlru; if (!list_lru_memcg_aware(lru)) return; - for_each_node(i) - memcg_destroy_list_lru_node(&lru->node[i]); -} - -static int memcg_update_list_lru(struct list_lru *lru, - int old_size, int new_size) -{ - int i; - - for_each_node(i) { - if (memcg_update_list_lru_node(&lru->node[i], - old_size, new_size)) - goto fail; - } - return 0; -fail: - for (i = i - 1; i >= 0; i--) { - if (!lru->node[i].memcg_lrus) - continue; - - memcg_cancel_update_list_lru_node(&lru->node[i], - old_size, new_size); + xas_lock_irq(&xas); + xas_for_each(&xas, mlru, ULONG_MAX) { + kfree(mlru); + xas_store(&xas, NULL); } - return -ENOMEM; -} - -static void memcg_cancel_update_list_lru(struct list_lru *lru, - int old_size, int new_size) -{ - int i; - - for_each_node(i) - memcg_cancel_update_list_lru_node(&lru->node[i], - old_size, new_size); + xas_unlock_irq(&xas); } -int memcg_update_all_list_lrus(int new_size) -{ - int ret = 0; - struct list_lru *lru; - int old_size = memcg_nr_cache_ids; - - mutex_lock(&list_lrus_mutex); - list_for_each_entry(lru, &memcg_list_lrus, list) { - ret = memcg_update_list_lru(lru, old_size, new_size); - if (ret) - goto fail; - } -out: - mutex_unlock(&list_lrus_mutex); - return ret; -fail: - list_for_each_entry_continue_reverse(lru, &memcg_list_lrus, list) - memcg_cancel_update_list_lru(lru, old_size, new_size); - goto out; -} - -static void memcg_drain_list_lru_node(struct list_lru *lru, int nid, - int src_idx, struct mem_cgroup *dst_memcg) +static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, + int src_idx, struct mem_cgroup *dst_memcg) { struct list_lru_node *nlru = &lru->node[nid]; int dst_idx = dst_memcg->kmemcg_id; struct list_lru_one *src, *dst; /* + * If there is no lru entry in this nlru, we can skip it immediately. + */ + if (!READ_ONCE(nlru->nr_items)) + return; + + /* * Since list_lru_{add,del} may be called under an IRQ-safe lock, * we have to use IRQ-safe primitives here to avoid deadlock. */ spin_lock_irq(&nlru->lock); - src = list_lru_from_memcg_idx(nlru, src_idx); - dst = list_lru_from_memcg_idx(nlru, dst_idx); + src = list_lru_from_memcg_idx(lru, nid, src_idx); + if (!src) + goto out; + dst = list_lru_from_memcg_idx(lru, nid, dst_idx); list_splice_init(&src->list, &dst->list); @@ -534,32 +418,143 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid, set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); src->nr_items = 0; } - +out: spin_unlock_irq(&nlru->lock); } -static void memcg_drain_list_lru(struct list_lru *lru, - int src_idx, struct mem_cgroup *dst_memcg) +static void memcg_reparent_list_lru(struct list_lru *lru, + int src_idx, struct mem_cgroup *dst_memcg) { int i; for_each_node(i) - memcg_drain_list_lru_node(lru, i, src_idx, dst_memcg); + memcg_reparent_list_lru_node(lru, i, src_idx, dst_memcg); + + memcg_list_lru_free(lru, src_idx); } -void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg) +void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) { + struct cgroup_subsys_state *css; struct list_lru *lru; + int src_idx = memcg->kmemcg_id; + + /* + * Change kmemcg_id of this cgroup and all its descendants to the + * parent's id, and then move all entries from this cgroup's list_lrus + * to ones of the parent. + * + * After we have finished, all list_lrus corresponding to this cgroup + * are guaranteed to remain empty. So we can safely free this cgroup's + * list lrus in memcg_list_lru_free(). + * + * Changing ->kmemcg_id to the parent can prevent memcg_list_lru_alloc() + * from allocating list lrus for this cgroup after memcg_list_lru_free() + * call. + */ + rcu_read_lock(); + css_for_each_descendant_pre(css, &memcg->css) { + struct mem_cgroup *child; + + child = mem_cgroup_from_css(css); + WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id); + } + rcu_read_unlock(); mutex_lock(&list_lrus_mutex); list_for_each_entry(lru, &memcg_list_lrus, list) - memcg_drain_list_lru(lru, src_idx, dst_memcg); + memcg_reparent_list_lru(lru, src_idx, parent); mutex_unlock(&list_lrus_mutex); } + +static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg, + struct list_lru *lru) +{ + int idx = memcg->kmemcg_id; + + return idx < 0 || xa_load(&lru->xa, idx); +} + +int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, + gfp_t gfp) +{ + int i; + unsigned long flags; + struct list_lru_memcg_table { + struct list_lru_memcg *mlru; + struct mem_cgroup *memcg; + } *table; + XA_STATE(xas, &lru->xa, 0); + + if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) + return 0; + + gfp &= GFP_RECLAIM_MASK; + table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp); + if (!table) + return -ENOMEM; + + /* + * Because the list_lru can be reparented to the parent cgroup's + * list_lru, we should make sure that this cgroup and all its + * ancestors have allocated list_lru_memcg. + */ + for (i = 0; memcg; memcg = parent_mem_cgroup(memcg), i++) { + if (memcg_list_lru_allocated(memcg, lru)) + break; + + table[i].memcg = memcg; + table[i].mlru = memcg_init_list_lru_one(gfp); + if (!table[i].mlru) { + while (i--) + kfree(table[i].mlru); + kfree(table); + return -ENOMEM; + } + } + + xas_lock_irqsave(&xas, flags); + while (i--) { + int index = READ_ONCE(table[i].memcg->kmemcg_id); + struct list_lru_memcg *mlru = table[i].mlru; + + xas_set(&xas, index); +retry: + if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) { + kfree(mlru); + } else { + xas_store(&xas, mlru); + if (xas_error(&xas) == -ENOMEM) { + xas_unlock_irqrestore(&xas, flags); + if (xas_nomem(&xas, gfp)) + xas_set_err(&xas, 0); + xas_lock_irqsave(&xas, flags); + /* + * The xas lock has been released, this memcg + * can be reparented before us. So reload + * memcg id. More details see the comments + * in memcg_reparent_list_lrus(). + */ + index = READ_ONCE(table[i].memcg->kmemcg_id); + if (index < 0) + xas_set_err(&xas, 0); + else if (!xas_error(&xas) && index != xas.xa_index) + xas_set(&xas, index); + goto retry; + } + } + } + /* xas_nomem() is used to free memory instead of memory allocation. */ + if (xas.xa_alloc) + xas_nomem(&xas, gfp); + xas_unlock_irqrestore(&xas, flags); + kfree(table); + + return xas_error(&xas); +} #else -static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) +static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { - return 0; } static void memcg_destroy_list_lru(struct list_lru *lru) @@ -571,7 +566,6 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct lock_class_key *key, struct shrinker *shrinker) { int i; - int err = -ENOMEM; #ifdef CONFIG_MEMCG_KMEM if (shrinker) @@ -579,11 +573,10 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, else lru->shrinker_id = -1; #endif - memcg_get_cache_ids(); lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL); if (!lru->node) - goto out; + return -ENOMEM; for_each_node(i) { spin_lock_init(&lru->node[i].lock); @@ -592,18 +585,10 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, init_one_lru(&lru->node[i].lru); } - err = memcg_init_list_lru(lru, memcg_aware); - if (err) { - kfree(lru->node); - /* Do this so a list_lru_destroy() doesn't crash: */ - lru->node = NULL; - goto out; - } - + memcg_init_list_lru(lru, memcg_aware); list_lru_register(lru); -out: - memcg_put_cache_ids(); - return err; + + return 0; } EXPORT_SYMBOL_GPL(__list_lru_init); @@ -613,8 +598,6 @@ void list_lru_destroy(struct list_lru *lru) if (!lru->node) return; - memcg_get_cache_ids(); - list_lru_unregister(lru); memcg_destroy_list_lru(lru); @@ -624,6 +607,5 @@ void list_lru_destroy(struct list_lru *lru) #ifdef CONFIG_MEMCG_KMEM lru->shrinker_id = -1; #endif - memcg_put_cache_ids(); } EXPORT_SYMBOL_GPL(list_lru_destroy); diff --git a/mm/maccess.c b/mm/maccess.c index d3f1a1f0b1c1..3fed2b876539 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -335,3 +335,9 @@ long strnlen_user_nofault(const void __user *unsafe_addr, long count) return ret; } + +void __copy_overflow(int size, unsigned long count) +{ + WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); +} +EXPORT_SYMBOL(__copy_overflow); diff --git a/mm/madvise.c b/mm/madvise.c index 8c927202bbe6..39b712fd8300 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -18,6 +18,8 @@ #include <linux/fadvise.h> #include <linux/sched.h> #include <linux/sched/mm.h> +#include <linux/mm_inline.h> +#include <linux/string.h> #include <linux/uio.h> #include <linux/ksm.h> #include <linux/fs.h> @@ -62,83 +64,94 @@ static int madvise_need_mmap_write(int behavior) } } +#ifdef CONFIG_ANON_VMA_NAME +struct anon_vma_name *anon_vma_name_alloc(const char *name) +{ + struct anon_vma_name *anon_name; + size_t count; + + /* Add 1 for NUL terminator at the end of the anon_name->name */ + count = strlen(name) + 1; + anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); + if (anon_name) { + kref_init(&anon_name->kref); + memcpy(anon_name->name, name, count); + } + + return anon_name; +} + +void anon_vma_name_free(struct kref *kref) +{ + struct anon_vma_name *anon_name = + container_of(kref, struct anon_vma_name, kref); + kfree(anon_name); +} + +struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) +{ + mmap_assert_locked(vma->vm_mm); + + if (vma->vm_file) + return NULL; + + return vma->anon_name; +} + +/* mmap_lock should be write-locked */ +static int replace_anon_vma_name(struct vm_area_struct *vma, + struct anon_vma_name *anon_name) +{ + struct anon_vma_name *orig_name = anon_vma_name(vma); + + if (!anon_name) { + vma->anon_name = NULL; + anon_vma_name_put(orig_name); + return 0; + } + + if (anon_vma_name_eq(orig_name, anon_name)) + return 0; + + vma->anon_name = anon_vma_name_reuse(anon_name); + anon_vma_name_put(orig_name); + + return 0; +} +#else /* CONFIG_ANON_VMA_NAME */ +static int replace_anon_vma_name(struct vm_area_struct *vma, + struct anon_vma_name *anon_name) +{ + if (anon_name) + return -EINVAL; + + return 0; +} +#endif /* CONFIG_ANON_VMA_NAME */ /* - * We can potentially split a vm area into separate - * areas, each area with its own behavior. + * Update the vm_flags on region of a vma, splitting it or merging it as + * necessary. Must be called with mmap_sem held for writing; + * Caller should ensure anon_name stability by raising its refcount even when + * anon_name belongs to a valid vma because this function might free that vma. */ -static long madvise_behavior(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end, int behavior) +static int madvise_update_vma(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, unsigned long new_flags, + struct anon_vma_name *anon_name) { struct mm_struct *mm = vma->vm_mm; - int error = 0; + int error; pgoff_t pgoff; - unsigned long new_flags = vma->vm_flags; - - switch (behavior) { - case MADV_NORMAL: - new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; - break; - case MADV_SEQUENTIAL: - new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; - break; - case MADV_RANDOM: - new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; - break; - case MADV_DONTFORK: - new_flags |= VM_DONTCOPY; - break; - case MADV_DOFORK: - if (vma->vm_flags & VM_IO) { - error = -EINVAL; - goto out; - } - new_flags &= ~VM_DONTCOPY; - break; - case MADV_WIPEONFORK: - /* MADV_WIPEONFORK is only supported on anonymous memory. */ - if (vma->vm_file || vma->vm_flags & VM_SHARED) { - error = -EINVAL; - goto out; - } - new_flags |= VM_WIPEONFORK; - break; - case MADV_KEEPONFORK: - new_flags &= ~VM_WIPEONFORK; - break; - case MADV_DONTDUMP: - new_flags |= VM_DONTDUMP; - break; - case MADV_DODUMP: - if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) { - error = -EINVAL; - goto out; - } - new_flags &= ~VM_DONTDUMP; - break; - case MADV_MERGEABLE: - case MADV_UNMERGEABLE: - error = ksm_madvise(vma, start, end, behavior, &new_flags); - if (error) - goto out_convert_errno; - break; - case MADV_HUGEPAGE: - case MADV_NOHUGEPAGE: - error = hugepage_madvise(vma, &new_flags, behavior); - if (error) - goto out_convert_errno; - break; - } - if (new_flags == vma->vm_flags) { + if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { *prev = vma; - goto out; + return 0; } pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, anon_name); if (*prev) { vma = *prev; goto success; @@ -147,23 +160,19 @@ static long madvise_behavior(struct vm_area_struct *vma, *prev = vma; if (start != vma->vm_start) { - if (unlikely(mm->map_count >= sysctl_max_map_count)) { - error = -ENOMEM; - goto out; - } + if (unlikely(mm->map_count >= sysctl_max_map_count)) + return -ENOMEM; error = __split_vma(mm, vma, start, 1); if (error) - goto out_convert_errno; + return error; } if (end != vma->vm_end) { - if (unlikely(mm->map_count >= sysctl_max_map_count)) { - error = -ENOMEM; - goto out; - } + if (unlikely(mm->map_count >= sysctl_max_map_count)) + return -ENOMEM; error = __split_vma(mm, vma, end, 0); if (error) - goto out_convert_errno; + return error; } success: @@ -171,16 +180,13 @@ success: * vm_flags is protected by the mmap_lock held in write mode. */ vma->vm_flags = new_flags; + if (!vma->vm_file) { + error = replace_anon_vma_name(vma, anon_name); + if (error) + return error; + } -out_convert_errno: - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; -out: - return error; + return 0; } #ifdef CONFIG_SWAP @@ -496,6 +502,11 @@ static void madvise_cold_page_range(struct mmu_gather *tlb, tlb_end_vma(tlb, vma); } +static inline bool can_madv_lru_vma(struct vm_area_struct *vma) +{ + return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); +} + static long madvise_cold(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) @@ -843,8 +854,8 @@ static long madvise_populate(struct vm_area_struct *vma, * our VMA might have been split. */ if (!vma || start >= vma->vm_end) { - vma = find_vma(mm, start); - if (!vma || start < vma->vm_start) + vma = vma_lookup(mm, start); + if (!vma) return -ENOMEM; } @@ -930,6 +941,99 @@ static long madvise_remove(struct vm_area_struct *vma, return error; } +/* + * Apply an madvise behavior to a region of a vma. madvise_update_vma + * will handle splitting a vm area into separate areas, each area with its own + * behavior. + */ +static int madvise_vma_behavior(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + unsigned long behavior) +{ + int error; + struct anon_vma_name *anon_name; + unsigned long new_flags = vma->vm_flags; + + switch (behavior) { + case MADV_REMOVE: + return madvise_remove(vma, prev, start, end); + case MADV_WILLNEED: + return madvise_willneed(vma, prev, start, end); + case MADV_COLD: + return madvise_cold(vma, prev, start, end); + case MADV_PAGEOUT: + return madvise_pageout(vma, prev, start, end); + case MADV_FREE: + case MADV_DONTNEED: + return madvise_dontneed_free(vma, prev, start, end, behavior); + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: + return madvise_populate(vma, prev, start, end, behavior); + case MADV_NORMAL: + new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; + break; + case MADV_SEQUENTIAL: + new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; + break; + case MADV_RANDOM: + new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; + break; + case MADV_DONTFORK: + new_flags |= VM_DONTCOPY; + break; + case MADV_DOFORK: + if (vma->vm_flags & VM_IO) + return -EINVAL; + new_flags &= ~VM_DONTCOPY; + break; + case MADV_WIPEONFORK: + /* MADV_WIPEONFORK is only supported on anonymous memory. */ + if (vma->vm_file || vma->vm_flags & VM_SHARED) + return -EINVAL; + new_flags |= VM_WIPEONFORK; + break; + case MADV_KEEPONFORK: + new_flags &= ~VM_WIPEONFORK; + break; + case MADV_DONTDUMP: + new_flags |= VM_DONTDUMP; + break; + case MADV_DODUMP: + if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) + return -EINVAL; + new_flags &= ~VM_DONTDUMP; + break; + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: + error = ksm_madvise(vma, start, end, behavior, &new_flags); + if (error) + goto out; + break; + case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: + error = hugepage_madvise(vma, &new_flags, behavior); + if (error) + goto out; + break; + } + + anon_name = anon_vma_name(vma); + anon_vma_name_get(anon_name); + error = madvise_update_vma(vma, prev, start, end, new_flags, + anon_name); + anon_vma_name_put(anon_name); + +out: + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; + return error; +} + #ifdef CONFIG_MEMORY_FAILURE /* * Error injection support for memory error handling. @@ -968,6 +1072,8 @@ static int madvise_inject_error(int behavior, pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", pfn, start); ret = memory_failure(pfn, MF_COUNT_INCREASED); + if (ret == -EOPNOTSUPP) + ret = 0; } if (ret) @@ -978,30 +1084,6 @@ static int madvise_inject_error(int behavior, } #endif -static long -madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end, int behavior) -{ - switch (behavior) { - case MADV_REMOVE: - return madvise_remove(vma, prev, start, end); - case MADV_WILLNEED: - return madvise_willneed(vma, prev, start, end); - case MADV_COLD: - return madvise_cold(vma, prev, start, end); - case MADV_PAGEOUT: - return madvise_pageout(vma, prev, start, end); - case MADV_FREE: - case MADV_DONTNEED: - return madvise_dontneed_free(vma, prev, start, end, behavior); - case MADV_POPULATE_READ: - case MADV_POPULATE_WRITE: - return madvise_populate(vma, prev, start, end, behavior); - default: - return madvise_behavior(vma, prev, start, end, behavior); - } -} - static bool madvise_behavior_valid(int behavior) { @@ -1056,6 +1138,122 @@ process_madvise_behavior_valid(int behavior) } /* + * Walk the vmas in range [start,end), and call the visit function on each one. + * The visit function will get start and end parameters that cover the overlap + * between the current vma and the original range. Any unmapped regions in the + * original range will result in this function returning -ENOMEM while still + * calling the visit function on all of the existing vmas in the range. + * Must be called with the mmap_lock held for reading or writing. + */ +static +int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, + unsigned long end, unsigned long arg, + int (*visit)(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, unsigned long arg)) +{ + struct vm_area_struct *vma; + struct vm_area_struct *prev; + unsigned long tmp; + int unmapped_error = 0; + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + * - different from the way of handling in mlock etc. + */ + vma = find_vma_prev(mm, start, &prev); + if (vma && start > vma->vm_start) + prev = vma; + + for (;;) { + int error; + + /* Still start < end. */ + if (!vma) + return -ENOMEM; + + /* Here start < (end|vma->vm_end). */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + if (start >= end) + break; + } + + /* Here vma->vm_start <= start < (end|vma->vm_end) */ + tmp = vma->vm_end; + if (end < tmp) + tmp = end; + + /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ + error = visit(vma, &prev, start, tmp, arg); + if (error) + return error; + start = tmp; + if (prev && start < prev->vm_end) + start = prev->vm_end; + if (start >= end) + break; + if (prev) + vma = prev->vm_next; + else /* madvise_remove dropped mmap_lock */ + vma = find_vma(mm, start); + } + + return unmapped_error; +} + +#ifdef CONFIG_ANON_VMA_NAME +static int madvise_vma_anon_name(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + unsigned long anon_name) +{ + int error; + + /* Only anonymous mappings can be named */ + if (vma->vm_file) + return -EBADF; + + error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, + (struct anon_vma_name *)anon_name); + + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; + return error; +} + +int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, struct anon_vma_name *anon_name) +{ + unsigned long end; + unsigned long len; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = (len_in + ~PAGE_MASK) & PAGE_MASK; + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + return -EINVAL; + + end = start + len; + if (end < start) + return -EINVAL; + + if (end == start) + return 0; + + return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, + madvise_vma_anon_name); +} +#endif /* CONFIG_ANON_VMA_NAME */ +/* * The madvise(2) system call. * * Applications can use madvise() to advise the kernel how it should @@ -1127,10 +1325,8 @@ process_madvise_behavior_valid(int behavior) */ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { - unsigned long end, tmp; - struct vm_area_struct *vma, *prev; - int unmapped_error = 0; - int error = -EINVAL; + unsigned long end; + int error; int write; size_t len; struct blk_plug plug; @@ -1138,23 +1334,22 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh start = untagged_addr(start); if (!madvise_behavior_valid(behavior)) - return error; + return -EINVAL; if (!PAGE_ALIGNED(start)) - return error; + return -EINVAL; len = PAGE_ALIGN(len_in); /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) - return error; + return -EINVAL; end = start + len; if (end < start) - return error; + return -EINVAL; - error = 0; if (end == start) - return error; + return 0; #ifdef CONFIG_MEMORY_FAILURE if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) @@ -1169,51 +1364,9 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh mmap_read_lock(mm); } - /* - * If the interval [start,end) covers some unmapped address - * ranges, just ignore them, but return -ENOMEM at the end. - * - different from the way of handling in mlock etc. - */ - vma = find_vma_prev(mm, start, &prev); - if (vma && start > vma->vm_start) - prev = vma; - blk_start_plug(&plug); - for (;;) { - /* Still start < end. */ - error = -ENOMEM; - if (!vma) - goto out; - - /* Here start < (end|vma->vm_end). */ - if (start < vma->vm_start) { - unmapped_error = -ENOMEM; - start = vma->vm_start; - if (start >= end) - goto out; - } - - /* Here vma->vm_start <= start < (end|vma->vm_end) */ - tmp = vma->vm_end; - if (end < tmp) - tmp = end; - - /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ - error = madvise_vma(vma, &prev, start, tmp, behavior); - if (error) - goto out; - start = tmp; - if (prev && start < prev->vm_end) - start = prev->vm_end; - error = unmapped_error; - if (start >= end) - goto out; - if (prev) - vma = prev->vm_next; - else /* madvise_remove dropped mmap_lock */ - vma = find_vma(mm, start); - } -out: + error = madvise_walk_vmas(mm, start, end, behavior, + madvise_vma_behavior); blk_finish_plug(&plug); if (write) mmap_write_unlock(mm); @@ -1280,15 +1433,21 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, while (iov_iter_count(&iter)) { iovec = iov_iter_iovec(&iter); + /* + * do_madvise returns ENOMEM if unmapped holes are present + * in the passed VMA. process_madvise() is expected to skip + * unmapped holes passed to it in the 'struct iovec' list + * and not fail because of them. Thus treat -ENOMEM return + * from do_madvise as valid and continue processing. + */ ret = do_madvise(mm, (unsigned long)iovec.iov_base, iovec.iov_len, behavior); - if (ret < 0) + if (ret < 0 && ret != -ENOMEM) break; iov_iter_advance(&iter, iovec.iov_len); } - if (ret == 0) - ret = total_len - iov_iter_count(&iter); + ret = (total_len - iov_iter_count(&iter)) ? : ret; release_mm: mmput(mm); diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index ea734f248fce..1b0ab8fcfd8b 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -3,6 +3,7 @@ #include <linux/hugetlb.h> #include <linux/bitops.h> #include <linux/mmu_notifier.h> +#include <linux/mm_inline.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> diff --git a/mm/memblock.c b/mm/memblock.c index 1018e50566f3..b12a364f2766 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -366,14 +366,20 @@ void __init memblock_discard(void) addr = __pa(memblock.reserved.regions); size = PAGE_ALIGN(sizeof(struct memblock_region) * memblock.reserved.max); - memblock_free_late(addr, size); + if (memblock_reserved_in_slab) + kfree(memblock.reserved.regions); + else + memblock_free_late(addr, size); } if (memblock.memory.regions != memblock_memory_init_regions) { addr = __pa(memblock.memory.regions); size = PAGE_ALIGN(sizeof(struct memblock_region) * memblock.memory.max); - memblock_free_late(addr, size); + if (memblock_memory_in_slab) + kfree(memblock.memory.regions); + else + memblock_free_late(addr, size); } memblock_memory = NULL; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4a7b3ebf8e48..d495c2acb9f0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -53,6 +53,7 @@ #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/vmpressure.h> +#include <linux/memremap.h> #include <linux/mm_inline.h> #include <linux/swap_cgroup.h> #include <linux/cpu.h> @@ -84,7 +85,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg); static bool cgroup_memory_nosocket __ro_after_init; /* Kernel memory accounting disabled? */ -bool cgroup_memory_nokmem __ro_after_init; +static bool cgroup_memory_nokmem __ro_after_init; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP @@ -254,7 +255,7 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) } #ifdef CONFIG_MEMCG_KMEM -extern spinlock_t css_set_lock; +static DEFINE_SPINLOCK(objcg_lock); bool mem_cgroup_kmem_disabled(void) { @@ -298,9 +299,9 @@ static void obj_cgroup_release(struct percpu_ref *ref) if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); - spin_lock_irqsave(&css_set_lock, flags); + spin_lock_irqsave(&objcg_lock, flags); list_del(&objcg->list); - spin_unlock_irqrestore(&css_set_lock, flags); + spin_unlock_irqrestore(&objcg_lock, flags); percpu_ref_exit(ref); kfree_rcu(objcg, rcu); @@ -332,7 +333,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg, objcg = rcu_replace_pointer(memcg->objcg, NULL, true); - spin_lock_irq(&css_set_lock); + spin_lock_irq(&objcg_lock); /* 1) Ready to reparent active objcg. */ list_add(&objcg->list, &memcg->objcg_list); @@ -342,54 +343,12 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg, /* 3) Move already reparented objcgs to the parent's list */ list_splice(&memcg->objcg_list, &parent->objcg_list); - spin_unlock_irq(&css_set_lock); + spin_unlock_irq(&objcg_lock); percpu_ref_kill(&objcg->refcnt); } /* - * This will be used as a shrinker list's index. - * The main reason for not using cgroup id for this: - * this works better in sparse environments, where we have a lot of memcgs, - * but only a few kmem-limited. Or also, if we have, for instance, 200 - * memcgs, and none but the 200th is kmem-limited, we'd have to have a - * 200 entry array for that. - * - * The current size of the caches array is stored in memcg_nr_cache_ids. It - * will double each time we have to increase it. - */ -static DEFINE_IDA(memcg_cache_ida); -int memcg_nr_cache_ids; - -/* Protects memcg_nr_cache_ids */ -static DECLARE_RWSEM(memcg_cache_ids_sem); - -void memcg_get_cache_ids(void) -{ - down_read(&memcg_cache_ids_sem); -} - -void memcg_put_cache_ids(void) -{ - up_read(&memcg_cache_ids_sem); -} - -/* - * MIN_SIZE is different than 1, because we would like to avoid going through - * the alloc/free process all the time. In a small machine, 4 kmem-limited - * cgroups is a reasonable guess. In the future, it could be a parameter or - * tunable, but that is strictly not necessary. - * - * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get - * this constant directly from cgroup, but it is understandable that this is - * better kept as an internal representation in cgroup.c. In any case, the - * cgrp_id space is not getting any smaller, and we don't have to necessarily - * increase ours as well if it increases. - */ -#define MEMCG_CACHES_MIN_SIZE 4 -#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX - -/* * A lot of the calls to the cache allocation functions are expected to be * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are * conditional to this static branch, we'll have to allow modules that does @@ -629,11 +588,46 @@ static DEFINE_SPINLOCK(stats_flush_lock); static DEFINE_PER_CPU(unsigned int, stats_updates); static atomic_t stats_flush_threshold = ATOMIC_INIT(0); -static inline void memcg_rstat_updated(struct mem_cgroup *memcg) +/* + * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can + * not rely on this as part of an acquired spinlock_t lock. These functions are + * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion + * is sufficient. + */ +static void memcg_stats_lock(void) +{ +#ifdef CONFIG_PREEMPT_RT + preempt_disable(); +#else + VM_BUG_ON(!irqs_disabled()); +#endif +} + +static void __memcg_stats_lock(void) { +#ifdef CONFIG_PREEMPT_RT + preempt_disable(); +#endif +} + +static void memcg_stats_unlock(void) +{ +#ifdef CONFIG_PREEMPT_RT + preempt_enable(); +#endif +} + +static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) +{ + unsigned int x; + cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); - if (!(__this_cpu_inc_return(stats_updates) % MEMCG_CHARGE_BATCH)) - atomic_inc(&stats_flush_threshold); + + x = __this_cpu_add_return(stats_updates, abs(val)); + if (x > MEMCG_CHARGE_BATCH) { + atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold); + __this_cpu_write(stats_updates, 0); + } } static void __mem_cgroup_flush_stats(void) @@ -656,7 +650,7 @@ void mem_cgroup_flush_stats(void) static void flush_memcg_stats_dwork(struct work_struct *w) { - mem_cgroup_flush_stats(); + __mem_cgroup_flush_stats(); queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); } @@ -672,7 +666,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) return; __this_cpu_add(memcg->vmstats_percpu->state[idx], val); - memcg_rstat_updated(memcg); + memcg_rstat_updated(memcg, val); } /* idx can be of type enum memcg_stat_item or node_stat_item. */ @@ -699,13 +693,35 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); memcg = pn->memcg; + /* + * The caller from rmap relay on disabled preemption becase they never + * update their counter from in-interrupt context. For these two + * counters we check that the update is never performed from an + * interrupt context while other caller need to have disabled interrupt. + */ + __memcg_stats_lock(); + if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) { + switch (idx) { + case NR_ANON_MAPPED: + case NR_FILE_MAPPED: + case NR_ANON_THPS: + case NR_SHMEM_PMDMAPPED: + case NR_FILE_PMDMAPPED: + WARN_ON_ONCE(!in_task()); + break; + default: + WARN_ON_ONCE(!irqs_disabled()); + } + } + /* Update memcg */ __this_cpu_add(memcg->vmstats_percpu->state[idx], val); /* Update lruvec */ __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); - memcg_rstat_updated(memcg); + memcg_rstat_updated(memcg, val); + memcg_stats_unlock(); } /** @@ -788,8 +804,10 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, if (mem_cgroup_disabled()) return; + memcg_stats_lock(); __this_cpu_add(memcg->vmstats_percpu->events[idx], count); - memcg_rstat_updated(memcg); + memcg_rstat_updated(memcg, count); + memcg_stats_unlock(); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) @@ -852,6 +870,9 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, */ static void memcg_check_events(struct mem_cgroup *memcg, int nid) { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return; + /* threshold event is triggered in finer grain than soft limit */ if (unlikely(mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { @@ -1251,8 +1272,7 @@ struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, * @nr_pages: positive when adding or negative when removing * * This function must be called under lru_lock, just before a page is added - * to or just after a page is removed from an lru list (that ordering being - * so as to allow it to check that lru_size 0 is consistent with list_empty). + * to or just after a page is removed from an lru list. */ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int zid, int nr_pages) @@ -1365,10 +1385,12 @@ struct memory_stat { static const struct memory_stat memory_stats[] = { { "anon", NR_ANON_MAPPED }, { "file", NR_FILE_PAGES }, + { "kernel", MEMCG_KMEM }, { "kernel_stack", NR_KERNEL_STACK_KB }, { "pagetables", NR_PAGETABLE }, { "percpu", MEMCG_PERCPU_B }, { "sock", MEMCG_SOCK }, + { "vmalloc", MEMCG_VMALLOC }, { "shmem", NR_SHMEM }, { "file_mapped", NR_FILE_MAPPED }, { "file_dirty", NR_FILE_DIRTY }, @@ -1788,20 +1810,16 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); } -enum oom_status { - OOM_SUCCESS, - OOM_FAILED, - OOM_ASYNC, - OOM_SKIPPED -}; - -static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) +/* + * Returns true if successfully killed one or more processes. Though in some + * corner cases it can return true even without killing any process. + */ +static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - enum oom_status ret; - bool locked; + bool locked, ret; if (order > PAGE_ALLOC_COSTLY_ORDER) - return OOM_SKIPPED; + return false; memcg_memory_event(memcg, MEMCG_OOM); @@ -1824,14 +1842,13 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int * victim and then we have to bail out from the charge path. */ if (memcg->oom_kill_disable) { - if (!current->in_user_fault) - return OOM_SKIPPED; - css_get(&memcg->css); - current->memcg_in_oom = memcg; - current->memcg_oom_gfp_mask = mask; - current->memcg_oom_order = order; - - return OOM_ASYNC; + if (current->in_user_fault) { + css_get(&memcg->css); + current->memcg_in_oom = memcg; + current->memcg_oom_gfp_mask = mask; + current->memcg_oom_order = order; + } + return false; } mem_cgroup_mark_under_oom(memcg); @@ -1842,10 +1859,7 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int mem_cgroup_oom_notify(memcg); mem_cgroup_unmark_under_oom(memcg); - if (mem_cgroup_out_of_memory(memcg, mask, order)) - ret = OOM_SUCCESS; - else - ret = OOM_FAILED; + ret = mem_cgroup_out_of_memory(memcg, mask, order); if (locked) mem_cgroup_oom_unlock(memcg); @@ -2078,45 +2092,47 @@ void unlock_page_memcg(struct page *page) folio_memcg_unlock(page_folio(page)); } -struct obj_stock { +struct memcg_stock_pcp { + local_lock_t stock_lock; + struct mem_cgroup *cached; /* this never be root cgroup */ + unsigned int nr_pages; + #ifdef CONFIG_MEMCG_KMEM struct obj_cgroup *cached_objcg; struct pglist_data *cached_pgdat; unsigned int nr_bytes; int nr_slab_reclaimable_b; int nr_slab_unreclaimable_b; -#else - int dummy[0]; #endif -}; - -struct memcg_stock_pcp { - struct mem_cgroup *cached; /* this never be root cgroup */ - unsigned int nr_pages; - struct obj_stock task_obj; - struct obj_stock irq_obj; struct work_struct work; unsigned long flags; #define FLUSHING_CACHED_CHARGE 0 }; -static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); +static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { + .stock_lock = INIT_LOCAL_LOCK(stock_lock), +}; static DEFINE_MUTEX(percpu_charge_mutex); #ifdef CONFIG_MEMCG_KMEM -static void drain_obj_stock(struct obj_stock *stock); +static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock); static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg); +static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages); #else -static inline void drain_obj_stock(struct obj_stock *stock) +static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) { + return NULL; } static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg) { return false; } +static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) +{ +} #endif /** @@ -2139,7 +2155,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) if (nr_pages > MEMCG_CHARGE_BATCH) return ret; - local_irq_save(flags); + local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); if (memcg == stock->cached && stock->nr_pages >= nr_pages) { @@ -2147,7 +2163,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) ret = true; } - local_irq_restore(flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); return ret; } @@ -2176,6 +2192,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) static void drain_local_stock(struct work_struct *dummy) { struct memcg_stock_pcp *stock; + struct obj_cgroup *old = NULL; unsigned long flags; /* @@ -2183,28 +2200,25 @@ static void drain_local_stock(struct work_struct *dummy) * drain_stock races is that we always operate on local CPU stock * here with IRQ disabled */ - local_irq_save(flags); + local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); - drain_obj_stock(&stock->irq_obj); - if (in_task()) - drain_obj_stock(&stock->task_obj); + old = drain_obj_stock(stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); - local_irq_restore(flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + if (old) + obj_cgroup_put(old); } /* * Cache charges(val) to local per_cpu area. * This will be consumed by consume_stock() function, later. */ -static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; - unsigned long flags; - - local_irq_save(flags); stock = this_cpu_ptr(&memcg_stock); if (stock->cached != memcg) { /* reset if necessary */ @@ -2216,8 +2230,15 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) if (stock->nr_pages > MEMCG_CHARGE_BATCH) drain_stock(stock); +} - local_irq_restore(flags); +static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + unsigned long flags; + + local_lock_irqsave(&memcg_stock.stock_lock, flags); + __refill_stock(memcg, nr_pages); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); } /* @@ -2237,7 +2258,8 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) * as well as workers from this path always operate on the local * per-cpu data. CPU up doesn't touch memcg_stock at all. */ - curcpu = get_cpu(); + migrate_disable(); + curcpu = smp_processor_id(); for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); struct mem_cgroup *memcg; @@ -2260,7 +2282,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) schedule_work_on(cpu, &stock->work); } } - put_cpu(); + migrate_enable(); mutex_unlock(&percpu_charge_mutex); } @@ -2534,7 +2556,6 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, int nr_retries = MAX_RECLAIM_RETRIES; struct mem_cgroup *mem_over_limit; struct page_counter *counter; - enum oom_status oom_status; unsigned long nr_reclaimed; bool passed_oom = false; bool may_swap = true; @@ -2563,15 +2584,6 @@ retry: } /* - * Memcg doesn't have a dedicated reserve for atomic - * allocations. But like the global atomic pool, we need to - * put the burden of reclaim on regular allocation requests - * and let these go through as privileged allocations. - */ - if (gfp_mask & __GFP_ATOMIC) - goto force; - - /* * Prevent unbounded recursion when reclaim operations need to * allocate memory. This might exceed the limits temporarily, * but we prefer facilitating memory reclaim and getting back @@ -2637,15 +2649,20 @@ retry: * a forward progress or bypass the charge if the oom killer * couldn't make any progress. */ - oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask, - get_order(nr_pages * PAGE_SIZE)); - if (oom_status == OOM_SUCCESS) { + if (mem_cgroup_oom(mem_over_limit, gfp_mask, + get_order(nr_pages * PAGE_SIZE))) { passed_oom = true; nr_retries = MAX_RECLAIM_RETRIES; goto retry; } nomem: - if (!(gfp_mask & __GFP_NOFAIL)) + /* + * Memcg doesn't have a dedicated reserve for atomic + * allocations. But like the global atomic pool, we need to + * put the burden of reclaim on regular allocation requests + * and let these go through as privileged allocations. + */ + if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH))) return -ENOMEM; force: /* @@ -2681,7 +2698,7 @@ done_restock: READ_ONCE(memcg->swap.high); /* Don't bother a random interrupted task */ - if (in_interrupt()) { + if (!in_task()) { if (mem_high) { schedule_work(&memcg->high_work); break; @@ -2705,6 +2722,11 @@ done_restock: } } while ((memcg = parent_mem_cgroup(memcg))); + if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && + !(current->flags & PF_MEMALLOC) && + gfpflags_allow_blocking(gfp_mask)) { + mem_cgroup_handle_over_high(); + } return 0; } @@ -2741,20 +2763,6 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) folio->memcg_data = (unsigned long)memcg; } -static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) -{ - struct mem_cgroup *memcg; - - rcu_read_lock(); -retry: - memcg = obj_cgroup_memcg(objcg); - if (unlikely(!css_tryget(&memcg->css))) - goto retry; - rcu_read_unlock(); - - return memcg; -} - #ifdef CONFIG_MEMCG_KMEM /* * The allocated objcg pointers array is not accounted directly. @@ -2764,41 +2772,6 @@ retry: #define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT) /* - * Most kmem_cache_alloc() calls are from user context. The irq disable/enable - * sequence used in this case to access content from object stock is slow. - * To optimize for user context access, there are now two object stocks for - * task context and interrupt context access respectively. - * - * The task context object stock can be accessed by disabling preemption only - * which is cheap in non-preempt kernel. The interrupt context object stock - * can only be accessed after disabling interrupt. User context code can - * access interrupt object stock, but not vice versa. - */ -static inline struct obj_stock *get_obj_stock(unsigned long *pflags) -{ - struct memcg_stock_pcp *stock; - - if (likely(in_task())) { - *pflags = 0UL; - preempt_disable(); - stock = this_cpu_ptr(&memcg_stock); - return &stock->task_obj; - } - - local_irq_save(*pflags); - stock = this_cpu_ptr(&memcg_stock); - return &stock->irq_obj; -} - -static inline void put_obj_stock(unsigned long flags) -{ - if (likely(in_task())) - preempt_enable(); - else - local_irq_restore(flags); -} - -/* * mod_objcg_mlstate() may be called with irq enabled, so * mod_memcg_lruvec_state() should be used. */ @@ -2929,48 +2902,17 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) return objcg; } -static int memcg_alloc_cache_id(void) +static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) { - int id, size; - int err; - - id = ida_simple_get(&memcg_cache_ida, - 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); - if (id < 0) - return id; - - if (id < memcg_nr_cache_ids) - return id; - - /* - * There's no space for the new id in memcg_caches arrays, - * so we have to grow them. - */ - down_write(&memcg_cache_ids_sem); - - size = 2 * (id + 1); - if (size < MEMCG_CACHES_MIN_SIZE) - size = MEMCG_CACHES_MIN_SIZE; - else if (size > MEMCG_CACHES_MAX_SIZE) - size = MEMCG_CACHES_MAX_SIZE; - - err = memcg_update_all_list_lrus(size); - if (!err) - memcg_nr_cache_ids = size; - - up_write(&memcg_cache_ids_sem); - - if (err) { - ida_simple_remove(&memcg_cache_ida, id); - return err; + mod_memcg_state(memcg, MEMCG_KMEM, nr_pages); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { + if (nr_pages > 0) + page_counter_charge(&memcg->kmem, nr_pages); + else + page_counter_uncharge(&memcg->kmem, -nr_pages); } - return id; } -static void memcg_free_cache_id(int id) -{ - ida_simple_remove(&memcg_cache_ida, id); -} /* * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg @@ -2984,8 +2926,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, memcg = get_mem_cgroup_from_objcg(objcg); - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->kmem, nr_pages); + memcg_account_kmem(memcg, -nr_pages); refill_stock(memcg, nr_pages); css_put(&memcg->css); @@ -3011,8 +2952,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, if (ret) goto out; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_charge(&memcg->kmem, nr_pages); + memcg_account_kmem(memcg, nr_pages); out: css_put(&memcg->css); @@ -3068,17 +3008,21 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) { + struct memcg_stock_pcp *stock; + struct obj_cgroup *old = NULL; unsigned long flags; - struct obj_stock *stock = get_obj_stock(&flags); int *bytes; + local_lock_irqsave(&memcg_stock.stock_lock, flags); + stock = this_cpu_ptr(&memcg_stock); + /* * Save vmstat data in stock and skip vmstat array update unless * accumulating over a page of vmstat data or when pgdat or idx * changes. */ if (stock->cached_objcg != objcg) { - drain_obj_stock(stock); + old = drain_obj_stock(stock); obj_cgroup_get(objcg); stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; @@ -3122,38 +3066,51 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, if (nr) mod_objcg_mlstate(objcg, pgdat, idx, nr); - put_obj_stock(flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + if (old) + obj_cgroup_put(old); } static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) { + struct memcg_stock_pcp *stock; unsigned long flags; - struct obj_stock *stock = get_obj_stock(&flags); bool ret = false; + local_lock_irqsave(&memcg_stock.stock_lock, flags); + + stock = this_cpu_ptr(&memcg_stock); if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { stock->nr_bytes -= nr_bytes; ret = true; } - put_obj_stock(flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); return ret; } -static void drain_obj_stock(struct obj_stock *stock) +static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) { struct obj_cgroup *old = stock->cached_objcg; if (!old) - return; + return NULL; if (stock->nr_bytes) { unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); - if (nr_pages) - obj_cgroup_uncharge_pages(old, nr_pages); + if (nr_pages) { + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_objcg(old); + + memcg_account_kmem(memcg, -nr_pages); + __refill_stock(memcg, nr_pages); + + css_put(&memcg->css); + } /* * The leftover is flushed to the centralized per-memcg value. @@ -3188,8 +3145,12 @@ static void drain_obj_stock(struct obj_stock *stock) stock->cached_pgdat = NULL; } - obj_cgroup_put(old); stock->cached_objcg = NULL; + /* + * The `old' objects needs to be released by the caller via + * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock. + */ + return old; } static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, @@ -3197,13 +3158,8 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, { struct mem_cgroup *memcg; - if (in_task() && stock->task_obj.cached_objcg) { - memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg); - if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) - return true; - } - if (stock->irq_obj.cached_objcg) { - memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg); + if (stock->cached_objcg) { + memcg = obj_cgroup_memcg(stock->cached_objcg); if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) return true; } @@ -3214,12 +3170,16 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, bool allow_uncharge) { + struct memcg_stock_pcp *stock; + struct obj_cgroup *old = NULL; unsigned long flags; - struct obj_stock *stock = get_obj_stock(&flags); unsigned int nr_pages = 0; + local_lock_irqsave(&memcg_stock.stock_lock, flags); + + stock = this_cpu_ptr(&memcg_stock); if (stock->cached_objcg != objcg) { /* reset if necessary */ - drain_obj_stock(stock); + old = drain_obj_stock(stock); obj_cgroup_get(objcg); stock->cached_objcg = objcg; stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) @@ -3233,7 +3193,9 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, stock->nr_bytes &= (PAGE_SIZE - 1); } - put_obj_stock(flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + if (old) + obj_cgroup_put(old); if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); @@ -3618,28 +3580,23 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, static int memcg_online_kmem(struct mem_cgroup *memcg) { struct obj_cgroup *objcg; - int memcg_id; if (cgroup_memory_nokmem) return 0; - BUG_ON(memcg->kmemcg_id >= 0); - - memcg_id = memcg_alloc_cache_id(); - if (memcg_id < 0) - return memcg_id; + if (unlikely(mem_cgroup_is_root(memcg))) + return 0; objcg = obj_cgroup_alloc(); - if (!objcg) { - memcg_free_cache_id(memcg_id); + if (!objcg) return -ENOMEM; - } + objcg->memcg = memcg; rcu_assign_pointer(memcg->objcg, objcg); static_branch_enable(&memcg_kmem_enabled_key); - memcg->kmemcg_id = memcg_id; + memcg->kmemcg_id = memcg->id.id; return 0; } @@ -3647,9 +3604,11 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) static void memcg_offline_kmem(struct mem_cgroup *memcg) { struct mem_cgroup *parent; - int kmemcg_id; - if (memcg->kmemcg_id == -1) + if (cgroup_memory_nokmem) + return; + + if (unlikely(mem_cgroup_is_root(memcg))) return; parent = parent_mem_cgroup(memcg); @@ -3658,19 +3617,13 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) memcg_reparent_objcgs(memcg, parent); - kmemcg_id = memcg->kmemcg_id; - BUG_ON(kmemcg_id < 0); - /* * After we have finished memcg_reparent_objcgs(), all list_lrus * corresponding to this cgroup are guaranteed to remain empty. * The ordering is imposed by list_lru_node->lock taken by - * memcg_drain_all_list_lrus(). + * memcg_reparent_list_lrus(). */ - memcg_drain_all_list_lrus(kmemcg_id, parent); - - memcg_free_cache_id(kmemcg_id); - memcg->kmemcg_id = -1; + memcg_reparent_list_lrus(memcg, parent); } #else static int memcg_online_kmem(struct mem_cgroup *memcg) @@ -3756,8 +3709,12 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, } break; case RES_SOFT_LIMIT: - memcg->soft_limit = nr_pages; - ret = 0; + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + ret = -EOPNOTSUPP; + } else { + memcg->soft_limit = nr_pages; + ret = 0; + } break; } return ret ?: nbytes; @@ -4733,6 +4690,9 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, char *endp; int ret; + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return -EOPNOTSUPP; + buf = strstrip(buf); efd = simple_strtoul(buf, &endp, 10); @@ -4850,6 +4810,17 @@ out_kfree: return ret; } +#if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) +static int mem_cgroup_slab_show(struct seq_file *m, void *p) +{ + /* + * Deprecated. + * Please, take a look at tools/cgroup/slabinfo.py . + */ + return 0; +} +#endif + static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", @@ -4950,7 +4921,7 @@ static struct cftype mem_cgroup_legacy_files[] = { (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) { .name = "kmem.slabinfo", - .seq_show = memcg_slab_show, + .seq_show = mem_cgroup_slab_show, }, #endif { @@ -5049,18 +5020,8 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id) static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; - int tmp = node; - /* - * This routine is called against possible nodes. - * But it's BUG to call kmalloc() against offline node. - * - * TODO: this routine can waste much memory for nodes which will - * never be onlined. It's better to use memory hotplug callback - * function. - */ - if (!node_state(node, N_NORMAL_MEMORY)) - tmp = -1; - pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); + + pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node); if (!pn) return 1; @@ -5072,8 +5033,6 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) } lruvec_init(&pn->lruvec); - pn->usage_in_excess = 0; - pn->on_tree = false; pn->memcg = memcg; memcg->nodeinfo[node] = pn; @@ -5110,21 +5069,16 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) static struct mem_cgroup *mem_cgroup_alloc(void) { struct mem_cgroup *memcg; - unsigned int size; int node; int __maybe_unused i; long error = -ENOMEM; - size = sizeof(struct mem_cgroup); - size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); - - memcg = kzalloc(size, GFP_KERNEL); + memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL); if (!memcg) return ERR_PTR(error); memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, - 1, MEM_CGROUP_ID_MAX, - GFP_KERNEL); + 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL); if (memcg->id.id < 0) { error = memcg->id.id; goto fail; @@ -5178,7 +5132,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); struct mem_cgroup *memcg, *old_memcg; - long error = -ENOMEM; old_memcg = set_active_memcg(parent); memcg = mem_cgroup_alloc(); @@ -5207,34 +5160,26 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) return &memcg->css; } - /* The following stuff does not apply to the root */ - error = memcg_online_kmem(memcg); - if (error) - goto fail; - if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_inc(&memcg_sockets_enabled_key); return &memcg->css; -fail: - mem_cgroup_id_remove(memcg); - mem_cgroup_free(memcg); - return ERR_PTR(error); } static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + if (memcg_online_kmem(memcg)) + goto remove_id; + /* * A memcg must be visible for expand_shrinker_info() * by the time the maps are allocated. So, we allocate maps * here, when for_each_mem_cgroup() can't skip it. */ - if (alloc_shrinker_info(memcg)) { - mem_cgroup_id_remove(memcg); - return -ENOMEM; - } + if (alloc_shrinker_info(memcg)) + goto offline_kmem; /* Online state pins memcg ID, memcg ID pins CSS */ refcount_set(&memcg->id.ref, 1); @@ -5244,6 +5189,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); return 0; +offline_kmem: + memcg_offline_kmem(memcg); +remove_id: + mem_cgroup_id_remove(memcg); + return -ENOMEM; } static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) @@ -5301,9 +5251,6 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) cancel_work_sync(&memcg->high_work); mem_cgroup_remove_from_trees(memcg); free_shrinker_info(memcg); - - /* Need to offline kmem if online_css() fails */ - memcg_offline_kmem(memcg); mem_cgroup_free(memcg); } @@ -5489,17 +5436,12 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, return NULL; /* - * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to - * a device and because they are not accessible by CPU they are store - * as special swap entry in the CPU page table. + * Handle device private pages that are not accessible by the CPU, but + * stored as special swap entries in the page table. */ if (is_device_private_entry(ent)) { page = pfn_swap_entry_to_page(ent); - /* - * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have - * a refcount of 1 when free (unlike normal page) - */ - if (!page_ref_add_unless(page, 1, 1)) + if (!get_page_unless_zero(page)) return NULL; return page; } @@ -6312,6 +6254,8 @@ static void __memory_events_show(struct seq_file *m, atomic_long_t *events) seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); seq_printf(m, "oom_kill %lu\n", atomic_long_read(&events[MEMCG_OOM_KILL])); + seq_printf(m, "oom_group_kill %lu\n", + atomic_long_read(&events[MEMCG_OOM_GROUP_KILL])); } static int memory_events_show(struct seq_file *m, void *v) @@ -6785,8 +6729,8 @@ static void uncharge_batch(const struct uncharge_gather *ug) page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); if (do_memsw_account()) page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) - page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); + if (ug->nr_kmem) + memcg_account_kmem(ug->memcg, -ug->nr_kmem); memcg_oom_recover(ug->memcg); } @@ -6805,7 +6749,6 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) long nr_pages; struct mem_cgroup *memcg; struct obj_cgroup *objcg; - bool use_objcg = folio_memcg_kmem(folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); @@ -6814,7 +6757,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) * folio memcg or objcg at this point, we have fully * exclusive access to the folio. */ - if (use_objcg) { + if (folio_memcg_kmem(folio)) { objcg = __folio_objcg(folio); /* * This get matches the put at the end of the function and @@ -6842,7 +6785,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) nr_pages = folio_nr_pages(folio); - if (use_objcg) { + if (folio_memcg_kmem(folio)) { ug->nr_memory += nr_pages; ug->nr_kmem += nr_pages; @@ -6952,7 +6895,7 @@ void mem_cgroup_sk_alloc(struct sock *sk) return; /* Do not associate the sock with unrelated interrupted task's memcg. */ - if (in_interrupt()) + if (!in_task()) return; rcu_read_lock(); @@ -7037,7 +6980,7 @@ static int __init cgroup_memory(char *s) if (!strcmp(token, "nokmem")) cgroup_memory_nokmem = true; } - return 0; + return 1; } __setup("cgroup.memory=", cgroup_memory); @@ -7105,19 +7048,19 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) /** * mem_cgroup_swapout - transfer a memsw charge to swap - * @page: page whose memsw charge to transfer + * @folio: folio whose memsw charge to transfer * @entry: swap entry to move the charge to * - * Transfer the memsw charge of @page to @entry. + * Transfer the memsw charge of @folio to @entry. */ -void mem_cgroup_swapout(struct page *page, swp_entry_t entry) +void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) { struct mem_cgroup *memcg, *swap_memcg; unsigned int nr_entries; unsigned short oldid; - VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); if (mem_cgroup_disabled()) return; @@ -7125,9 +7068,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return; - memcg = page_memcg(page); + memcg = folio_memcg(folio); - VM_WARN_ON_ONCE_PAGE(!memcg, page); + VM_WARN_ON_ONCE_FOLIO(!memcg, folio); if (!memcg) return; @@ -7137,16 +7080,16 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * ancestor for the swap instead and transfer the memory+swap charge. */ swap_memcg = mem_cgroup_id_get_online(memcg); - nr_entries = thp_nr_pages(page); + nr_entries = folio_nr_pages(folio); /* Get references for the tail pages, too */ if (nr_entries > 1) mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), nr_entries); - VM_BUG_ON_PAGE(oldid, page); + VM_BUG_ON_FOLIO(oldid, folio); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); - page->memcg_data = 0; + folio->memcg_data = 0; if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, nr_entries); @@ -7163,9 +7106,10 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * important here to have the interrupts disabled because it is the * only synchronisation we have for updating the per-CPU variables. */ - VM_BUG_ON(!irqs_disabled()); + memcg_stats_lock(); mem_cgroup_charge_statistics(memcg, -nr_entries); - memcg_check_events(memcg, page_to_nid(page)); + memcg_stats_unlock(); + memcg_check_events(memcg, folio_nid(folio)); css_put(&memcg->css); } diff --git a/mm/memfd.c b/mm/memfd.c index 9f80f162791a..08f5f8304746 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -31,20 +31,28 @@ static void memfd_tag_pins(struct xa_state *xas) { struct page *page; - unsigned int tagged = 0; + int latency = 0; + int cache_count; lru_add_drain(); xas_lock_irq(xas); xas_for_each(xas, page, ULONG_MAX) { - if (xa_is_value(page)) - continue; - page = find_subpage(page, xas->xa_index); - if (page_count(page) - page_mapcount(page) > 1) + cache_count = 1; + if (!xa_is_value(page) && + PageTransHuge(page) && !PageHuge(page)) + cache_count = HPAGE_PMD_NR; + + if (!xa_is_value(page) && + page_count(page) - total_mapcount(page) != cache_count) xas_set_mark(xas, MEMFD_TAG_PINNED); + if (cache_count != 1) + xas_set(xas, page->index + cache_count); - if (++tagged % XA_CHECK_SCHED) + latency += cache_count; + if (latency < XA_CHECK_SCHED) continue; + latency = 0; xas_pause(xas); xas_unlock_irq(xas); @@ -73,7 +81,8 @@ static int memfd_wait_for_pins(struct address_space *mapping) error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { - unsigned int tagged = 0; + int latency = 0; + int cache_count; if (!xas_marked(&xas, MEMFD_TAG_PINNED)) break; @@ -87,10 +96,14 @@ static int memfd_wait_for_pins(struct address_space *mapping) xas_lock_irq(&xas); xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) { bool clear = true; - if (xa_is_value(page)) - continue; - page = find_subpage(page, xas.xa_index); - if (page_count(page) - page_mapcount(page) != 1) { + + cache_count = 1; + if (!xa_is_value(page) && + PageTransHuge(page) && !PageHuge(page)) + cache_count = HPAGE_PMD_NR; + + if (!xa_is_value(page) && cache_count != + page_count(page) - total_mapcount(page)) { /* * On the last scan, we clean up all those tags * we inserted; but make a note that we still @@ -103,8 +116,11 @@ static int memfd_wait_for_pins(struct address_space *mapping) } if (clear) xas_clear_mark(&xas, MEMFD_TAG_PINNED); - if (++tagged % XA_CHECK_SCHED) + + latency += cache_count; + if (latency < XA_CHECK_SCHED) continue; + latency = 0; xas_pause(&xas); xas_unlock_irq(&xas); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f1c389f7e669..dcb6bb9cf731 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -58,6 +58,7 @@ #include <linux/ratelimit.h> #include <linux/page-isolation.h> #include <linux/pagewalk.h> +#include <linux/shmem_fs.h> #include "internal.h" #include "ras/ras_event.h" @@ -129,12 +130,6 @@ static int hwpoison_filter_dev(struct page *p) hwpoison_filter_dev_minor == ~0U) return 0; - /* - * page_mapping() does not accept slab pages. - */ - if (PageSlab(p)) - return -EINVAL; - mapping = page_mapping(p); if (mapping == NULL || mapping->host == NULL) return -EINVAL; @@ -257,16 +252,13 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n", pfn, t->comm, t->pid); - if (flags & MF_ACTION_REQUIRED) { - if (t == current) - ret = force_sig_mceerr(BUS_MCEERR_AR, - (void __user *)tk->addr, addr_lsb); - else - /* Signal other processes sharing the page if they have PF_MCE_EARLY set. */ - ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr, - addr_lsb, t); - } else { + if ((flags & MF_ACTION_REQUIRED) && (t == current)) + ret = force_sig_mceerr(BUS_MCEERR_AR, + (void __user *)tk->addr, addr_lsb); + else /* + * Signal other processes sharing the page if they have + * PF_MCE_EARLY set. * Don't use force here, it's convenient if the signal * can be temporarily blocked. * This could cause a loop when the user sets SIGBUS @@ -274,7 +266,6 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) */ ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr, addr_lsb, t); /* synchronous? */ - } if (ret < 0) pr_info("Memory failure: Error sending signal to %s:%d: %d\n", t->comm, t->pid, ret); @@ -314,6 +305,7 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page, pmd_t *pmd; pte_t *pte; + VM_BUG_ON_VMA(address == -EFAULT, vma); pgd = pgd_offset(vma->vm_mm, address); if (!pgd_present(*pgd)) return 0; @@ -486,12 +478,13 @@ static struct task_struct *task_early_kill(struct task_struct *tsk, static void collect_procs_anon(struct page *page, struct list_head *to_kill, int force_early) { + struct folio *folio = page_folio(page); struct vm_area_struct *vma; struct task_struct *tsk; struct anon_vma *av; pgoff_t pgoff; - av = page_lock_anon_vma_read(page); + av = folio_lock_anon_vma_read(folio); if (av == NULL) /* Not actually mapped anymore */ return; @@ -706,8 +699,10 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn, (void *)&priv); if (ret == 1 && priv.tk.addr) kill_proc(&priv.tk, pfn, flags); + else + ret = 0; mmap_read_unlock(p->mm); - return ret ? -EFAULT : -EHWPOISON; + return ret > 0 ? -EHWPOISON : -EFAULT; } static const char *action_name[] = { @@ -722,7 +717,6 @@ static const char * const action_page_types[] = { [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", [MF_MSG_SLAB] = "kernel slab page", [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking", - [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned", [MF_MSG_HUGE] = "huge page", [MF_MSG_FREE_HUGE] = "free huge page", [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page", @@ -737,9 +731,9 @@ static const char * const action_page_types[] = { [MF_MSG_CLEAN_LRU] = "clean LRU page", [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page", [MF_MSG_BUDDY] = "free buddy page", - [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)", [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", + [MF_MSG_DIFFERENT_PAGE_SIZE] = "different page size", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -867,6 +861,7 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) { int ret; struct address_space *mapping; + bool extra_pins; delete_from_lru_cache(p); @@ -896,17 +891,23 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) } /* + * The shmem page is kept in page cache instead of truncating + * so is expected to have an extra refcount after error-handling. + */ + extra_pins = shmem_mapping(mapping); + + /* * Truncation is a bit tricky. Enable it per file system for now. * * Open: to take i_rwsem or not for this? Right now we don't. */ ret = truncate_error_page(p, page_to_pfn(p), mapping); + if (has_extra_refcount(ps, p, extra_pins)) + ret = MF_FAILED; + out: unlock_page(p); - if (has_extra_refcount(ps, p, false)) - ret = MF_FAILED; - return ret; } @@ -1154,18 +1155,40 @@ static int page_action(struct page_state *ps, struct page *p, return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; } +static inline bool PageHWPoisonTakenOff(struct page *page) +{ + return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON; +} + +void SetPageHWPoisonTakenOff(struct page *page) +{ + set_page_private(page, MAGIC_HWPOISON); +} + +void ClearPageHWPoisonTakenOff(struct page *page) +{ + if (PageHWPoison(page)) + set_page_private(page, 0); +} + /* * Return true if a page type of a given page is supported by hwpoison * mechanism (while handling could fail), otherwise false. This function * does not return true for hugetlb or device memory pages, so it's assumed * to be called only in the context where we never have such pages. */ -static inline bool HWPoisonHandlable(struct page *page) +static inline bool HWPoisonHandlable(struct page *page, unsigned long flags) { - return PageLRU(page) || __PageMovable(page) || is_free_buddy_page(page); + bool movable = false; + + /* Soft offline could mirgate non-LRU movable pages */ + if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page)) + movable = true; + + return movable || PageLRU(page) || is_free_buddy_page(page); } -static int __get_hwpoison_page(struct page *page) +static int __get_hwpoison_page(struct page *page, unsigned long flags) { struct page *head = compound_head(page); int ret = 0; @@ -1180,7 +1203,7 @@ static int __get_hwpoison_page(struct page *page) * for any unsupported type of page in order to reduce the risk of * unexpected races caused by taking a page refcount. */ - if (!HWPoisonHandlable(head)) + if (!HWPoisonHandlable(head, flags)) return -EBUSY; if (get_page_unless_zero(head)) { @@ -1205,7 +1228,7 @@ static int get_any_page(struct page *p, unsigned long flags) try_again: if (!count_increased) { - ret = __get_hwpoison_page(p); + ret = __get_hwpoison_page(p, flags); if (!ret) { if (page_count(p)) { /* We raced with an allocation, retry. */ @@ -1233,7 +1256,7 @@ try_again: } } - if (PageHuge(p) || HWPoisonHandlable(p)) { + if (PageHuge(p) || HWPoisonHandlable(p, flags)) { ret = 1; } else { /* @@ -1256,6 +1279,27 @@ out: return ret; } +static int __get_unpoison_page(struct page *page) +{ + struct page *head = compound_head(page); + int ret = 0; + bool hugetlb = false; + + ret = get_hwpoison_huge_page(head, &hugetlb); + if (hugetlb) + return ret; + + /* + * PageHWPoisonTakenOff pages are not only marked as PG_hwpoison, + * but also isolated from buddy freelist, so need to identify the + * state and have to cancel both operations to unpoison. + */ + if (PageHWPoisonTakenOff(page)) + return -EHWPOISON; + + return get_page_unless_zero(page) ? 1 : 0; +} + /** * get_hwpoison_page() - Get refcount for memory error handling * @p: Raw error page (hit by memory error) @@ -1263,7 +1307,7 @@ out: * * get_hwpoison_page() takes a page refcount of an error page to handle memory * error on it, after checking that the error page is in a well-defined state - * (defined as a page-type we can successfully handle the memor error on it, + * (defined as a page-type we can successfully handle the memory error on it, * such as LRU page and hugetlb page). * * Memory error handling could be triggered at any time on any type of page, @@ -1272,18 +1316,26 @@ out: * extra care for the error page's state (as done in __get_hwpoison_page()), * and has some retry logic in get_any_page(). * + * When called from unpoison_memory(), the caller should already ensure that + * the given page has PG_hwpoison. So it's never reused for other page + * allocations, and __get_unpoison_page() never races with them. + * * Return: 0 on failure, * 1 on success for in-use pages in a well-defined state, * -EIO for pages on which we can not handle memory errors, * -EBUSY when get_hwpoison_page() has raced with page lifecycle - * operations like allocation and free. + * operations like allocation and free, + * -EHWPOISON when the page is hwpoisoned and taken off from buddy. */ static int get_hwpoison_page(struct page *p, unsigned long flags) { int ret; zone_pcp_disable(page_zone(p)); - ret = get_any_page(p, flags); + if (flags & MF_UNPOISON) + ret = __get_unpoison_page(p); + else + ret = get_any_page(p, flags); zone_pcp_enable(page_zone(p)); return ret; @@ -1296,6 +1348,7 @@ static int get_hwpoison_page(struct page *p, unsigned long flags) static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, int flags, struct page *hpage) { + struct folio *folio = page_folio(hpage); enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC; struct address_space *mapping; LIST_HEAD(tokill); @@ -1360,26 +1413,22 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, if (kill) collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); - if (!PageHuge(hpage)) { - try_to_unmap(hpage, ttu); + if (PageHuge(hpage) && !PageAnon(hpage)) { + /* + * For hugetlb pages in shared mappings, try_to_unmap + * could potentially call huge_pmd_unshare. Because of + * this, take semaphore in write mode here and set + * TTU_RMAP_LOCKED to indicate we have taken the lock + * at this higher level. + */ + mapping = hugetlb_page_mapping_lock_write(hpage); + if (mapping) { + try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); + i_mmap_unlock_write(mapping); + } else + pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn); } else { - if (!PageAnon(hpage)) { - /* - * For hugetlb pages in shared mappings, try_to_unmap - * could potentially call huge_pmd_unshare. Because of - * this, take semaphore in write mode here and set - * TTU_RMAP_LOCKED to indicate we have taken the lock - * at this higher level. - */ - mapping = hugetlb_page_mapping_lock_write(hpage); - if (mapping) { - try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED); - i_mmap_unlock_write(mapping); - } else - pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn); - } else { - try_to_unmap(hpage, ttu); - } + try_to_unmap(folio, ttu); } unmap_success = !page_mapped(hpage); @@ -1475,7 +1524,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) if (TestClearPageHWPoison(head)) num_poisoned_pages_dec(); unlock_page(head); - return 0; + return -EOPNOTSUPP; } unlock_page(head); res = MF_FAILED; @@ -1492,14 +1541,25 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) } lock_page(head); + + /* + * The page could have changed compound pages due to race window. + * If this happens just bail out. + */ + if (!PageHuge(p) || compound_head(p) != head) { + action_result(pfn, MF_MSG_DIFFERENT_PAGE_SIZE, MF_IGNORED); + res = -EBUSY; + goto out; + } + page_flags = head->flags; - if (!PageHWPoison(head)) { - pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); - num_poisoned_pages_dec(); - unlock_page(head); - put_page(head); - return 0; + if (hwpoison_filter(p)) { + if (TestClearPageHWPoison(head)) + num_poisoned_pages_dec(); + put_page(p); + res = -EOPNOTSUPP; + goto out; } /* @@ -1553,6 +1613,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, } /* + * Pages instantiated by device-dax (not filesystem-dax) + * may be compound pages. + */ + page = compound_head(page); + + /* * Prevent the inode from being freed while we are interrogating * the address_space, typically this would be handled by * lock_page(), but dax pages do not use the page lock. This @@ -1564,7 +1630,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, goto out; if (hwpoison_filter(page)) { - rc = 0; + rc = -EOPNOTSUPP; goto unlock; } @@ -1589,7 +1655,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, * SIGBUS (i.e. MF_MUST_KILL) */ flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; - collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED); + collect_procs(page, &tokill, true); list_for_each_entry(tk, &tokill, nd) if (tk->size_shift) @@ -1604,7 +1670,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, start = (page->index << PAGE_SHIFT) & ~(size - 1); unmap_mapping_range(page->mapping, start, size, 0); } - kill_procs(&tokill, flags & MF_MUST_KILL, false, pfn, flags); + kill_procs(&tokill, true, false, pfn, flags); rc = 0; unlock: dax_unlock_page(page, cookie); @@ -1615,6 +1681,8 @@ out: return rc; } +static DEFINE_MUTEX(mf_mutex); + /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page @@ -1631,17 +1699,19 @@ out: * * Must run in process context (e.g. a work queue) with interrupts * enabled and no spinlocks hold. + * + * Return: 0 for successfully handled the memory error, + * -EOPNOTSUPP for memory_filter() filtered the error event, + * < 0(except -EOPNOTSUPP) on failure. */ int memory_failure(unsigned long pfn, int flags) { struct page *p; struct page *hpage; - struct page *orig_head; struct dev_pagemap *pgmap; int res = 0; unsigned long page_flags; bool retry = true; - static DEFINE_MUTEX(mf_mutex); if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn); @@ -1683,7 +1753,7 @@ try_again: goto unlock_mutex; } - orig_head = hpage = compound_head(p); + hpage = compound_head(p); num_poisoned_pages_inc(); /* @@ -1764,10 +1834,21 @@ try_again: lock_page(p); /* - * The page could have changed compound pages during the locking. - * If this happens just bail out. + * We're only intended to deal with the non-Compound page here. + * However, the page could have changed compound pages due to + * race window. If this happens, we could try again to hopefully + * handle the page next round. */ - if (PageCompound(p) && compound_head(p) != orig_head) { + if (PageCompound(p)) { + if (retry) { + if (TestClearPageHWPoison(p)) + num_poisoned_pages_dec(); + unlock_page(p); + put_page(p); + flags &= ~MF_COUNT_INCREASED; + retry = false; + goto try_again; + } action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); res = -EBUSY; goto unlock_page; @@ -1782,21 +1863,12 @@ try_again: */ page_flags = p->flags; - /* - * unpoison always clear PG_hwpoison inside page lock - */ - if (!PageHWPoison(p)) { - pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); - num_poisoned_pages_dec(); - unlock_page(p); - put_page(p); - goto unlock_mutex; - } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); unlock_page(p); put_page(p); + res = -EOPNOTSUPP; goto unlock_mutex; } @@ -1805,7 +1877,7 @@ try_again: * page_lock. We need wait writeback completion for this page or it * may trigger vfs BUG while evict inode. */ - if (!PageTransTail(p) && !PageLRU(p) && !PageWriteback(p)) + if (!PageLRU(p) && !PageWriteback(p)) goto identify_page_state; /* @@ -1955,6 +2027,28 @@ core_initcall(memory_failure_init); pr_info(fmt, pfn); \ }) +static inline int clear_page_hwpoison(struct ratelimit_state *rs, struct page *p) +{ + if (TestClearPageHWPoison(p)) { + unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", + page_to_pfn(p), rs); + num_poisoned_pages_dec(); + return 1; + } + return 0; +} + +static inline int unpoison_taken_off_page(struct ratelimit_state *rs, + struct page *p) +{ + if (put_page_back_buddy(p)) { + unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", + page_to_pfn(p), rs); + return 0; + } + return -EBUSY; +} + /** * unpoison_memory - Unpoison a previously poisoned page * @pfn: Page number of the to be unpoisoned page @@ -1971,8 +2065,7 @@ int unpoison_memory(unsigned long pfn) { struct page *page; struct page *p; - int freeit = 0; - unsigned long flags = 0; + int ret = -EBUSY; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -1982,69 +2075,60 @@ int unpoison_memory(unsigned long pfn) p = pfn_to_page(pfn); page = compound_head(p); + mutex_lock(&mf_mutex); + if (!PageHWPoison(p)) { unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (page_count(page) > 1) { unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (page_mapped(page)) { unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (page_mapping(page)) { unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n", pfn, &unpoison_rs); - return 0; - } - - /* - * unpoison_memory() can encounter thp only when the thp is being - * worked by memory_failure() and the page lock is not held yet. - * In such case, we yield to memory_failure() and make unpoison fail. - */ - if (!PageHuge(page) && PageTransHuge(page)) { - unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n", - pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } - if (!get_hwpoison_page(p, flags)) { - if (TestClearPageHWPoison(p)) - num_poisoned_pages_dec(); - unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n", - pfn, &unpoison_rs); - return 0; - } + if (PageSlab(page) || PageTable(page)) + goto unlock_mutex; - lock_page(page); - /* - * This test is racy because PG_hwpoison is set outside of page lock. - * That's acceptable because that won't trigger kernel panic. Instead, - * the PG_hwpoison page will be caught and isolated on the entrance to - * the free buddy page pool. - */ - if (TestClearPageHWPoison(page)) { - unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", - pfn, &unpoison_rs); - num_poisoned_pages_dec(); - freeit = 1; - } - unlock_page(page); + ret = get_hwpoison_page(p, MF_UNPOISON); + if (!ret) { + if (clear_page_hwpoison(&unpoison_rs, page)) + ret = 0; + else + ret = -EBUSY; + } else if (ret < 0) { + if (ret == -EHWPOISON) { + ret = unpoison_taken_off_page(&unpoison_rs, p); + } else + unpoison_pr_info("Unpoison: failed to grab page %#lx\n", + pfn, &unpoison_rs); + } else { + int freeit = clear_page_hwpoison(&unpoison_rs, p); - put_page(page); - if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) put_page(page); + if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) { + put_page(page); + ret = 0; + } + } - return 0; +unlock_mutex: + mutex_unlock(&mf_mutex); + return ret; } EXPORT_SYMBOL(unpoison_memory); @@ -2087,7 +2171,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) */ static int __soft_offline_page(struct page *page) { - int ret = 0; + long ret = 0; unsigned long pfn = page_to_pfn(page); struct page *hpage = compound_head(page); char const *msg_page[] = {"page", "hugepage"}; @@ -2098,12 +2182,6 @@ static int __soft_offline_page(struct page *page) .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, }; - /* - * Check PageHWPoison again inside page lock because PageHWPoison - * is set by memory_failure() outside page lock. Note that - * memory_failure() also double-checks PageHWPoison inside page lock, - * so there's no race between soft_offline_page() and memory_failure(). - */ lock_page(page); if (!PageHuge(page)) wait_on_page_writeback(page); @@ -2114,7 +2192,7 @@ static int __soft_offline_page(struct page *page) return 0; } - if (!PageHuge(page)) + if (!PageHuge(page) && PageLRU(page) && !PageSwapCache(page)) /* * Try to invalidate first. This should work for * non dirty unmapped page cache pages. @@ -2122,10 +2200,6 @@ static int __soft_offline_page(struct page *page) ret = invalidate_inode_page(page); unlock_page(page); - /* - * RED-PEN would be better to keep it isolated here, but we - * would need to fix isolation locking first. - */ if (ret) { pr_info("soft_offline: %#lx: invalidated\n", pfn); page_handle_poison(page, false, true); @@ -2144,7 +2218,7 @@ static int __soft_offline_page(struct page *page) if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); - pr_info("soft offline: %#lx: %s migration failed %d, type %pGp\n", + pr_info("soft offline: %#lx: %s migration failed %ld, type %pGp\n", pfn, msg_page[huge], ret, &page->flags); if (ret > 0) ret = -EBUSY; @@ -2225,15 +2299,18 @@ int soft_offline_page(unsigned long pfn, int flags) return -EIO; } + mutex_lock(&mf_mutex); + if (PageHWPoison(page)) { pr_info("%s: %#lx page already poisoned\n", __func__, pfn); put_ref_page(ref_page); + mutex_unlock(&mf_mutex); return 0; } retry: get_online_mems(); - ret = get_hwpoison_page(page, flags); + ret = get_hwpoison_page(page, flags | MF_SOFT_OFFLINE); put_online_mems(); if (ret > 0) { @@ -2246,5 +2323,7 @@ retry: } } + mutex_unlock(&mf_mutex); + return ret; } diff --git a/mm/memory.c b/mm/memory.c index 8f1de811a1dc..7c40850b7124 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -41,6 +41,7 @@ #include <linux/kernel_stat.h> #include <linux/mm.h> +#include <linux/mm_inline.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/numa_balancing.h> @@ -719,8 +720,6 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, else if (is_writable_device_exclusive_entry(entry)) pte = maybe_mkwrite(pte_mkdirty(pte), vma); - set_pte_at(vma->vm_mm, address, ptep, pte); - /* * No need to take a page reference as one was already * created when the swap entry was made. @@ -734,8 +733,7 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, */ WARN_ON_ONCE(!PageAnon(page)); - if (vma->vm_flags & VM_LOCKED) - mlock_vma_page(page); + set_pte_at(vma->vm_mm, address, ptep, pte); /* * No need to invalidate - it was non-present before. However @@ -1304,6 +1302,40 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) return ret; } +/* + * Parameter block passed down to zap_pte_range in exceptional cases. + */ +struct zap_details { + struct folio *single_folio; /* Locked folio to be unmapped */ + bool even_cows; /* Zap COWed private pages too? */ +}; + +/* Whether we should zap all COWed (private) pages too */ +static inline bool should_zap_cows(struct zap_details *details) +{ + /* By default, zap all pages */ + if (!details) + return true; + + /* Or, we zap COWed pages only if the caller wants to */ + return details->even_cows; +} + +/* Decides whether we should zap this page with the page pointer specified */ +static inline bool should_zap_page(struct zap_details *details, struct page *page) +{ + /* If we can make a decision without *page.. */ + if (should_zap_cows(details)) + return true; + + /* E.g. the caller passes NULL for the case of a zero page */ + if (!page) + return true; + + /* Otherwise we should only zap non-anon pages */ + return !PageAnon(page); +} + static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, @@ -1326,6 +1358,8 @@ again: arch_enter_lazy_mmu_mode(); do { pte_t ptent = *pte; + struct page *page; + if (pte_none(ptent)) continue; @@ -1333,10 +1367,8 @@ again: break; if (pte_present(ptent)) { - struct page *page; - page = vm_normal_page(vma, addr, ptent); - if (unlikely(zap_skip_check_mapping(details, page))) + if (unlikely(!should_zap_page(details, page))) continue; ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); @@ -1354,7 +1386,7 @@ again: mark_page_accessed(page); } rss[mm_counter(page)]--; - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); if (unlikely(__tlb_remove_page(tlb, page))) { @@ -1368,34 +1400,32 @@ again: entry = pte_to_swp_entry(ptent); if (is_device_private_entry(entry) || is_device_exclusive_entry(entry)) { - struct page *page = pfn_swap_entry_to_page(entry); - - if (unlikely(zap_skip_check_mapping(details, page))) + page = pfn_swap_entry_to_page(entry); + if (unlikely(!should_zap_page(details, page))) continue; - pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); rss[mm_counter(page)]--; - if (is_device_private_entry(entry)) - page_remove_rmap(page, false); - + page_remove_rmap(page, vma, false); put_page(page); - continue; - } - - /* If details->check_mapping, we leave swap entries. */ - if (unlikely(details)) - continue; - - if (!non_swap_entry(entry)) + } else if (!non_swap_entry(entry)) { + /* Genuine swap entry, hence a private anon page */ + if (!should_zap_cows(details)) + continue; rss[MM_SWAPENTS]--; - else if (is_migration_entry(entry)) { - struct page *page; - + if (unlikely(!free_swap_and_cache(entry))) + print_bad_pte(vma, addr, ptent, NULL); + } else if (is_migration_entry(entry)) { page = pfn_swap_entry_to_page(entry); + if (!should_zap_page(details, page)) + continue; rss[mm_counter(page)]--; + } else if (is_hwpoison_entry(entry)) { + if (!should_zap_cows(details)) + continue; + } else { + /* We should have covered all the swap entry types */ + WARN_ON_ONCE(1); } - if (unlikely(!free_swap_and_cache(entry))) - print_bad_pte(vma, addr, ptent, NULL); pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, addr != end); @@ -1443,8 +1473,8 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ - } else if (details && details->single_page && - PageTransCompound(details->single_page) && + } else if (details && details->single_folio && + folio_test_pmd_mappable(details->single_folio) && next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) { spinlock_t *ptl = pmd_lock(tlb->mm, pmd); /* @@ -1682,7 +1712,7 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size) { - if (address < vma->vm_start || address + size > vma->vm_end || + if (!range_in_vma(vma, address, address + size) || !(vma->vm_flags & VM_PFNMAP)) return; @@ -1730,16 +1760,16 @@ static int validate_page_before_insert(struct page *page) return 0; } -static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte, +static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { if (!pte_none(*pte)) return -EBUSY; /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter_fast(mm, mm_counter_file(page)); - page_add_file_rmap(page, false); - set_pte_at(mm, addr, pte, mk_pte(page, prot)); + inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); + page_add_file_rmap(page, vma, false); + set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); return 0; } @@ -1753,7 +1783,6 @@ static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte, static int insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot) { - struct mm_struct *mm = vma->vm_mm; int retval; pte_t *pte; spinlock_t *ptl; @@ -1762,17 +1791,17 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, if (retval) goto out; retval = -ENOMEM; - pte = get_locked_pte(mm, addr, &ptl); + pte = get_locked_pte(vma->vm_mm, addr, &ptl); if (!pte) goto out; - retval = insert_page_into_pte_locked(mm, pte, addr, page, prot); + retval = insert_page_into_pte_locked(vma, pte, addr, page, prot); pte_unmap_unlock(pte, ptl); out: return retval; } #ifdef pte_index -static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte, +static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { int err; @@ -1782,7 +1811,7 @@ static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte, err = validate_page_before_insert(page); if (err) return err; - return insert_page_into_pte_locked(mm, pte, addr, page, prot); + return insert_page_into_pte_locked(vma, pte, addr, page, prot); } /* insert_pages() amortizes the cost of spinlock operations @@ -1819,7 +1848,7 @@ more: start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { - int err = insert_page_in_batch_locked(mm, pte, + int err = insert_page_in_batch_locked(vma, pte, addr, pages[curr_page_idx], prot); if (unlikely(err)) { pte_unmap_unlock(start_pte, pte_lock); @@ -3075,7 +3104,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ - page_remove_rmap(old_page, false); + page_remove_rmap(old_page, vma, false); } /* Free the old page.. */ @@ -3095,16 +3124,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) */ mmu_notifier_invalidate_range_only_end(&range); if (old_page) { - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if (page_copied && (vma->vm_flags & VM_LOCKED)) { - lock_page(old_page); /* LRU manipulation */ - if (PageMlocked(old_page)) - munlock_vma_page(old_page); - unlock_page(old_page); - } if (page_copied) free_swap_cache(old_page); put_page(old_page); @@ -3317,12 +3336,8 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root, vma_interval_tree_foreach(vma, root, first_index, last_index) { vba = vma->vm_pgoff; vea = vba + vma_pages(vma) - 1; - zba = first_index; - if (zba < vba) - zba = vba; - zea = last_index; - if (zea > vea) - zea = vea; + zba = max(first_index, vba); + zea = min(last_index, vea); unmap_mapping_range_vma(vma, ((zba - vba) << PAGE_SHIFT) + vma->vm_start, @@ -3332,31 +3347,30 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root, } /** - * unmap_mapping_page() - Unmap single page from processes. - * @page: The locked page to be unmapped. + * unmap_mapping_folio() - Unmap single folio from processes. + * @folio: The locked folio to be unmapped. * - * Unmap this page from any userspace process which still has it mmaped. + * Unmap this folio from any userspace process which still has it mmaped. * Typically, for efficiency, the range of nearby pages has already been * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once - * truncation or invalidation holds the lock on a page, it may find that - * the page has been remapped again: and then uses unmap_mapping_page() + * truncation or invalidation holds the lock on a folio, it may find that + * the page has been remapped again: and then uses unmap_mapping_folio() * to unmap it finally. */ -void unmap_mapping_page(struct page *page) +void unmap_mapping_folio(struct folio *folio) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; struct zap_details details = { }; pgoff_t first_index; pgoff_t last_index; - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(PageTail(page)); + VM_BUG_ON(!folio_test_locked(folio)); - first_index = page->index; - last_index = page->index + thp_nr_pages(page) - 1; + first_index = folio->index; + last_index = folio->index + folio_nr_pages(folio) - 1; - details.zap_mapping = mapping; - details.single_page = page; + details.even_cows = false; + details.single_folio = folio; i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) @@ -3384,7 +3398,7 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t first_index = start; pgoff_t last_index = start + nr - 1; - details.zap_mapping = even_cows ? NULL : mapping; + details.even_cows = even_cows; if (last_index < first_index) last_index = ULONG_MAX; @@ -3507,7 +3521,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (unlikely(!si)) goto out; - delayacct_set_flag(current, DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry, vma, vmf->address); swapcache = page; @@ -3555,7 +3568,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->address, &vmf->ptl); if (likely(pte_same(*vmf->pte, vmf->orig_pte))) ret = VM_FAULT_OOM; - delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); goto unlock; } @@ -3569,13 +3581,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * owner processes (which may be unknown at hwpoison time) */ ret = VM_FAULT_HWPOISON; - delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); goto out_release; } locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); - delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); if (!locked) { ret |= VM_FAULT_RETRY; goto out_release; @@ -3626,7 +3636,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { + if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; @@ -3639,8 +3649,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) pte = pte_mkuffd_wp(pte); pte = pte_wrprotect(pte); } - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); - arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); vmf->orig_pte = pte; /* ksm created a completely new copy */ @@ -3651,6 +3659,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) do_page_add_anon_rmap(page, vma, vmf->address, exclusive); } + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); + swap_free(entry); if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) @@ -3852,11 +3863,16 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) return ret; if (unlikely(PageHWPoison(vmf->page))) { - if (ret & VM_FAULT_LOCKED) + vm_fault_t poisonret = VM_FAULT_HWPOISON; + if (ret & VM_FAULT_LOCKED) { + /* Retry if a clean page was removed from the cache. */ + if (invalidate_inode_page(vmf->page)) + poisonret = 0; unlock_page(vmf->page); + } put_page(vmf->page); vmf->page = NULL; - return VM_FAULT_HWPOISON; + return poisonret; } if (unlikely(!(ret & VM_FAULT_LOCKED))) @@ -3928,7 +3944,8 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); - page_add_file_rmap(page, true); + page_add_file_rmap(page, vma, true); + /* * deposit and withdraw with pmd lock held */ @@ -3977,7 +3994,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) lru_cache_add_inactive_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); - page_add_file_rmap(page, false); + page_add_file_rmap(page, vma, false); } set_pte_at(vma->vm_mm, addr, vmf->pte, entry); } @@ -4603,6 +4620,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, struct vm_fault vmf = { .vma = vma, .address = address & PAGE_MASK, + .real_address = address, .flags = flags, .pgoff = linear_page_index(vma, address), .gfp_mask = __get_fault_gfp_mask(vma), @@ -5425,6 +5443,8 @@ long copy_huge_page_from_user(struct page *dst_page, if (rc) break; + flush_dcache_page(subpage); + cond_resched(); } return ret_val; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2a9627dc784c..416b38ca8def 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -295,12 +295,6 @@ struct page *pfn_to_online_page(unsigned long pfn) } EXPORT_SYMBOL_GPL(pfn_to_online_page); -/* - * Reasonably generic function for adding memory. It is - * expected that archs that support memory hotplug will - * call this function after deciding the zone to which to - * add the new pages. - */ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, struct mhp_params *params) { @@ -829,7 +823,7 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn struct pglist_data *pgdat = NODE_DATA(nid); int zid; - for (zid = 0; zid <= ZONE_NORMAL; zid++) { + for (zid = 0; zid < ZONE_NORMAL; zid++) { struct zone *zone = &pgdat->node_zones[zid]; if (zone_intersects(zone, start_pfn, nr_pages)) @@ -1162,43 +1156,20 @@ static void reset_node_present_pages(pg_data_t *pgdat) } /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -static pg_data_t __ref *hotadd_new_pgdat(int nid) +static pg_data_t __ref *hotadd_init_pgdat(int nid) { struct pglist_data *pgdat; + /* + * NODE_DATA is preallocated (free_area_init) but its internal + * state is not allocated completely. Add missing pieces. + * Completely offline nodes stay around and they just need + * reintialization. + */ pgdat = NODE_DATA(nid); - if (!pgdat) { - pgdat = arch_alloc_nodedata(nid); - if (!pgdat) - return NULL; - - pgdat->per_cpu_nodestats = - alloc_percpu(struct per_cpu_nodestat); - arch_refresh_nodedata(nid, pgdat); - } else { - int cpu; - /* - * Reset the nr_zones, order and highest_zoneidx before reuse. - * Note that kswapd will init kswapd_highest_zoneidx properly - * when it starts in the near future. - */ - pgdat->nr_zones = 0; - pgdat->kswapd_order = 0; - pgdat->kswapd_highest_zoneidx = 0; - for_each_online_cpu(cpu) { - struct per_cpu_nodestat *p; - - p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu); - memset(p, 0, sizeof(*p)); - } - } - - /* we can use NODE_DATA(nid) from here */ - pgdat->node_id = nid; - pgdat->node_start_pfn = 0; /* init node's zones as empty zones, we don't have any present pages.*/ - free_area_init_core_hotplug(nid); + free_area_init_core_hotplug(pgdat); /* * The node we allocated has no zone fallback lists. For avoiding @@ -1210,6 +1181,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid) * When memory is hot-added, all the memory is in offline state. So * clear all zones' present_pages because they will be updated in * online_pages() and offline_pages(). + * TODO: should be in free_area_init_core_hotplug? */ reset_node_managed_pages(pgdat); reset_node_present_pages(pgdat); @@ -1217,16 +1189,6 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid) return pgdat; } -static void rollback_node_hotadd(int nid) -{ - pg_data_t *pgdat = NODE_DATA(nid); - - arch_refresh_nodedata(nid, NULL); - free_percpu(pgdat->per_cpu_nodestats); - arch_free_nodedata(pgdat); -} - - /* * __try_online_node - online a node if offlined * @nid: the node ID @@ -1246,7 +1208,7 @@ static int __try_online_node(int nid, bool set_node_online) if (node_online(nid)) return 0; - pgdat = hotadd_new_pgdat(nid); + pgdat = hotadd_init_pgdat(nid); if (!pgdat) { pr_err("Cannot online node %d due to NULL pgdat\n", nid); ret = -ENOMEM; @@ -1327,7 +1289,7 @@ bool mhp_supports_memmap_on_memory(unsigned long size) * populate a single PMD. */ return memmap_on_memory && - !hugetlb_free_vmemmap_enabled && + !hugetlb_free_vmemmap_enabled() && IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) && size == memory_block_size_bytes() && IS_ALIGNED(vmemmap_size, PMD_SIZE) && @@ -1421,9 +1383,9 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) BUG_ON(ret); } - /* link memory sections under this node.*/ - link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1), - MEMINIT_HOTPLUG); + register_memory_blocks_under_node(nid, PFN_DOWN(start), + PFN_UP(start + size - 1), + MEMINIT_HOTPLUG); /* create new memmap entry */ if (!strcmp(res->name, "System RAM")) @@ -1445,9 +1407,6 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) return ret; error: - /* rollback pgdat allocation and others */ - if (new_node) - rollback_node_hotadd(nid); if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) memblock_remove(start, size); error_mem_hotplug_end: @@ -1590,38 +1549,6 @@ bool mhp_range_allowed(u64 start, u64 size, bool need_mapping) #ifdef CONFIG_MEMORY_HOTREMOVE /* - * Confirm all pages in a range [start, end) belong to the same zone (skipping - * memory holes). When true, return the zone. - */ -struct zone *test_pages_in_a_zone(unsigned long start_pfn, - unsigned long end_pfn) -{ - unsigned long pfn, sec_end_pfn; - struct zone *zone = NULL; - struct page *page; - - for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1); - pfn < end_pfn; - pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) { - /* Make sure the memory section is present first */ - if (!present_section_nr(pfn_to_section_nr(pfn))) - continue; - for (; pfn < sec_end_pfn && pfn < end_pfn; - pfn += MAX_ORDER_NR_PAGES) { - /* Check if we got outside of the zone */ - if (zone && !zone_spans_pfn(zone, pfn)) - return NULL; - page = pfn_to_page(pfn); - if (zone && page_zone(page) != zone) - return NULL; - zone = page_zone(page); - } - } - - return zone; -} - -/* * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, * non-lru movable pages and hugepages). Will skip over most unmovable * pages (esp., pages that can be skipped when offlining), but bail out on @@ -1690,10 +1617,13 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) DEFAULT_RATELIMIT_BURST); for (pfn = start_pfn; pfn < end_pfn; pfn++) { + struct folio *folio; + if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); - head = compound_head(page); + folio = page_folio(page); + head = &folio->page; if (PageHuge(page)) { pfn = page_to_pfn(head) + compound_nr(head) - 1; @@ -1710,10 +1640,10 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * the unmap as the catch all safety net). */ if (PageHWPoison(page)) { - if (WARN_ON(PageLRU(page))) - isolate_lru_page(page); - if (page_mapped(page)) - try_to_unmap(page, TTU_IGNORE_MLOCK); + if (WARN_ON(folio_test_lru(folio))) + folio_isolate_lru(folio); + if (folio_mapped(folio)) + try_to_unmap(folio, TTU_IGNORE_MLOCK); continue; } @@ -1844,15 +1774,15 @@ static int count_system_ram_pages_cb(unsigned long start_pfn, } int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, - struct memory_group *group) + struct zone *zone, struct memory_group *group) { const unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn, system_ram_pages = 0; + const int node = zone_to_nid(zone); unsigned long flags; - struct zone *zone; struct memory_notify arg; - int ret, node; char *reason; + int ret; /* * {on,off}lining is constrained to full memory sections (or more @@ -1884,15 +1814,17 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, goto failed_removal; } - /* This makes hotplug much easier...and readable. - we assume this for now. .*/ - zone = test_pages_in_a_zone(start_pfn, end_pfn); - if (!zone) { + /* + * We only support offlining of memory blocks managed by a single zone, + * checked by calling code. This is just a sanity check that we might + * want to remove in the future. + */ + if (WARN_ON_ONCE(page_zone(pfn_to_page(start_pfn)) != zone || + page_zone(pfn_to_page(end_pfn - 1)) != zone)) { ret = -EINVAL; reason = "multizone range"; goto failed_removal; } - node = zone_to_nid(zone); /* * Disable pcplists so that page isolation cannot race with freeing @@ -2004,6 +1936,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, return 0; failed_removal_isolated: + /* pushback to free area */ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); memory_notify(MEM_CANCEL_OFFLINE, &arg); failed_removal_pcplists_disabled: @@ -2014,7 +1947,6 @@ failed_removal: (unsigned long long) start_pfn << PAGE_SHIFT, ((unsigned long long) end_pfn << PAGE_SHIFT) - 1, reason); - /* pushback to free area */ mem_hotplug_done(); return ret; } @@ -2046,12 +1978,12 @@ static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg) return mem->nr_vmemmap_pages; } -static int check_cpu_on_node(pg_data_t *pgdat) +static int check_cpu_on_node(int nid) { int cpu; for_each_present_cpu(cpu) { - if (cpu_to_node(cpu) == pgdat->node_id) + if (cpu_to_node(cpu) == nid) /* * the cpu on this node isn't removed, and we can't * offline this node. @@ -2085,7 +2017,6 @@ static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg) */ void try_offline_node(int nid) { - pg_data_t *pgdat = NODE_DATA(nid); int rc; /* @@ -2093,7 +2024,7 @@ void try_offline_node(int nid) * offline it. A node spans memory after move_pfn_range_to_zone(), * e.g., after the memory block was onlined. */ - if (pgdat->node_spanned_pages) + if (node_spanned_pages(nid)) return; /* @@ -2105,7 +2036,7 @@ void try_offline_node(int nid) if (rc) return; - if (check_cpu_on_node(pgdat)) + if (check_cpu_on_node(nid)) return; /* diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f6248affaf38..a2516d31db6c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -134,6 +134,8 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES]; * @node: Node id to start the search * * Lookup the next closest node by distance if @nid is not online. + * + * Return: this @node if it is online, otherwise the closest node by distance */ int numa_map_to_online_node(int node) { @@ -296,6 +298,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, atomic_set(&policy->refcnt, 1); policy->mode = mode; policy->flags = flags; + policy->home_node = NUMA_NO_NODE; return policy; } @@ -783,7 +786,6 @@ static int vma_replace_policy(struct vm_area_struct *vma, static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct mempolicy *new_pol) { - struct vm_area_struct *next; struct vm_area_struct *prev; struct vm_area_struct *vma; int err = 0; @@ -798,8 +800,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, if (start > vma->vm_start) prev = vma; - for (; vma && vma->vm_start < end; prev = vma, vma = next) { - next = vma->vm_next; + for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) { vmstart = max(start, vma->vm_start); vmend = min(end, vma->vm_end); @@ -810,13 +811,10 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, - new_pol, vma->vm_userfaultfd_ctx); + new_pol, vma->vm_userfaultfd_ctx, + anon_vma_name(vma)); if (prev) { vma = prev; - next = vma->vm_next; - if (mpol_equal(vma_policy(vma), new_pol)) - continue; - /* vma_merge() joined vma && vma->next, case 8 */ goto replace; } if (vma->vm_start != vmstart) { @@ -903,17 +901,14 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) static int lookup_node(struct mm_struct *mm, unsigned long addr) { struct page *p = NULL; - int err; + int ret; - int locked = 1; - err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked); - if (err > 0) { - err = page_to_nid(p); + ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p); + if (ret > 0) { + ret = page_to_nid(p); put_page(p); } - if (locked) - mmap_read_unlock(mm); - return err; + return ret; } /* Retrieve NUMA policy */ @@ -964,14 +959,14 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { /* - * Take a refcount on the mpol, lookup_node() - * will drop the mmap_lock, so after calling - * lookup_node() only "pol" remains valid, "vma" - * is stale. + * Take a refcount on the mpol, because we are about to + * drop the mmap_lock, after which only "pol" remains + * valid, "vma" is stale. */ pol_refcount = pol; vma = NULL; mpol_get(pol); + mmap_read_unlock(mm); err = lookup_node(mm, addr); if (err < 0) goto out; @@ -1477,6 +1472,77 @@ static long kernel_mbind(unsigned long start, unsigned long len, return do_mbind(start, len, lmode, mode_flags, &nodes, flags); } +SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, + unsigned long, home_node, unsigned long, flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct mempolicy *new; + unsigned long vmstart; + unsigned long vmend; + unsigned long end; + int err = -ENOENT; + + start = untagged_addr(start); + if (start & ~PAGE_MASK) + return -EINVAL; + /* + * flags is used for future extension if any. + */ + if (flags != 0) + return -EINVAL; + + /* + * Check home_node is online to avoid accessing uninitialized + * NODE_DATA. + */ + if (home_node >= MAX_NUMNODES || !node_online(home_node)) + return -EINVAL; + + len = (len + PAGE_SIZE - 1) & PAGE_MASK; + end = start + len; + + if (end < start) + return -EINVAL; + if (end == start) + return 0; + mmap_write_lock(mm); + vma = find_vma(mm, start); + for (; vma && vma->vm_start < end; vma = vma->vm_next) { + + vmstart = max(start, vma->vm_start); + vmend = min(end, vma->vm_end); + new = mpol_dup(vma_policy(vma)); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } + /* + * Only update home node if there is an existing vma policy + */ + if (!new) + continue; + + /* + * If any vma in the range got policy other than MPOL_BIND + * or MPOL_PREFERRED_MANY we return error. We don't reset + * the home node for vmas we already updated before. + */ + if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) { + err = -EOPNOTSUPP; + break; + } + + new->home_node = home_node; + err = mbind_range(mm, vmstart, vmend, new); + mpol_put(new); + if (err) + break; + } + mmap_write_unlock(mm); + return err; +} + SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned int, flags) @@ -1801,6 +1867,11 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); } + if ((policy->mode == MPOL_BIND || + policy->mode == MPOL_PREFERRED_MANY) && + policy->home_node != NUMA_NO_NODE) + return policy->home_node; + return nd; } @@ -2061,7 +2132,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes); if (!page) - page = __alloc_pages(gfp, order, numa_node_id(), NULL); + page = __alloc_pages(gfp, order, nid, NULL); return page; } @@ -2072,7 +2143,6 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, * @order: Order of the GFP allocation. * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual address of the allocation. Must be inside @vma. - * @node: Which node to prefer for allocation (modulo policy). * @hugepage: For hugepages try only the preferred node if possible. * * Allocate a page for a specific address in @vma, using the appropriate @@ -2083,9 +2153,10 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, * Return: The page on success or NULL if allocation fails. */ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, int node, bool hugepage) + unsigned long addr, bool hugepage) { struct mempolicy *pol; + int node = numa_node_id(); struct page *page; int preferred_nid; nodemask_t *nmask; @@ -2102,6 +2173,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, } if (pol->mode == MPOL_PREFERRED_MANY) { + node = policy_node(gfp, pol, node); page = alloc_pages_preferred_many(gfp, order, node, pol); mpol_cond_put(pol); goto out; @@ -2185,7 +2257,7 @@ struct page *alloc_pages(gfp_t gfp, unsigned order) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else if (pol->mode == MPOL_PREFERRED_MANY) page = alloc_pages_preferred_many(gfp, order, - numa_node_id(), pol); + policy_node(gfp, pol, numa_node_id()), pol); else page = __alloc_pages(gfp, order, policy_node(gfp, pol, numa_node_id()), @@ -2341,6 +2413,8 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) return false; if (a->flags != b->flags) return false; + if (a->home_node != b->home_node) + return false; if (mpol_store_user_nodemask(a)) if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) return false; @@ -2884,7 +2958,7 @@ static const char * const policy_modes[] = * Format of input: * <mode>[=<flags>][:<nodelist>] * - * On success, returns 0, else 1 + * Return: %0 on success, else %1 */ int mpol_parse_str(char *str, struct mempolicy **mpol) { diff --git a/mm/memremap.c b/mm/memremap.c index 5a66a71ab591..c17eca4a48ca 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -4,7 +4,7 @@ #include <linux/io.h> #include <linux/kasan.h> #include <linux/memory_hotplug.h> -#include <linux/mm.h> +#include <linux/memremap.h> #include <linux/pfn_t.h> #include <linux/swap.h> #include <linux/mmzone.h> @@ -12,6 +12,7 @@ #include <linux/types.h> #include <linux/wait_bit.h> #include <linux/xarray.h> +#include "internal.h" static DEFINE_XARRAY(pgmap_array); @@ -37,21 +38,19 @@ unsigned long memremap_compat_align(void) EXPORT_SYMBOL_GPL(memremap_compat_align); #endif -#ifdef CONFIG_DEV_PAGEMAP_OPS +#ifdef CONFIG_FS_DAX DEFINE_STATIC_KEY_FALSE(devmap_managed_key); EXPORT_SYMBOL(devmap_managed_key); static void devmap_managed_enable_put(struct dev_pagemap *pgmap) { - if (pgmap->type == MEMORY_DEVICE_PRIVATE || - pgmap->type == MEMORY_DEVICE_FS_DAX) + if (pgmap->type == MEMORY_DEVICE_FS_DAX) static_branch_dec(&devmap_managed_key); } static void devmap_managed_enable_get(struct dev_pagemap *pgmap) { - if (pgmap->type == MEMORY_DEVICE_PRIVATE || - pgmap->type == MEMORY_DEVICE_FS_DAX) + if (pgmap->type == MEMORY_DEVICE_FS_DAX) static_branch_inc(&devmap_managed_key); } #else @@ -61,7 +60,7 @@ static void devmap_managed_enable_get(struct dev_pagemap *pgmap) static void devmap_managed_enable_put(struct dev_pagemap *pgmap) { } -#endif /* CONFIG_DEV_PAGEMAP_OPS */ +#endif /* CONFIG_FS_DAX */ static void pgmap_array_delete(struct range *range) { @@ -102,38 +101,10 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id) return (range->start + range_len(range)) >> PAGE_SHIFT; } -static unsigned long pfn_next(unsigned long pfn) +static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id) { - if (pfn % 1024 == 0) - cond_resched(); - return pfn + 1; -} - -#define for_each_device_pfn(pfn, map, i) \ - for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn)) - -static void dev_pagemap_kill(struct dev_pagemap *pgmap) -{ - if (pgmap->ops && pgmap->ops->kill) - pgmap->ops->kill(pgmap); - else - percpu_ref_kill(pgmap->ref); -} - -static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) -{ - if (pgmap->ops && pgmap->ops->cleanup) { - pgmap->ops->cleanup(pgmap); - } else { - wait_for_completion(&pgmap->done); - percpu_ref_exit(pgmap->ref); - } - /* - * Undo the pgmap ref assignment for the internal case as the - * caller may re-enable the same pgmap. - */ - if (pgmap->ref == &pgmap->internal_ref) - pgmap->ref = NULL; + return (pfn_end(pgmap, range_id) - + pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift; } static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) @@ -164,14 +135,13 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) void memunmap_pages(struct dev_pagemap *pgmap) { - unsigned long pfn; int i; - dev_pagemap_kill(pgmap); + percpu_ref_kill(&pgmap->ref); for (i = 0; i < pgmap->nr_range; i++) - for_each_device_pfn(pfn, pgmap, i) - put_page(pfn_to_page(pfn)); - dev_pagemap_cleanup(pgmap); + percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i)); + wait_for_completion(&pgmap->done); + percpu_ref_exit(&pgmap->ref); for (i = 0; i < pgmap->nr_range; i++) pageunmap_range(pgmap, i); @@ -188,8 +158,7 @@ static void devm_memremap_pages_release(void *data) static void dev_pagemap_percpu_release(struct percpu_ref *ref) { - struct dev_pagemap *pgmap = - container_of(ref, struct dev_pagemap, internal_ref); + struct dev_pagemap *pgmap = container_of(ref, struct dev_pagemap, ref); complete(&pgmap->done); } @@ -295,12 +264,12 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], PHYS_PFN(range->start), PHYS_PFN(range_len(range)), pgmap); - percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id) - - pfn_first(pgmap, range_id)); + percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id)); return 0; err_add_memory: - kasan_remove_zero_shadow(__va(range->start), range_len(range)); + if (!is_private) + kasan_remove_zero_shadow(__va(range->start), range_len(range)); err_kasan: untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); err_pfn_remap: @@ -346,8 +315,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) } break; case MEMORY_DEVICE_FS_DAX: - if (!IS_ENABLED(CONFIG_ZONE_DEVICE) || - IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { + if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { WARN(1, "File system DAX not supported\n"); return ERR_PTR(-EINVAL); } @@ -362,22 +330,11 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) break; } - if (!pgmap->ref) { - if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) - return ERR_PTR(-EINVAL); - - init_completion(&pgmap->done); - error = percpu_ref_init(&pgmap->internal_ref, - dev_pagemap_percpu_release, 0, GFP_KERNEL); - if (error) - return ERR_PTR(error); - pgmap->ref = &pgmap->internal_ref; - } else { - if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) { - WARN(1, "Missing reference count teardown definition\n"); - return ERR_PTR(-EINVAL); - } - } + init_completion(&pgmap->done); + error = percpu_ref_init(&pgmap->ref, dev_pagemap_percpu_release, 0, + GFP_KERNEL); + if (error) + return ERR_PTR(error); devmap_managed_enable_get(pgmap); @@ -486,7 +443,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, /* fall back to slow path lookup */ rcu_read_lock(); pgmap = xa_load(&pgmap_array, PHYS_PFN(phys)); - if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) + if (pgmap && !percpu_ref_tryget_live(&pgmap->ref)) pgmap = NULL; rcu_read_unlock(); @@ -494,21 +451,17 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, } EXPORT_SYMBOL_GPL(get_dev_pagemap); -#ifdef CONFIG_DEV_PAGEMAP_OPS -void free_devmap_managed_page(struct page *page) +void free_zone_device_page(struct page *page) { - /* notify page idle for dax */ - if (!is_device_private_page(page)) { - wake_up_var(&page->_refcount); + if (WARN_ON_ONCE(!page->pgmap->ops || !page->pgmap->ops->page_free)) return; - } __ClearPageWaiters(page); mem_cgroup_uncharge(page_folio(page)); /* - * When a device_private page is freed, the page->mapping field + * When a device managed page is freed, the page->mapping field * may still contain a (stale) mapping value. For example, the * lower bits of page->mapping may still identify the page as an * anonymous page. Ultimately, this entire field is just stale @@ -530,5 +483,27 @@ void free_devmap_managed_page(struct page *page) */ page->mapping = NULL; page->pgmap->ops->page_free(page); + + /* + * Reset the page count to 1 to prepare for handing out the page again. + */ + set_page_count(page, 1); +} + +#ifdef CONFIG_FS_DAX +bool __put_devmap_managed_page(struct page *page) +{ + if (page->pgmap->type != MEMORY_DEVICE_FS_DAX) + return false; + + /* + * fsdax page refcounts are 1-based, rather than 0-based: if + * refcount is 1, then the page is free and the refcount is + * stable because nobody holds a reference on the page. + */ + if (page_ref_dec_return(page) == 1) + wake_up_var(&page->_refcount); + return true; } -#endif /* CONFIG_DEV_PAGEMAP_OPS */ +EXPORT_SYMBOL(__put_devmap_managed_page); +#endif /* CONFIG_FS_DAX */ diff --git a/mm/migrate.c b/mm/migrate.c index cf25b00f03c8..4f30ed37856f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -38,18 +38,18 @@ #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> #include <linux/gfp.h> -#include <linux/pagewalk.h> #include <linux/pfn_t.h> #include <linux/memremap.h> #include <linux/userfaultfd_k.h> #include <linux/balloon_compaction.h> -#include <linux/mmu_notifier.h> #include <linux/page_idle.h> #include <linux/page_owner.h> #include <linux/sched/mm.h> #include <linux/ptrace.h> #include <linux/oom.h> #include <linux/memory.h> +#include <linux/random.h> +#include <linux/sched/sysctl.h> #include <asm/tlbflush.h> @@ -106,7 +106,7 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) /* Driver shouldn't use PG_isolated bit of page->flags */ WARN_ON_ONCE(PageIsolated(page)); - __SetPageIsolated(page); + SetPageIsolated(page); unlock_page(page); return 0; @@ -125,7 +125,7 @@ static void putback_movable_page(struct page *page) mapping = page_mapping(page); mapping->a_ops->putback_page(page); - __ClearPageIsolated(page); + ClearPageIsolated(page); } /* @@ -158,7 +158,7 @@ void putback_movable_pages(struct list_head *l) if (PageMovable(page)) putback_movable_page(page); else - __ClearPageIsolated(page); + ClearPageIsolated(page); unlock_page(page); put_page(page); } else { @@ -172,37 +172,33 @@ void putback_movable_pages(struct list_head *l) /* * Restore a potential migration pte to a working pte entry */ -static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, - unsigned long addr, void *old) +static bool remove_migration_pte(struct folio *folio, + struct vm_area_struct *vma, unsigned long addr, void *old) { - struct page_vma_mapped_walk pvmw = { - .page = old, - .vma = vma, - .address = addr, - .flags = PVMW_SYNC | PVMW_MIGRATION, - }; - struct page *new; - pte_t pte; - swp_entry_t entry; + DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION); - VM_BUG_ON_PAGE(PageTail(page), page); while (page_vma_mapped_walk(&pvmw)) { - if (PageKsm(page)) - new = page; - else - new = page - pvmw.page->index + - linear_page_index(vma, pvmw.address); + pte_t pte; + swp_entry_t entry; + struct page *new; + unsigned long idx = 0; + + /* pgoff is invalid for ksm pages, but they are never large */ + if (folio_test_large(folio) && !folio_test_hugetlb(folio)) + idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff; + new = folio_page(folio, idx); #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION /* PMD-mapped THP migration entry */ if (!pvmw.pte) { - VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); + VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || + !folio_test_pmd_mappable(folio), folio); remove_migration_pmd(&pvmw, new); continue; } #endif - get_page(new); + folio_get(folio); pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); if (pte_swp_soft_dirty(*pvmw.pte)) pte = pte_mksoft_dirty(pte); @@ -231,31 +227,27 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, } #ifdef CONFIG_HUGETLB_PAGE - if (PageHuge(new)) { + if (folio_test_hugetlb(folio)) { unsigned int shift = huge_page_shift(hstate_vma(vma)); pte = pte_mkhuge(pte); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); - set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); - if (PageAnon(new)) + if (folio_test_anon(folio)) hugepage_add_anon_rmap(new, vma, pvmw.address); else page_dup_rmap(new, true); + set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } else #endif { - set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); - - if (PageAnon(new)) + if (folio_test_anon(folio)) page_add_anon_rmap(new, vma, pvmw.address, false); else - page_add_file_rmap(new, false); + page_add_file_rmap(new, vma, false); + set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } - if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) - mlock_vma_page(new); - - if (PageTransHuge(page) && PageMlocked(page)) - clear_page_mlock(page); + if (vma->vm_flags & VM_LOCKED) + mlock_page_drain(smp_processor_id()); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, pvmw.address, pvmw.pte); @@ -268,17 +260,17 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, * Get rid of all migration entries and replace them by * references to the indicated page. */ -void remove_migration_ptes(struct page *old, struct page *new, bool locked) +void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked) { struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, - .arg = old, + .arg = src, }; if (locked) - rmap_walk_locked(new, &rwc); + rmap_walk_locked(dst, &rwc); else - rmap_walk(new, &rwc); + rmap_walk(dst, &rwc); } /* @@ -291,7 +283,6 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, { pte_t pte; swp_entry_t entry; - struct page *page; spin_lock(ptl); pte = *ptep; @@ -302,18 +293,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, if (!is_migration_entry(entry)) goto out; - page = pfn_swap_entry_to_page(entry); - page = compound_head(page); - - /* - * Once page cache replacement of page migration started, page_count - * is zero; but we must not call put_and_wait_on_page_locked() without - * a ref. Use get_page_unless_zero(), and just fault again if it fails. - */ - if (!get_page_unless_zero(page)) - goto out; - pte_unmap_unlock(ptep, ptl); - put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); + migration_entry_wait_on_locked(entry, ptep, ptl); return; out: pte_unmap_unlock(ptep, ptl); @@ -338,16 +318,11 @@ void migration_entry_wait_huge(struct vm_area_struct *vma, void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) { spinlock_t *ptl; - struct page *page; ptl = pmd_lock(mm, pmd); if (!is_pmd_migration_entry(*pmd)) goto unlock; - page = pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd)); - if (!get_page_unless_zero(page)) - goto unlock; - spin_unlock(ptl); - put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); + migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), NULL, ptl); return; unlock: spin_unlock(ptl); @@ -358,14 +333,8 @@ static int expected_page_refs(struct address_space *mapping, struct page *page) { int expected_count = 1; - /* - * Device private pages have an extra refcount as they are - * ZONE_DEVICE pages. - */ - expected_count += is_device_private_page(page); if (mapping) expected_count += compound_nr(page) + page_has_private(page); - return expected_count; } @@ -434,14 +403,6 @@ int folio_migrate_mapping(struct address_space *mapping, } xas_store(&xas, newfolio); - if (nr > 1) { - int i; - - for (i = 1; i < nr; i++) { - xas_next(&xas); - xas_store(&xas, newfolio); - } - } /* * Drop cache reference from old page by unfreezing @@ -796,6 +757,7 @@ int buffer_migrate_page_norefs(struct address_space *mapping, */ static int writeout(struct address_space *mapping, struct page *page) { + struct folio *folio = page_folio(page); struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, .nr_to_write = 1, @@ -821,7 +783,7 @@ static int writeout(struct address_space *mapping, struct page *page) * At this point we know that the migration attempt cannot * be successful. */ - remove_migration_ptes(page, page, false); + remove_migration_ptes(folio, folio, false); rc = mapping->a_ops->writepage(page, &wbc); @@ -908,7 +870,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, VM_BUG_ON_PAGE(!PageIsolated(page), page); if (!PageMovable(page)) { rc = MIGRATEPAGE_SUCCESS; - __ClearPageIsolated(page); + ClearPageIsolated(page); goto out; } @@ -930,7 +892,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, * We clear PG_movable under page_lock so any compactor * cannot try to migrate this page. */ - __ClearPageIsolated(page); + ClearPageIsolated(page); } /* @@ -942,8 +904,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, page->mapping = NULL; if (likely(!is_zone_device_page(newpage))) - flush_dcache_page(newpage); - + flush_dcache_folio(page_folio(newpage)); } out: return rc; @@ -952,6 +913,8 @@ out: static int __unmap_and_move(struct page *page, struct page *newpage, int force, enum migrate_mode mode) { + struct folio *folio = page_folio(page); + struct folio *dst = page_folio(newpage); int rc = -EAGAIN; bool page_was_mapped = false; struct anon_vma *anon_vma = NULL; @@ -1055,16 +1018,31 @@ static int __unmap_and_move(struct page *page, struct page *newpage, /* Establish migration ptes */ VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, page); - try_to_migrate(page, 0); + try_to_migrate(folio, 0); page_was_mapped = true; } if (!page_mapped(page)) rc = move_to_new_page(newpage, page, mode); + /* + * When successful, push newpage to LRU immediately: so that if it + * turns out to be an mlocked page, remove_migration_ptes() will + * automatically build up the correct newpage->mlock_count for it. + * + * We would like to do something similar for the old page, when + * unsuccessful, and other cases when a page has been temporarily + * isolated from the unevictable LRU: but this case is the easiest. + */ + if (rc == MIGRATEPAGE_SUCCESS) { + lru_cache_add(newpage); + if (page_was_mapped) + lru_add_drain(); + } + if (page_was_mapped) - remove_migration_ptes(page, - rc == MIGRATEPAGE_SUCCESS ? newpage : page, false); + remove_migration_ptes(folio, + rc == MIGRATEPAGE_SUCCESS ? dst : folio, false); out_unlock_both: unlock_page(newpage); @@ -1075,98 +1053,16 @@ out_unlock: unlock_page(page); out: /* - * If migration is successful, decrease refcount of the newpage + * If migration is successful, decrease refcount of the newpage, * which will not free the page because new page owner increased - * refcounter. As well, if it is LRU page, add the page to LRU - * list in here. Use the old state of the isolated source page to - * determine if we migrated a LRU page. newpage was already unlocked - * and possibly modified by its owner - don't rely on the page - * state. + * refcounter. */ - if (rc == MIGRATEPAGE_SUCCESS) { - if (unlikely(!is_lru)) - put_page(newpage); - else - putback_lru_page(newpage); - } + if (rc == MIGRATEPAGE_SUCCESS) + put_page(newpage); return rc; } - -/* - * node_demotion[] example: - * - * Consider a system with two sockets. Each socket has - * three classes of memory attached: fast, medium and slow. - * Each memory class is placed in its own NUMA node. The - * CPUs are placed in the node with the "fast" memory. The - * 6 NUMA nodes (0-5) might be split among the sockets like - * this: - * - * Socket A: 0, 1, 2 - * Socket B: 3, 4, 5 - * - * When Node 0 fills up, its memory should be migrated to - * Node 1. When Node 1 fills up, it should be migrated to - * Node 2. The migration path start on the nodes with the - * processors (since allocations default to this node) and - * fast memory, progress through medium and end with the - * slow memory: - * - * 0 -> 1 -> 2 -> stop - * 3 -> 4 -> 5 -> stop - * - * This is represented in the node_demotion[] like this: - * - * { 1, // Node 0 migrates to 1 - * 2, // Node 1 migrates to 2 - * -1, // Node 2 does not migrate - * 4, // Node 3 migrates to 4 - * 5, // Node 4 migrates to 5 - * -1} // Node 5 does not migrate - */ - -/* - * Writes to this array occur without locking. Cycles are - * not allowed: Node X demotes to Y which demotes to X... - * - * If multiple reads are performed, a single rcu_read_lock() - * must be held over all reads to ensure that no cycles are - * observed. - */ -static int node_demotion[MAX_NUMNODES] __read_mostly = - {[0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE}; - -/** - * next_demotion_node() - Get the next node in the demotion path - * @node: The starting node to lookup the next node - * - * Return: node id for next memory node in the demotion path hierarchy - * from @node; NUMA_NO_NODE if @node is terminal. This does not keep - * @node online or guarantee that it *continues* to be the next demotion - * target. - */ -int next_demotion_node(int node) -{ - int target; - - /* - * node_demotion[] is updated without excluding this - * function from running. RCU doesn't provide any - * compiler barriers, so the READ_ONCE() is required - * to avoid compiler reordering or read merging. - * - * Make sure to use RCU over entire code blocks if - * node_demotion[] reads need to be consistent. - */ - rcu_read_lock(); - target = READ_ONCE(node_demotion[node]); - rcu_read_unlock(); - - return target; -} - /* * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. @@ -1191,7 +1087,7 @@ static int unmap_and_move(new_page_t get_new_page, if (unlikely(__PageMovable(page))) { lock_page(page); if (!PageMovable(page)) - __ClearPageIsolated(page); + ClearPageIsolated(page); unlock_page(page); } goto out; @@ -1272,6 +1168,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, enum migrate_mode mode, int reason, struct list_head *ret) { + struct folio *dst, *src = page_folio(hpage); int rc = -EAGAIN; int page_was_mapped = 0; struct page *new_hpage; @@ -1299,6 +1196,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, new_hpage = get_new_page(hpage, private); if (!new_hpage) return -ENOMEM; + dst = page_folio(new_hpage); if (!trylock_page(hpage)) { if (!force) @@ -1348,7 +1246,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, ttu |= TTU_RMAP_LOCKED; } - try_to_migrate(hpage, ttu); + try_to_migrate(src, ttu); page_was_mapped = 1; if (mapping_locked) @@ -1359,8 +1257,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, rc = move_to_new_page(new_hpage, hpage, mode); if (page_was_mapped) - remove_migration_ptes(hpage, - rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false); + remove_migration_ptes(src, + rc == MIGRATEPAGE_SUCCESS ? dst : src, false); unlock_put_anon: unlock_page(new_hpage); @@ -1422,7 +1320,7 @@ static inline int try_split_thp(struct page *page, struct page **page2, * @mode: The migration mode that specifies the constraints for * page migration, if any. * @reason: The reason for page migration. - * @ret_succeeded: Set to the number of pages migrated successfully if + * @ret_succeeded: Set to the number of normal pages migrated successfully if * the caller passes a non-NULL pointer. * * The function returns after 10 attempts or if no pages are movable any more @@ -1430,7 +1328,9 @@ static inline int try_split_thp(struct page *page, struct page **page2, * It is caller's responsibility to call putback_movable_pages() to return pages * to the LRU or free list only if ret != 0. * - * Returns the number of pages that were not migrated, or an error code. + * Returns the number of {normal page, THP, hugetlb} that were not migrated, or + * an error code. The number of THP splits will be considered as the number of + * non-migrated THP, no matter how many subpages of the THP are migrated successfully. */ int migrate_pages(struct list_head *from, new_page_t get_new_page, free_page_t put_new_page, unsigned long private, @@ -1439,6 +1339,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, int retry = 1; int thp_retry = 1; int nr_failed = 0; + int nr_failed_pages = 0; int nr_succeeded = 0; int nr_thp_succeeded = 0; int nr_thp_failed = 0; @@ -1447,16 +1348,15 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, bool is_thp = false; struct page *page; struct page *page2; - int swapwrite = current->flags & PF_SWAPWRITE; int rc, nr_subpages; LIST_HEAD(ret_pages); + LIST_HEAD(thp_split_pages); bool nosplit = (reason == MR_NUMA_MISPLACED); + bool no_subpage_counting = false; trace_mm_migrate_pages_start(mode, reason); - if (!swapwrite) - current->flags |= PF_SWAPWRITE; - +thp_subpage_migration: for (pass = 0; pass < 10 && (retry || thp_retry); pass++) { retry = 0; thp_retry = 0; @@ -1469,7 +1369,7 @@ retry: * during migration. */ is_thp = PageTransHuge(page) && !PageHuge(page); - nr_subpages = thp_nr_pages(page); + nr_subpages = compound_nr(page); cond_resched(); if (PageHuge(page)) @@ -1505,18 +1405,20 @@ retry: case -ENOSYS: /* THP migration is unsupported */ if (is_thp) { - if (!try_split_thp(page, &page2, from)) { + nr_thp_failed++; + if (!try_split_thp(page, &page2, &thp_split_pages)) { nr_thp_split++; goto retry; } - nr_thp_failed++; - nr_failed += nr_subpages; + nr_failed_pages += nr_subpages; break; } /* Hugetlb migration is unsupported */ - nr_failed++; + if (!no_subpage_counting) + nr_failed++; + nr_failed_pages += nr_subpages; break; case -ENOMEM: /* @@ -1525,16 +1427,19 @@ retry: * THP NUMA faulting doesn't split THP to retry. */ if (is_thp && !nosplit) { - if (!try_split_thp(page, &page2, from)) { + nr_thp_failed++; + if (!try_split_thp(page, &page2, &thp_split_pages)) { nr_thp_split++; goto retry; } - nr_thp_failed++; - nr_failed += nr_subpages; + nr_failed_pages += nr_subpages; goto out; } - nr_failed++; + + if (!no_subpage_counting) + nr_failed++; + nr_failed_pages += nr_subpages; goto out; case -EAGAIN: if (is_thp) { @@ -1544,12 +1449,11 @@ retry: retry++; break; case MIGRATEPAGE_SUCCESS: + nr_succeeded += nr_subpages; if (is_thp) { nr_thp_succeeded++; - nr_succeeded += nr_subpages; break; } - nr_succeeded++; break; default: /* @@ -1560,17 +1464,37 @@ retry: */ if (is_thp) { nr_thp_failed++; - nr_failed += nr_subpages; + nr_failed_pages += nr_subpages; break; } - nr_failed++; + + if (!no_subpage_counting) + nr_failed++; + nr_failed_pages += nr_subpages; break; } } } - nr_failed += retry + thp_retry; + nr_failed += retry; nr_thp_failed += thp_retry; - rc = nr_failed; + /* + * Try to migrate subpages of fail-to-migrate THPs, no nr_failed + * counting in this round, since all subpages of a THP is counted + * as 1 failure in the first round. + */ + if (!list_empty(&thp_split_pages)) { + /* + * Move non-migrated pages (after 10 retries) to ret_pages + * to avoid migrating them again. + */ + list_splice_init(from, &ret_pages); + list_splice_init(&thp_split_pages, from); + no_subpage_counting = true; + retry = 1; + goto thp_subpage_migration; + } + + rc = nr_failed + nr_thp_failed; out: /* * Put the permanent failure page back to migration list, they @@ -1579,16 +1503,13 @@ out: list_splice(&ret_pages, from); count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); - count_vm_events(PGMIGRATE_FAIL, nr_failed); + count_vm_events(PGMIGRATE_FAIL, nr_failed_pages); count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded); count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed); count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split); - trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded, + trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded, nr_thp_failed, nr_thp_split, mode, reason); - if (!swapwrite) - current->flags &= ~PF_SWAPWRITE; - if (ret_succeeded) *ret_succeeded = nr_succeeded; @@ -1681,7 +1602,6 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, { struct vm_area_struct *vma; struct page *page; - unsigned int follflags; int err; mmap_read_lock(mm); @@ -1691,8 +1611,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, goto out; /* FOLL_DUMP to ignore special (like zero) pages */ - follflags = FOLL_GET | FOLL_DUMP; - page = follow_page(vma, addr, follflags); + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); err = PTR_ERR(page); if (IS_ERR(page)) @@ -1831,6 +1750,13 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, } /* + * The move_pages() man page does not have an -EEXIST choice, so + * use -EFAULT instead. + */ + if (err == -EEXIST) + err = -EFAULT; + + /* * If the page is already on the target node (!err), store the * node, otherwise, store the err. */ @@ -2103,16 +2029,27 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { int page_lru; int nr_pages = thp_nr_pages(page); + int order = compound_order(page); - VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); + VM_BUG_ON_PAGE(order && !PageTransHuge(page), page); /* Do not migrate THP mapped by multiple processes */ if (PageTransHuge(page) && total_mapcount(page) > 1) return 0; /* Avoid migrating to a node that is nearly full */ - if (!migrate_balanced_pgdat(pgdat, nr_pages)) + if (!migrate_balanced_pgdat(pgdat, nr_pages)) { + int z; + + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)) + return 0; + for (z = pgdat->nr_zones - 1; z >= 0; z--) { + if (populated_zone(pgdat->node_zones + z)) + break; + } + wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE); return 0; + } if (isolate_lru_page(page)) return 0; @@ -2141,6 +2078,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, pg_data_t *pgdat = NODE_DATA(node); int isolated; int nr_remaining; + unsigned int nr_succeeded; LIST_HEAD(migratepages); new_page_t *new; bool compound; @@ -2179,7 +2117,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, list_add(&page->lru, &migratepages); nr_remaining = migrate_pages(&migratepages, *new, NULL, node, - MIGRATE_ASYNC, MR_NUMA_MISPLACED, NULL); + MIGRATE_ASYNC, MR_NUMA_MISPLACED, + &nr_succeeded); if (nr_remaining) { if (!list_empty(&migratepages)) { list_del(&page->lru); @@ -2188,8 +2127,13 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, putback_lru_page(page); } isolated = 0; - } else - count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_pages); + } + if (nr_succeeded) { + count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded); + if (!node_is_toptier(page_to_nid(page)) && node_is_toptier(node)) + mod_node_page_state(pgdat, PGPROMOTE_SUCCESS, + nr_succeeded); + } BUG_ON(!list_empty(&migratepages)); return isolated; @@ -2200,784 +2144,152 @@ out: #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_NUMA */ -#ifdef CONFIG_DEVICE_PRIVATE -static int migrate_vma_collect_skip(unsigned long start, - unsigned long end, - struct mm_walk *walk) -{ - struct migrate_vma *migrate = walk->private; - unsigned long addr; - - for (addr = start; addr < end; addr += PAGE_SIZE) { - migrate->dst[migrate->npages] = 0; - migrate->src[migrate->npages++] = 0; - } - - return 0; -} - -static int migrate_vma_collect_hole(unsigned long start, - unsigned long end, - __always_unused int depth, - struct mm_walk *walk) -{ - struct migrate_vma *migrate = walk->private; - unsigned long addr; - - /* Only allow populating anonymous memory. */ - if (!vma_is_anonymous(walk->vma)) - return migrate_vma_collect_skip(start, end, walk); - - for (addr = start; addr < end; addr += PAGE_SIZE) { - migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; - migrate->dst[migrate->npages] = 0; - migrate->npages++; - migrate->cpages++; - } - - return 0; -} - -static int migrate_vma_collect_pmd(pmd_t *pmdp, - unsigned long start, - unsigned long end, - struct mm_walk *walk) -{ - struct migrate_vma *migrate = walk->private; - struct vm_area_struct *vma = walk->vma; - struct mm_struct *mm = vma->vm_mm; - unsigned long addr = start, unmapped = 0; - spinlock_t *ptl; - pte_t *ptep; - -again: - if (pmd_none(*pmdp)) - return migrate_vma_collect_hole(start, end, -1, walk); - - if (pmd_trans_huge(*pmdp)) { - struct page *page; - - ptl = pmd_lock(mm, pmdp); - if (unlikely(!pmd_trans_huge(*pmdp))) { - spin_unlock(ptl); - goto again; - } - - page = pmd_page(*pmdp); - if (is_huge_zero_page(page)) { - spin_unlock(ptl); - split_huge_pmd(vma, pmdp, addr); - if (pmd_trans_unstable(pmdp)) - return migrate_vma_collect_skip(start, end, - walk); - } else { - int ret; - - get_page(page); - spin_unlock(ptl); - if (unlikely(!trylock_page(page))) - return migrate_vma_collect_skip(start, end, - walk); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); - if (ret) - return migrate_vma_collect_skip(start, end, - walk); - if (pmd_none(*pmdp)) - return migrate_vma_collect_hole(start, end, -1, - walk); - } - } - - if (unlikely(pmd_bad(*pmdp))) - return migrate_vma_collect_skip(start, end, walk); - - ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); - arch_enter_lazy_mmu_mode(); - - for (; addr < end; addr += PAGE_SIZE, ptep++) { - unsigned long mpfn = 0, pfn; - struct page *page; - swp_entry_t entry; - pte_t pte; - - pte = *ptep; - - if (pte_none(pte)) { - if (vma_is_anonymous(vma)) { - mpfn = MIGRATE_PFN_MIGRATE; - migrate->cpages++; - } - goto next; - } - - if (!pte_present(pte)) { - /* - * Only care about unaddressable device page special - * page table entry. Other special swap entries are not - * migratable, and we ignore regular swapped page. - */ - entry = pte_to_swp_entry(pte); - if (!is_device_private_entry(entry)) - goto next; - - page = pfn_swap_entry_to_page(entry); - if (!(migrate->flags & - MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || - page->pgmap->owner != migrate->pgmap_owner) - goto next; - - mpfn = migrate_pfn(page_to_pfn(page)) | - MIGRATE_PFN_MIGRATE; - if (is_writable_device_private_entry(entry)) - mpfn |= MIGRATE_PFN_WRITE; - } else { - if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) - goto next; - pfn = pte_pfn(pte); - if (is_zero_pfn(pfn)) { - mpfn = MIGRATE_PFN_MIGRATE; - migrate->cpages++; - goto next; - } - page = vm_normal_page(migrate->vma, addr, pte); - mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; - mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; - } - - /* FIXME support THP */ - if (!page || !page->mapping || PageTransCompound(page)) { - mpfn = 0; - goto next; - } - - /* - * By getting a reference on the page we pin it and that blocks - * any kind of migration. Side effect is that it "freezes" the - * pte. - * - * We drop this reference after isolating the page from the lru - * for non device page (device page are not on the lru and thus - * can't be dropped from it). - */ - get_page(page); - - /* - * Optimize for the common case where page is only mapped once - * in one process. If we can lock the page, then we can safely - * set up a special migration page table entry now. - */ - if (trylock_page(page)) { - pte_t swp_pte; - - migrate->cpages++; - ptep_get_and_clear(mm, addr, ptep); - - /* Setup special migration page table entry */ - if (mpfn & MIGRATE_PFN_WRITE) - entry = make_writable_migration_entry( - page_to_pfn(page)); - else - entry = make_readable_migration_entry( - page_to_pfn(page)); - swp_pte = swp_entry_to_pte(entry); - if (pte_present(pte)) { - if (pte_soft_dirty(pte)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - if (pte_uffd_wp(pte)) - swp_pte = pte_swp_mkuffd_wp(swp_pte); - } else { - if (pte_swp_soft_dirty(pte)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - if (pte_swp_uffd_wp(pte)) - swp_pte = pte_swp_mkuffd_wp(swp_pte); - } - set_pte_at(mm, addr, ptep, swp_pte); - - /* - * This is like regular unmap: we remove the rmap and - * drop page refcount. Page won't be freed, as we took - * a reference just above. - */ - page_remove_rmap(page, false); - put_page(page); - - if (pte_present(pte)) - unmapped++; - } else { - put_page(page); - mpfn = 0; - } - -next: - migrate->dst[migrate->npages] = 0; - migrate->src[migrate->npages++] = mpfn; - } - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(ptep - 1, ptl); - - /* Only flush the TLB if we actually modified any entries */ - if (unmapped) - flush_tlb_range(walk->vma, start, end); - - return 0; -} - -static const struct mm_walk_ops migrate_vma_walk_ops = { - .pmd_entry = migrate_vma_collect_pmd, - .pte_hole = migrate_vma_collect_hole, -}; - -/* - * migrate_vma_collect() - collect pages over a range of virtual addresses - * @migrate: migrate struct containing all migration information - * - * This will walk the CPU page table. For each virtual address backed by a - * valid page, it updates the src array and takes a reference on the page, in - * order to pin the page until we lock it and unmap it. - */ -static void migrate_vma_collect(struct migrate_vma *migrate) -{ - struct mmu_notifier_range range; - - /* - * Note that the pgmap_owner is passed to the mmu notifier callback so - * that the registered device driver can skip invalidating device - * private page mappings that won't be migrated. - */ - mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0, - migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end, - migrate->pgmap_owner); - mmu_notifier_invalidate_range_start(&range); - - walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end, - &migrate_vma_walk_ops, migrate); - - mmu_notifier_invalidate_range_end(&range); - migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); -} - -/* - * migrate_vma_check_page() - check if page is pinned or not - * @page: struct page to check - * - * Pinned pages cannot be migrated. This is the same test as in - * folio_migrate_mapping(), except that here we allow migration of a - * ZONE_DEVICE page. - */ -static bool migrate_vma_check_page(struct page *page) -{ - /* - * One extra ref because caller holds an extra reference, either from - * isolate_lru_page() for a regular page, or migrate_vma_collect() for - * a device page. - */ - int extra = 1; - - /* - * FIXME support THP (transparent huge page), it is bit more complex to - * check them than regular pages, because they can be mapped with a pmd - * or with a pte (split pte mapping). - */ - if (PageCompound(page)) - return false; - - /* Page from ZONE_DEVICE have one extra reference */ - if (is_zone_device_page(page)) { - /* - * Private page can never be pin as they have no valid pte and - * GUP will fail for those. Yet if there is a pending migration - * a thread might try to wait on the pte migration entry and - * will bump the page reference count. Sadly there is no way to - * differentiate a regular pin from migration wait. Hence to - * avoid 2 racing thread trying to migrate back to CPU to enter - * infinite loop (one stopping migration because the other is - * waiting on pte migration entry). We always return true here. - * - * FIXME proper solution is to rework migration_entry_wait() so - * it does not need to take a reference on page. - */ - return is_device_private_page(page); - } - - /* For file back page */ - if (page_mapping(page)) - extra += 1 + page_has_private(page); - - if ((page_count(page) - extra) > page_mapcount(page)) - return false; - - return true; -} - /* - * migrate_vma_unmap() - replace page mapping with special migration pte entry - * @migrate: migrate struct containing all migration information - * - * Isolate pages from the LRU and replace mappings (CPU page table pte) with a - * special migration pte entry and check if it has been pinned. Pinned pages are - * restored because we cannot migrate them. - * - * This is the last step before we call the device driver callback to allocate - * destination memory and copy contents of original page over to new page. - */ -static void migrate_vma_unmap(struct migrate_vma *migrate) -{ - const unsigned long npages = migrate->npages; - const unsigned long start = migrate->start; - unsigned long addr, i, restore = 0; - bool allow_drain = true; - - lru_add_drain(); - - for (i = 0; i < npages; i++) { - struct page *page = migrate_pfn_to_page(migrate->src[i]); - - if (!page) - continue; - - /* ZONE_DEVICE pages are not on LRU */ - if (!is_zone_device_page(page)) { - if (!PageLRU(page) && allow_drain) { - /* Drain CPU's pagevec */ - lru_add_drain_all(); - allow_drain = false; - } - - if (isolate_lru_page(page)) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - migrate->cpages--; - restore++; - continue; - } - - /* Drop the reference we took in collect */ - put_page(page); - } - - if (page_mapped(page)) - try_to_migrate(page, 0); - - if (page_mapped(page) || !migrate_vma_check_page(page)) { - if (!is_zone_device_page(page)) { - get_page(page); - putback_lru_page(page); - } - - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - migrate->cpages--; - restore++; - continue; - } - } - - for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) { - struct page *page = migrate_pfn_to_page(migrate->src[i]); - - if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) - continue; - - remove_migration_ptes(page, page, false); - - migrate->src[i] = 0; - unlock_page(page); - put_page(page); - restore--; - } -} - -/** - * migrate_vma_setup() - prepare to migrate a range of memory - * @args: contains the vma, start, and pfns arrays for the migration - * - * Returns: negative errno on failures, 0 when 0 or more pages were migrated - * without an error. + * node_demotion[] example: * - * Prepare to migrate a range of memory virtual address range by collecting all - * the pages backing each virtual address in the range, saving them inside the - * src array. Then lock those pages and unmap them. Once the pages are locked - * and unmapped, check whether each page is pinned or not. Pages that aren't - * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the - * corresponding src array entry. Then restores any pages that are pinned, by - * remapping and unlocking those pages. + * Consider a system with two sockets. Each socket has + * three classes of memory attached: fast, medium and slow. + * Each memory class is placed in its own NUMA node. The + * CPUs are placed in the node with the "fast" memory. The + * 6 NUMA nodes (0-5) might be split among the sockets like + * this: * - * The caller should then allocate destination memory and copy source memory to - * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE - * flag set). Once these are allocated and copied, the caller must update each - * corresponding entry in the dst array with the pfn value of the destination - * page and with MIGRATE_PFN_VALID. Destination pages must be locked via - * lock_page(). + * Socket A: 0, 1, 2 + * Socket B: 3, 4, 5 * - * Note that the caller does not have to migrate all the pages that are marked - * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from - * device memory to system memory. If the caller cannot migrate a device page - * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe - * consequences for the userspace process, so it must be avoided if at all - * possible. + * When Node 0 fills up, its memory should be migrated to + * Node 1. When Node 1 fills up, it should be migrated to + * Node 2. The migration path start on the nodes with the + * processors (since allocations default to this node) and + * fast memory, progress through medium and end with the + * slow memory: * - * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we - * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus - * allowing the caller to allocate device memory for those unbacked virtual - * addresses. For this the caller simply has to allocate device memory and - * properly set the destination entry like for regular migration. Note that - * this can still fail, and thus inside the device driver you must check if the - * migration was successful for those entries after calling migrate_vma_pages(), - * just like for regular migration. + * 0 -> 1 -> 2 -> stop + * 3 -> 4 -> 5 -> stop * - * After that, the callers must call migrate_vma_pages() to go over each entry - * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag - * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, - * then migrate_vma_pages() to migrate struct page information from the source - * struct page to the destination struct page. If it fails to migrate the - * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the - * src array. + * This is represented in the node_demotion[] like this: * - * At this point all successfully migrated pages have an entry in the src - * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst - * array entry with MIGRATE_PFN_VALID flag set. + * { nr=1, nodes[0]=1 }, // Node 0 migrates to 1 + * { nr=1, nodes[0]=2 }, // Node 1 migrates to 2 + * { nr=0, nodes[0]=-1 }, // Node 2 does not migrate + * { nr=1, nodes[0]=4 }, // Node 3 migrates to 4 + * { nr=1, nodes[0]=5 }, // Node 4 migrates to 5 + * { nr=0, nodes[0]=-1 }, // Node 5 does not migrate * - * Once migrate_vma_pages() returns the caller may inspect which pages were - * successfully migrated, and which were not. Successfully migrated pages will - * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. + * Moreover some systems may have multiple slow memory nodes. + * Suppose a system has one socket with 3 memory nodes, node 0 + * is fast memory type, and node 1/2 both are slow memory + * type, and the distance between fast memory node and slow + * memory node is same. So the migration path should be: * - * It is safe to update device page table after migrate_vma_pages() because - * both destination and source page are still locked, and the mmap_lock is held - * in read mode (hence no one can unmap the range being migrated). + * 0 -> 1/2 -> stop * - * Once the caller is done cleaning up things and updating its page table (if it - * chose to do so, this is not an obligation) it finally calls - * migrate_vma_finalize() to update the CPU page table to point to new pages - * for successfully migrated pages or otherwise restore the CPU page table to - * point to the original source pages. + * This is represented in the node_demotion[] like this: + * { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2 + * { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate + * { nr=0, nodes[0]=-1, }, // Node 2 does not migrate */ -int migrate_vma_setup(struct migrate_vma *args) -{ - long nr_pages = (args->end - args->start) >> PAGE_SHIFT; - - args->start &= PAGE_MASK; - args->end &= PAGE_MASK; - if (!args->vma || is_vm_hugetlb_page(args->vma) || - (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) - return -EINVAL; - if (nr_pages <= 0) - return -EINVAL; - if (args->start < args->vma->vm_start || - args->start >= args->vma->vm_end) - return -EINVAL; - if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) - return -EINVAL; - if (!args->src || !args->dst) - return -EINVAL; - - memset(args->src, 0, sizeof(*args->src) * nr_pages); - args->cpages = 0; - args->npages = 0; - - migrate_vma_collect(args); - - if (args->cpages) - migrate_vma_unmap(args); - - /* - * At this point pages are locked and unmapped, and thus they have - * stable content and can safely be copied to destination memory that - * is allocated by the drivers. - */ - return 0; - -} -EXPORT_SYMBOL(migrate_vma_setup); /* - * This code closely matches the code in: - * __handle_mm_fault() - * handle_pte_fault() - * do_anonymous_page() - * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE - * private page. + * Writes to this array occur without locking. Cycles are + * not allowed: Node X demotes to Y which demotes to X... + * + * If multiple reads are performed, a single rcu_read_lock() + * must be held over all reads to ensure that no cycles are + * observed. */ -static void migrate_vma_insert_page(struct migrate_vma *migrate, - unsigned long addr, - struct page *page, - unsigned long *src) -{ - struct vm_area_struct *vma = migrate->vma; - struct mm_struct *mm = vma->vm_mm; - bool flush = false; - spinlock_t *ptl; - pte_t entry; - pgd_t *pgdp; - p4d_t *p4dp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - - /* Only allow populating anonymous memory */ - if (!vma_is_anonymous(vma)) - goto abort; - - pgdp = pgd_offset(mm, addr); - p4dp = p4d_alloc(mm, pgdp, addr); - if (!p4dp) - goto abort; - pudp = pud_alloc(mm, p4dp, addr); - if (!pudp) - goto abort; - pmdp = pmd_alloc(mm, pudp, addr); - if (!pmdp) - goto abort; - - if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) - goto abort; - - /* - * Use pte_alloc() instead of pte_alloc_map(). We can't run - * pte_offset_map() on pmds where a huge pmd might be created - * from a different thread. - * - * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when - * parallel threads are excluded by other means. - * - * Here we only have mmap_read_lock(mm). - */ - if (pte_alloc(mm, pmdp)) - goto abort; - - /* See the comment in pte_alloc_one_map() */ - if (unlikely(pmd_trans_unstable(pmdp))) - goto abort; - - if (unlikely(anon_vma_prepare(vma))) - goto abort; - if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL)) - goto abort; - - /* - * The memory barrier inside __SetPageUptodate makes sure that - * preceding stores to the page contents become visible before - * the set_pte_at() write. - */ - __SetPageUptodate(page); - - if (is_zone_device_page(page)) { - if (is_device_private_page(page)) { - swp_entry_t swp_entry; - - if (vma->vm_flags & VM_WRITE) - swp_entry = make_writable_device_private_entry( - page_to_pfn(page)); - else - swp_entry = make_readable_device_private_entry( - page_to_pfn(page)); - entry = swp_entry_to_pte(swp_entry); - } else { - /* - * For now we only support migrating to un-addressable - * device memory. - */ - pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); - goto abort; - } - } else { - entry = mk_pte(page, vma->vm_page_prot); - if (vma->vm_flags & VM_WRITE) - entry = pte_mkwrite(pte_mkdirty(entry)); - } +#define DEFAULT_DEMOTION_TARGET_NODES 15 - ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); - - if (check_stable_address_space(mm)) - goto unlock_abort; - - if (pte_present(*ptep)) { - unsigned long pfn = pte_pfn(*ptep); - - if (!is_zero_pfn(pfn)) - goto unlock_abort; - flush = true; - } else if (!pte_none(*ptep)) - goto unlock_abort; - - /* - * Check for userfaultfd but do not deliver the fault. Instead, - * just back off. - */ - if (userfaultfd_missing(vma)) - goto unlock_abort; - - inc_mm_counter(mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, addr, false); - if (!is_zone_device_page(page)) - lru_cache_add_inactive_or_unevictable(page, vma); - get_page(page); - - if (flush) { - flush_cache_page(vma, addr, pte_pfn(*ptep)); - ptep_clear_flush_notify(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, entry); - update_mmu_cache(vma, addr, ptep); - } else { - /* No need to invalidate - it was non-present before */ - set_pte_at(mm, addr, ptep, entry); - update_mmu_cache(vma, addr, ptep); - } +#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES +#define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1) +#else +#define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES +#endif - pte_unmap_unlock(ptep, ptl); - *src = MIGRATE_PFN_MIGRATE; - return; +struct demotion_nodes { + unsigned short nr; + short nodes[DEMOTION_TARGET_NODES]; +}; -unlock_abort: - pte_unmap_unlock(ptep, ptl); -abort: - *src &= ~MIGRATE_PFN_MIGRATE; -} +static struct demotion_nodes *node_demotion __read_mostly; /** - * migrate_vma_pages() - migrate meta-data from src page to dst page - * @migrate: migrate struct containing all migration information + * next_demotion_node() - Get the next node in the demotion path + * @node: The starting node to lookup the next node * - * This migrates struct page meta-data from source struct page to destination - * struct page. This effectively finishes the migration from source page to the - * destination page. + * Return: node id for next memory node in the demotion path hierarchy + * from @node; NUMA_NO_NODE if @node is terminal. This does not keep + * @node online or guarantee that it *continues* to be the next demotion + * target. */ -void migrate_vma_pages(struct migrate_vma *migrate) +int next_demotion_node(int node) { - const unsigned long npages = migrate->npages; - const unsigned long start = migrate->start; - struct mmu_notifier_range range; - unsigned long addr, i; - bool notified = false; - - for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { - struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); - struct page *page = migrate_pfn_to_page(migrate->src[i]); - struct address_space *mapping; - int r; - - if (!newpage) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - continue; - } - - if (!page) { - if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) - continue; - if (!notified) { - notified = true; - - mmu_notifier_range_init_owner(&range, - MMU_NOTIFY_MIGRATE, 0, migrate->vma, - migrate->vma->vm_mm, addr, migrate->end, - migrate->pgmap_owner); - mmu_notifier_invalidate_range_start(&range); - } - migrate_vma_insert_page(migrate, addr, newpage, - &migrate->src[i]); - continue; - } - - mapping = page_mapping(page); + struct demotion_nodes *nd; + unsigned short target_nr, index; + int target; - if (is_zone_device_page(newpage)) { - if (is_device_private_page(newpage)) { - /* - * For now only support private anonymous when - * migrating to un-addressable device memory. - */ - if (mapping) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - continue; - } - } else { - /* - * Other types of ZONE_DEVICE page are not - * supported. - */ - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - continue; - } - } + if (!node_demotion) + return NUMA_NO_NODE; - r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); - if (r != MIGRATEPAGE_SUCCESS) - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - } + nd = &node_demotion[node]; /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above ptep_clear_flush_notify() inside migrate_vma_insert_page() - * did already call it. + * node_demotion[] is updated without excluding this + * function from running. RCU doesn't provide any + * compiler barriers, so the READ_ONCE() is required + * to avoid compiler reordering or read merging. + * + * Make sure to use RCU over entire code blocks if + * node_demotion[] reads need to be consistent. */ - if (notified) - mmu_notifier_invalidate_range_only_end(&range); -} -EXPORT_SYMBOL(migrate_vma_pages); - -/** - * migrate_vma_finalize() - restore CPU page table entry - * @migrate: migrate struct containing all migration information - * - * This replaces the special migration pte entry with either a mapping to the - * new page if migration was successful for that page, or to the original page - * otherwise. - * - * This also unlocks the pages and puts them back on the lru, or drops the extra - * refcount, for device pages. - */ -void migrate_vma_finalize(struct migrate_vma *migrate) -{ - const unsigned long npages = migrate->npages; - unsigned long i; - - for (i = 0; i < npages; i++) { - struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); - struct page *page = migrate_pfn_to_page(migrate->src[i]); - - if (!page) { - if (newpage) { - unlock_page(newpage); - put_page(newpage); - } - continue; - } - - if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { - if (newpage) { - unlock_page(newpage); - put_page(newpage); - } - newpage = page; - } + rcu_read_lock(); + target_nr = READ_ONCE(nd->nr); - remove_migration_ptes(page, newpage, false); - unlock_page(page); + switch (target_nr) { + case 0: + target = NUMA_NO_NODE; + goto out; + case 1: + index = 0; + break; + default: + /* + * If there are multiple target nodes, just select one + * target node randomly. + * + * In addition, we can also use round-robin to select + * target node, but we should introduce another variable + * for node_demotion[] to record last selected target node, + * that may cause cache ping-pong due to the changing of + * last target node. Or introducing per-cpu data to avoid + * caching issue, which seems more complicated. So selecting + * target node randomly seems better until now. + */ + index = get_random_int() % target_nr; + break; + } - if (is_zone_device_page(page)) - put_page(page); - else - putback_lru_page(page); + target = READ_ONCE(nd->nodes[index]); - if (newpage != page) { - unlock_page(newpage); - if (is_zone_device_page(newpage)) - put_page(newpage); - else - putback_lru_page(newpage); - } - } +out: + rcu_read_unlock(); + return target; } -EXPORT_SYMBOL(migrate_vma_finalize); -#endif /* CONFIG_DEVICE_PRIVATE */ #if defined(CONFIG_HOTPLUG_CPU) /* Disable reclaim-based migration. */ static void __disable_all_migrate_targets(void) { - int node; + int node, i; - for_each_online_node(node) - node_demotion[node] = NUMA_NO_NODE; + if (!node_demotion) + return; + + for_each_online_node(node) { + node_demotion[node].nr = 0; + for (i = 0; i < DEMOTION_TARGET_NODES; i++) + node_demotion[node].nodes[i] = NUMA_NO_NODE; + } } static void disable_all_migrate_targets(void) @@ -3004,28 +2316,45 @@ static void disable_all_migrate_targets(void) * Failing here is OK. It might just indicate * being at the end of a chain. */ -static int establish_migrate_target(int node, nodemask_t *used) +static int establish_migrate_target(int node, nodemask_t *used, + int best_distance) { - int migration_target; + int migration_target, index, val; + struct demotion_nodes *nd; - /* - * Can not set a migration target on a - * node with it already set. - * - * No need for READ_ONCE() here since this - * in the write path for node_demotion[]. - * This should be the only thread writing. - */ - if (node_demotion[node] != NUMA_NO_NODE) + if (!node_demotion) return NUMA_NO_NODE; + nd = &node_demotion[node]; + migration_target = find_next_best_node(node, used); if (migration_target == NUMA_NO_NODE) return NUMA_NO_NODE; - node_demotion[node] = migration_target; + /* + * If the node has been set a migration target node before, + * which means it's the best distance between them. Still + * check if this node can be demoted to other target nodes + * if they have a same best distance. + */ + if (best_distance != -1) { + val = node_distance(node, migration_target); + if (val > best_distance) + goto out_clear; + } + + index = nd->nr; + if (WARN_ONCE(index >= DEMOTION_TARGET_NODES, + "Exceeds maximum demotion target nodes\n")) + goto out_clear; + + nd->nodes[index] = migration_target; + nd->nr++; return migration_target; +out_clear: + node_clear(migration_target, *used); + return NUMA_NO_NODE; } /* @@ -3039,7 +2368,9 @@ static int establish_migrate_target(int node, nodemask_t *used) * * The difference here is that cycles must be avoided. If * node0 migrates to node1, then neither node1, nor anything - * node1 migrates to can migrate to node0. + * node1 migrates to can migrate to node0. Also one node can + * be migrated to multiple nodes if the target nodes all have + * a same best-distance against the source node. * * This function can run simultaneously with readers of * node_demotion[]. However, it can not run simultaneously @@ -3051,7 +2382,7 @@ static void __set_migration_target_nodes(void) nodemask_t next_pass = NODE_MASK_NONE; nodemask_t this_pass = NODE_MASK_NONE; nodemask_t used_targets = NODE_MASK_NONE; - int node; + int node, best_distance; /* * Avoid any oddities like cycles that could occur @@ -3080,18 +2411,33 @@ again: * multiple source nodes to share a destination. */ nodes_or(used_targets, used_targets, this_pass); - for_each_node_mask(node, this_pass) { - int target_node = establish_migrate_target(node, &used_targets); - if (target_node == NUMA_NO_NODE) - continue; + for_each_node_mask(node, this_pass) { + best_distance = -1; /* - * Visit targets from this pass in the next pass. - * Eventually, every node will have been part of - * a pass, and will become set in 'used_targets'. + * Try to set up the migration path for the node, and the target + * migration nodes can be multiple, so doing a loop to find all + * the target nodes if they all have a best node distance. */ - node_set(target_node, next_pass); + do { + int target_node = + establish_migrate_target(node, &used_targets, + best_distance); + + if (target_node == NUMA_NO_NODE) + break; + + if (best_distance == -1) + best_distance = node_distance(node, target_node); + + /* + * Visit targets from this pass in the next pass. + * Eventually, every node will have been part of + * a pass, and will become set in 'used_targets'. + */ + node_set(target_node, next_pass); + } while (1); } /* * 'next_pass' contains nodes which became migration @@ -3105,7 +2451,7 @@ again: /* * For callers that do not hold get_online_mems() already. */ -static void set_migration_target_nodes(void) +void set_migration_target_nodes(void) { get_online_mems(); __set_migration_target_nodes(); @@ -3169,46 +2515,24 @@ static int __meminit migrate_on_reclaim_callback(struct notifier_block *self, return notifier_from_errno(0); } -/* - * React to hotplug events that might affect the migration targets - * like events that online or offline NUMA nodes. - * - * The ordering is also currently dependent on which nodes have - * CPUs. That means we need CPU on/offline notification too. - */ -static int migration_online_cpu(unsigned int cpu) +void __init migrate_on_reclaim_init(void) { - set_migration_target_nodes(); - return 0; -} + node_demotion = kmalloc_array(nr_node_ids, + sizeof(struct demotion_nodes), + GFP_KERNEL); + WARN_ON(!node_demotion); -static int migration_offline_cpu(unsigned int cpu) -{ - set_migration_target_nodes(); - return 0; -} - -static int __init migrate_on_reclaim_init(void) -{ - int ret; - - ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline", - NULL, migration_offline_cpu); + hotplug_memory_notifier(migrate_on_reclaim_callback, 100); /* - * In the unlikely case that this fails, the automatic - * migration targets may become suboptimal for nodes - * where N_CPU changes. With such a small impact in a - * rare case, do not bother trying to do anything special. + * At this point, all numa nodes with memory/CPus have their state + * properly set, so we can build the demotion order now. + * Let us hold the cpu_hotplug lock just, as we could possibily have + * CPU hotplug events during boot. */ - WARN_ON(ret < 0); - ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online", - migration_online_cpu, NULL); - WARN_ON(ret < 0); - - hotplug_memory_notifier(migrate_on_reclaim_callback, 100); - return 0; + cpus_read_lock(); + set_migration_target_nodes(); + cpus_read_unlock(); } -late_initcall(migrate_on_reclaim_init); #endif /* CONFIG_HOTPLUG_CPU */ bool numa_demotion_enabled = false; diff --git a/mm/migrate_device.c b/mm/migrate_device.c new file mode 100644 index 000000000000..70c7dc05bbfc --- /dev/null +++ b/mm/migrate_device.c @@ -0,0 +1,773 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Device Memory Migration functionality. + * + * Originally written by Jérôme Glisse. + */ +#include <linux/export.h> +#include <linux/memremap.h> +#include <linux/migrate.h> +#include <linux/mm_inline.h> +#include <linux/mmu_notifier.h> +#include <linux/oom.h> +#include <linux/pagewalk.h> +#include <linux/rmap.h> +#include <linux/swapops.h> +#include <asm/tlbflush.h> +#include "internal.h" + +static int migrate_vma_collect_skip(unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + unsigned long addr; + + for (addr = start; addr < end; addr += PAGE_SIZE) { + migrate->dst[migrate->npages] = 0; + migrate->src[migrate->npages++] = 0; + } + + return 0; +} + +static int migrate_vma_collect_hole(unsigned long start, + unsigned long end, + __always_unused int depth, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + unsigned long addr; + + /* Only allow populating anonymous memory. */ + if (!vma_is_anonymous(walk->vma)) + return migrate_vma_collect_skip(start, end, walk); + + for (addr = start; addr < end; addr += PAGE_SIZE) { + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; + migrate->dst[migrate->npages] = 0; + migrate->npages++; + migrate->cpages++; + } + + return 0; +} + +static int migrate_vma_collect_pmd(pmd_t *pmdp, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + struct vm_area_struct *vma = walk->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long addr = start, unmapped = 0; + spinlock_t *ptl; + pte_t *ptep; + +again: + if (pmd_none(*pmdp)) + return migrate_vma_collect_hole(start, end, -1, walk); + + if (pmd_trans_huge(*pmdp)) { + struct page *page; + + ptl = pmd_lock(mm, pmdp); + if (unlikely(!pmd_trans_huge(*pmdp))) { + spin_unlock(ptl); + goto again; + } + + page = pmd_page(*pmdp); + if (is_huge_zero_page(page)) { + spin_unlock(ptl); + split_huge_pmd(vma, pmdp, addr); + if (pmd_trans_unstable(pmdp)) + return migrate_vma_collect_skip(start, end, + walk); + } else { + int ret; + + get_page(page); + spin_unlock(ptl); + if (unlikely(!trylock_page(page))) + return migrate_vma_collect_skip(start, end, + walk); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + if (ret) + return migrate_vma_collect_skip(start, end, + walk); + if (pmd_none(*pmdp)) + return migrate_vma_collect_hole(start, end, -1, + walk); + } + } + + if (unlikely(pmd_bad(*pmdp))) + return migrate_vma_collect_skip(start, end, walk); + + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + arch_enter_lazy_mmu_mode(); + + for (; addr < end; addr += PAGE_SIZE, ptep++) { + unsigned long mpfn = 0, pfn; + struct page *page; + swp_entry_t entry; + pte_t pte; + + pte = *ptep; + + if (pte_none(pte)) { + if (vma_is_anonymous(vma)) { + mpfn = MIGRATE_PFN_MIGRATE; + migrate->cpages++; + } + goto next; + } + + if (!pte_present(pte)) { + /* + * Only care about unaddressable device page special + * page table entry. Other special swap entries are not + * migratable, and we ignore regular swapped page. + */ + entry = pte_to_swp_entry(pte); + if (!is_device_private_entry(entry)) + goto next; + + page = pfn_swap_entry_to_page(entry); + if (!(migrate->flags & + MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || + page->pgmap->owner != migrate->pgmap_owner) + goto next; + + mpfn = migrate_pfn(page_to_pfn(page)) | + MIGRATE_PFN_MIGRATE; + if (is_writable_device_private_entry(entry)) + mpfn |= MIGRATE_PFN_WRITE; + } else { + if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) + goto next; + pfn = pte_pfn(pte); + if (is_zero_pfn(pfn)) { + mpfn = MIGRATE_PFN_MIGRATE; + migrate->cpages++; + goto next; + } + page = vm_normal_page(migrate->vma, addr, pte); + mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; + mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; + } + + /* FIXME support THP */ + if (!page || !page->mapping || PageTransCompound(page)) { + mpfn = 0; + goto next; + } + + /* + * By getting a reference on the page we pin it and that blocks + * any kind of migration. Side effect is that it "freezes" the + * pte. + * + * We drop this reference after isolating the page from the lru + * for non device page (device page are not on the lru and thus + * can't be dropped from it). + */ + get_page(page); + + /* + * Optimize for the common case where page is only mapped once + * in one process. If we can lock the page, then we can safely + * set up a special migration page table entry now. + */ + if (trylock_page(page)) { + pte_t swp_pte; + + migrate->cpages++; + ptep_get_and_clear(mm, addr, ptep); + + /* Setup special migration page table entry */ + if (mpfn & MIGRATE_PFN_WRITE) + entry = make_writable_migration_entry( + page_to_pfn(page)); + else + entry = make_readable_migration_entry( + page_to_pfn(page)); + swp_pte = swp_entry_to_pte(entry); + if (pte_present(pte)) { + if (pte_soft_dirty(pte)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pte)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + } else { + if (pte_swp_soft_dirty(pte)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_swp_uffd_wp(pte)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + } + set_pte_at(mm, addr, ptep, swp_pte); + + /* + * This is like regular unmap: we remove the rmap and + * drop page refcount. Page won't be freed, as we took + * a reference just above. + */ + page_remove_rmap(page, vma, false); + put_page(page); + + if (pte_present(pte)) + unmapped++; + } else { + put_page(page); + mpfn = 0; + } + +next: + migrate->dst[migrate->npages] = 0; + migrate->src[migrate->npages++] = mpfn; + } + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(ptep - 1, ptl); + + /* Only flush the TLB if we actually modified any entries */ + if (unmapped) + flush_tlb_range(walk->vma, start, end); + + return 0; +} + +static const struct mm_walk_ops migrate_vma_walk_ops = { + .pmd_entry = migrate_vma_collect_pmd, + .pte_hole = migrate_vma_collect_hole, +}; + +/* + * migrate_vma_collect() - collect pages over a range of virtual addresses + * @migrate: migrate struct containing all migration information + * + * This will walk the CPU page table. For each virtual address backed by a + * valid page, it updates the src array and takes a reference on the page, in + * order to pin the page until we lock it and unmap it. + */ +static void migrate_vma_collect(struct migrate_vma *migrate) +{ + struct mmu_notifier_range range; + + /* + * Note that the pgmap_owner is passed to the mmu notifier callback so + * that the registered device driver can skip invalidating device + * private page mappings that won't be migrated. + */ + mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0, + migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end, + migrate->pgmap_owner); + mmu_notifier_invalidate_range_start(&range); + + walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end, + &migrate_vma_walk_ops, migrate); + + mmu_notifier_invalidate_range_end(&range); + migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); +} + +/* + * migrate_vma_check_page() - check if page is pinned or not + * @page: struct page to check + * + * Pinned pages cannot be migrated. This is the same test as in + * folio_migrate_mapping(), except that here we allow migration of a + * ZONE_DEVICE page. + */ +static bool migrate_vma_check_page(struct page *page) +{ + /* + * One extra ref because caller holds an extra reference, either from + * isolate_lru_page() for a regular page, or migrate_vma_collect() for + * a device page. + */ + int extra = 1; + + /* + * FIXME support THP (transparent huge page), it is bit more complex to + * check them than regular pages, because they can be mapped with a pmd + * or with a pte (split pte mapping). + */ + if (PageCompound(page)) + return false; + + /* Page from ZONE_DEVICE have one extra reference */ + if (is_zone_device_page(page)) + extra++; + + /* For file back page */ + if (page_mapping(page)) + extra += 1 + page_has_private(page); + + if ((page_count(page) - extra) > page_mapcount(page)) + return false; + + return true; +} + +/* + * migrate_vma_unmap() - replace page mapping with special migration pte entry + * @migrate: migrate struct containing all migration information + * + * Isolate pages from the LRU and replace mappings (CPU page table pte) with a + * special migration pte entry and check if it has been pinned. Pinned pages are + * restored because we cannot migrate them. + * + * This is the last step before we call the device driver callback to allocate + * destination memory and copy contents of original page over to new page. + */ +static void migrate_vma_unmap(struct migrate_vma *migrate) +{ + const unsigned long npages = migrate->npages; + unsigned long i, restore = 0; + bool allow_drain = true; + + lru_add_drain(); + + for (i = 0; i < npages; i++) { + struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct folio *folio; + + if (!page) + continue; + + /* ZONE_DEVICE pages are not on LRU */ + if (!is_zone_device_page(page)) { + if (!PageLRU(page) && allow_drain) { + /* Drain CPU's pagevec */ + lru_add_drain_all(); + allow_drain = false; + } + + if (isolate_lru_page(page)) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + migrate->cpages--; + restore++; + continue; + } + + /* Drop the reference we took in collect */ + put_page(page); + } + + folio = page_folio(page); + if (folio_mapped(folio)) + try_to_migrate(folio, 0); + + if (page_mapped(page) || !migrate_vma_check_page(page)) { + if (!is_zone_device_page(page)) { + get_page(page); + putback_lru_page(page); + } + + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + migrate->cpages--; + restore++; + continue; + } + } + + for (i = 0; i < npages && restore; i++) { + struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct folio *folio; + + if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) + continue; + + folio = page_folio(page); + remove_migration_ptes(folio, folio, false); + + migrate->src[i] = 0; + folio_unlock(folio); + folio_put(folio); + restore--; + } +} + +/** + * migrate_vma_setup() - prepare to migrate a range of memory + * @args: contains the vma, start, and pfns arrays for the migration + * + * Returns: negative errno on failures, 0 when 0 or more pages were migrated + * without an error. + * + * Prepare to migrate a range of memory virtual address range by collecting all + * the pages backing each virtual address in the range, saving them inside the + * src array. Then lock those pages and unmap them. Once the pages are locked + * and unmapped, check whether each page is pinned or not. Pages that aren't + * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the + * corresponding src array entry. Then restores any pages that are pinned, by + * remapping and unlocking those pages. + * + * The caller should then allocate destination memory and copy source memory to + * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE + * flag set). Once these are allocated and copied, the caller must update each + * corresponding entry in the dst array with the pfn value of the destination + * page and with MIGRATE_PFN_VALID. Destination pages must be locked via + * lock_page(). + * + * Note that the caller does not have to migrate all the pages that are marked + * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from + * device memory to system memory. If the caller cannot migrate a device page + * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe + * consequences for the userspace process, so it must be avoided if at all + * possible. + * + * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we + * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus + * allowing the caller to allocate device memory for those unbacked virtual + * addresses. For this the caller simply has to allocate device memory and + * properly set the destination entry like for regular migration. Note that + * this can still fail, and thus inside the device driver you must check if the + * migration was successful for those entries after calling migrate_vma_pages(), + * just like for regular migration. + * + * After that, the callers must call migrate_vma_pages() to go over each entry + * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag + * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, + * then migrate_vma_pages() to migrate struct page information from the source + * struct page to the destination struct page. If it fails to migrate the + * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the + * src array. + * + * At this point all successfully migrated pages have an entry in the src + * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst + * array entry with MIGRATE_PFN_VALID flag set. + * + * Once migrate_vma_pages() returns the caller may inspect which pages were + * successfully migrated, and which were not. Successfully migrated pages will + * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. + * + * It is safe to update device page table after migrate_vma_pages() because + * both destination and source page are still locked, and the mmap_lock is held + * in read mode (hence no one can unmap the range being migrated). + * + * Once the caller is done cleaning up things and updating its page table (if it + * chose to do so, this is not an obligation) it finally calls + * migrate_vma_finalize() to update the CPU page table to point to new pages + * for successfully migrated pages or otherwise restore the CPU page table to + * point to the original source pages. + */ +int migrate_vma_setup(struct migrate_vma *args) +{ + long nr_pages = (args->end - args->start) >> PAGE_SHIFT; + + args->start &= PAGE_MASK; + args->end &= PAGE_MASK; + if (!args->vma || is_vm_hugetlb_page(args->vma) || + (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) + return -EINVAL; + if (nr_pages <= 0) + return -EINVAL; + if (args->start < args->vma->vm_start || + args->start >= args->vma->vm_end) + return -EINVAL; + if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) + return -EINVAL; + if (!args->src || !args->dst) + return -EINVAL; + + memset(args->src, 0, sizeof(*args->src) * nr_pages); + args->cpages = 0; + args->npages = 0; + + migrate_vma_collect(args); + + if (args->cpages) + migrate_vma_unmap(args); + + /* + * At this point pages are locked and unmapped, and thus they have + * stable content and can safely be copied to destination memory that + * is allocated by the drivers. + */ + return 0; + +} +EXPORT_SYMBOL(migrate_vma_setup); + +/* + * This code closely matches the code in: + * __handle_mm_fault() + * handle_pte_fault() + * do_anonymous_page() + * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE + * private page. + */ +static void migrate_vma_insert_page(struct migrate_vma *migrate, + unsigned long addr, + struct page *page, + unsigned long *src) +{ + struct vm_area_struct *vma = migrate->vma; + struct mm_struct *mm = vma->vm_mm; + bool flush = false; + spinlock_t *ptl; + pte_t entry; + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + /* Only allow populating anonymous memory */ + if (!vma_is_anonymous(vma)) + goto abort; + + pgdp = pgd_offset(mm, addr); + p4dp = p4d_alloc(mm, pgdp, addr); + if (!p4dp) + goto abort; + pudp = pud_alloc(mm, p4dp, addr); + if (!pudp) + goto abort; + pmdp = pmd_alloc(mm, pudp, addr); + if (!pmdp) + goto abort; + + if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) + goto abort; + + /* + * Use pte_alloc() instead of pte_alloc_map(). We can't run + * pte_offset_map() on pmds where a huge pmd might be created + * from a different thread. + * + * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when + * parallel threads are excluded by other means. + * + * Here we only have mmap_read_lock(mm). + */ + if (pte_alloc(mm, pmdp)) + goto abort; + + /* See the comment in pte_alloc_one_map() */ + if (unlikely(pmd_trans_unstable(pmdp))) + goto abort; + + if (unlikely(anon_vma_prepare(vma))) + goto abort; + if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL)) + goto abort; + + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + if (is_device_private_page(page)) { + swp_entry_t swp_entry; + + if (vma->vm_flags & VM_WRITE) + swp_entry = make_writable_device_private_entry( + page_to_pfn(page)); + else + swp_entry = make_readable_device_private_entry( + page_to_pfn(page)); + entry = swp_entry_to_pte(swp_entry); + } else { + /* + * For now we only support migrating to un-addressable device + * memory. + */ + if (is_zone_device_page(page)) { + pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); + goto abort; + } + entry = mk_pte(page, vma->vm_page_prot); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); + } + + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + + if (check_stable_address_space(mm)) + goto unlock_abort; + + if (pte_present(*ptep)) { + unsigned long pfn = pte_pfn(*ptep); + + if (!is_zero_pfn(pfn)) + goto unlock_abort; + flush = true; + } else if (!pte_none(*ptep)) + goto unlock_abort; + + /* + * Check for userfaultfd but do not deliver the fault. Instead, + * just back off. + */ + if (userfaultfd_missing(vma)) + goto unlock_abort; + + inc_mm_counter(mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, addr, false); + if (!is_zone_device_page(page)) + lru_cache_add_inactive_or_unevictable(page, vma); + get_page(page); + + if (flush) { + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush_notify(vma, addr, ptep); + set_pte_at_notify(mm, addr, ptep, entry); + update_mmu_cache(vma, addr, ptep); + } else { + /* No need to invalidate - it was non-present before */ + set_pte_at(mm, addr, ptep, entry); + update_mmu_cache(vma, addr, ptep); + } + + pte_unmap_unlock(ptep, ptl); + *src = MIGRATE_PFN_MIGRATE; + return; + +unlock_abort: + pte_unmap_unlock(ptep, ptl); +abort: + *src &= ~MIGRATE_PFN_MIGRATE; +} + +/** + * migrate_vma_pages() - migrate meta-data from src page to dst page + * @migrate: migrate struct containing all migration information + * + * This migrates struct page meta-data from source struct page to destination + * struct page. This effectively finishes the migration from source page to the + * destination page. + */ +void migrate_vma_pages(struct migrate_vma *migrate) +{ + const unsigned long npages = migrate->npages; + const unsigned long start = migrate->start; + struct mmu_notifier_range range; + unsigned long addr, i; + bool notified = false; + + for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { + struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); + struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct address_space *mapping; + int r; + + if (!newpage) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + + if (!page) { + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) + continue; + if (!notified) { + notified = true; + + mmu_notifier_range_init_owner(&range, + MMU_NOTIFY_MIGRATE, 0, migrate->vma, + migrate->vma->vm_mm, addr, migrate->end, + migrate->pgmap_owner); + mmu_notifier_invalidate_range_start(&range); + } + migrate_vma_insert_page(migrate, addr, newpage, + &migrate->src[i]); + continue; + } + + mapping = page_mapping(page); + + if (is_device_private_page(newpage)) { + /* + * For now only support private anonymous when migrating + * to un-addressable device memory. + */ + if (mapping) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + } else if (is_zone_device_page(newpage)) { + /* + * Other types of ZONE_DEVICE page are not supported. + */ + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + + r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); + if (r != MIGRATEPAGE_SUCCESS) + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + } + + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above ptep_clear_flush_notify() inside migrate_vma_insert_page() + * did already call it. + */ + if (notified) + mmu_notifier_invalidate_range_only_end(&range); +} +EXPORT_SYMBOL(migrate_vma_pages); + +/** + * migrate_vma_finalize() - restore CPU page table entry + * @migrate: migrate struct containing all migration information + * + * This replaces the special migration pte entry with either a mapping to the + * new page if migration was successful for that page, or to the original page + * otherwise. + * + * This also unlocks the pages and puts them back on the lru, or drops the extra + * refcount, for device pages. + */ +void migrate_vma_finalize(struct migrate_vma *migrate) +{ + const unsigned long npages = migrate->npages; + unsigned long i; + + for (i = 0; i < npages; i++) { + struct folio *dst, *src; + struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page) { + if (newpage) { + unlock_page(newpage); + put_page(newpage); + } + continue; + } + + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { + if (newpage) { + unlock_page(newpage); + put_page(newpage); + } + newpage = page; + } + + src = page_folio(page); + dst = page_folio(newpage); + remove_migration_ptes(src, dst, false); + folio_unlock(src); + + if (is_zone_device_page(page)) + put_page(page); + else + putback_lru_page(page); + + if (newpage != page) { + unlock_page(newpage); + if (is_zone_device_page(newpage)) + put_page(newpage); + else + putback_lru_page(newpage); + } + } +} +EXPORT_SYMBOL(migrate_vma_finalize); diff --git a/mm/mlock.c b/mm/mlock.c index e263d62ae2d0..efd2dd2943de 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -14,6 +14,7 @@ #include <linux/swapops.h> #include <linux/pagemap.h> #include <linux/pagevec.h> +#include <linux/pagewalk.h> #include <linux/mempolicy.h> #include <linux/syscalls.h> #include <linux/sched.h> @@ -27,6 +28,8 @@ #include "internal.h" +static DEFINE_PER_CPU(struct pagevec, mlock_pvec); + bool can_do_mlock(void) { if (rlimit(RLIMIT_MEMLOCK) != 0) @@ -46,441 +49,320 @@ EXPORT_SYMBOL(can_do_mlock); * be placed on the LRU "unevictable" list, rather than the [in]active lists. * The unevictable list is an LRU sibling list to the [in]active lists. * PageUnevictable is set to indicate the unevictable state. - * - * When lazy mlocking via vmscan, it is important to ensure that the - * vma's VM_LOCKED status is not concurrently being modified, otherwise we - * may have mlocked a page that is being munlocked. So lazy mlock must take - * the mmap_lock for read, and verify that the vma really is locked - * (see mm/rmap.c). */ -/* - * LRU accounting for clear_page_mlock() - */ -void clear_page_mlock(struct page *page) +static struct lruvec *__mlock_page(struct page *page, struct lruvec *lruvec) { - int nr_pages; + /* There is nothing more we can do while it's off LRU */ + if (!TestClearPageLRU(page)) + return lruvec; - if (!TestClearPageMlocked(page)) - return; + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); - nr_pages = thp_nr_pages(page); - mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); - count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); - /* - * The previous TestClearPageMlocked() corresponds to the smp_mb() - * in __pagevec_lru_add_fn(). - * - * See __pagevec_lru_add_fn for more explanation. - */ - if (!isolate_lru_page(page)) { - putback_lru_page(page); - } else { + if (unlikely(page_evictable(page))) { /* - * We lost the race. the page already moved to evictable list. + * This is a little surprising, but quite possible: + * PageMlocked must have got cleared already by another CPU. + * Could this page be on the Unevictable LRU? I'm not sure, + * but move it now if so. */ - if (PageUnevictable(page)) - count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); + if (PageUnevictable(page)) { + del_page_from_lru_list(page, lruvec); + ClearPageUnevictable(page); + add_page_to_lru_list(page, lruvec); + __count_vm_events(UNEVICTABLE_PGRESCUED, + thp_nr_pages(page)); + } + goto out; } + + if (PageUnevictable(page)) { + if (PageMlocked(page)) + page->mlock_count++; + goto out; + } + + del_page_from_lru_list(page, lruvec); + ClearPageActive(page); + SetPageUnevictable(page); + page->mlock_count = !!PageMlocked(page); + add_page_to_lru_list(page, lruvec); + __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); +out: + SetPageLRU(page); + return lruvec; } -/* - * Mark page as mlocked if not already. - * If page on LRU, isolate and putback to move to unevictable list. - */ -void mlock_vma_page(struct page *page) +static struct lruvec *__mlock_new_page(struct page *page, struct lruvec *lruvec) { - /* Serialize with page migration */ - BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); - if (!TestSetPageMlocked(page)) { - int nr_pages = thp_nr_pages(page); + /* As above, this is a little surprising, but possible */ + if (unlikely(page_evictable(page))) + goto out; - mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); - count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); - if (!isolate_lru_page(page)) - putback_lru_page(page); - } + SetPageUnevictable(page); + page->mlock_count = !!PageMlocked(page); + __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); +out: + add_page_to_lru_list(page, lruvec); + SetPageLRU(page); + return lruvec; } -/* - * Finish munlock after successful page isolation - * - * Page must be locked. This is a wrapper for page_mlock() - * and putback_lru_page() with munlock accounting. - */ -static void __munlock_isolated_page(struct page *page) +static struct lruvec *__munlock_page(struct page *page, struct lruvec *lruvec) { - /* - * Optimization: if the page was mapped just once, that's our mapping - * and we don't need to check all the other vmas. - */ - if (page_mapcount(page) > 1) - page_mlock(page); + int nr_pages = thp_nr_pages(page); + bool isolated = false; + + if (!TestClearPageLRU(page)) + goto munlock; - /* Did try_to_unlock() succeed or punt? */ - if (!PageMlocked(page)) - count_vm_events(UNEVICTABLE_PGMUNLOCKED, thp_nr_pages(page)); + isolated = true; + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); - putback_lru_page(page); + if (PageUnevictable(page)) { + /* Then mlock_count is maintained, but might undercount */ + if (page->mlock_count) + page->mlock_count--; + if (page->mlock_count) + goto out; + } + /* else assume that was the last mlock: reclaim will fix it if not */ + +munlock: + if (TestClearPageMlocked(page)) { + __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + if (isolated || !PageUnevictable(page)) + __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); + else + __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); + } + + /* page_evictable() has to be checked *after* clearing Mlocked */ + if (isolated && PageUnevictable(page) && page_evictable(page)) { + del_page_from_lru_list(page, lruvec); + ClearPageUnevictable(page); + add_page_to_lru_list(page, lruvec); + __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); + } +out: + if (isolated) + SetPageLRU(page); + return lruvec; } /* - * Accounting for page isolation fail during munlock - * - * Performs accounting when page isolation fails in munlock. There is nothing - * else to do because it means some other task has already removed the page - * from the LRU. putback_lru_page() will take care of removing the page from - * the unevictable list, if necessary. vmscan [page_referenced()] will move - * the page back to the unevictable list if some other vma has it mlocked. + * Flags held in the low bits of a struct page pointer on the mlock_pvec. */ -static void __munlock_isolation_failed(struct page *page) +#define LRU_PAGE 0x1 +#define NEW_PAGE 0x2 +static inline struct page *mlock_lru(struct page *page) { - int nr_pages = thp_nr_pages(page); + return (struct page *)((unsigned long)page + LRU_PAGE); +} - if (PageUnevictable(page)) - __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); - else - __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); +static inline struct page *mlock_new(struct page *page) +{ + return (struct page *)((unsigned long)page + NEW_PAGE); } -/** - * munlock_vma_page - munlock a vma page - * @page: page to be unlocked, either a normal page or THP page head - * - * returns the size of the page as a page mask (0 for normal page, - * HPAGE_PMD_NR - 1 for THP head page) - * - * called from munlock()/munmap() path with page supposedly on the LRU. - * When we munlock a page, because the vma where we found the page is being - * munlock()ed or munmap()ed, we want to check whether other vmas hold the - * page locked so that we can leave it on the unevictable lru list and not - * bother vmscan with it. However, to walk the page's rmap list in - * page_mlock() we must isolate the page from the LRU. If some other - * task has removed the page from the LRU, we won't be able to do that. - * So we clear the PageMlocked as we might not get another chance. If we - * can't isolate the page, we leave it for putback_lru_page() and vmscan - * [page_referenced()/try_to_unmap()] to deal with. +/* + * mlock_pagevec() is derived from pagevec_lru_move_fn(): + * perhaps that can make use of such page pointer flags in future, + * but for now just keep it for mlock. We could use three separate + * pagevecs instead, but one feels better (munlocking a full pagevec + * does not need to drain mlocking pagevecs first). */ -unsigned int munlock_vma_page(struct page *page) +static void mlock_pagevec(struct pagevec *pvec) { - int nr_pages; - - /* For page_mlock() and to serialize with page migration */ - BUG_ON(!PageLocked(page)); - VM_BUG_ON_PAGE(PageTail(page), page); + struct lruvec *lruvec = NULL; + unsigned long mlock; + struct page *page; + int i; - if (!TestClearPageMlocked(page)) { - /* Potentially, PTE-mapped THP: do not skip the rest PTEs */ - return 0; + for (i = 0; i < pagevec_count(pvec); i++) { + page = pvec->pages[i]; + mlock = (unsigned long)page & (LRU_PAGE | NEW_PAGE); + page = (struct page *)((unsigned long)page - mlock); + pvec->pages[i] = page; + + if (mlock & LRU_PAGE) + lruvec = __mlock_page(page, lruvec); + else if (mlock & NEW_PAGE) + lruvec = __mlock_new_page(page, lruvec); + else + lruvec = __munlock_page(page, lruvec); } - nr_pages = thp_nr_pages(page); - mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + if (lruvec) + unlock_page_lruvec_irq(lruvec); + release_pages(pvec->pages, pvec->nr); + pagevec_reinit(pvec); +} - if (!isolate_lru_page(page)) - __munlock_isolated_page(page); - else - __munlock_isolation_failed(page); +void mlock_page_drain(int cpu) +{ + struct pagevec *pvec; - return nr_pages - 1; + pvec = &per_cpu(mlock_pvec, cpu); + if (pagevec_count(pvec)) + mlock_pagevec(pvec); } -/* - * convert get_user_pages() return value to posix mlock() error - */ -static int __mlock_posix_error_return(long retval) +bool need_mlock_page_drain(int cpu) { - if (retval == -EFAULT) - retval = -ENOMEM; - else if (retval == -ENOMEM) - retval = -EAGAIN; - return retval; + return pagevec_count(&per_cpu(mlock_pvec, cpu)); } -/* - * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec() - * - * The fast path is available only for evictable pages with single mapping. - * Then we can bypass the per-cpu pvec and get better performance. - * when mapcount > 1 we need page_mlock() which can fail. - * when !page_evictable(), we need the full redo logic of putback_lru_page to - * avoid leaving evictable page in unevictable list. - * - * In case of success, @page is added to @pvec and @pgrescued is incremented - * in case that the page was previously unevictable. @page is also unlocked. +/** + * mlock_folio - mlock a folio already on (or temporarily off) LRU + * @folio: folio to be mlocked. */ -static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec, - int *pgrescued) +void mlock_folio(struct folio *folio) { - VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(!PageLocked(page), page); + struct pagevec *pvec = &get_cpu_var(mlock_pvec); - if (page_mapcount(page) <= 1 && page_evictable(page)) { - pagevec_add(pvec, page); - if (TestClearPageUnevictable(page)) - (*pgrescued)++; - unlock_page(page); - return true; + if (!folio_test_set_mlocked(folio)) { + int nr_pages = folio_nr_pages(folio); + + zone_stat_mod_folio(folio, NR_MLOCK, nr_pages); + __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); } - return false; + folio_get(folio); + if (!pagevec_add(pvec, mlock_lru(&folio->page)) || + folio_test_large(folio) || lru_cache_disabled()) + mlock_pagevec(pvec); + put_cpu_var(mlock_pvec); } -/* - * Putback multiple evictable pages to the LRU - * - * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of - * the pages might have meanwhile become unevictable but that is OK. +/** + * mlock_new_page - mlock a newly allocated page not yet on LRU + * @page: page to be mlocked, either a normal page or a THP head. */ -static void __putback_lru_fast(struct pagevec *pvec, int pgrescued) +void mlock_new_page(struct page *page) { - count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec)); - /* - *__pagevec_lru_add() calls release_pages() so we don't call - * put_page() explicitly - */ - __pagevec_lru_add(pvec); - count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); + struct pagevec *pvec = &get_cpu_var(mlock_pvec); + int nr_pages = thp_nr_pages(page); + + SetPageMlocked(page); + mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); + + get_page(page); + if (!pagevec_add(pvec, mlock_new(page)) || + PageHead(page) || lru_cache_disabled()) + mlock_pagevec(pvec); + put_cpu_var(mlock_pvec); } -/* - * Munlock a batch of pages from the same zone - * - * The work is split to two main phases. First phase clears the Mlocked flag - * and attempts to isolate the pages, all under a single zone lru lock. - * The second phase finishes the munlock only for pages where isolation - * succeeded. - * - * Note that the pagevec may be modified during the process. +/** + * munlock_page - munlock a page + * @page: page to be munlocked, either a normal page or a THP head. */ -static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) +void munlock_page(struct page *page) { - int i; - int nr = pagevec_count(pvec); - int delta_munlocked = -nr; - struct pagevec pvec_putback; - struct lruvec *lruvec = NULL; - int pgrescued = 0; - - pagevec_init(&pvec_putback); - - /* Phase 1: page isolation */ - for (i = 0; i < nr; i++) { - struct page *page = pvec->pages[i]; - struct folio *folio = page_folio(page); - - if (TestClearPageMlocked(page)) { - /* - * We already have pin from follow_page_mask() - * so we can spare the get_page() here. - */ - if (TestClearPageLRU(page)) { - lruvec = folio_lruvec_relock_irq(folio, lruvec); - del_page_from_lru_list(page, lruvec); - continue; - } else - __munlock_isolation_failed(page); - } else { - delta_munlocked++; - } - - /* - * We won't be munlocking this page in the next phase - * but we still need to release the follow_page_mask() - * pin. We cannot do it under lru_lock however. If it's - * the last pin, __page_cache_release() would deadlock. - */ - pagevec_add(&pvec_putback, pvec->pages[i]); - pvec->pages[i] = NULL; - } - if (lruvec) { - __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); - unlock_page_lruvec_irq(lruvec); - } else if (delta_munlocked) { - mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); - } - - /* Now we can release pins of pages that we are not munlocking */ - pagevec_release(&pvec_putback); - - /* Phase 2: page munlock */ - for (i = 0; i < nr; i++) { - struct page *page = pvec->pages[i]; - - if (page) { - lock_page(page); - if (!__putback_lru_fast_prepare(page, &pvec_putback, - &pgrescued)) { - /* - * Slow path. We don't want to lose the last - * pin before unlock_page() - */ - get_page(page); /* for putback_lru_page() */ - __munlock_isolated_page(page); - unlock_page(page); - put_page(page); /* from follow_page_mask() */ - } - } - } + struct pagevec *pvec = &get_cpu_var(mlock_pvec); /* - * Phase 3: page putback for pages that qualified for the fast path - * This will also call put_page() to return pin from follow_page_mask() + * TestClearPageMlocked(page) must be left to __munlock_page(), + * which will check whether the page is multiply mlocked. */ - if (pagevec_count(&pvec_putback)) - __putback_lru_fast(&pvec_putback, pgrescued); + + get_page(page); + if (!pagevec_add(pvec, page) || + PageHead(page) || lru_cache_disabled()) + mlock_pagevec(pvec); + put_cpu_var(mlock_pvec); } -/* - * Fill up pagevec for __munlock_pagevec using pte walk - * - * The function expects that the struct page corresponding to @start address is - * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone. - * - * The rest of @pvec is filled by subsequent pages within the same pmd and same - * zone, as long as the pte's are present and vm_normal_page() succeeds. These - * pages also get pinned. - * - * Returns the address of the next page that should be scanned. This equals - * @start + PAGE_SIZE when no page could be added by the pte walk. - */ -static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, - struct vm_area_struct *vma, struct zone *zone, - unsigned long start, unsigned long end) +static int mlock_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) + { - pte_t *pte; + struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; + pte_t *start_pte, *pte; + struct page *page; - /* - * Initialize pte walk starting at the already pinned page where we - * are sure that there is a pte, as it was pinned under the same - * mmap_lock write op. - */ - pte = get_locked_pte(vma->vm_mm, start, &ptl); - /* Make sure we do not cross the page table boundary */ - end = pgd_addr_end(start, end); - end = p4d_addr_end(start, end); - end = pud_addr_end(start, end); - end = pmd_addr_end(start, end); - - /* The page next to the pinned page is the first we will try to get */ - start += PAGE_SIZE; - while (start < end) { - struct page *page = NULL; - pte++; - if (pte_present(*pte)) - page = vm_normal_page(vma, start, *pte); - /* - * Break if page could not be obtained or the page's node+zone does not - * match - */ - if (!page || page_zone(page) != zone) - break; + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + if (!pmd_present(*pmd)) + goto out; + if (is_huge_zero_pmd(*pmd)) + goto out; + page = pmd_page(*pmd); + if (vma->vm_flags & VM_LOCKED) + mlock_folio(page_folio(page)); + else + munlock_page(page); + goto out; + } - /* - * Do not use pagevec for PTE-mapped THP, - * munlock_vma_pages_range() will handle them. - */ + start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { + if (!pte_present(*pte)) + continue; + page = vm_normal_page(vma, addr, *pte); + if (!page) + continue; if (PageTransCompound(page)) - break; - - get_page(page); - /* - * Increase the address that will be returned *before* the - * eventual break due to pvec becoming full by adding the page - */ - start += PAGE_SIZE; - if (pagevec_add(pvec, page) == 0) - break; + continue; + if (vma->vm_flags & VM_LOCKED) + mlock_folio(page_folio(page)); + else + munlock_page(page); } - pte_unmap_unlock(pte, ptl); - return start; + pte_unmap(start_pte); +out: + spin_unlock(ptl); + cond_resched(); + return 0; } /* - * munlock_vma_pages_range() - munlock all pages in the vma range.' - * @vma - vma containing range to be munlock()ed. + * mlock_vma_pages_range() - mlock any pages already in the range, + * or munlock all pages in the range. + * @vma - vma containing range to be mlock()ed or munlock()ed * @start - start address in @vma of the range - * @end - end of range in @vma. + * @end - end of range in @vma + * @newflags - the new set of flags for @vma. * - * For mremap(), munmap() and exit(). - * - * Called with @vma VM_LOCKED. - * - * Returns with VM_LOCKED cleared. Callers must be prepared to - * deal with this. - * - * We don't save and restore VM_LOCKED here because pages are - * still on lru. In unmap path, pages might be scanned by reclaim - * and re-mlocked by page_mlock/try_to_unmap before we unmap and - * free them. This will result in freeing mlocked pages. + * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED; + * called for munlock() and munlockall(), to clear VM_LOCKED from @vma. */ -void munlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) +static void mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, vm_flags_t newflags) { - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + static const struct mm_walk_ops mlock_walk_ops = { + .pmd_entry = mlock_pte_range, + }; - while (start < end) { - struct page *page; - unsigned int page_mask = 0; - unsigned long page_increm; - struct pagevec pvec; - struct zone *zone; + /* + * There is a slight chance that concurrent page migration, + * or page reclaim finding a page of this now-VM_LOCKED vma, + * will call mlock_vma_page() and raise page's mlock_count: + * double counting, leaving the page unevictable indefinitely. + * Communicate this danger to mlock_vma_page() with VM_IO, + * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas. + * mmap_lock is held in write mode here, so this weird + * combination should not be visible to other mmap_lock users; + * but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED. + */ + if (newflags & VM_LOCKED) + newflags |= VM_IO; + WRITE_ONCE(vma->vm_flags, newflags); - pagevec_init(&pvec); - /* - * Although FOLL_DUMP is intended for get_dump_page(), - * it just so happens that its special treatment of the - * ZERO_PAGE (returning an error instead of doing get_page) - * suits munlock very well (and if somehow an abnormal page - * has sneaked into the range, we won't oops here: great). - */ - page = follow_page(vma, start, FOLL_GET | FOLL_DUMP); - - if (page && !IS_ERR(page)) { - if (PageTransTail(page)) { - VM_BUG_ON_PAGE(PageMlocked(page), page); - put_page(page); /* follow_page_mask() */ - } else if (PageTransHuge(page)) { - lock_page(page); - /* - * Any THP page found by follow_page_mask() may - * have gotten split before reaching - * munlock_vma_page(), so we need to compute - * the page_mask here instead. - */ - page_mask = munlock_vma_page(page); - unlock_page(page); - put_page(page); /* follow_page_mask() */ - } else { - /* - * Non-huge pages are handled in batches via - * pagevec. The pin from follow_page_mask() - * prevents them from collapsing by THP. - */ - pagevec_add(&pvec, page); - zone = page_zone(page); - - /* - * Try to fill the rest of pagevec using fast - * pte walk. This will also update start to - * the next page to process. Then munlock the - * pagevec. - */ - start = __munlock_pagevec_fill(&pvec, vma, - zone, start, end); - __munlock_pagevec(&pvec, zone); - goto next; - } - } - page_increm = 1 + page_mask; - start += page_increm * PAGE_SIZE; -next: - cond_resched(); + lru_add_drain(); + walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL); + lru_add_drain(); + + if (newflags & VM_IO) { + newflags &= ~VM_IO; + WRITE_ONCE(vma->vm_flags, newflags); } } @@ -500,10 +382,9 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, pgoff_t pgoff; int nr_pages; int ret = 0; - int lock = !!(newflags & VM_LOCKED); - vm_flags_t old_flags = vma->vm_flags; + vm_flags_t oldflags = vma->vm_flags; - if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || + if (newflags == oldflags || (oldflags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || vma_is_dax(vma) || vma_is_secretmem(vma)) /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ @@ -512,7 +393,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*prev) { vma = *prev; goto success; @@ -535,9 +416,9 @@ success: * Keep track of amount of locked VM. */ nr_pages = (end - start) >> PAGE_SHIFT; - if (!lock) + if (!(newflags & VM_LOCKED)) nr_pages = -nr_pages; - else if (old_flags & VM_LOCKED) + else if (oldflags & VM_LOCKED) nr_pages = 0; mm->locked_vm += nr_pages; @@ -547,11 +428,12 @@ success: * set VM_LOCKED, populate_vma_page_range will bring it back. */ - if (lock) + if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { + /* No work to do, and mlocking twice would be wrong */ vma->vm_flags = newflags; - else - munlock_vma_pages_range(vma, start, end); - + } else { + mlock_vma_pages_range(vma, start, end, newflags); + } out: *prev = vma; return ret; @@ -645,6 +527,18 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, return count >> PAGE_SHIFT; } +/* + * convert get_user_pages() return value to posix mlock() error + */ +static int __mlock_posix_error_return(long retval) +{ + if (retval == -EFAULT) + retval = -ENOMEM; + else if (retval == -ENOMEM) + retval = -EAGAIN; + return retval; +} + static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags) { unsigned long locked; @@ -839,6 +733,7 @@ int user_shm_lock(size_t size, struct ucounts *ucounts) } if (!get_ucounts(ucounts)) { dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); + allowed = 0; goto out; } allowed = 1; diff --git a/mm/mmap.c b/mm/mmap.c index bfb0ea164a90..3aa839f81e63 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/mm.h> +#include <linux/mm_inline.h> #include <linux/vmacache.h> #include <linux/shm.h> #include <linux/mman.h> @@ -1029,7 +1030,8 @@ again: */ static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -1047,6 +1049,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, return 0; if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) return 0; + if (!anon_vma_name_eq(anon_vma_name(vma), anon_name)) + return 0; return 1; } @@ -1079,9 +1083,10 @@ static int can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { if (vma->vm_pgoff == vm_pgoff) return 1; @@ -1100,9 +1105,10 @@ static int can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; vm_pglen = vma_pages(vma); @@ -1113,9 +1119,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, } /* - * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out - * whether that can be merged with its predecessor or its successor. - * Or both (it neatly fills a hole). + * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name), + * figure out whether that can be merged with its predecessor or its + * successor. Or both (it neatly fills a hole). * * In most cases - when called for mmap, brk or mremap - [addr,end) is * certain not to be mapped by the time vma_merge is called; but when @@ -1160,7 +1166,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, unsigned long end, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t pgoff, struct mempolicy *policy, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; @@ -1190,7 +1197,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, anon_vma, file, pgoff, - vm_userfaultfd_ctx)) { + vm_userfaultfd_ctx, anon_name)) { /* * OK, it can. Can we now merge in the successor as well? */ @@ -1199,7 +1206,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx) && + vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ @@ -1222,7 +1229,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx)) { + vm_userfaultfd_ctx, anon_name)) { if (prev && addr < prev->vm_end) /* case 4 */ err = __vma_adjust(prev, prev->vm_start, addr, prev->vm_pgoff, NULL, next); @@ -1609,8 +1616,6 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, /* * VM_NORESERVE is used because the reservations will be * taken when vm_ops->mmap() is called - * A dummy user value is used because we are not locking - * memory so no accounting is necessary */ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, @@ -1754,7 +1759,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * Can we just expand an old mapping? */ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (vma) goto out; @@ -1803,7 +1808,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, */ if (unlikely(vm_flags != vma->vm_flags && prev)) { merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, - NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (merge) { /* ->mmap() can change vma->vm_file and fput the original file. So * fput the vma->vm_file here or we would add an extra fput for file @@ -2550,7 +2555,7 @@ static int __init cmdline_parse_stack_guard_gap(char *p) if (!*endptr) stack_guard_gap = val << PAGE_SHIFT; - return 0; + return 1; } __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); @@ -2667,6 +2672,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, vma->vm_prev = NULL; do { vma_rb_erase(vma, &mm->mm_rb); + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm -= vma_pages(vma); mm->map_count--; tail_vma = vma; vma = vma->vm_next; @@ -2771,22 +2778,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return __split_vma(mm, vma, addr, new_below); } -static inline void -unlock_range(struct vm_area_struct *start, unsigned long limit) -{ - struct mm_struct *mm = start->vm_mm; - struct vm_area_struct *tmp = start; - - while (tmp && tmp->vm_start < limit) { - if (tmp->vm_flags & VM_LOCKED) { - mm->locked_vm -= vma_pages(tmp); - munlock_vma_pages_all(tmp); - } - - tmp = tmp->vm_next; - } -} - /* Munmap is split into 2 main parts -- this part which finds * what needs doing, and the areas themselves, which do the * work. This now handles partial unmappings. @@ -2867,12 +2858,6 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, return error; } - /* - * unlock any mlock()ed ranges before detaching vmas - */ - if (mm->locked_vm) - unlock_range(vma, end); - /* Detach vmas from rbtree */ if (!detach_vmas_to_be_unmapped(mm, vma, prev, end)) downgrade = false; @@ -2928,7 +2913,6 @@ EXPORT_SYMBOL(vm_munmap); SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { addr = untagged_addr(addr); - profile_munmap(addr); return __vm_munmap(addr, len, true); } @@ -3056,7 +3040,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla /* Can we just expand an old private anonymous mapping? */ vma = vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (vma) goto out; @@ -3141,26 +3125,20 @@ void exit_mmap(struct mm_struct *mm) * Nothing can be holding mm->mmap_lock here and the above call * to mmu_notifier_release(mm) ensures mmu notifier callbacks in * __oom_reap_task_mm() will not block. - * - * This needs to be done before calling munlock_vma_pages_all(), - * which clears VM_LOCKED, otherwise the oom reaper cannot - * reliably test it. */ (void)__oom_reap_task_mm(mm); - set_bit(MMF_OOM_SKIP, &mm->flags); - mmap_write_lock(mm); - mmap_write_unlock(mm); } - if (mm->locked_vm) - unlock_range(mm->mmap, ULONG_MAX); - + mmap_write_lock(mm); arch_exit_mmap(mm); vma = mm->mmap; - if (!vma) /* Can happen if dup_mmap() received an OOM */ + if (!vma) { + /* Can happen if dup_mmap() received an OOM */ + mmap_write_unlock(mm); return; + } lru_add_drain(); flush_cache_mm(mm); @@ -3171,16 +3149,15 @@ void exit_mmap(struct mm_struct *mm) free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); - /* - * Walk the list again, actually closing and freeing it, - * with preemption enabled, without holding any MM locks. - */ + /* Walk the list again, actually closing and freeing it. */ while (vma) { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); vma = remove_vma(vma); cond_resched(); } + mm->mmap = NULL; + mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); } @@ -3249,7 +3226,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (new_vma) { /* * Source vma may have been merged into new_vma @@ -3441,6 +3418,7 @@ static struct vm_area_struct *__install_special_mapping( vma->vm_end = addr + len; vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; + vma->vm_flags &= VM_LOCKED_CLEAR_MASK; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_ops = ops; diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 1b9837419bf9..afb7185ffdc4 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -3,6 +3,7 @@ #include <linux/kernel.h> #include <linux/mmdebug.h> #include <linux/mm_types.h> +#include <linux/mm_inline.h> #include <linux/pagemap.h> #include <linux/rcupdate.h> #include <linux/smp.h> diff --git a/mm/mmzone.c b/mm/mmzone.c index eb89d6e018e2..0ae7571e35ab 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -81,6 +81,13 @@ void lruvec_init(struct lruvec *lruvec) for_each_lru(lru) INIT_LIST_HEAD(&lruvec->lists[lru]); + /* + * The "Unevictable LRU" is imaginary: though its size is maintained, + * it is never scanned, and unevictable pages are not threaded on it + * (so that their lru fields can be reused to hold mlock_count). + * Poison its list head, so that any operations on it would crash. + */ + list_del(&lruvec->lists[LRU_UNEVICTABLE]); } #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) @@ -89,13 +96,14 @@ int page_cpupid_xchg_last(struct page *page, int cpupid) unsigned long old_flags, flags; int last_cpupid; + old_flags = READ_ONCE(page->flags); do { - old_flags = flags = page->flags; - last_cpupid = page_cpupid_last(page); + flags = old_flags; + last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; - } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); + } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); return last_cpupid; } diff --git a/mm/mprotect.c b/mm/mprotect.c index e552f5e0ccbd..b69ce7a7b2b7 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -29,6 +29,7 @@ #include <linux/uaccess.h> #include <linux/mm_inline.h> #include <linux/pgtable.h> +#include <linux/sched/sysctl.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> @@ -83,6 +84,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, */ if (prot_numa) { struct page *page; + int nid; /* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) @@ -94,7 +96,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, /* Also skip shared copy-on-write pages */ if (is_cow_mapping(vma->vm_flags) && - page_mapcount(page) != 1) + page_count(page) != 1) continue; /* @@ -109,7 +111,16 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * Don't mess with PTEs if page is already on the node * a single-threaded process is running on. */ - if (target_node == page_to_nid(page)) + nid = page_to_nid(page); + if (target_node == nid) + continue; + + /* + * Skip scanning top tier node if normal numa + * balancing is disabled + */ + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && + node_is_toptier(nid)) continue; } @@ -464,7 +475,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *pprev = vma_merge(mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*pprev) { vma = *pprev; VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); diff --git a/mm/mremap.c b/mm/mremap.c index 002eec83e91e..9d76da79594d 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -942,8 +942,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (mmap_write_lock_killable(current->mm)) return -EINTR; - vma = find_vma(mm, addr); - if (!vma || vma->vm_start > addr) { + vma = vma_lookup(mm, addr); + if (!vma) { ret = EFAULT; goto out; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1ddabefcfb5a..7ec38194f8e1 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -93,9 +93,6 @@ static bool oom_cpuset_eligible(struct task_struct *start, bool ret = false; const nodemask_t *mask = oc->nodemask; - if (is_memcg_oom(oc)) - return true; - rcu_read_lock(); for_each_thread(start, tsk) { if (mask) { @@ -526,7 +523,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm) set_bit(MMF_UNSTABLE, &mm->flags); for (vma = mm->mmap ; vma; vma = vma->vm_next) { - if (!can_madv_lru_vma(vma)) + if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP)) continue; /* @@ -793,7 +790,7 @@ static inline bool __task_will_free_mem(struct task_struct *task) * coredump_task_exit(), so the oom killer cannot assume that * the process will promptly exit and release memory. */ - if (sig->flags & SIGNAL_GROUP_COREDUMP) + if (sig->core_state) return false; if (sig->flags & SIGNAL_GROUP_EXIT) @@ -994,6 +991,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) * If necessary, kill all tasks in the selected memory cgroup. */ if (oom_group) { + memcg_memory_event(oom_group, MEMCG_OOM_GROUP_KILL); mem_cgroup_print_oom_group(oom_group); mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, (void *)message); @@ -1057,7 +1055,7 @@ bool out_of_memory(struct oom_control *oc) if (!is_memcg_oom(oc)) { blocking_notifier_call_chain(&oom_notify_list, 0, &freed); - if (freed > 0) + if (freed > 0 && !is_sysrq_oom(oc)) /* Got some memory back in the last second. */ return true; } @@ -1169,15 +1167,15 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) goto put_task; } - if (mmget_not_zero(p->mm)) { - mm = p->mm; - if (task_will_free_mem(p)) - reap = true; - else { - /* Error only if the work has not been done already */ - if (!test_bit(MMF_OOM_SKIP, &mm->flags)) - ret = -EINVAL; - } + mm = p->mm; + mmgrab(mm); + + if (task_will_free_mem(p)) + reap = true; + else { + /* Error only if the work has not been done already */ + if (!test_bit(MMF_OOM_SKIP, &mm->flags)) + ret = -EINVAL; } task_unlock(p); @@ -1188,13 +1186,16 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) ret = -EINTR; goto drop_mm; } - if (!__oom_reap_task_mm(mm)) + /* + * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure + * possible change in exit_mmap is seen + */ + if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm)) ret = -EAGAIN; mmap_read_unlock(mm); drop_mm: - if (mm) - mmput(mm); + mmdrop(mm); put_task: put_task_struct(task); return ret; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index a613f8ef6a02..435c02630593 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -324,18 +324,6 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) } /* - * Unreclaimable memory (kernel memory or anonymous memory - * without swap) can bring down the dirtyable pages below - * the zone's dirty balance reserve and the above calculation - * will underflow. However we still want to add in nodes - * which are below threshold (negative values) to get a more - * accurate calculation but make sure that the total never - * underflows. - */ - if ((long)x < 0) - x = 0; - - /* * Make sure that the number of highmem pages is never larger * than the number of the total dirtyable memory. This can only * occur in very strange VM situations but we want to make sure @@ -2430,13 +2418,13 @@ EXPORT_SYMBOL(folio_write_one); /* * For address_spaces which do not use buffers nor write back. */ -int __set_page_dirty_no_writeback(struct page *page) +bool noop_dirty_folio(struct address_space *mapping, struct folio *folio) { - if (!PageDirty(page)) - return !TestSetPageDirty(page); - return 0; + if (!folio_test_dirty(folio)) + return !folio_test_set_dirty(folio); + return false; } -EXPORT_SYMBOL(__set_page_dirty_no_writeback); +EXPORT_SYMBOL(noop_dirty_folio); /* * Helper function for set_page_dirty family. @@ -2496,7 +2484,11 @@ void folio_account_cleaned(struct folio *folio, struct address_space *mapping, * If warn is true, then emit a warning if the folio is not uptodate and has * not been truncated. * - * The caller must hold lock_page_memcg(). + * The caller must hold lock_page_memcg(). Most callers have the folio + * locked. A few have the folio blocked from truncation through other + * means (eg zap_page_range() has it mapped and is holding the page table + * lock). This can also be called from mark_buffer_dirty(), which I + * cannot prove is always protected against truncate. */ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, int warn) @@ -2526,7 +2518,7 @@ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, * This is also sometimes used by filesystems which use buffer_heads when * a single buffer is being dirtied: we want to set the folio dirty in * that case, but not all the buffers. This is a "bottom-up" dirtying, - * whereas __set_page_dirty_buffers() is a "top-down" dirtying. + * whereas block_dirty_folio() is a "top-down" dirtying. * * The caller must ensure this doesn't race with truncation. Most will * simply hold the folio lock, but e.g. zap_pte_range() calls with the @@ -2612,7 +2604,7 @@ EXPORT_SYMBOL(folio_redirty_for_writepage); * folio_mark_dirty - Mark a folio as being modified. * @folio: The folio. * - * For folios with a mapping this should be done under the page lock + * For folios with a mapping this should be done with the folio lock held * for the benefit of asynchronous memory errors who prefer a consistent * dirty state. This rule can be broken in some special cases, * but should be better not to. @@ -2626,23 +2618,21 @@ bool folio_mark_dirty(struct folio *folio) if (likely(mapping)) { /* * readahead/lru_deactivate_page could remain - * PG_readahead/PG_reclaim due to race with end_page_writeback - * About readahead, if the page is written, the flags would be + * PG_readahead/PG_reclaim due to race with folio_end_writeback + * About readahead, if the folio is written, the flags would be * reset. So no problem. - * About lru_deactivate_page, if the page is redirty, the flag - * will be reset. So no problem. but if the page is used by readahead - * it will confuse readahead and make it restart the size rampup - * process. But it's a trivial problem. + * About lru_deactivate_page, if the folio is redirtied, + * the flag will be reset. So no problem. but if the + * folio is used by readahead it will confuse readahead + * and make it restart the size rampup process. But it's + * a trivial problem. */ if (folio_test_reclaim(folio)) folio_clear_reclaim(folio); - return mapping->a_ops->set_page_dirty(&folio->page); + return mapping->a_ops->dirty_folio(mapping, folio); } - if (!folio_test_dirty(folio)) { - if (!folio_test_set_dirty(folio)) - return true; - } - return false; + + return noop_dirty_folio(mapping, folio); } EXPORT_SYMBOL(folio_mark_dirty); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c5952749ad40..6e0b4596cde9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -19,6 +19,7 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/swap.h> +#include <linux/swapops.h> #include <linux/interrupt.h> #include <linux/pagemap.h> #include <linux/jiffies.h> @@ -63,6 +64,7 @@ #include <linux/sched/rt.h> #include <linux/sched/mm.h> #include <linux/page_owner.h> +#include <linux/page_table_check.h> #include <linux/kthread.h> #include <linux/memcontrol.h> #include <linux/ftrace.h> @@ -72,6 +74,7 @@ #include <linux/padata.h> #include <linux/khugepaged.h> #include <linux/buffer_head.h> +#include <linux/delayacct.h> #include <asm/sections.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -125,7 +128,7 @@ static DEFINE_MUTEX(pcp_batch_high_lock); struct pagesets { local_lock_t lock; }; -static DEFINE_PER_CPU(struct pagesets, pagesets) = { +static DEFINE_PER_CPU(struct pagesets, pagesets) __maybe_unused = { .lock = INIT_LOCAL_LOCK(lock), }; @@ -726,23 +729,32 @@ void free_compound_page(struct page *page) free_the_page(page, compound_order(page)); } +static void prep_compound_head(struct page *page, unsigned int order) +{ + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); + set_compound_order(page, order); + atomic_set(compound_mapcount_ptr(page), -1); + atomic_set(compound_pincount_ptr(page), 0); +} + +static void prep_compound_tail(struct page *head, int tail_idx) +{ + struct page *p = head + tail_idx; + + p->mapping = TAIL_MAPPING; + set_compound_head(p, head); +} + void prep_compound_page(struct page *page, unsigned int order) { int i; int nr_pages = 1 << order; __SetPageHead(page); - for (i = 1; i < nr_pages; i++) { - struct page *p = page + i; - p->mapping = TAIL_MAPPING; - set_compound_head(p, page); - } + for (i = 1; i < nr_pages; i++) + prep_compound_tail(page, i); - set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); - set_compound_order(page, order); - atomic_set(compound_mapcount_ptr(page), -1); - if (hpage_pincount_available(page)) - atomic_set(compound_pincount_ptr(page), 0); + prep_compound_head(page, order); } #ifdef CONFIG_DEBUG_PAGEALLOC @@ -1059,14 +1071,12 @@ static inline void __free_one_page(struct page *page, int migratetype, fpi_t fpi_flags) { struct capture_control *capc = task_capc(zone); + unsigned int max_order = pageblock_order; unsigned long buddy_pfn; unsigned long combined_pfn; - unsigned int max_order; struct page *buddy; bool to_tail; - max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order); - VM_BUG_ON(!zone_is_initialized(zone)); VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); @@ -1104,25 +1114,24 @@ continue_merging: } if (order < MAX_ORDER - 1) { /* If we are here, it means order is >= pageblock_order. - * We want to prevent merge between freepages on isolate - * pageblock and normal pageblock. Without this, pageblock - * isolation could cause incorrect freepage or CMA accounting. + * We want to prevent merge between freepages on pageblock + * without fallbacks and normal pageblock. Without this, + * pageblock isolation could cause incorrect freepage or CMA + * accounting or HIGHATOMIC accounting. * * We don't want to hit this code for the more frequent * low-order merging. */ - if (unlikely(has_isolate_pageblock(zone))) { - int buddy_mt; + int buddy_mt; - buddy_pfn = __find_buddy_pfn(pfn, order); - buddy = page + (buddy_pfn - pfn); - buddy_mt = get_pageblock_migratetype(buddy); + buddy_pfn = __find_buddy_pfn(pfn, order); + buddy = page + (buddy_pfn - pfn); + buddy_mt = get_pageblock_migratetype(buddy); - if (migratetype != buddy_mt - && (is_migrate_isolate(migratetype) || - is_migrate_isolate(buddy_mt))) - goto done_merging; - } + if (migratetype != buddy_mt + && (!migratetype_is_mergeable(migratetype) || + !migratetype_is_mergeable(buddy_mt))) + goto done_merging; max_order = order + 1; goto continue_merging; } @@ -1297,6 +1306,7 @@ static __always_inline bool free_pages_prepare(struct page *page, if (memcg_kmem_enabled() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); reset_page_owner(page, order); + page_table_check_free(page, order); return false; } @@ -1336,6 +1346,7 @@ static __always_inline bool free_pages_prepare(struct page *page, page_cpupid_reset_last(page); page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; reset_page_owner(page, order); + page_table_check_free(page, order); if (!PageHighMem(page)) { debug_check_no_locks_freed(page_address(page), @@ -1417,120 +1428,83 @@ static bool bulkfree_pcp_prepare(struct page *page) } #endif /* CONFIG_DEBUG_VM */ -static inline void prefetch_buddy(struct page *page) -{ - unsigned long pfn = page_to_pfn(page); - unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0); - struct page *buddy = page + (buddy_pfn - pfn); - - prefetch(buddy); -} - /* * Frees a number of pages from the PCP lists * Assumes all pages on list are in same zone. * count is the number of pages to free. */ static void free_pcppages_bulk(struct zone *zone, int count, - struct per_cpu_pages *pcp) + struct per_cpu_pages *pcp, + int pindex) { - int pindex = 0; - int batch_free = 0; - int nr_freed = 0; + int min_pindex = 0; + int max_pindex = NR_PCP_LISTS - 1; unsigned int order; - int prefetch_nr = READ_ONCE(pcp->batch); bool isolated_pageblocks; - struct page *page, *tmp; - LIST_HEAD(head); + struct page *page; /* * Ensure proper count is passed which otherwise would stuck in the * below while (list_empty(list)) loop. */ count = min(pcp->count, count); + + /* Ensure requested pindex is drained first. */ + pindex = pindex - 1; + + /* + * local_lock_irq held so equivalent to spin_lock_irqsave for + * both PREEMPT_RT and non-PREEMPT_RT configurations. + */ + spin_lock(&zone->lock); + isolated_pageblocks = has_isolate_pageblock(zone); + while (count > 0) { struct list_head *list; + int nr_pages; - /* - * Remove pages from lists in a round-robin fashion. A - * batch_free count is maintained that is incremented when an - * empty list is encountered. This is so more pages are freed - * off fuller lists instead of spinning excessively around empty - * lists - */ + /* Remove pages from lists in a round-robin fashion. */ do { - batch_free++; - if (++pindex == NR_PCP_LISTS) - pindex = 0; + if (++pindex > max_pindex) + pindex = min_pindex; list = &pcp->lists[pindex]; - } while (list_empty(list)); + if (!list_empty(list)) + break; - /* This is the only non-empty list. Free them all. */ - if (batch_free == NR_PCP_LISTS) - batch_free = count; + if (pindex == max_pindex) + max_pindex--; + if (pindex == min_pindex) + min_pindex++; + } while (1); order = pindex_to_order(pindex); + nr_pages = 1 << order; BUILD_BUG_ON(MAX_ORDER >= (1<<NR_PCP_ORDER_WIDTH)); do { + int mt; + page = list_last_entry(list, struct page, lru); + mt = get_pcppage_migratetype(page); + /* must delete to avoid corrupting pcp list */ list_del(&page->lru); - nr_freed += 1 << order; - count -= 1 << order; + count -= nr_pages; + pcp->count -= nr_pages; if (bulkfree_pcp_prepare(page)) continue; - /* Encode order with the migratetype */ - page->index <<= NR_PCP_ORDER_WIDTH; - page->index |= order; + /* MIGRATE_ISOLATE page should not go to pcplists */ + VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); + /* Pageblock could have been isolated meanwhile */ + if (unlikely(isolated_pageblocks)) + mt = get_pageblock_migratetype(page); - list_add_tail(&page->lru, &head); - - /* - * We are going to put the page back to the global - * pool, prefetch its buddy to speed up later access - * under zone->lock. It is believed the overhead of - * an additional test and calculating buddy_pfn here - * can be offset by reduced memory latency later. To - * avoid excessive prefetching due to large count, only - * prefetch buddy for the first pcp->batch nr of pages. - */ - if (prefetch_nr) { - prefetch_buddy(page); - prefetch_nr--; - } - } while (count > 0 && --batch_free && !list_empty(list)); + __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); + trace_mm_page_pcpu_drain(page, order, mt); + } while (count > 0 && !list_empty(list)); } - pcp->count -= nr_freed; - - /* - * local_lock_irq held so equivalent to spin_lock_irqsave for - * both PREEMPT_RT and non-PREEMPT_RT configurations. - */ - spin_lock(&zone->lock); - isolated_pageblocks = has_isolate_pageblock(zone); - - /* - * Use safe version since after __free_one_page(), - * page->lru.next will not point to original list. - */ - list_for_each_entry_safe(page, tmp, &head, lru) { - int mt = get_pcppage_migratetype(page); - /* mt has been encoded with the order (see above) */ - order = mt & NR_PCP_ORDER_MASK; - mt >>= NR_PCP_ORDER_WIDTH; - - /* MIGRATE_ISOLATE page should not go to pcplists */ - VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); - /* Pageblock could have been isolated meanwhile */ - if (unlikely(isolated_pageblocks)) - mt = get_pageblock_migratetype(page); - - __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); - trace_mm_page_pcpu_drain(page, order, mt); - } spin_unlock(&zone->lock); } @@ -2245,19 +2219,8 @@ void __init init_cma_reserved_pageblock(struct page *page) } while (++p, --i); set_pageblock_migratetype(page, MIGRATE_CMA); - - if (pageblock_order >= MAX_ORDER) { - i = pageblock_nr_pages; - p = page; - do { - set_page_refcounted(p); - __free_pages(p, MAX_ORDER - 1); - p += MAX_ORDER_NR_PAGES; - } while (i -= MAX_ORDER_NR_PAGES); - } else { - set_page_refcounted(page); - __free_pages(page, pageblock_order); - } + set_page_refcounted(page); + __free_pages(page, pageblock_order); adjust_managed_page_count(page, pageblock_nr_pages); page_zone(page)->cma_pages += pageblock_nr_pages; @@ -2327,23 +2290,36 @@ static inline int check_new_page(struct page *page) return 1; } +static bool check_new_pages(struct page *page, unsigned int order) +{ + int i; + for (i = 0; i < (1 << order); i++) { + struct page *p = page + i; + + if (unlikely(check_new_page(p))) + return true; + } + + return false; +} + #ifdef CONFIG_DEBUG_VM /* * With DEBUG_VM enabled, order-0 pages are checked for expected state when * being allocated from pcp lists. With debug_pagealloc also enabled, they are * also checked when pcp lists are refilled from the free lists. */ -static inline bool check_pcp_refill(struct page *page) +static inline bool check_pcp_refill(struct page *page, unsigned int order) { if (debug_pagealloc_enabled_static()) - return check_new_page(page); + return check_new_pages(page, order); else return false; } -static inline bool check_new_pcp(struct page *page) +static inline bool check_new_pcp(struct page *page, unsigned int order) { - return check_new_page(page); + return check_new_pages(page, order); } #else /* @@ -2351,32 +2327,19 @@ static inline bool check_new_pcp(struct page *page) * when pcp lists are being refilled from the free lists. With debug_pagealloc * enabled, they are also checked when being allocated from the pcp lists. */ -static inline bool check_pcp_refill(struct page *page) +static inline bool check_pcp_refill(struct page *page, unsigned int order) { - return check_new_page(page); + return check_new_pages(page, order); } -static inline bool check_new_pcp(struct page *page) +static inline bool check_new_pcp(struct page *page, unsigned int order) { if (debug_pagealloc_enabled_static()) - return check_new_page(page); + return check_new_pages(page, order); else return false; } #endif /* CONFIG_DEBUG_VM */ -static bool check_new_pages(struct page *page, unsigned int order) -{ - int i; - for (i = 0; i < (1 << order); i++) { - struct page *p = page + i; - - if (unlikely(check_new_page(p))) - return true; - } - - return false; -} - inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { @@ -2410,6 +2373,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, } set_page_owner(page, order, gfp_flags); + page_table_check_alloc(page, order); } static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, @@ -2463,17 +2427,13 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, /* * This array describes the order lists are fallen back to when * the free lists for the desirable migrate type are depleted + * + * The other migratetypes do not have fallbacks. */ static int fallbacks[MIGRATE_TYPES][3] = { [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, -#ifdef CONFIG_CMA - [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ -#endif -#ifdef CONFIG_MEMORY_ISOLATION - [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */ -#endif }; #ifdef CONFIG_CMA @@ -2779,8 +2739,8 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, /* Yoink! */ mt = get_pageblock_migratetype(page); - if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt) - && !is_migrate_cma(mt)) { + /* Only reserve normal pageblocks (i.e., they can merge with others) */ + if (migratetype_is_mergeable(mt)) { zone->nr_reserved_highatomic += pageblock_nr_pages; set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); @@ -3021,7 +2981,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, if (unlikely(page == NULL)) break; - if (unlikely(check_pcp_refill(page))) + if (unlikely(check_pcp_refill(page, order))) continue; /* @@ -3070,7 +3030,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); if (to_drain > 0) - free_pcppages_bulk(zone, to_drain, pcp); + free_pcppages_bulk(zone, to_drain, pcp, 0); local_unlock_irqrestore(&pagesets.lock, flags); } #endif @@ -3091,7 +3051,7 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); if (pcp->count) - free_pcppages_bulk(zone, pcp->count, pcp); + free_pcppages_bulk(zone, pcp->count, pcp, 0); local_unlock_irqrestore(&pagesets.lock, flags); } @@ -3314,10 +3274,15 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn, return true; } -static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch) +static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch, + bool free_high) { int min_nr_free, max_nr_free; + /* Free everything if batch freeing high-order pages. */ + if (unlikely(free_high)) + return pcp->count; + /* Check for PCP disabled or boot pageset */ if (unlikely(high < batch)) return 1; @@ -3338,11 +3303,12 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch) return batch; } -static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone) +static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, + bool free_high) { int high = READ_ONCE(pcp->high); - if (unlikely(!high)) + if (unlikely(!high || free_high)) return 0; if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) @@ -3355,24 +3321,34 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone) return min(READ_ONCE(pcp->batch) << 2, high); } -static void free_unref_page_commit(struct page *page, unsigned long pfn, - int migratetype, unsigned int order) +static void free_unref_page_commit(struct page *page, int migratetype, + unsigned int order) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; int high; int pindex; + bool free_high; __count_vm_event(PGFREE); pcp = this_cpu_ptr(zone->per_cpu_pageset); pindex = order_to_pindex(migratetype, order); list_add(&page->lru, &pcp->lists[pindex]); pcp->count += 1 << order; - high = nr_pcp_high(pcp, zone); + + /* + * As high-order pages other than THP's stored on PCP can contribute + * to fragmentation, limit the number stored when PCP is heavily + * freeing without allocation. The remainder after bulk freeing + * stops will be drained from vmstat refresh context. + */ + free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER); + + high = nr_pcp_high(pcp, zone, free_high); if (pcp->count >= high) { int batch = READ_ONCE(pcp->batch); - free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch), pcp); + free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex); } } @@ -3405,7 +3381,7 @@ void free_unref_page(struct page *page, unsigned int order) } local_lock_irqsave(&pagesets.lock, flags); - free_unref_page_commit(page, pfn, migratetype, order); + free_unref_page_commit(page, migratetype, order); local_unlock_irqrestore(&pagesets.lock, flags); } @@ -3415,13 +3391,13 @@ void free_unref_page(struct page *page, unsigned int order) void free_unref_page_list(struct list_head *list) { struct page *page, *next; - unsigned long flags, pfn; + unsigned long flags; int batch_count = 0; int migratetype; /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { - pfn = page_to_pfn(page); + unsigned long pfn = page_to_pfn(page); if (!free_unref_page_prepare(page, pfn, 0)) { list_del(&page->lru); continue; @@ -3437,15 +3413,10 @@ void free_unref_page_list(struct list_head *list) free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE); continue; } - - set_page_private(page, pfn); } local_lock_irqsave(&pagesets.lock, flags); list_for_each_entry_safe(page, next, list, lru) { - pfn = page_private(page); - set_page_private(page, 0); - /* * Non-isolated types over MIGRATE_PCPTYPES get added * to the MIGRATE_MOVABLE pcp list. @@ -3455,7 +3426,7 @@ void free_unref_page_list(struct list_head *list) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(page); - free_unref_page_commit(page, pfn, migratetype, 0); + free_unref_page_commit(page, migratetype, 0); /* * Guard against excessive IRQ disabled times when we get @@ -3529,8 +3500,11 @@ int __isolate_free_page(struct page *page, unsigned int order) struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { int mt = get_pageblock_migratetype(page); - if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) - && !is_migrate_highatomic(mt)) + /* + * Only change normal pageblocks (i.e., they can merge + * with others) + */ + if (migratetype_is_mergeable(mt)) set_pageblock_migratetype(page, MIGRATE_MOVABLE); } @@ -3625,7 +3599,7 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, page = list_first_entry(list, struct page, lru); list_del(&page->lru); pcp->count -= 1 << order; - } while (check_new_pcp(page)); + } while (check_new_pcp(page, order)); return page; } @@ -3690,10 +3664,10 @@ struct page *rmqueue(struct zone *preferred_zone, * allocate greater than order-1 page units with __GFP_NOFAIL. */ WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); - spin_lock_irqsave(&zone->lock, flags); do { page = NULL; + spin_lock_irqsave(&zone->lock, flags); /* * order-0 request can reach here when the pcplist is skipped * due to non-CMA allocation context. HIGHATOMIC area is @@ -3705,15 +3679,15 @@ struct page *rmqueue(struct zone *preferred_zone, if (page) trace_mm_page_alloc_zone_locked(page, order, migratetype); } - if (!page) + if (!page) { page = __rmqueue(zone, order, migratetype, alloc_flags); - } while (page && check_new_pages(page, order)); - if (!page) - goto failed; - - __mod_zone_freepage_state(zone, -(1 << order), - get_pcppage_migratetype(page)); - spin_unlock_irqrestore(&zone->lock, flags); + if (!page) + goto failed; + } + __mod_zone_freepage_state(zone, -(1 << order), + get_pcppage_migratetype(page)); + spin_unlock_irqrestore(&zone->lock, flags); + } while (check_new_pages(page, order)); __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone, 1); @@ -4204,7 +4178,9 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) va_list args; static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1); - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) + if ((gfp_mask & __GFP_NOWARN) || + !__ratelimit(&nopage_rs) || + ((gfp_mask & __GFP_DMA) && !has_managed_dma())) return; va_start(args, fmt); @@ -4348,6 +4324,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, return NULL; psi_memstall_enter(&pflags); + delayacct_compact_start(); noreclaim_flag = memalloc_noreclaim_save(); *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, @@ -4355,6 +4332,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, memalloc_noreclaim_restore(noreclaim_flag); psi_memstall_leave(&pflags); + delayacct_compact_end(); if (*compact_result == COMPACT_SKIPPED) return NULL; @@ -4575,13 +4553,12 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac) { unsigned int noreclaim_flag; - unsigned long pflags, progress; + unsigned long progress; cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - psi_memstall_enter(&pflags); fs_reclaim_acquire(gfp_mask); noreclaim_flag = memalloc_noreclaim_save(); @@ -4590,7 +4567,6 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(gfp_mask); - psi_memstall_leave(&pflags); cond_resched(); @@ -4604,11 +4580,13 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, unsigned long *did_some_progress) { struct page *page = NULL; + unsigned long pflags; bool drained = false; + psi_memstall_enter(&pflags); *did_some_progress = __perform_reclaim(gfp_mask, order, ac); if (unlikely(!(*did_some_progress))) - return NULL; + goto out; retry: page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); @@ -4624,6 +4602,8 @@ retry: drained = true; goto retry; } +out: + psi_memstall_leave(&pflags); return page; } @@ -6360,7 +6340,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta #define BOOT_PAGESET_BATCH 1 static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset); static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats); -static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); +DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); static void __build_all_zonelists(void *data) { @@ -6382,7 +6362,11 @@ static void __build_all_zonelists(void *data) if (self && !node_online(self->node_id)) { build_zonelists(self); } else { - for_each_online_node(nid) { + /* + * All possible nodes have pgdat preallocated + * in free_area_init + */ + for_each_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); build_zonelists(pgdat); @@ -6562,6 +6546,75 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone } #ifdef CONFIG_ZONE_DEVICE +static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, + unsigned long zone_idx, int nid, + struct dev_pagemap *pgmap) +{ + + __init_single_page(page, pfn, zone_idx, nid); + + /* + * Mark page reserved as it will need to wait for onlining + * phase for it to be fully associated with a zone. + * + * We can use the non-atomic __set_bit operation for setting + * the flag as we are still initializing the pages. + */ + __SetPageReserved(page); + + /* + * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer + * and zone_device_data. It is a bug if a ZONE_DEVICE page is + * ever freed or placed on a driver-private list. + */ + page->pgmap = pgmap; + page->zone_device_data = NULL; + + /* + * Mark the block movable so that blocks are reserved for + * movable at startup. This will force kernel allocations + * to reserve their blocks rather than leaking throughout + * the address space during boot when many long-lived + * kernel allocations are made. + * + * Please note that MEMINIT_HOTPLUG path doesn't clear memmap + * because this is done early in section_activate() + */ + if (IS_ALIGNED(pfn, pageblock_nr_pages)) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + cond_resched(); + } +} + +static void __ref memmap_init_compound(struct page *head, + unsigned long head_pfn, + unsigned long zone_idx, int nid, + struct dev_pagemap *pgmap, + unsigned long nr_pages) +{ + unsigned long pfn, end_pfn = head_pfn + nr_pages; + unsigned int order = pgmap->vmemmap_shift; + + __SetPageHead(head); + for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) { + struct page *page = pfn_to_page(pfn); + + __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); + prep_compound_tail(head, pfn - head_pfn); + set_page_count(page, 0); + + /* + * The first tail page stores compound_mapcount_ptr() and + * compound_order() and the second tail page stores + * compound_pincount_ptr(). Call prep_compound_head() after + * the first and second tail pages have been initialized to + * not have the data overwritten. + */ + if (pfn == head_pfn + 2) + prep_compound_head(head, order); + } +} + void __ref memmap_init_zone_device(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, @@ -6570,6 +6623,7 @@ void __ref memmap_init_zone_device(struct zone *zone, unsigned long pfn, end_pfn = start_pfn + nr_pages; struct pglist_data *pgdat = zone->zone_pgdat; struct vmem_altmap *altmap = pgmap_altmap(pgmap); + unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap); unsigned long zone_idx = zone_idx(zone); unsigned long start = jiffies; int nid = pgdat->node_id; @@ -6587,42 +6641,16 @@ void __ref memmap_init_zone_device(struct zone *zone, nr_pages = end_pfn - start_pfn; } - for (pfn = start_pfn; pfn < end_pfn; pfn++) { + for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) { struct page *page = pfn_to_page(pfn); - __init_single_page(page, pfn, zone_idx, nid); - - /* - * Mark page reserved as it will need to wait for onlining - * phase for it to be fully associated with a zone. - * - * We can use the non-atomic __set_bit operation for setting - * the flag as we are still initializing the pages. - */ - __SetPageReserved(page); + __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); - /* - * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer - * and zone_device_data. It is a bug if a ZONE_DEVICE page is - * ever freed or placed on a driver-private list. - */ - page->pgmap = pgmap; - page->zone_device_data = NULL; + if (pfns_per_compound == 1) + continue; - /* - * Mark the block movable so that blocks are reserved for - * movable at startup. This will force kernel allocations - * to reserve their blocks rather than leaking throughout - * the address space during boot when many long-lived - * kernel allocations are made. - * - * Please note that MEMINIT_HOTPLUG path doesn't clear memmap - * because this is done early in section_activate() - */ - if (IS_ALIGNED(pfn, pageblock_nr_pages)) { - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - cond_resched(); - } + memmap_init_compound(page, pfn, zone_idx, nid, pgmap, + pfns_per_compound); } pr_info("%s initialised %lu pages in %ums\n", __func__, @@ -7325,16 +7353,15 @@ static inline void setup_usemap(struct zone *zone) {} /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ void __init set_pageblock_order(void) { - unsigned int order; + unsigned int order = MAX_ORDER - 1; /* Check that pageblock_nr_pages has not already been setup */ if (pageblock_order) return; - if (HPAGE_SHIFT > PAGE_SHIFT) + /* Don't let pageblocks exceed the maximum allocation granularity. */ + if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order) order = HUGETLB_PAGE_ORDER; - else - order = MAX_ORDER - 1; /* * Assume the largest contiguous order of interest is a huge page. @@ -7438,12 +7465,33 @@ static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, * NOTE: this function is only called during memory hotplug */ #ifdef CONFIG_MEMORY_HOTPLUG -void __ref free_area_init_core_hotplug(int nid) +void __ref free_area_init_core_hotplug(struct pglist_data *pgdat) { + int nid = pgdat->node_id; enum zone_type z; - pg_data_t *pgdat = NODE_DATA(nid); + int cpu; pgdat_init_internals(pgdat); + + if (pgdat->per_cpu_nodestats == &boot_nodestats) + pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); + + /* + * Reset the nr_zones, order and highest_zoneidx before reuse. + * Note that kswapd will init kswapd_highest_zoneidx properly + * when it starts in the near future. + */ + pgdat->nr_zones = 0; + pgdat->kswapd_order = 0; + pgdat->kswapd_highest_zoneidx = 0; + pgdat->node_start_pfn = 0; + for_each_online_cpu(cpu) { + struct per_cpu_nodestat *p; + + p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu); + memset(p, 0, sizeof(*p)); + } + for (z = 0; z < MAX_NR_ZONES; z++) zone_init_internals(&pgdat->node_zones[z], z, nid, 0); } @@ -7593,9 +7641,14 @@ static void __init free_area_init_node(int nid) pgdat->node_start_pfn = start_pfn; pgdat->per_cpu_nodestats = NULL; - pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, - (u64)start_pfn << PAGE_SHIFT, - end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); + if (start_pfn != end_pfn) { + pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, + (u64)start_pfn << PAGE_SHIFT, + end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); + } else { + pr_info("Initmem setup node %d as memoryless\n", nid); + } + calculate_node_totalpages(pgdat, start_pfn, end_pfn); alloc_node_mem_map(pgdat); @@ -7604,7 +7657,7 @@ static void __init free_area_init_node(int nid) free_area_init_core(pgdat); } -void __init free_area_init_memoryless_node(int nid) +static void __init free_area_init_memoryless_node(int nid) { free_area_init_node(nid); } @@ -7908,10 +7961,17 @@ restart: out2: /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ - for (nid = 0; nid < MAX_NUMNODES; nid++) + for (nid = 0; nid < MAX_NUMNODES; nid++) { + unsigned long start_pfn, end_pfn; + zone_movable_pfn[nid] = roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + if (zone_movable_pfn[nid] >= end_pfn) + zone_movable_pfn[nid] = 0; + } + out: /* restore the node_state */ node_states[N_MEMORY] = saved_node_state; @@ -8032,8 +8092,36 @@ void __init free_area_init(unsigned long *max_zone_pfn) /* Initialise every node */ mminit_verify_pageflags_layout(); setup_nr_node_ids(); - for_each_online_node(nid) { - pg_data_t *pgdat = NODE_DATA(nid); + for_each_node(nid) { + pg_data_t *pgdat; + + if (!node_online(nid)) { + pr_info("Initializing node %d as memoryless\n", nid); + + /* Allocator not initialized yet */ + pgdat = arch_alloc_nodedata(nid); + if (!pgdat) { + pr_err("Cannot allocate %zuB for node %d.\n", + sizeof(*pgdat), nid); + continue; + } + arch_refresh_nodedata(nid, pgdat); + free_area_init_memoryless_node(nid); + + /* + * We do not want to confuse userspace by sysfs + * files/directories for node without any memory + * attached to it, so this node is not marked as + * N_MEMORY and not marked online so that no sysfs + * hierarchy will be created via register_one_node for + * it. The pgdat will get fully initialized by + * hotadd_init_pgdat() when memory is hotplugged into + * this node. + */ + continue; + } + + pgdat = NODE_DATA(nid); free_area_init_node(nid); /* Any memory on that node */ @@ -8170,7 +8258,7 @@ void __init mem_init_print_info(void) */ #define adj_init_size(start, end, size, pos, adj) \ do { \ - if (start <= pos && pos < end && size > adj) \ + if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \ size -= adj; \ } while (0) @@ -8410,7 +8498,8 @@ static void __setup_per_zone_wmarks(void) zone->watermark_boost = 0; zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; - zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; + zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp; + zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp; spin_unlock_irqrestore(&zone->lock, flags); } @@ -8922,14 +9011,12 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page, #ifdef CONFIG_CONTIG_ALLOC static unsigned long pfn_max_align_down(unsigned long pfn) { - return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, - pageblock_nr_pages) - 1); + return ALIGN_DOWN(pfn, MAX_ORDER_NR_PAGES); } static unsigned long pfn_max_align_up(unsigned long pfn) { - return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, - pageblock_nr_pages)); + return ALIGN(pfn, MAX_ORDER_NR_PAGES); } #if defined(CONFIG_DYNAMIC_DEBUG) || \ @@ -9214,8 +9301,8 @@ static bool zone_spans_last_pfn(const struct zone *zone, * for allocation requests which can not be fulfilled with the buddy allocator. * * The allocated memory is always aligned to a page boundary. If nr_pages is a - * power of two then the alignment is guaranteed to be to the given nr_pages - * (e.g. 1GB request would be aligned to 1GB). + * power of two, then allocated range is also guaranteed to be aligned to same + * nr_pages (e.g. 1GB request would be aligned to 1GB). * * Allocated pages can be freed with free_contig_range() or by manually calling * __free_page() on each allocated page. @@ -9388,6 +9475,7 @@ bool is_free_buddy_page(struct page *page) return order < MAX_ORDER; } +EXPORT_SYMBOL(is_free_buddy_page); #ifdef CONFIG_MEMORY_FAILURE /* @@ -9448,6 +9536,7 @@ bool take_page_off_buddy(struct page *page) del_page_from_free_list(page_head, zone, page_order); break_down_buddy_pages(zone, page_head, page, 0, page_order, migratetype); + SetPageHWPoisonTakenOff(page); if (!is_migrate_isolate(migratetype)) __mod_zone_freepage_state(zone, -1, migratetype); ret = true; @@ -9459,4 +9548,44 @@ bool take_page_off_buddy(struct page *page) spin_unlock_irqrestore(&zone->lock, flags); return ret; } + +/* + * Cancel takeoff done by take_page_off_buddy(). + */ +bool put_page_back_buddy(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long pfn = page_to_pfn(page); + unsigned long flags; + int migratetype = get_pfnblock_migratetype(page, pfn); + bool ret = false; + + spin_lock_irqsave(&zone->lock, flags); + if (put_page_testzero(page)) { + ClearPageHWPoisonTakenOff(page); + __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE); + if (TestClearPageHWPoison(page)) { + num_poisoned_pages_dec(); + ret = true; + } + } + spin_unlock_irqrestore(&zone->lock, flags); + + return ret; +} #endif + +#ifdef CONFIG_ZONE_DMA +bool has_managed_dma(void) +{ + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) { + struct zone *zone = &pgdat->node_zones[ZONE_DMA]; + + if (managed_zone(zone)) + return true; + } + return false; +} +#endif /* CONFIG_ZONE_DMA */ diff --git a/mm/page_counter.c b/mm/page_counter.c index 7d83641eb86b..eb156ff5d603 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -120,7 +120,6 @@ bool page_counter_try_charge(struct page_counter *counter, new = atomic_long_add_return(nr_pages, &c->usage); if (new > c->max) { atomic_long_sub(nr_pages, &c->usage); - propagate_protected_usage(c, new); /* * This is racy, but we can live with some * inaccuracy in the failcnt which is only used diff --git a/mm/page_ext.c b/mm/page_ext.c index 6242afb24d84..2e66d934d63f 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -8,6 +8,7 @@ #include <linux/kmemleak.h> #include <linux/page_owner.h> #include <linux/page_idle.h> +#include <linux/page_table_check.h> /* * struct page extension @@ -63,18 +64,21 @@ static bool need_page_idle(void) { return true; } -struct page_ext_operations page_idle_ops = { +static struct page_ext_operations page_idle_ops __initdata = { .need = need_page_idle, }; #endif -static struct page_ext_operations *page_ext_ops[] = { +static struct page_ext_operations *page_ext_ops[] __initdata = { #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) &page_idle_ops, #endif +#ifdef CONFIG_PAGE_TABLE_CHECK + &page_table_check_ops, +#endif }; unsigned long page_ext_size = sizeof(struct page_ext); diff --git a/mm/page_idle.c b/mm/page_idle.c index edead6a8a5f9..fc0435abf909 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -13,6 +13,8 @@ #include <linux/page_ext.h> #include <linux/page_idle.h> +#include "internal.h" + #define BITMAP_CHUNK_SIZE sizeof(u64) #define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE) @@ -44,15 +46,11 @@ static struct page *page_idle_get_page(unsigned long pfn) return page; } -static bool page_idle_clear_pte_refs_one(struct page *page, +static bool page_idle_clear_pte_refs_one(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, void *arg) { - struct page_vma_mapped_walk pvmw = { - .page = page, - .vma = vma, - .address = addr, - }; + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); bool referenced = false; while (page_vma_mapped_walk(&pvmw)) { @@ -74,41 +72,41 @@ static bool page_idle_clear_pte_refs_one(struct page *page, } if (referenced) { - clear_page_idle(page); + folio_clear_idle(folio); /* * We cleared the referenced bit in a mapping to this page. To * avoid interference with page reclaim, mark it young so that - * page_referenced() will return > 0. + * folio_referenced() will return > 0. */ - set_page_young(page); + folio_set_young(folio); } return true; } static void page_idle_clear_pte_refs(struct page *page) { + struct folio *folio = page_folio(page); /* * Since rwc.arg is unused, rwc is effectively immutable, so we * can make it static const to save some cycles and stack. */ static const struct rmap_walk_control rwc = { .rmap_one = page_idle_clear_pte_refs_one, - .anon_lock = page_lock_anon_vma_read, + .anon_lock = folio_lock_anon_vma_read, }; bool need_lock; - if (!page_mapped(page) || - !page_rmapping(page)) + if (!folio_mapped(folio) || !folio_raw_mapping(folio)) return; - need_lock = !PageAnon(page) || PageKsm(page); - if (need_lock && !trylock_page(page)) + need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); + if (need_lock && !folio_trylock(folio)) return; - rmap_walk(page, (struct rmap_walk_control *)&rwc); + rmap_walk(folio, &rwc); if (need_lock) - unlock_page(page); + folio_unlock(folio); } static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, diff --git a/mm/page_io.c b/mm/page_io.c index 9725c7e1eeea..b417f000b49e 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -25,6 +25,7 @@ #include <linux/psi.h> #include <linux/uio.h> #include <linux/sched/task.h> +#include <linux/delayacct.h> void end_swap_bio_write(struct bio *bio) { @@ -337,10 +338,10 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, return 0; } - bio = bio_alloc(GFP_NOIO, 1); - bio_set_dev(bio, sis->bdev); + bio = bio_alloc(sis->bdev, 1, + REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc), + GFP_NOIO); bio->bi_iter.bi_sector = swap_page_sector(page); - bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc); bio->bi_end_io = end_write_func; bio_add_page(bio, page, thp_size(page), 0); @@ -358,6 +359,7 @@ int swap_readpage(struct page *page, bool synchronous) struct bio *bio; int ret = 0; struct swap_info_struct *sis = page_swap_info(page); + bool workingset = PageWorkingset(page); unsigned long pflags; VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page); @@ -369,7 +371,9 @@ int swap_readpage(struct page *page, bool synchronous) * or the submitting cgroup IO-throttled, submission can be a * significant part of overall IO time. */ - psi_memstall_enter(&pflags); + if (workingset) + psi_memstall_enter(&pflags); + delayacct_swapin_start(); if (frontswap_load(page) == 0) { SetPageUptodate(page); @@ -401,9 +405,7 @@ int swap_readpage(struct page *page, bool synchronous) } ret = 0; - bio = bio_alloc(GFP_KERNEL, 1); - bio_set_dev(bio, sis->bdev); - bio->bi_opf = REQ_OP_READ; + bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL); bio->bi_iter.bi_sector = swap_page_sector(page); bio->bi_end_io = end_swap_bio_read; bio_add_page(bio, page, thp_size(page), 0); @@ -431,20 +433,25 @@ int swap_readpage(struct page *page, bool synchronous) bio_put(bio); out: - psi_memstall_leave(&pflags); + if (workingset) + psi_memstall_leave(&pflags); + delayacct_swapin_end(); return ret; } -int swap_set_page_dirty(struct page *page) +bool swap_dirty_folio(struct address_space *mapping, struct folio *folio) { - struct swap_info_struct *sis = page_swap_info(page); + struct swap_info_struct *sis = swp_swap_info(folio_swap_entry(folio)); if (data_race(sis->flags & SWP_FS_OPS)) { - struct address_space *mapping = sis->swap_file->f_mapping; + const struct address_space_operations *aops; + + mapping = sis->swap_file->f_mapping; + aops = mapping->a_ops; - VM_BUG_ON_PAGE(!PageSwapCache(page), page); - return mapping->a_ops->set_page_dirty(page); + VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); + return aops->dirty_folio(mapping, folio); } else { - return __set_page_dirty_no_writeback(page); + return noop_dirty_folio(mapping, folio); } } diff --git a/mm/page_owner.c b/mm/page_owner.c index 4f924957ce7a..99e360df9465 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -46,7 +46,7 @@ static int __init early_page_owner_param(char *buf) } early_param("page_owner", early_page_owner_param); -static bool need_page_owner(void) +static __init bool need_page_owner(void) { return page_owner_enabled; } @@ -75,11 +75,13 @@ static noinline void register_early_stack(void) early_handle = create_dummy_stack(); } -static void init_page_owner(void) +static __init void init_page_owner(void) { if (!page_owner_enabled) return; + stack_depot_init(); + register_dummy_stack(); register_failure_stack(); register_early_stack(); diff --git a/mm/page_table_check.c b/mm/page_table_check.c new file mode 100644 index 000000000000..2458281bff89 --- /dev/null +++ b/mm/page_table_check.c @@ -0,0 +1,261 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2021, Google LLC. + * Pasha Tatashin <[email protected]> + */ +#include <linux/mm.h> +#include <linux/page_table_check.h> + +#undef pr_fmt +#define pr_fmt(fmt) "page_table_check: " fmt + +struct page_table_check { + atomic_t anon_map_count; + atomic_t file_map_count; +}; + +static bool __page_table_check_enabled __initdata = + IS_ENABLED(CONFIG_PAGE_TABLE_CHECK_ENFORCED); + +DEFINE_STATIC_KEY_TRUE(page_table_check_disabled); +EXPORT_SYMBOL(page_table_check_disabled); + +static int __init early_page_table_check_param(char *buf) +{ + return strtobool(buf, &__page_table_check_enabled); +} + +early_param("page_table_check", early_page_table_check_param); + +static bool __init need_page_table_check(void) +{ + return __page_table_check_enabled; +} + +static void __init init_page_table_check(void) +{ + if (!__page_table_check_enabled) + return; + static_branch_disable(&page_table_check_disabled); +} + +struct page_ext_operations page_table_check_ops = { + .size = sizeof(struct page_table_check), + .need = need_page_table_check, + .init = init_page_table_check, +}; + +static struct page_table_check *get_page_table_check(struct page_ext *page_ext) +{ + BUG_ON(!page_ext); + return (void *)(page_ext) + page_table_check_ops.offset; +} + +static inline bool pte_user_accessible_page(pte_t pte) +{ + return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER); +} + +static inline bool pmd_user_accessible_page(pmd_t pmd) +{ + return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) && + (pmd_val(pmd) & _PAGE_USER); +} + +static inline bool pud_user_accessible_page(pud_t pud) +{ + return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) && + (pud_val(pud) & _PAGE_USER); +} + +/* + * An enty is removed from the page table, decrement the counters for that page + * verify that it is of correct type and counters do not become negative. + */ +static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, + unsigned long pfn, unsigned long pgcnt) +{ + struct page_ext *page_ext; + struct page *page; + unsigned long i; + bool anon; + + if (!pfn_valid(pfn)) + return; + + page = pfn_to_page(pfn); + page_ext = lookup_page_ext(page); + anon = PageAnon(page); + + for (i = 0; i < pgcnt; i++) { + struct page_table_check *ptc = get_page_table_check(page_ext); + + if (anon) { + BUG_ON(atomic_read(&ptc->file_map_count)); + BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0); + } else { + BUG_ON(atomic_read(&ptc->anon_map_count)); + BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0); + } + page_ext = page_ext_next(page_ext); + } +} + +/* + * A new enty is added to the page table, increment the counters for that page + * verify that it is of correct type and is not being mapped with a different + * type to a different process. + */ +static void page_table_check_set(struct mm_struct *mm, unsigned long addr, + unsigned long pfn, unsigned long pgcnt, + bool rw) +{ + struct page_ext *page_ext; + struct page *page; + unsigned long i; + bool anon; + + if (!pfn_valid(pfn)) + return; + + page = pfn_to_page(pfn); + page_ext = lookup_page_ext(page); + anon = PageAnon(page); + + for (i = 0; i < pgcnt; i++) { + struct page_table_check *ptc = get_page_table_check(page_ext); + + if (anon) { + BUG_ON(atomic_read(&ptc->file_map_count)); + BUG_ON(atomic_inc_return(&ptc->anon_map_count) > 1 && rw); + } else { + BUG_ON(atomic_read(&ptc->anon_map_count)); + BUG_ON(atomic_inc_return(&ptc->file_map_count) < 0); + } + page_ext = page_ext_next(page_ext); + } +} + +/* + * page is on free list, or is being allocated, verify that counters are zeroes + * crash if they are not. + */ +void __page_table_check_zero(struct page *page, unsigned int order) +{ + struct page_ext *page_ext = lookup_page_ext(page); + unsigned long i; + + BUG_ON(!page_ext); + for (i = 0; i < (1ul << order); i++) { + struct page_table_check *ptc = get_page_table_check(page_ext); + + BUG_ON(atomic_read(&ptc->anon_map_count)); + BUG_ON(atomic_read(&ptc->file_map_count)); + page_ext = page_ext_next(page_ext); + } +} + +void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t pte) +{ + if (&init_mm == mm) + return; + + if (pte_user_accessible_page(pte)) { + page_table_check_clear(mm, addr, pte_pfn(pte), + PAGE_SIZE >> PAGE_SHIFT); + } +} +EXPORT_SYMBOL(__page_table_check_pte_clear); + +void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, + pmd_t pmd) +{ + if (&init_mm == mm) + return; + + if (pmd_user_accessible_page(pmd)) { + page_table_check_clear(mm, addr, pmd_pfn(pmd), + PMD_PAGE_SIZE >> PAGE_SHIFT); + } +} +EXPORT_SYMBOL(__page_table_check_pmd_clear); + +void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, + pud_t pud) +{ + if (&init_mm == mm) + return; + + if (pud_user_accessible_page(pud)) { + page_table_check_clear(mm, addr, pud_pfn(pud), + PUD_PAGE_SIZE >> PAGE_SHIFT); + } +} +EXPORT_SYMBOL(__page_table_check_pud_clear); + +void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + if (&init_mm == mm) + return; + + __page_table_check_pte_clear(mm, addr, *ptep); + if (pte_user_accessible_page(pte)) { + page_table_check_set(mm, addr, pte_pfn(pte), + PAGE_SIZE >> PAGE_SHIFT, + pte_write(pte)); + } +} +EXPORT_SYMBOL(__page_table_check_pte_set); + +void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ + if (&init_mm == mm) + return; + + __page_table_check_pmd_clear(mm, addr, *pmdp); + if (pmd_user_accessible_page(pmd)) { + page_table_check_set(mm, addr, pmd_pfn(pmd), + PMD_PAGE_SIZE >> PAGE_SHIFT, + pmd_write(pmd)); + } +} +EXPORT_SYMBOL(__page_table_check_pmd_set); + +void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud) +{ + if (&init_mm == mm) + return; + + __page_table_check_pud_clear(mm, addr, *pudp); + if (pud_user_accessible_page(pud)) { + page_table_check_set(mm, addr, pud_pfn(pud), + PUD_PAGE_SIZE >> PAGE_SHIFT, + pud_write(pud)); + } +} +EXPORT_SYMBOL(__page_table_check_pud_set); + +void __page_table_check_pte_clear_range(struct mm_struct *mm, + unsigned long addr, + pmd_t pmd) +{ + if (&init_mm == mm) + return; + + if (!pmd_bad(pmd) && !pmd_leaf(pmd)) { + pte_t *ptep = pte_offset_map(&pmd, addr); + unsigned long i; + + pte_unmap(ptep); + for (i = 0; i < PTRS_PER_PTE; i++) { + __page_table_check_pte_clear(mm, addr, *ptep); + addr += PAGE_SIZE; + ptep++; + } + } +} diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index f7b331081791..1187f9c1ec5b 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -53,18 +53,6 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw) return true; } -static inline bool pfn_is_match(struct page *page, unsigned long pfn) -{ - unsigned long page_pfn = page_to_pfn(page); - - /* normal page and hugetlbfs page */ - if (!PageTransCompound(page) || PageHuge(page)) - return page_pfn == pfn; - - /* THP can be referenced by any subpage */ - return pfn >= page_pfn && pfn - page_pfn < thp_nr_pages(page); -} - /** * check_pte - check if @pvmw->page is mapped at the @pvmw->pte * @pvmw: page_vma_mapped_walk struct, includes a pair pte and page for checking @@ -116,7 +104,17 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) pfn = pte_pfn(*pvmw->pte); } - return pfn_is_match(pvmw->page, pfn); + return (pfn - pvmw->pfn) < pvmw->nr_pages; +} + +/* Returns true if the two ranges overlap. Careful to not overflow. */ +static bool check_pmd(unsigned long pfn, struct page_vma_mapped_walk *pvmw) +{ + if ((pfn + HPAGE_PMD_NR - 1) < pvmw->pfn) + return false; + if (pfn > pvmw->pfn + pvmw->nr_pages - 1) + return false; + return true; } static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size) @@ -127,7 +125,7 @@ static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size) } /** - * page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at + * page_vma_mapped_walk - check if @pvmw->pfn is mapped in @pvmw->vma at * @pvmw->address * @pvmw: pointer to struct page_vma_mapped_walk. page, vma, address and flags * must be set. pmd, pte and ptl must be NULL. @@ -152,8 +150,8 @@ static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size) */ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) { - struct mm_struct *mm = pvmw->vma->vm_mm; - struct page *page = pvmw->page; + struct vm_area_struct *vma = pvmw->vma; + struct mm_struct *mm = vma->vm_mm; unsigned long end; pgd_t *pgd; p4d_t *p4d; @@ -164,32 +162,26 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (pvmw->pmd && !pvmw->pte) return not_found(pvmw); - if (unlikely(PageHuge(page))) { + if (unlikely(is_vm_hugetlb_page(vma))) { + unsigned long size = pvmw->nr_pages * PAGE_SIZE; /* The only possible mapping was handled on last iteration */ if (pvmw->pte) return not_found(pvmw); /* when pud is not present, pte will be NULL */ - pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page)); + pvmw->pte = huge_pte_offset(mm, pvmw->address, size); if (!pvmw->pte) return false; - pvmw->ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte); + pvmw->ptl = huge_pte_lockptr(size_to_hstate(size), mm, + pvmw->pte); spin_lock(pvmw->ptl); if (!check_pte(pvmw)) return not_found(pvmw); return true; } - /* - * Seek to next pte only makes sense for THP. - * But more important than that optimization, is to filter out - * any PageKsm page: whose page->index misleads vma_address() - * and vma_address_end() to disaster. - */ - end = PageTransCompound(page) ? - vma_address_end(page, pvmw->vma) : - pvmw->address + PAGE_SIZE; + end = vma_address_end(pvmw); if (pvmw->pte) goto next_pte; restart: @@ -224,7 +216,7 @@ restart: if (likely(pmd_trans_huge(pmde))) { if (pvmw->flags & PVMW_MIGRATION) return not_found(pvmw); - if (pmd_page(pmde) != page) + if (!check_pmd(pmd_pfn(pmde), pvmw)) return not_found(pvmw); return true; } @@ -236,7 +228,7 @@ restart: return not_found(pvmw); entry = pmd_to_swp_entry(pmde); if (!is_migration_entry(entry) || - pfn_swap_entry_to_page(entry) != page) + !check_pmd(swp_offset(entry), pvmw)) return not_found(pvmw); return true; } @@ -250,7 +242,8 @@ restart: * cleared *pmd but not decremented compound_mapcount(). */ if ((pvmw->flags & PVMW_SYNC) && - PageTransCompound(page)) { + transparent_hugepage_active(vma) && + (pvmw->nr_pages >= HPAGE_PMD_NR)) { spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); spin_unlock(ptl); @@ -307,7 +300,8 @@ next_pte: int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) { struct page_vma_mapped_walk pvmw = { - .page = page, + .pfn = page_to_pfn(page), + .nr_pages = 1, .vma = vma, .flags = PVMW_SYNC, }; diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 639662c20c82..411d1593ef23 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -113,6 +113,24 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk) return pcpu_nr_pages_to_map_bits(chunk->nr_pages); } +#ifdef CONFIG_MEMCG_KMEM +/** + * pcpu_obj_full_size - helper to calculate size of each accounted object + * @size: size of area to allocate in bytes + * + * For each accounted object there is an extra space which is used to store + * obj_cgroup membership. Charge it too. + */ +static inline size_t pcpu_obj_full_size(size_t size) +{ + size_t extra_size; + + extra_size = size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *); + + return size * num_possible_cpus() + extra_size; +} +#endif /* CONFIG_MEMCG_KMEM */ + #ifdef CONFIG_PERCPU_STATS #include <linux/spinlock.h> diff --git a/mm/percpu.c b/mm/percpu.c index f5b2c2ea5a54..ea28db283044 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -779,7 +779,7 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) { struct pcpu_block_md *block = chunk->md_blocks + index; unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index); - unsigned int rs, re, start; /* region start, region end */ + unsigned int start, end; /* region start, region end */ /* promote scan_hint to contig_hint */ if (block->scan_hint) { @@ -795,9 +795,8 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) block->right_free = 0; /* iterate over free areas and update the contig hints */ - bitmap_for_each_clear_region(alloc_map, rs, re, start, - PCPU_BITMAP_BLOCK_BITS) - pcpu_block_update(block, rs, re); + for_each_clear_bitrange_from(start, end, alloc_map, PCPU_BITMAP_BLOCK_BITS) + pcpu_block_update(block, start, end); } /** @@ -1070,17 +1069,18 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits, int *next_off) { - unsigned int page_start, page_end, rs, re; + unsigned int start, end; - page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE); - page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); + start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE); + end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); - rs = page_start; - bitmap_next_clear_region(chunk->populated, &rs, &re, page_end); - if (rs >= page_end) + start = find_next_zero_bit(chunk->populated, end, start); + if (start >= end) return true; - *next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; + end = find_next_bit(chunk->populated, end, start + 1); + + *next_off = end * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; return false; } @@ -1635,7 +1635,7 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, if (!objcg) return true; - if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) { + if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) { obj_cgroup_put(objcg); return false; } @@ -1656,10 +1656,10 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, rcu_read_lock(); mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, - size * num_possible_cpus()); + pcpu_obj_full_size(size)); rcu_read_unlock(); } else { - obj_cgroup_uncharge(objcg, size * num_possible_cpus()); + obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); obj_cgroup_put(objcg); } } @@ -1676,11 +1676,11 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) return; chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL; - obj_cgroup_uncharge(objcg, size * num_possible_cpus()); + obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); rcu_read_lock(); mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, - -(size * num_possible_cpus())); + -pcpu_obj_full_size(size)); rcu_read_unlock(); obj_cgroup_put(objcg); @@ -1851,13 +1851,12 @@ area_found: /* populate if not all pages are already there */ if (!is_atomic) { - unsigned int page_start, page_end, rs, re; + unsigned int page_end, rs, re; - page_start = PFN_DOWN(off); + rs = PFN_DOWN(off); page_end = PFN_UP(off + size); - bitmap_for_each_clear_region(chunk->populated, rs, re, - page_start, page_end) { + for_each_clear_bitrange_from(rs, re, chunk->populated, page_end) { WARN_ON(chunk->immutable); ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp); @@ -2013,8 +2012,7 @@ static void pcpu_balance_free(bool empty_only) list_for_each_entry_safe(chunk, next, &to_free, list) { unsigned int rs, re; - bitmap_for_each_set_region(chunk->populated, rs, re, 0, - chunk->nr_pages) { + for_each_set_bitrange(rs, re, chunk->populated, chunk->nr_pages) { pcpu_depopulate_chunk(chunk, rs, re); spin_lock_irq(&pcpu_lock); pcpu_chunk_depopulated(chunk, rs, re); @@ -2084,8 +2082,7 @@ retry_pop: continue; /* @chunk can't go away while pcpu_alloc_mutex is held */ - bitmap_for_each_clear_region(chunk->populated, rs, re, 0, - chunk->nr_pages) { + for_each_clear_bitrange(rs, re, chunk->populated, chunk->nr_pages) { int nr = min_t(int, re - rs, nr_to_pop); spin_unlock_irq(&pcpu_lock); @@ -2992,6 +2989,42 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info( return ai; } + +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) +{ + const unsigned long goal = __pa(MAX_DMA_ADDRESS); +#ifdef CONFIG_NUMA + int node = NUMA_NO_NODE; + void *ptr; + + if (cpu_to_nd_fn) + node = cpu_to_nd_fn(cpu); + + if (node == NUMA_NO_NODE || !node_online(node) || !NODE_DATA(node)) { + ptr = memblock_alloc_from(size, align, goal); + pr_info("cpu %d has no node %d or node-local memory\n", + cpu, node); + pr_debug("per cpu data for cpu%d %zu bytes at 0x%llx\n", + cpu, size, (u64)__pa(ptr)); + } else { + ptr = memblock_alloc_try_nid(size, align, goal, + MEMBLOCK_ALLOC_ACCESSIBLE, + node); + + pr_debug("per cpu data for cpu%d %zu bytes on node%d at 0x%llx\n", + cpu, size, node, (u64)__pa(ptr)); + } + return ptr; +#else + return memblock_alloc_from(size, align, goal); +#endif +} + +static void __init pcpu_fc_free(void *ptr, size_t size) +{ + memblock_free(ptr, size); +} #endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */ #if defined(BUILD_EMBED_FIRST_CHUNK) @@ -3001,14 +3034,13 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info( * @dyn_size: minimum free size for dynamic allocation in bytes * @atom_size: allocation atom size * @cpu_distance_fn: callback to determine distance between cpus, optional - * @alloc_fn: function to allocate percpu page - * @free_fn: function to free percpu page + * @cpu_to_nd_fn: callback to convert cpu to it's node, optional * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. * * If this function is used to setup the first chunk, it is allocated - * by calling @alloc_fn and used as-is without being mapped into + * by calling pcpu_fc_alloc and used as-is without being mapped into * vmalloc area. Allocations are always whole multiples of @atom_size * aligned to @atom_size. * @@ -3022,7 +3054,7 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info( * @dyn_size specifies the minimum dynamic area size. * * If the needed size is smaller than the minimum or specified unit - * size, the leftover is returned using @free_fn. + * size, the leftover is returned using pcpu_fc_free. * * RETURNS: * 0 on success, -errno on failure. @@ -3030,8 +3062,7 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info( int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn) + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { void *base = (void *)ULONG_MAX; void **areas = NULL; @@ -3066,7 +3097,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, BUG_ON(cpu == NR_CPUS); /* allocate space for the whole group */ - ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size); + ptr = pcpu_fc_alloc(cpu, gi->nr_units * ai->unit_size, atom_size, cpu_to_nd_fn); if (!ptr) { rc = -ENOMEM; goto out_free_areas; @@ -3105,12 +3136,12 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { if (gi->cpu_map[i] == NR_CPUS) { /* unused unit, free whole */ - free_fn(ptr, ai->unit_size); + pcpu_fc_free(ptr, ai->unit_size); continue; } /* copy and return the unused part */ memcpy(ptr, __per_cpu_load, ai->static_size); - free_fn(ptr + size_sum, ai->unit_size - size_sum); + pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum); } } @@ -3129,7 +3160,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, out_free_areas: for (group = 0; group < ai->nr_groups; group++) if (areas[group]) - free_fn(areas[group], + pcpu_fc_free(areas[group], ai->groups[group].nr_units * ai->unit_size); out_free: pcpu_free_alloc_info(ai); @@ -3140,12 +3171,79 @@ out_free: #endif /* BUILD_EMBED_FIRST_CHUNK */ #ifdef BUILD_PAGE_FIRST_CHUNK +#include <asm/pgalloc.h> + +#ifndef P4D_TABLE_SIZE +#define P4D_TABLE_SIZE PAGE_SIZE +#endif + +#ifndef PUD_TABLE_SIZE +#define PUD_TABLE_SIZE PAGE_SIZE +#endif + +#ifndef PMD_TABLE_SIZE +#define PMD_TABLE_SIZE PAGE_SIZE +#endif + +#ifndef PTE_TABLE_SIZE +#define PTE_TABLE_SIZE PAGE_SIZE +#endif +void __init __weak pcpu_populate_pte(unsigned long addr) +{ + pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + if (pgd_none(*pgd)) { + p4d_t *new; + + new = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE); + if (!new) + goto err_alloc; + pgd_populate(&init_mm, pgd, new); + } + + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + pud_t *new; + + new = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); + if (!new) + goto err_alloc; + p4d_populate(&init_mm, p4d, new); + } + + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) { + pmd_t *new; + + new = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); + if (!new) + goto err_alloc; + pud_populate(&init_mm, pud, new); + } + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) { + pte_t *new; + + new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE); + if (!new) + goto err_alloc; + pmd_populate_kernel(&init_mm, pmd, new); + } + + return; + +err_alloc: + panic("%s: Failed to allocate memory\n", __func__); +} + /** * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @reserved_size: the size of reserved percpu area in bytes - * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE - * @free_fn: function to free percpu page, always called with PAGE_SIZE - * @populate_pte_fn: function to populate pte + * @cpu_to_nd_fn: callback to convert cpu to it's node, optional * * This is a helper to ease setting up page-remapped first percpu * chunk and can be called where pcpu_setup_first_chunk() is expected. @@ -3156,10 +3254,7 @@ out_free: * RETURNS: * 0 on success, -errno on failure. */ -int __init pcpu_page_first_chunk(size_t reserved_size, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_populate_pte_fn_t populate_pte_fn) +int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { static struct vm_struct vm; struct pcpu_alloc_info *ai; @@ -3201,7 +3296,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, for (i = 0; i < unit_pages; i++) { void *ptr; - ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); + ptr = pcpu_fc_alloc(cpu, PAGE_SIZE, PAGE_SIZE, cpu_to_nd_fn); if (!ptr) { pr_warn("failed to allocate %s page for cpu%u\n", psize_str, cpu); @@ -3223,7 +3318,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, (unsigned long)vm.addr + unit * ai->unit_size; for (i = 0; i < unit_pages; i++) - populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); + pcpu_populate_pte(unit_addr + (i << PAGE_SHIFT)); /* pte already populated, the following shouldn't fail */ rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], @@ -3253,7 +3348,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, enomem: while (--j >= 0) - free_fn(page_address(pages[j]), PAGE_SIZE); + pcpu_fc_free(page_address(pages[j]), PAGE_SIZE); rc = -ENOMEM; out_free_ar: memblock_free(pages, pages_size); @@ -3278,17 +3373,6 @@ out_free_ar: unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); -static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, - size_t align) -{ - return memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS)); -} - -static void __init pcpu_dfl_fc_free(void *ptr, size_t size) -{ - memblock_free(ptr, size); -} - void __init setup_per_cpu_areas(void) { unsigned long delta; @@ -3299,9 +3383,8 @@ void __init setup_per_cpu_areas(void) * Always reserve area for module percpu variables. That's * what the legacy allocator did. */ - rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, - PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, - pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, + PAGE_SIZE, NULL, NULL); if (rc < 0) panic("Failed to initialize percpu areas."); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 4e640baf9794..6523fda274e5 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -10,6 +10,7 @@ #include <linux/pagemap.h> #include <linux/hugetlb.h> #include <linux/pgtable.h> +#include <linux/mm_inline.h> #include <asm/tlb.h> /* diff --git a/mm/ptdump.c b/mm/ptdump.c index da751448d0e4..eea3d28d173c 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -40,8 +40,10 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 0, pgd_val(val)); - if (pgd_leaf(val)) + if (pgd_leaf(val)) { st->note_page(st, addr, 0, pgd_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } @@ -61,8 +63,10 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 1, p4d_val(val)); - if (p4d_leaf(val)) + if (p4d_leaf(val)) { st->note_page(st, addr, 1, p4d_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } @@ -82,8 +86,10 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 2, pud_val(val)); - if (pud_leaf(val)) + if (pud_leaf(val)) { st->note_page(st, addr, 2, pud_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } @@ -101,8 +107,10 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 3, pmd_val(val)); - if (pmd_leaf(val)) + if (pmd_leaf(val)) { st->note_page(st, addr, 3, pmd_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } diff --git a/mm/readahead.c b/mm/readahead.c index 6ae5693de28c..d3a47546d17d 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -8,6 +8,111 @@ * Initial version. */ +/** + * DOC: Readahead Overview + * + * Readahead is used to read content into the page cache before it is + * explicitly requested by the application. Readahead only ever + * attempts to read pages that are not yet in the page cache. If a + * page is present but not up-to-date, readahead will not try to read + * it. In that case a simple ->readpage() will be requested. + * + * Readahead is triggered when an application read request (whether a + * systemcall or a page fault) finds that the requested page is not in + * the page cache, or that it is in the page cache and has the + * %PG_readahead flag set. This flag indicates that the page was loaded + * as part of a previous read-ahead request and now that it has been + * accessed, it is time for the next read-ahead. + * + * Each readahead request is partly synchronous read, and partly async + * read-ahead. This is reflected in the struct file_ra_state which + * contains ->size being to total number of pages, and ->async_size + * which is the number of pages in the async section. The first page in + * this async section will have %PG_readahead set as a trigger for a + * subsequent read ahead. Once a series of sequential reads has been + * established, there should be no need for a synchronous component and + * all read ahead request will be fully asynchronous. + * + * When either of the triggers causes a readahead, three numbers need to + * be determined: the start of the region, the size of the region, and + * the size of the async tail. + * + * The start of the region is simply the first page address at or after + * the accessed address, which is not currently populated in the page + * cache. This is found with a simple search in the page cache. + * + * The size of the async tail is determined by subtracting the size that + * was explicitly requested from the determined request size, unless + * this would be less than zero - then zero is used. NOTE THIS + * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED + * PAGE. + * + * The size of the region is normally determined from the size of the + * previous readahead which loaded the preceding pages. This may be + * discovered from the struct file_ra_state for simple sequential reads, + * or from examining the state of the page cache when multiple + * sequential reads are interleaved. Specifically: where the readahead + * was triggered by the %PG_readahead flag, the size of the previous + * readahead is assumed to be the number of pages from the triggering + * page to the start of the new readahead. In these cases, the size of + * the previous readahead is scaled, often doubled, for the new + * readahead, though see get_next_ra_size() for details. + * + * If the size of the previous read cannot be determined, the number of + * preceding pages in the page cache is used to estimate the size of + * a previous read. This estimate could easily be misled by random + * reads being coincidentally adjacent, so it is ignored unless it is + * larger than the current request, and it is not scaled up, unless it + * is at the start of file. + * + * In general read ahead is accelerated at the start of the file, as + * reads from there are often sequential. There are other minor + * adjustments to the read ahead size in various special cases and these + * are best discovered by reading the code. + * + * The above calculation determines the readahead, to which any requested + * read size may be added. + * + * Readahead requests are sent to the filesystem using the ->readahead() + * address space operation, for which mpage_readahead() is a canonical + * implementation. ->readahead() should normally initiate reads on all + * pages, but may fail to read any or all pages without causing an IO + * error. The page cache reading code will issue a ->readpage() request + * for any page which ->readahead() does not provided, and only an error + * from this will be final. + * + * ->readahead() will generally call readahead_page() repeatedly to get + * each page from those prepared for read ahead. It may fail to read a + * page by: + * + * * not calling readahead_page() sufficiently many times, effectively + * ignoring some pages, as might be appropriate if the path to + * storage is congested. + * + * * failing to actually submit a read request for a given page, + * possibly due to insufficient resources, or + * + * * getting an error during subsequent processing of a request. + * + * In the last two cases, the page should be unlocked to indicate that + * the read attempt has failed. In the first case the page will be + * unlocked by the caller. + * + * Those pages not in the final ``async_size`` of the request should be + * considered to be important and ->readahead() should not fail them due + * to congestion or temporary resource unavailability, but should wait + * for necessary resources (e.g. memory or indexing information) to + * become available. Pages in the final ``async_size`` may be + * considered less urgent and failure to read them is more acceptable. + * In this case it is best to use delete_from_page_cache() to remove the + * pages from the page cache as is automatically done for pages that + * were not fetched with readahead_page(). This will allow a + * subsequent synchronous read ahead request to try them again. If they + * are left in the page cache, then they will be read individually using + * ->readpage(). + * + */ + #include <linux/kernel.h> #include <linux/dax.h> #include <linux/gfp.h> @@ -51,7 +156,7 @@ static void read_cache_pages_invalidate_page(struct address_space *mapping, if (!trylock_page(page)) BUG(); page->mapping = mapping; - do_invalidatepage(page, 0, PAGE_SIZE); + folio_invalidate(page_folio(page), 0, PAGE_SIZE); page->mapping = NULL; unlock_page(page); } @@ -127,8 +232,17 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages, if (aops->readahead) { aops->readahead(rac); - /* Clean up the remaining pages */ + /* + * Clean up the remaining pages. The sizes in ->ra + * maybe be used to size next read-ahead, so make sure + * they accurately reflect what happened. + */ while ((page = readahead_page(rac))) { + rac->ra->size -= 1; + if (rac->ra->async_size > 0) { + rac->ra->async_size -= 1; + delete_from_page_cache(page); + } unlock_page(page); put_page(page); } @@ -148,7 +262,7 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages, blk_finish_plug(&plug); - BUG_ON(!list_empty(pages)); + BUG_ON(pages && !list_empty(pages)); BUG_ON(readahead_count(rac)); out: @@ -196,9 +310,9 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, * Preallocate as many pages as we will need. */ for (i = 0; i < nr_to_read; i++) { - struct page *page = xa_load(&mapping->i_pages, index + i); + struct folio *folio = xa_load(&mapping->i_pages, index + i); - if (page && !xa_is_value(page)) { + if (folio && !xa_is_value(folio)) { /* * Page already present? Kick off the current batch * of contiguous pages before continuing with the @@ -212,21 +326,21 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, continue; } - page = __page_cache_alloc(gfp_mask); - if (!page) + folio = filemap_alloc_folio(gfp_mask, 0); + if (!folio) break; if (mapping->a_ops->readpages) { - page->index = index + i; - list_add(&page->lru, &page_pool); - } else if (add_to_page_cache_lru(page, mapping, index + i, + folio->index = index + i; + list_add(&folio->lru, &page_pool); + } else if (filemap_add_folio(mapping, folio, index + i, gfp_mask) < 0) { - put_page(page); + folio_put(folio); read_pages(ractl, &page_pool, true); i = ractl->_index + ractl->_nr_pages - index - 1; continue; } if (i == nr_to_read - lookahead_size) - SetPageReadahead(page); + folio_set_readahead(folio); ractl->_nr_pages++; } @@ -247,7 +361,7 @@ EXPORT_SYMBOL_GPL(page_cache_ra_unbounded); * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. */ -void do_page_cache_ra(struct readahead_control *ractl, +static void do_page_cache_ra(struct readahead_control *ractl, unsigned long nr_to_read, unsigned long lookahead_size) { struct inode *inode = ractl->mapping->host; @@ -432,10 +546,102 @@ static int try_context_readahead(struct address_space *mapping, } /* + * There are some parts of the kernel which assume that PMD entries + * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, + * limit the maximum allocation order to PMD size. I'm not aware of any + * assumptions about maximum order if THP are disabled, but 8 seems like + * a good order (that's 1MB if you're using 4kB pages) + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER +#else +#define MAX_PAGECACHE_ORDER 8 +#endif + +static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, + pgoff_t mark, unsigned int order, gfp_t gfp) +{ + int err; + struct folio *folio = filemap_alloc_folio(gfp, order); + + if (!folio) + return -ENOMEM; + if (mark - index < (1UL << order)) + folio_set_readahead(folio); + err = filemap_add_folio(ractl->mapping, folio, index, gfp); + if (err) + folio_put(folio); + else + ractl->_nr_pages += 1UL << order; + return err; +} + +void page_cache_ra_order(struct readahead_control *ractl, + struct file_ra_state *ra, unsigned int new_order) +{ + struct address_space *mapping = ractl->mapping; + pgoff_t index = readahead_index(ractl); + pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; + pgoff_t mark = index + ra->size - ra->async_size; + int err = 0; + gfp_t gfp = readahead_gfp_mask(mapping); + + if (!mapping_large_folio_support(mapping) || ra->size < 4) + goto fallback; + + limit = min(limit, index + ra->size - 1); + + if (new_order < MAX_PAGECACHE_ORDER) { + new_order += 2; + if (new_order > MAX_PAGECACHE_ORDER) + new_order = MAX_PAGECACHE_ORDER; + while ((1 << new_order) > ra->size) + new_order--; + } + + while (index <= limit) { + unsigned int order = new_order; + + /* Align with smaller pages if needed */ + if (index & ((1UL << order) - 1)) { + order = __ffs(index); + if (order == 1) + order = 0; + } + /* Don't allocate pages past EOF */ + while (index + (1UL << order) - 1 > limit) { + if (--order == 1) + order = 0; + } + err = ra_alloc_folio(ractl, index, mark, order, gfp); + if (err) + break; + index += 1UL << order; + } + + if (index > limit) { + ra->size += index - limit - 1; + ra->async_size += index - limit - 1; + } + + read_pages(ractl, NULL, false); + + /* + * If there were already pages in the page cache, then we may have + * left some gaps. Let the regular readahead code take care of this + * situation. + */ + if (!err) + return; +fallback: + do_page_cache_ra(ractl, ra->size, ra->async_size); +} + +/* * A minimal readahead algorithm for trivial sequential/random reads. */ static void ondemand_readahead(struct readahead_control *ractl, - bool hit_readahead_marker, unsigned long req_size) + struct folio *folio, unsigned long req_size) { struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host); struct file_ra_state *ra = ractl->ra; @@ -470,12 +676,12 @@ static void ondemand_readahead(struct readahead_control *ractl, } /* - * Hit a marked page without valid readahead state. + * Hit a marked folio without valid readahead state. * E.g. interleaved reads. * Query the pagecache for async_size, which normally equals to * readahead size. Ramp it up and use it as the new readahead size. */ - if (hit_readahead_marker) { + if (folio) { pgoff_t start; rcu_read_lock(); @@ -548,7 +754,7 @@ readit: } ractl->_index = ra->start; - do_page_cache_ra(ractl, ra->size, ra->async_size); + page_cache_ra_order(ractl, ra, folio ? folio_order(folio) : 0); } void page_cache_sync_ra(struct readahead_control *ractl, @@ -576,12 +782,12 @@ void page_cache_sync_ra(struct readahead_control *ractl, } /* do read-ahead */ - ondemand_readahead(ractl, false, req_count); + ondemand_readahead(ractl, NULL, req_count); } EXPORT_SYMBOL_GPL(page_cache_sync_ra); void page_cache_async_ra(struct readahead_control *ractl, - struct page *page, unsigned long req_count) + struct folio *folio, unsigned long req_count) { /* no read-ahead */ if (!ractl->ra->ra_pages) @@ -590,22 +796,16 @@ void page_cache_async_ra(struct readahead_control *ractl, /* * Same bit is used for PG_readahead and PG_reclaim. */ - if (PageWriteback(page)) + if (folio_test_writeback(folio)) return; - ClearPageReadahead(page); - - /* - * Defer asynchronous read-ahead on IO congestion. - */ - if (inode_read_congested(ractl->mapping->host)) - return; + folio_clear_readahead(folio); if (blk_cgroup_congested()) return; /* do read-ahead */ - ondemand_readahead(ractl, true, req_count); + ondemand_readahead(ractl, folio, req_count); } EXPORT_SYMBOL_GPL(page_cache_async_ra); diff --git a/mm/rmap.c b/mm/rmap.c index 163ac4e6bcee..615b5d323ee2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -31,8 +31,8 @@ * mm->page_table_lock or pte_lock * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) - * mapping->private_lock (in __set_page_dirty_buffers) - * lock_page_memcg move_lock (in __set_page_dirty_buffers) + * mapping->private_lock (in block_dirty_folio) + * folio_lock_memcg move_lock (in block_dirty_folio) * i_pages lock (widely used) * lruvec->lru_lock (in folio_lruvec_lock_irq) * inode->i_lock (in set_page_dirty's __mark_inode_dirty) @@ -107,15 +107,15 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) VM_BUG_ON(atomic_read(&anon_vma->refcount)); /* - * Synchronize against page_lock_anon_vma_read() such that + * Synchronize against folio_lock_anon_vma_read() such that * we can safely hold the lock without the anon_vma getting * freed. * * Relies on the full mb implied by the atomic_dec_and_test() from * put_anon_vma() against the acquire barrier implied by - * down_read_trylock() from page_lock_anon_vma_read(). This orders: + * down_read_trylock() from folio_lock_anon_vma_read(). This orders: * - * page_lock_anon_vma_read() VS put_anon_vma() + * folio_lock_anon_vma_read() VS put_anon_vma() * down_read_trylock() atomic_dec_and_test() * LOCK MB * atomic_read() rwsem_is_locked() @@ -168,7 +168,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, * allocate a new one. * * Anon-vma allocations are very subtle, because we may have - * optimistically looked up an anon_vma in page_lock_anon_vma_read() + * optimistically looked up an anon_vma in folio_lock_anon_vma_read() * and that may actually touch the rwsem even in the newly * allocated vma (it depends on RCU to make sure that the * anon_vma isn't actually destroyed). @@ -526,28 +526,28 @@ out: * atomic op -- the trylock. If we fail the trylock, we fall back to getting a * reference like with page_get_anon_vma() and then block on the mutex. */ -struct anon_vma *page_lock_anon_vma_read(struct page *page) +struct anon_vma *folio_lock_anon_vma_read(struct folio *folio) { struct anon_vma *anon_vma = NULL; struct anon_vma *root_anon_vma; unsigned long anon_mapping; rcu_read_lock(); - anon_mapping = (unsigned long)READ_ONCE(page->mapping); + anon_mapping = (unsigned long)READ_ONCE(folio->mapping); if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) goto out; - if (!page_mapped(page)) + if (!folio_mapped(folio)) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); root_anon_vma = READ_ONCE(anon_vma->root); if (down_read_trylock(&root_anon_vma->rwsem)) { /* - * If the page is still mapped, then this anon_vma is still + * If the folio is still mapped, then this anon_vma is still * its anon_vma, and holding the mutex ensures that it will * not go away, see anon_vma_free(). */ - if (!page_mapped(page)) { + if (!folio_mapped(folio)) { up_read(&root_anon_vma->rwsem); anon_vma = NULL; } @@ -560,7 +560,7 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) goto out; } - if (!page_mapped(page)) { + if (!folio_mapped(folio)) { rcu_read_unlock(); put_anon_vma(anon_vma); return NULL; @@ -621,9 +621,20 @@ void try_to_unmap_flush_dirty(void) try_to_unmap_flush(); } +/* + * Bits 0-14 of mm->tlb_flush_batched record pending generations. + * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations. + */ +#define TLB_FLUSH_BATCH_FLUSHED_SHIFT 16 +#define TLB_FLUSH_BATCH_PENDING_MASK \ + ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1) +#define TLB_FLUSH_BATCH_PENDING_LARGE \ + (TLB_FLUSH_BATCH_PENDING_MASK / 2) + static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; + int batch, nbatch; arch_tlbbatch_add_mm(&tlb_ubc->arch, mm); tlb_ubc->flush_required = true; @@ -633,7 +644,22 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) * before the PTE is cleared. */ barrier(); - mm->tlb_flush_batched = true; + batch = atomic_read(&mm->tlb_flush_batched); +retry: + if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) { + /* + * Prevent `pending' from catching up with `flushed' because of + * overflow. Reset `pending' and `flushed' to be 1 and 0 if + * `pending' becomes large. + */ + nbatch = atomic_cmpxchg(&mm->tlb_flush_batched, batch, 1); + if (nbatch != batch) { + batch = nbatch; + goto retry; + } + } else { + atomic_inc(&mm->tlb_flush_batched); + } /* * If the PTE was dirty then it's best to assume it's writable. The @@ -680,15 +706,18 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) */ void flush_tlb_batched_pending(struct mm_struct *mm) { - if (data_race(mm->tlb_flush_batched)) { - flush_tlb_mm(mm); + int batch = atomic_read(&mm->tlb_flush_batched); + int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK; + int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT; + if (pending != flushed) { + flush_tlb_mm(mm); /* - * Do not allow the compiler to re-order the clearing of - * tlb_flush_batched before the tlb is flushed. + * If the new TLB flushing is pending during flushing, leave + * mm->tlb_flush_batched as is, to avoid losing flushing. */ - barrier(); - mm->tlb_flush_batched = false; + atomic_cmpxchg(&mm->tlb_flush_batched, batch, + pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT)); } } #else @@ -708,8 +737,9 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { - if (PageAnon(page)) { - struct anon_vma *page__anon_vma = page_anon_vma(page); + struct folio *folio = page_folio(page); + if (folio_test_anon(folio)) { + struct anon_vma *page__anon_vma = folio_anon_vma(folio); /* * Note: swapoff's unuse_vma() is more efficient with this * check, and needs it to match anon_vma when KSM is active. @@ -719,7 +749,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) return -EFAULT; } else if (!vma->vm_file) { return -EFAULT; - } else if (vma->vm_file->f_mapping != compound_head(page)->mapping) { + } else if (vma->vm_file->f_mapping != folio->mapping) { return -EFAULT; } @@ -760,30 +790,29 @@ out: return pmd; } -struct page_referenced_arg { +struct folio_referenced_arg { int mapcount; int referenced; unsigned long vm_flags; struct mem_cgroup *memcg; }; /* - * arg: page_referenced_arg will be passed + * arg: folio_referenced_arg will be passed */ -static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, - unsigned long address, void *arg) +static bool folio_referenced_one(struct folio *folio, + struct vm_area_struct *vma, unsigned long address, void *arg) { - struct page_referenced_arg *pra = arg; - struct page_vma_mapped_walk pvmw = { - .page = page, - .vma = vma, - .address = address, - }; + struct folio_referenced_arg *pra = arg; + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); int referenced = 0; while (page_vma_mapped_walk(&pvmw)) { address = pvmw.address; - if (vma->vm_flags & VM_LOCKED) { + if ((vma->vm_flags & VM_LOCKED) && + (!folio_test_large(folio) || !pvmw.pte)) { + /* Restore the mlock which got missed */ + mlock_vma_folio(folio, vma, !pvmw.pte); page_vma_mapped_walk_done(&pvmw); pra->vm_flags |= VM_LOCKED; return false; /* To break the loop */ @@ -795,10 +824,10 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, /* * Don't treat a reference through * a sequentially read mapping as such. - * If the page has been used in another mapping, + * If the folio has been used in another mapping, * we will catch it; if this other mapping is * already gone, the unmap path will have set - * PG_referenced or activated the page. + * the referenced flag or activated the folio. */ if (likely(!(vma->vm_flags & VM_SEQ_READ))) referenced++; @@ -808,7 +837,7 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, pvmw.pmd)) referenced++; } else { - /* unexpected pmd-mapped page? */ + /* unexpected pmd-mapped folio? */ WARN_ON_ONCE(1); } @@ -816,13 +845,13 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, } if (referenced) - clear_page_idle(page); - if (test_and_clear_page_young(page)) + folio_clear_idle(folio); + if (folio_test_clear_young(folio)) referenced++; if (referenced) { pra->referenced++; - pra->vm_flags |= vma->vm_flags; + pra->vm_flags |= vma->vm_flags & ~VM_LOCKED; } if (!pra->mapcount) @@ -831,9 +860,9 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, return true; } -static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) +static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg) { - struct page_referenced_arg *pra = arg; + struct folio_referenced_arg *pra = arg; struct mem_cgroup *memcg = pra->memcg; if (!mm_match_cgroup(vma->vm_mm, memcg)) @@ -843,40 +872,39 @@ static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) } /** - * page_referenced - test if the page was referenced - * @page: the page to test - * @is_locked: caller holds lock on the page + * folio_referenced() - Test if the folio was referenced. + * @folio: The folio to test. + * @is_locked: Caller holds lock on the folio. * @memcg: target memory cgroup - * @vm_flags: collect encountered vma->vm_flags who actually referenced the page + * @vm_flags: A combination of all the vma->vm_flags which referenced the folio. + * + * Quick test_and_clear_referenced for all mappings of a folio, * - * Quick test_and_clear_referenced for all mappings to a page, - * returns the number of ptes which referenced the page. + * Return: The number of mappings which referenced the folio. */ -int page_referenced(struct page *page, - int is_locked, - struct mem_cgroup *memcg, - unsigned long *vm_flags) +int folio_referenced(struct folio *folio, int is_locked, + struct mem_cgroup *memcg, unsigned long *vm_flags) { int we_locked = 0; - struct page_referenced_arg pra = { - .mapcount = total_mapcount(page), + struct folio_referenced_arg pra = { + .mapcount = folio_mapcount(folio), .memcg = memcg, }; struct rmap_walk_control rwc = { - .rmap_one = page_referenced_one, + .rmap_one = folio_referenced_one, .arg = (void *)&pra, - .anon_lock = page_lock_anon_vma_read, + .anon_lock = folio_lock_anon_vma_read, }; *vm_flags = 0; if (!pra.mapcount) return 0; - if (!page_rmapping(page)) + if (!folio_raw_mapping(folio)) return 0; - if (!is_locked && (!PageAnon(page) || PageKsm(page))) { - we_locked = trylock_page(page); + if (!is_locked && (!folio_test_anon(folio) || folio_test_ksm(folio))) { + we_locked = folio_trylock(folio); if (!we_locked) return 1; } @@ -887,37 +915,32 @@ int page_referenced(struct page *page, * cgroups */ if (memcg) { - rwc.invalid_vma = invalid_page_referenced_vma; + rwc.invalid_vma = invalid_folio_referenced_vma; } - rmap_walk(page, &rwc); + rmap_walk(folio, &rwc); *vm_flags = pra.vm_flags; if (we_locked) - unlock_page(page); + folio_unlock(folio); return pra.referenced; } -static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, +static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { - struct page_vma_mapped_walk pvmw = { - .page = page, - .vma = vma, - .address = address, - .flags = PVMW_SYNC, - }; + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC); struct mmu_notifier_range range; int *cleaned = arg; /* * We have to assume the worse case ie pmd for invalidation. Note that - * the page can not be free from this function. + * the folio can not be freed from this function. */ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, vma, vma->vm_mm, address, - vma_address_end(page, vma)); + vma_address_end(&pvmw)); mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { @@ -945,14 +968,14 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, if (!pmd_dirty(*pmd) && !pmd_write(*pmd)) continue; - flush_cache_page(vma, address, page_to_pfn(page)); + flush_cache_page(vma, address, folio_pfn(folio)); entry = pmdp_invalidate(vma, address, pmd); entry = pmd_wrprotect(entry); entry = pmd_mkclean(entry); set_pmd_at(vma->vm_mm, address, pmd, entry); ret = 1; #else - /* unexpected pmd-mapped page? */ + /* unexpected pmd-mapped folio? */ WARN_ON_ONCE(1); #endif } @@ -1000,7 +1023,7 @@ int folio_mkclean(struct folio *folio) if (!mapping) return 0; - rmap_walk(&folio->page, &rwc); + rmap_walk(folio, &rwc); return cleaned; } @@ -1028,8 +1051,8 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; /* * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written - * simultaneously, so a concurrent reader (eg page_referenced()'s - * PageAnon()) will not see one without the other. + * simultaneously, so a concurrent reader (eg folio_referenced()'s + * folio_test_anon()) will not see one without the other. */ WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); } @@ -1079,6 +1102,7 @@ static void __page_set_anon_rmap(struct page *page, static void __page_check_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { + struct folio *folio = page_folio(page); /* * The page's anon-rmap details (mapping and index) are guaranteed to * be set up correctly at this point. @@ -1090,7 +1114,8 @@ static void __page_check_anon_rmap(struct page *page, * are initially only visible via the pagetables, and the pte is locked * over the call to page_add_new_anon_rmap. */ - VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page); + VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root, + folio); VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address), page); } @@ -1152,17 +1177,17 @@ void do_page_add_anon_rmap(struct page *page, __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); } - if (unlikely(PageKsm(page))) { + if (unlikely(PageKsm(page))) unlock_page_memcg(page); - return; - } /* address might be in next vma when migration races vma_adjust */ - if (first) + else if (first) __page_set_anon_rmap(page, vma, address, flags & RMAP_EXCLUSIVE); else __page_check_anon_rmap(page, vma, address); + + mlock_vma_page(page, vma, compound); } /** @@ -1187,8 +1212,7 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON_PAGE(!PageTransHuge(page), page); /* increment count (starts at -1) */ atomic_set(compound_mapcount_ptr(page), 0); - if (hpage_pincount_available(page)) - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(compound_pincount_ptr(page), 0); __mod_lruvec_page_state(page, NR_ANON_THPS, nr); } else { @@ -1203,12 +1227,14 @@ void page_add_new_anon_rmap(struct page *page, /** * page_add_file_rmap - add pte mapping to a file page - * @page: the page to add the mapping to - * @compound: charge the page as compound or small page + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @compound: charge the page as compound or small page * * The caller needs to hold the pte lock. */ -void page_add_file_rmap(struct page *page, bool compound) +void page_add_file_rmap(struct page *page, + struct vm_area_struct *vma, bool compound) { int i, nr = 1; @@ -1223,6 +1249,17 @@ void page_add_file_rmap(struct page *page, bool compound) } if (!atomic_inc_and_test(compound_mapcount_ptr(page))) goto out; + + /* + * It is racy to ClearPageDoubleMap in page_remove_file_rmap(); + * but page lock is held by all page_add_file_rmap() compound + * callers, and SetPageDoubleMap below warns if !PageLocked: + * so here is a place that DoubleMap can be safely cleared. + */ + VM_WARN_ON_ONCE(!PageLocked(page)); + if (nr == nr_pages && PageDoubleMap(page)) + ClearPageDoubleMap(page); + if (PageSwapBacked(page)) __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, nr_pages); @@ -1231,13 +1268,8 @@ void page_add_file_rmap(struct page *page, bool compound) nr_pages); } else { if (PageTransCompound(page) && page_mapping(page)) { - struct page *head = compound_head(page); - VM_WARN_ON_ONCE(!PageLocked(page)); - - SetPageDoubleMap(head); - if (PageMlocked(page)) - clear_page_mlock(head); + SetPageDoubleMap(compound_head(page)); } if (!atomic_inc_and_test(&page->_mapcount)) goto out; @@ -1245,6 +1277,8 @@ void page_add_file_rmap(struct page *page, bool compound) __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); out: unlock_page_memcg(page); + + mlock_vma_page(page, vma, compound); } static void page_remove_file_rmap(struct page *page, bool compound) @@ -1287,9 +1321,6 @@ static void page_remove_file_rmap(struct page *page, bool compound) * pte lock(a spinlock) is held, which implies preemption disabled. */ __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); - - if (unlikely(PageMlocked(page))) - clear_page_mlock(page); } static void page_remove_anon_compound_rmap(struct page *page) @@ -1329,9 +1360,6 @@ static void page_remove_anon_compound_rmap(struct page *page) nr = thp_nr_pages(page); } - if (unlikely(PageMlocked(page))) - clear_page_mlock(page); - if (nr) __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr); } @@ -1339,11 +1367,13 @@ static void page_remove_anon_compound_rmap(struct page *page) /** * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from + * @vma: the vm area from which the mapping is removed * @compound: uncharge the page as compound or small page * * The caller needs to hold the pte lock. */ -void page_remove_rmap(struct page *page, bool compound) +void page_remove_rmap(struct page *page, + struct vm_area_struct *vma, bool compound) { lock_page_memcg(page); @@ -1368,9 +1398,6 @@ void page_remove_rmap(struct page *page, bool compound) */ __dec_lruvec_page_state(page, NR_ANON_MAPPED); - if (unlikely(PageMlocked(page))) - clear_page_mlock(page); - if (PageTransCompound(page)) deferred_split_huge_page(compound_head(page)); @@ -1385,20 +1412,18 @@ void page_remove_rmap(struct page *page, bool compound) */ out: unlock_page_memcg(page); + + munlock_vma_page(page, vma, compound); } /* * @arg: enum ttu_flags will be passed to this argument */ -static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, +static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; - struct page_vma_mapped_walk pvmw = { - .page = page, - .vma = vma, - .address = address, - }; + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); pte_t pteval; struct page *subpage; bool ret = true; @@ -1415,21 +1440,20 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, pvmw.flags = PVMW_SYNC; if (flags & TTU_SPLIT_HUGE_PMD) - split_huge_pmd_address(vma, address, false, page); + split_huge_pmd_address(vma, address, false, folio); /* * For THP, we have to assume the worse case ie pmd for invalidation. * For hugetlb, it could be much worse if we need to do pud * invalidation in the case of pmd sharing. * - * Note that the page can not be free in this function as call of - * try_to_unmap() must hold a reference on the page. + * Note that the folio can not be freed in this function as call of + * try_to_unmap() must hold a reference on the folio. */ - range.end = PageKsm(page) ? - address + PAGE_SIZE : vma_address_end(page, vma); + range.end = vma_address_end(&pvmw); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address, range.end); - if (PageHuge(page)) { + if (folio_test_hugetlb(folio)) { /* * If sharing is possible, start and end will be adjusted * accordingly. @@ -1440,32 +1464,26 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { + /* Unexpected PMD-mapped THP? */ + VM_BUG_ON_FOLIO(!pvmw.pte, folio); + /* - * If the page is mlock()d, we cannot swap it out. + * If the folio is in an mlock()d vma, we must not swap it out. */ if (!(flags & TTU_IGNORE_MLOCK) && (vma->vm_flags & VM_LOCKED)) { - /* - * PTE-mapped THP are never marked as mlocked: so do - * not set it on a DoubleMap THP, nor on an Anon THP - * (which may still be PTE-mapped after DoubleMap was - * cleared). But stop unmapping even in those cases. - */ - if (!PageTransCompound(page) || (PageHead(page) && - !PageDoubleMap(page) && !PageAnon(page))) - mlock_vma_page(page); + /* Restore the mlock which got missed */ + mlock_vma_folio(folio, vma, false); page_vma_mapped_walk_done(&pvmw); ret = false; break; } - /* Unexpected PMD-mapped THP? */ - VM_BUG_ON_PAGE(!pvmw.pte, page); - - subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); + subpage = folio_page(folio, + pte_pfn(*pvmw.pte) - folio_pfn(folio)); address = pvmw.address; - if (PageHuge(page) && !PageAnon(page)) { + if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { /* * To call huge_pmd_unshare, i_mmap_rwsem must be * held in write mode. Caller needs to explicitly @@ -1504,7 +1522,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (should_defer_flush(mm, flags)) { /* * We clear the PTE but do not flush so potentially - * a remote CPU could still be writing to the page. + * a remote CPU could still be writing to the folio. * If the entry was previously clean then the * architecture must guarantee that a clear->dirty * transition on a cached TLB entry is written through @@ -1517,22 +1535,22 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, pteval = ptep_clear_flush(vma, address, pvmw.pte); } - /* Move the dirty bit to the page. Now the pte is gone. */ + /* Set the dirty flag on the folio now the pte is gone. */ if (pte_dirty(pteval)) - set_page_dirty(page); + folio_mark_dirty(folio); /* Update high watermark before we lower rss */ update_hiwater_rss(mm); - if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { + if (PageHWPoison(subpage) && !(flags & TTU_IGNORE_HWPOISON)) { pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); - if (PageHuge(page)) { - hugetlb_count_sub(compound_nr(page), mm); + if (folio_test_hugetlb(folio)) { + hugetlb_count_sub(folio_nr_pages(folio), mm); set_huge_swap_pte_at(mm, address, pvmw.pte, pteval, vma_mmu_pagesize(vma)); } else { - dec_mm_counter(mm, mm_counter(page)); + dec_mm_counter(mm, mm_counter(&folio->page)); set_pte_at(mm, address, pvmw.pte, pteval); } @@ -1547,18 +1565,19 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * migration) will not expect userfaults on already * copied pages. */ - dec_mm_counter(mm, mm_counter(page)); + dec_mm_counter(mm, mm_counter(&folio->page)); /* We have to invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); - } else if (PageAnon(page)) { + } else if (folio_test_anon(folio)) { swp_entry_t entry = { .val = page_private(subpage) }; pte_t swp_pte; /* * Store the swap location in the pte. * See handle_pte_fault() ... */ - if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) { + if (unlikely(folio_test_swapbacked(folio) != + folio_test_swapcache(folio))) { WARN_ON_ONCE(1); ret = false; /* We have to invalidate as we cleared the pte */ @@ -1569,8 +1588,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } /* MADV_FREE page check */ - if (!PageSwapBacked(page)) { - if (!PageDirty(page)) { + if (!folio_test_swapbacked(folio)) { + if (!folio_test_dirty(folio)) { /* Invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); @@ -1579,11 +1598,11 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } /* - * If the page was redirtied, it cannot be + * If the folio was redirtied, it cannot be * discarded. Remap the page to page table. */ set_pte_at(mm, address, pvmw.pte, pteval); - SetPageSwapBacked(page); + folio_set_swapbacked(folio); ret = false; page_vma_mapped_walk_done(&pvmw); break; @@ -1620,16 +1639,17 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, address + PAGE_SIZE); } else { /* - * This is a locked file-backed page, thus it cannot - * be removed from the page cache and replaced by a new - * page before mmu_notifier_invalidate_range_end, so no - * concurrent thread might update its page table to - * point at new page while a device still is using this - * page. + * This is a locked file-backed folio, + * so it cannot be removed from the page + * cache and replaced by a new folio before + * mmu_notifier_invalidate_range_end, so no + * concurrent thread might update its page table + * to point at a new folio while a device is + * still using this folio. * * See Documentation/vm/mmu_notifier.rst */ - dec_mm_counter(mm, mm_counter_file(page)); + dec_mm_counter(mm, mm_counter_file(&folio->page)); } discard: /* @@ -1639,8 +1659,10 @@ discard: * * See Documentation/vm/mmu_notifier.rst */ - page_remove_rmap(subpage, PageHuge(page)); - put_page(page); + page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); + if (vma->vm_flags & VM_LOCKED) + mlock_page_drain(smp_processor_id()); + folio_put(folio); } mmu_notifier_invalidate_range_end(&range); @@ -1653,35 +1675,35 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) return vma_is_temporary_stack(vma); } -static int page_not_mapped(struct page *page) +static int page_not_mapped(struct folio *folio) { - return !page_mapped(page); + return !folio_mapped(folio); } /** - * try_to_unmap - try to remove all page table mappings to a page - * @page: the page to get unmapped + * try_to_unmap - Try to remove all page table mappings to a folio. + * @folio: The folio to unmap. * @flags: action and flags * * Tries to remove all the page table entries which are mapping this - * page, used in the pageout path. Caller must hold the page lock. + * folio. It is the caller's responsibility to check if the folio is + * still mapped if needed (use TTU_SYNC to prevent accounting races). * - * It is the caller's responsibility to check if the page is still - * mapped when needed (use TTU_SYNC to prevent accounting races). + * Context: Caller must hold the folio lock. */ -void try_to_unmap(struct page *page, enum ttu_flags flags) +void try_to_unmap(struct folio *folio, enum ttu_flags flags) { struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, .arg = (void *)flags, .done = page_not_mapped, - .anon_lock = page_lock_anon_vma_read, + .anon_lock = folio_lock_anon_vma_read, }; if (flags & TTU_RMAP_LOCKED) - rmap_walk_locked(page, &rwc); + rmap_walk_locked(folio, &rwc); else - rmap_walk(page, &rwc); + rmap_walk(folio, &rwc); } /* @@ -1690,15 +1712,11 @@ void try_to_unmap(struct page *page, enum ttu_flags flags) * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs * containing migration entries. */ -static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, +static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; - struct page_vma_mapped_walk pvmw = { - .page = page, - .vma = vma, - .address = address, - }; + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); pte_t pteval; struct page *subpage; bool ret = true; @@ -1719,7 +1737,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, * TTU_SPLIT_HUGE_PMD and it wants to freeze. */ if (flags & TTU_SPLIT_HUGE_PMD) - split_huge_pmd_address(vma, address, true, page); + split_huge_pmd_address(vma, address, true, folio); /* * For THP, we have to assume the worse case ie pmd for invalidation. @@ -1729,11 +1747,10 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ - range.end = PageKsm(page) ? - address + PAGE_SIZE : vma_address_end(page, vma); + range.end = vma_address_end(&pvmw); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address, range.end); - if (PageHuge(page)) { + if (folio_test_hugetlb(folio)) { /* * If sharing is possible, start and end will be adjusted * accordingly. @@ -1747,21 +1764,24 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION /* PMD-mapped THP migration entry */ if (!pvmw.pte) { - VM_BUG_ON_PAGE(PageHuge(page) || - !PageTransCompound(page), page); + subpage = folio_page(folio, + pmd_pfn(*pvmw.pmd) - folio_pfn(folio)); + VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || + !folio_test_pmd_mappable(folio), folio); - set_pmd_migration_entry(&pvmw, page); + set_pmd_migration_entry(&pvmw, subpage); continue; } #endif /* Unexpected PMD-mapped THP? */ - VM_BUG_ON_PAGE(!pvmw.pte, page); + VM_BUG_ON_FOLIO(!pvmw.pte, folio); - subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); + subpage = folio_page(folio, + pte_pfn(*pvmw.pte) - folio_pfn(folio)); address = pvmw.address; - if (PageHuge(page) && !PageAnon(page)) { + if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { /* * To call huge_pmd_unshare, i_mmap_rwsem must be * held in write mode. Caller needs to explicitly @@ -1799,15 +1819,15 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); pteval = ptep_clear_flush(vma, address, pvmw.pte); - /* Move the dirty bit to the page. Now the pte is gone. */ + /* Set the dirty flag on the folio now the pte is gone. */ if (pte_dirty(pteval)) - set_page_dirty(page); + folio_mark_dirty(folio); /* Update high watermark before we lower rss */ update_hiwater_rss(mm); - if (is_zone_device_page(page)) { - unsigned long pfn = page_to_pfn(page); + if (folio_is_zone_device(folio)) { + unsigned long pfn = folio_pfn(folio); swp_entry_t entry; pte_t swp_pte; @@ -1843,16 +1863,16 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, * changed when hugepage migrations to device private * memory are supported. */ - subpage = page; - } else if (PageHWPoison(page)) { + subpage = &folio->page; + } else if (PageHWPoison(subpage)) { pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); - if (PageHuge(page)) { - hugetlb_count_sub(compound_nr(page), mm); + if (folio_test_hugetlb(folio)) { + hugetlb_count_sub(folio_nr_pages(folio), mm); set_huge_swap_pte_at(mm, address, pvmw.pte, pteval, vma_mmu_pagesize(vma)); } else { - dec_mm_counter(mm, mm_counter(page)); + dec_mm_counter(mm, mm_counter(&folio->page)); set_pte_at(mm, address, pvmw.pte, pteval); } @@ -1867,7 +1887,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, * migration) will not expect userfaults on already * copied pages. */ - dec_mm_counter(mm, mm_counter(page)); + dec_mm_counter(mm, mm_counter(&folio->page)); /* We have to invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); @@ -1913,8 +1933,10 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, * * See Documentation/vm/mmu_notifier.rst */ - page_remove_rmap(subpage, PageHuge(page)); - put_page(page); + page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); + if (vma->vm_flags & VM_LOCKED) + mlock_page_drain(smp_processor_id()); + folio_put(folio); } mmu_notifier_invalidate_range_end(&range); @@ -1924,19 +1946,19 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, /** * try_to_migrate - try to replace all page table mappings with swap entries - * @page: the page to replace page table entries for + * @folio: the folio to replace page table entries for * @flags: action and flags * - * Tries to remove all the page table entries which are mapping this page and - * replace them with special swap entries. Caller must hold the page lock. + * Tries to remove all the page table entries which are mapping this folio and + * replace them with special swap entries. Caller must hold the folio lock. */ -void try_to_migrate(struct page *page, enum ttu_flags flags) +void try_to_migrate(struct folio *folio, enum ttu_flags flags) { struct rmap_walk_control rwc = { .rmap_one = try_to_migrate_one, .arg = (void *)flags, .done = page_not_mapped, - .anon_lock = page_lock_anon_vma_read, + .anon_lock = folio_lock_anon_vma_read, }; /* @@ -1947,7 +1969,7 @@ void try_to_migrate(struct page *page, enum ttu_flags flags) TTU_SYNC))) return; - if (is_zone_device_page(page) && !is_device_private_page(page)) + if (folio_is_zone_device(folio) && !folio_is_device_private(folio)) return; /* @@ -1958,83 +1980,13 @@ void try_to_migrate(struct page *page, enum ttu_flags flags) * locking requirements of exec(), migration skips * temporary VMAs until after exec() completes. */ - if (!PageKsm(page) && PageAnon(page)) + if (!folio_test_ksm(folio) && folio_test_anon(folio)) rwc.invalid_vma = invalid_migration_vma; if (flags & TTU_RMAP_LOCKED) - rmap_walk_locked(page, &rwc); + rmap_walk_locked(folio, &rwc); else - rmap_walk(page, &rwc); -} - -/* - * Walks the vma's mapping a page and mlocks the page if any locked vma's are - * found. Once one is found the page is locked and the scan can be terminated. - */ -static bool page_mlock_one(struct page *page, struct vm_area_struct *vma, - unsigned long address, void *unused) -{ - struct page_vma_mapped_walk pvmw = { - .page = page, - .vma = vma, - .address = address, - }; - - /* An un-locked vma doesn't have any pages to lock, continue the scan */ - if (!(vma->vm_flags & VM_LOCKED)) - return true; - - while (page_vma_mapped_walk(&pvmw)) { - /* - * Need to recheck under the ptl to serialise with - * __munlock_pagevec_fill() after VM_LOCKED is cleared in - * munlock_vma_pages_range(). - */ - if (vma->vm_flags & VM_LOCKED) { - /* - * PTE-mapped THP are never marked as mlocked; but - * this function is never called on a DoubleMap THP, - * nor on an Anon THP (which may still be PTE-mapped - * after DoubleMap was cleared). - */ - mlock_vma_page(page); - /* - * No need to scan further once the page is marked - * as mlocked. - */ - page_vma_mapped_walk_done(&pvmw); - return false; - } - } - - return true; -} - -/** - * page_mlock - try to mlock a page - * @page: the page to be mlocked - * - * Called from munlock code. Checks all of the VMAs mapping the page and mlocks - * the page if any are found. The page will be returned with PG_mlocked cleared - * if it is not mapped by any locked vmas. - */ -void page_mlock(struct page *page) -{ - struct rmap_walk_control rwc = { - .rmap_one = page_mlock_one, - .done = page_not_mapped, - .anon_lock = page_lock_anon_vma_read, - - }; - - VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); - VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); - - /* Anon THP are only marked as mlocked when singly mapped */ - if (PageTransCompound(page) && PageAnon(page)) - return; - - rmap_walk(page, &rwc); + rmap_walk(folio, &rwc); } #ifdef CONFIG_DEVICE_PRIVATE @@ -2045,15 +1997,11 @@ struct make_exclusive_args { bool valid; }; -static bool page_make_device_exclusive_one(struct page *page, +static bool page_make_device_exclusive_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *priv) { struct mm_struct *mm = vma->vm_mm; - struct page_vma_mapped_walk pvmw = { - .page = page, - .vma = vma, - .address = address, - }; + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); struct make_exclusive_args *args = priv; pte_t pteval; struct page *subpage; @@ -2064,12 +2012,13 @@ static bool page_make_device_exclusive_one(struct page *page, mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma, vma->vm_mm, address, min(vma->vm_end, - address + page_size(page)), args->owner); + address + folio_size(folio)), + args->owner); mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { /* Unexpected PMD-mapped THP? */ - VM_BUG_ON_PAGE(!pvmw.pte, page); + VM_BUG_ON_FOLIO(!pvmw.pte, folio); if (!pte_present(*pvmw.pte)) { ret = false; @@ -2077,16 +2026,17 @@ static bool page_make_device_exclusive_one(struct page *page, break; } - subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); + subpage = folio_page(folio, + pte_pfn(*pvmw.pte) - folio_pfn(folio)); address = pvmw.address; /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); pteval = ptep_clear_flush(vma, address, pvmw.pte); - /* Move the dirty bit to the page. Now the pte is gone. */ + /* Set the dirty flag on the folio now the pte is gone. */ if (pte_dirty(pteval)) - set_page_dirty(page); + folio_mark_dirty(folio); /* * Check that our target page is still mapped at the expected @@ -2119,7 +2069,7 @@ static bool page_make_device_exclusive_one(struct page *page, * There is a reference on the page for the swap entry which has * been removed, so shouldn't take another. */ - page_remove_rmap(subpage, false); + page_remove_rmap(subpage, vma, false); } mmu_notifier_invalidate_range_end(&range); @@ -2128,21 +2078,22 @@ static bool page_make_device_exclusive_one(struct page *page, } /** - * page_make_device_exclusive - mark the page exclusively owned by a device - * @page: the page to replace page table entries for - * @mm: the mm_struct where the page is expected to be mapped - * @address: address where the page is expected to be mapped + * folio_make_device_exclusive - Mark the folio exclusively owned by a device. + * @folio: The folio to replace page table entries for. + * @mm: The mm_struct where the folio is expected to be mapped. + * @address: Address where the folio is expected to be mapped. * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks * - * Tries to remove all the page table entries which are mapping this page and - * replace them with special device exclusive swap entries to grant a device - * exclusive access to the page. Caller must hold the page lock. + * Tries to remove all the page table entries which are mapping this + * folio and replace them with special device exclusive swap entries to + * grant a device exclusive access to the folio. * - * Returns false if the page is still mapped, or if it could not be unmapped + * Context: Caller must hold the folio lock. + * Return: false if the page is still mapped, or if it could not be unmapped * from the expected address. Otherwise returns true (success). */ -static bool page_make_device_exclusive(struct page *page, struct mm_struct *mm, - unsigned long address, void *owner) +static bool folio_make_device_exclusive(struct folio *folio, + struct mm_struct *mm, unsigned long address, void *owner) { struct make_exclusive_args args = { .mm = mm, @@ -2153,21 +2104,20 @@ static bool page_make_device_exclusive(struct page *page, struct mm_struct *mm, struct rmap_walk_control rwc = { .rmap_one = page_make_device_exclusive_one, .done = page_not_mapped, - .anon_lock = page_lock_anon_vma_read, + .anon_lock = folio_lock_anon_vma_read, .arg = &args, }; /* - * Restrict to anonymous pages for now to avoid potential writeback - * issues. Also tail pages shouldn't be passed to rmap_walk so skip - * those. + * Restrict to anonymous folios for now to avoid potential writeback + * issues. */ - if (!PageAnon(page) || PageTail(page)) + if (!folio_test_anon(folio)) return false; - rmap_walk(page, &rwc); + rmap_walk(folio, &rwc); - return args.valid && !page_mapcount(page); + return args.valid && !folio_mapcount(folio); } /** @@ -2205,15 +2155,16 @@ int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, return npages; for (i = 0; i < npages; i++, start += PAGE_SIZE) { - if (!trylock_page(pages[i])) { - put_page(pages[i]); + struct folio *folio = page_folio(pages[i]); + if (PageTail(pages[i]) || !folio_trylock(folio)) { + folio_put(folio); pages[i] = NULL; continue; } - if (!page_make_device_exclusive(pages[i], mm, start, owner)) { - unlock_page(pages[i]); - put_page(pages[i]); + if (!folio_make_device_exclusive(folio, mm, start, owner)) { + folio_unlock(folio); + folio_put(folio); pages[i] = NULL; } } @@ -2232,21 +2183,21 @@ void __put_anon_vma(struct anon_vma *anon_vma) anon_vma_free(root); } -static struct anon_vma *rmap_walk_anon_lock(struct page *page, - struct rmap_walk_control *rwc) +static struct anon_vma *rmap_walk_anon_lock(struct folio *folio, + const struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; if (rwc->anon_lock) - return rwc->anon_lock(page); + return rwc->anon_lock(folio); /* - * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() + * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read() * because that depends on page_mapped(); but not all its usages * are holding mmap_lock. Users without mmap_lock are required to * take a reference count to prevent the anon_vma disappearing */ - anon_vma = page_anon_vma(page); + anon_vma = folio_anon_vma(folio); if (!anon_vma) return NULL; @@ -2262,35 +2213,30 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the anon_vma struct it points to. - * - * When called from page_mlock(), the mmap_lock of the mm containing the vma - * where the page was found will be held for write. So, we won't recheck - * vm_flags for that VMA. That should be OK, because that vma shouldn't be - * LOCKED. */ -static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, - bool locked) +static void rmap_walk_anon(struct folio *folio, + const struct rmap_walk_control *rwc, bool locked) { struct anon_vma *anon_vma; pgoff_t pgoff_start, pgoff_end; struct anon_vma_chain *avc; if (locked) { - anon_vma = page_anon_vma(page); + anon_vma = folio_anon_vma(folio); /* anon_vma disappear under us? */ - VM_BUG_ON_PAGE(!anon_vma, page); + VM_BUG_ON_FOLIO(!anon_vma, folio); } else { - anon_vma = rmap_walk_anon_lock(page, rwc); + anon_vma = rmap_walk_anon_lock(folio, rwc); } if (!anon_vma) return; - pgoff_start = page_to_pgoff(page); - pgoff_end = pgoff_start + thp_nr_pages(page) - 1; + pgoff_start = folio_pgoff(folio); + pgoff_end = pgoff_start + folio_nr_pages(folio) - 1; anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff_start, pgoff_end) { struct vm_area_struct *vma = avc->vma; - unsigned long address = vma_address(page, vma); + unsigned long address = vma_address(&folio->page, vma); VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); @@ -2298,9 +2244,9 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - if (!rwc->rmap_one(page, vma, address, rwc->arg)) + if (!rwc->rmap_one(folio, vma, address, rwc->arg)) break; - if (rwc->done && rwc->done(page)) + if (rwc->done && rwc->done(folio)) break; } @@ -2315,16 +2261,11 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the address_space struct it points to. - * - * When called from page_mlock(), the mmap_lock of the mm containing the vma - * where the page was found will be held for write. So, we won't recheck - * vm_flags for that VMA. That should be OK, because that vma shouldn't be - * LOCKED. */ -static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, - bool locked) +static void rmap_walk_file(struct folio *folio, + const struct rmap_walk_control *rwc, bool locked) { - struct address_space *mapping = page_mapping(page); + struct address_space *mapping = folio_mapping(folio); pgoff_t pgoff_start, pgoff_end; struct vm_area_struct *vma; @@ -2334,18 +2275,18 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, * structure at mapping cannot be freed and reused yet, * so we can safely take mapping->i_mmap_rwsem. */ - VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (!mapping) return; - pgoff_start = page_to_pgoff(page); - pgoff_end = pgoff_start + thp_nr_pages(page) - 1; + pgoff_start = folio_pgoff(folio); + pgoff_end = pgoff_start + folio_nr_pages(folio) - 1; if (!locked) i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff_start, pgoff_end) { - unsigned long address = vma_address(page, vma); + unsigned long address = vma_address(&folio->page, vma); VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); @@ -2353,9 +2294,9 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - if (!rwc->rmap_one(page, vma, address, rwc->arg)) + if (!rwc->rmap_one(folio, vma, address, rwc->arg)) goto done; - if (rwc->done && rwc->done(page)) + if (rwc->done && rwc->done(folio)) goto done; } @@ -2364,25 +2305,25 @@ done: i_mmap_unlock_read(mapping); } -void rmap_walk(struct page *page, struct rmap_walk_control *rwc) +void rmap_walk(struct folio *folio, const struct rmap_walk_control *rwc) { - if (unlikely(PageKsm(page))) - rmap_walk_ksm(page, rwc); - else if (PageAnon(page)) - rmap_walk_anon(page, rwc, false); + if (unlikely(folio_test_ksm(folio))) + rmap_walk_ksm(folio, rwc); + else if (folio_test_anon(folio)) + rmap_walk_anon(folio, rwc, false); else - rmap_walk_file(page, rwc, false); + rmap_walk_file(folio, rwc, false); } /* Like rmap_walk, but caller holds relevant rmap lock */ -void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) +void rmap_walk_locked(struct folio *folio, const struct rmap_walk_control *rwc) { /* no ksm support for now */ - VM_BUG_ON_PAGE(PageKsm(page), page); - if (PageAnon(page)) - rmap_walk_anon(page, rwc, true); + VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio); + if (folio_test_anon(folio)) + rmap_walk_anon(folio, rwc, true); else - rmap_walk_file(page, rwc, true); + rmap_walk_file(folio, rwc, true); } #ifdef CONFIG_HUGETLB_PAGE @@ -2410,8 +2351,7 @@ void hugepage_add_new_anon_rmap(struct page *page, { BUG_ON(address < vma->vm_start || address >= vma->vm_end); atomic_set(compound_mapcount_ptr(page), 0); - if (hpage_pincount_available(page)) - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(compound_pincount_ptr(page), 0); __page_set_anon_rmap(page, vma, address, 1); } diff --git a/mm/secretmem.c b/mm/secretmem.c index 22b310adb53d..098638d3b8a4 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -152,7 +152,7 @@ static void secretmem_freepage(struct page *page) } const struct address_space_operations secretmem_aops = { - .set_page_dirty = __set_page_dirty_no_writeback, + .dirty_folio = noop_dirty_folio, .freepage = secretmem_freepage, .migratepage = secretmem_migratepage, .isolate_page = secretmem_isolate_page, diff --git a/mm/shmem.c b/mm/shmem.c index 18f93c2d68f1..529c9ad3e926 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -36,7 +36,6 @@ #include <linux/uio.h> #include <linux/khugepaged.h> #include <linux/hugetlb.h> -#include <linux/frontswap.h> #include <linux/fs_parser.h> #include <linux/swapfile.h> @@ -477,6 +476,8 @@ bool shmem_is_huge(struct vm_area_struct *vma, { loff_t i_size; + if (!S_ISREG(inode->i_mode)) + return false; if (shmem_huge == SHMEM_HUGE_DENY) return false; if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) || @@ -554,7 +555,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shmem_inode_info *info; struct page *page; unsigned long batch = sc ? sc->nr_to_scan : 128; - int removed = 0, split = 0; + int split = 0; if (list_empty(&sbinfo->shrinklist)) return SHRINK_STOP; @@ -569,7 +570,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, /* inode is about to be evicted */ if (!inode) { list_del_init(&info->shrinklist); - removed++; goto next; } @@ -577,12 +577,12 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, if (round_up(inode->i_size, PAGE_SIZE) == round_up(inode->i_size, HPAGE_PMD_SIZE)) { list_move(&info->shrinklist, &to_remove); - removed++; goto next; } list_move(&info->shrinklist, &list); next: + sbinfo->shrinklist_len--; if (!--batch) break; } @@ -602,7 +602,7 @@ next: inode = &info->vfs_inode; if (nr_to_split && split >= nr_to_split) - goto leave; + goto move_back; page = find_get_page(inode->i_mapping, (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT); @@ -616,38 +616,44 @@ next: } /* - * Leave the inode on the list if we failed to lock - * the page at this time. + * Move the inode on the list back to shrinklist if we failed + * to lock the page at this time. * * Waiting for the lock may lead to deadlock in the * reclaim path. */ if (!trylock_page(page)) { put_page(page); - goto leave; + goto move_back; } ret = split_huge_page(page); unlock_page(page); put_page(page); - /* If split failed leave the inode on the list */ + /* If split failed move the inode on the list back to shrinklist */ if (ret) - goto leave; + goto move_back; split++; drop: list_del_init(&info->shrinklist); - removed++; -leave: + goto put; +move_back: + /* + * Make sure the inode is either on the global list or deleted + * from any local list before iput() since it could be deleted + * in another thread once we put the inode (then the local list + * is corrupted). + */ + spin_lock(&sbinfo->shrinklist_lock); + list_move(&info->shrinklist, &sbinfo->shrinklist); + sbinfo->shrinklist_len++; + spin_unlock(&sbinfo->shrinklist_lock); +put: iput(inode); } - spin_lock(&sbinfo->shrinklist_lock); - list_splice_tail(&list, &sbinfo->shrinklist); - sbinfo->shrinklist_len -= removed; - spin_unlock(&sbinfo->shrinklist_lock); - return split; } @@ -694,7 +700,6 @@ static int shmem_add_to_page_cache(struct page *page, struct mm_struct *charge_mm) { XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); - unsigned long i = 0; unsigned long nr = compound_nr(page); int error; @@ -721,20 +726,18 @@ static int shmem_add_to_page_cache(struct page *page, cgroup_throttle_swaprate(page, gfp); do { - void *entry; xas_lock_irq(&xas); - entry = xas_find_conflict(&xas); - if (entry != expected) + if (expected != xas_find_conflict(&xas)) { xas_set_err(&xas, -EEXIST); - xas_create_range(&xas); - if (xas_error(&xas)) goto unlock; -next: - xas_store(&xas, page); - if (++i < nr) { - xas_next(&xas); - goto next; } + if (expected && xas_find_conflict(&xas)) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + xas_store(&xas, page); + if (xas_error(&xas)) + goto unlock; if (PageTransHuge(page)) { count_vm_event(THP_FILE_ALLOC); __mod_lruvec_page_state(page, NR_SHMEM_THPS, nr); @@ -880,30 +883,26 @@ void shmem_unlock_mapping(struct address_space *mapping) } } -/* - * Check whether a hole-punch or truncation needs to split a huge page, - * returning true if no split was required, or the split has been successful. - * - * Eviction (or truncation to 0 size) should never need to split a huge page; - * but in rare cases might do so, if shmem_undo_range() failed to trylock on - * head, and then succeeded to trylock on tail. - * - * A split can only succeed when there are no additional references on the - * huge page: so the split below relies upon find_get_entries() having stopped - * when it found a subpage of the huge page, without getting further references. - */ -static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end) +static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) { - if (!PageTransCompound(page)) - return true; - - /* Just proceed to delete a huge page wholly within the range punched */ - if (PageHead(page) && - page->index >= start && page->index + HPAGE_PMD_NR <= end) - return true; + struct folio *folio; + struct page *page; - /* Try to split huge page, so we can truly punch the hole or truncate */ - return split_huge_page(page) >= 0; + /* + * At first avoid shmem_getpage(,,,SGP_READ): that fails + * beyond i_size, and reports fallocated pages as holes. + */ + folio = __filemap_get_folio(inode->i_mapping, index, + FGP_ENTRY | FGP_LOCK, 0); + if (!xa_is_value(folio)) + return folio; + /* + * But read a page back from swap if any of it is within i_size + * (although in some cases this is just a waste of time). + */ + page = NULL; + shmem_getpage(inode, index, &page, SGP_READ); + return page ? page_folio(page) : NULL; } /* @@ -917,10 +916,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; pgoff_t end = (lend + 1) >> PAGE_SHIFT; - unsigned int partial_start = lstart & (PAGE_SIZE - 1); - unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1); - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; + struct folio *folio; + bool same_folio; long nr_swaps_freed = 0; pgoff_t index; int i; @@ -931,67 +930,64 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (info->fallocend > start && info->fallocend <= end && !unfalloc) info->fallocend = start; - pagevec_init(&pvec); + folio_batch_init(&fbatch); index = start; while (index < end && find_lock_entries(mapping, index, end - 1, - &pvec, indices)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + &fbatch, indices)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + folio = fbatch.folios[i]; index = indices[i]; - if (xa_is_value(page)) { + if (xa_is_value(folio)) { if (unfalloc) continue; nr_swaps_freed += !shmem_free_swap(mapping, - index, page); + index, folio); continue; } - index += thp_nr_pages(page) - 1; + index += folio_nr_pages(folio) - 1; - if (!unfalloc || !PageUptodate(page)) - truncate_inode_page(mapping, page); - unlock_page(page); + if (!unfalloc || !folio_test_uptodate(folio)) + truncate_inode_folio(mapping, folio); + folio_unlock(folio); } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); cond_resched(); index++; } - if (partial_start) { - struct page *page = NULL; - shmem_getpage(inode, start - 1, &page, SGP_READ); - if (page) { - unsigned int top = PAGE_SIZE; - if (start > end) { - top = partial_end; - partial_end = 0; - } - zero_user_segment(page, partial_start, top); - set_page_dirty(page); - unlock_page(page); - put_page(page); + same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); + folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); + if (folio) { + same_folio = lend < folio_pos(folio) + folio_size(folio); + folio_mark_dirty(folio); + if (!truncate_inode_partial_folio(folio, lstart, lend)) { + start = folio->index + folio_nr_pages(folio); + if (same_folio) + end = folio->index; } + folio_unlock(folio); + folio_put(folio); + folio = NULL; } - if (partial_end) { - struct page *page = NULL; - shmem_getpage(inode, end, &page, SGP_READ); - if (page) { - zero_user_segment(page, 0, partial_end); - set_page_dirty(page); - unlock_page(page); - put_page(page); - } + + if (!same_folio) + folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT); + if (folio) { + folio_mark_dirty(folio); + if (!truncate_inode_partial_folio(folio, lstart, lend)) + end = folio->index; + folio_unlock(folio); + folio_put(folio); } - if (start >= end) - return; index = start; while (index < end) { cond_resched(); - if (!find_get_entries(mapping, index, end - 1, &pvec, + if (!find_get_entries(mapping, index, end - 1, &fbatch, indices)) { /* If all gone or hole-punch or unfalloc, we're done */ if (index == start || end != -1) @@ -1000,14 +996,14 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, index = start; continue; } - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < folio_batch_count(&fbatch); i++) { + folio = fbatch.folios[i]; index = indices[i]; - if (xa_is_value(page)) { + if (xa_is_value(folio)) { if (unfalloc) continue; - if (shmem_free_swap(mapping, index, page)) { + if (shmem_free_swap(mapping, index, folio)) { /* Swap was replaced by page: retry */ index--; break; @@ -1016,32 +1012,24 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, continue; } - lock_page(page); + folio_lock(folio); - if (!unfalloc || !PageUptodate(page)) { - if (page_mapping(page) != mapping) { + if (!unfalloc || !folio_test_uptodate(folio)) { + if (folio_mapping(folio) != mapping) { /* Page was replaced by swap: retry */ - unlock_page(page); + folio_unlock(folio); index--; break; } - VM_BUG_ON_PAGE(PageWriteback(page), page); - if (shmem_punch_compound(page, start, end)) - truncate_inode_page(mapping, page); - else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - /* Wipe the page and don't get stuck */ - clear_highpage(page); - flush_dcache_page(page); - set_page_dirty(page); - if (index < - round_up(start, HPAGE_PMD_NR)) - start = index + 1; - } + VM_BUG_ON_FOLIO(folio_test_writeback(folio), + folio); + truncate_inode_folio(mapping, folio); } - unlock_page(page); + index = folio->index + folio_nr_pages(folio) - 1; + folio_unlock(folio); } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); index++; } @@ -1075,6 +1063,12 @@ static int shmem_getattr(struct user_namespace *mnt_userns, if (shmem_is_huge(NULL, inode, 0)) stat->blksize = HPAGE_PMD_SIZE; + if (request_mask & STATX_BTIME) { + stat->result_mask |= STATX_BTIME; + stat->btime.tv_sec = info->i_crtime.tv_sec; + stat->btime.tv_nsec = info->i_crtime.tv_nsec; + } + return 0; } @@ -1135,6 +1129,7 @@ static void shmem_evict_inode(struct inode *inode) if (shmem_mapping(inode->i_mapping)) { shmem_unacct_size(info->flags, inode->i_size); inode->i_size = 0; + mapping_set_exiting(inode->i_mapping); shmem_truncate_range(inode, 0, (loff_t)-1); if (!list_empty(&info->shrinklist)) { spin_lock(&sbinfo->shrinklist_lock); @@ -1165,7 +1160,7 @@ static void shmem_evict_inode(struct inode *inode) static int shmem_find_swap_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, struct page **entries, pgoff_t *indices, - unsigned int type, bool frontswap) + unsigned int type) { XA_STATE(xas, &mapping->i_pages, start); struct page *page; @@ -1186,9 +1181,6 @@ static int shmem_find_swap_entries(struct address_space *mapping, entry = radix_to_swp_entry(page); if (swp_type(entry) != type) continue; - if (frontswap && - !frontswap_test(swap_info[type], swp_offset(entry))) - continue; indices[ret] = xas.xa_index; entries[ret] = page; @@ -1241,26 +1233,20 @@ static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec, /* * If swap found in inode, free it and move page from swapcache to filecache. */ -static int shmem_unuse_inode(struct inode *inode, unsigned int type, - bool frontswap, unsigned long *fs_pages_to_unuse) +static int shmem_unuse_inode(struct inode *inode, unsigned int type) { struct address_space *mapping = inode->i_mapping; pgoff_t start = 0; struct pagevec pvec; pgoff_t indices[PAGEVEC_SIZE]; - bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0); int ret = 0; pagevec_init(&pvec); do { unsigned int nr_entries = PAGEVEC_SIZE; - if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE) - nr_entries = *fs_pages_to_unuse; - pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries, - pvec.pages, indices, - type, frontswap); + pvec.pages, indices, type); if (pvec.nr == 0) { ret = 0; break; @@ -1270,14 +1256,6 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type, if (ret < 0) break; - if (frontswap_partial) { - *fs_pages_to_unuse -= ret; - if (*fs_pages_to_unuse == 0) { - ret = FRONTSWAP_PAGES_UNUSED; - break; - } - } - start = indices[pvec.nr - 1]; } while (true); @@ -1289,8 +1267,7 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type, * device 'type' back into memory, so the swap device can be * unused. */ -int shmem_unuse(unsigned int type, bool frontswap, - unsigned long *fs_pages_to_unuse) +int shmem_unuse(unsigned int type) { struct shmem_inode_info *info, *next; int error = 0; @@ -1313,8 +1290,7 @@ int shmem_unuse(unsigned int type, bool frontswap, atomic_inc(&info->stop_eviction); mutex_unlock(&shmem_swaplist_mutex); - error = shmem_unuse_inode(&info->vfs_inode, type, frontswap, - fs_pages_to_unuse); + error = shmem_unuse_inode(&info->vfs_inode, type); cond_resched(); mutex_lock(&shmem_swaplist_mutex); @@ -1559,8 +1535,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, return NULL; shmem_pseudo_vma_init(&pvma, info, hindex); - page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), - true); + page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); @@ -1888,9 +1863,6 @@ repeat: return 0; } - /* Never use a huge page for shmem_symlink() */ - if (S_ISLNK(inode->i_mode)) - goto alloc_nohuge; if (!shmem_is_huge(vma, inode, index)) goto alloc_nohuge; @@ -2299,6 +2271,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode atomic_set(&info->stop_eviction, 0); info->seals = F_SEAL_SEAL; info->flags = flags & VM_NORESERVE; + info->i_crtime = inode->i_mtime; INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); simple_xattrs_init(&info->xattrs); @@ -2391,8 +2364,10 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, /* don't free the page */ goto out_unacct_blocks; } + + flush_dcache_page(page); } else { /* ZEROPAGE */ - clear_highpage(page); + clear_user_highpage(page, dst_addr); } } else { page = *pagep; @@ -2457,6 +2432,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_SHIFT; + int ret = 0; /* i_rwsem is held by caller */ if (unlikely(info->seals & (F_SEAL_GROW | @@ -2467,7 +2443,19 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - return shmem_getpage(inode, index, pagep, SGP_WRITE); + ret = shmem_getpage(inode, index, pagep, SGP_WRITE); + + if (ret) + return ret; + + if (PageHWPoison(*pagep)) { + unlock_page(*pagep); + put_page(*pagep); + *pagep = NULL; + return -EIO; + } + + return 0; } static int @@ -2513,19 +2501,10 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) struct address_space *mapping = inode->i_mapping; pgoff_t index; unsigned long offset; - enum sgp_type sgp = SGP_READ; int error = 0; ssize_t retval = 0; loff_t *ppos = &iocb->ki_pos; - /* - * Might this read be for a stacking filesystem? Then when reading - * holes of a sparse file, we actually need to allocate those pages, - * and even mark them dirty, so it cannot exceed the max_blocks limit. - */ - if (!iter_is_iovec(to)) - sgp = SGP_CACHE; - index = *ppos >> PAGE_SHIFT; offset = *ppos & ~PAGE_MASK; @@ -2534,6 +2513,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) pgoff_t end_index; unsigned long nr, ret; loff_t i_size = i_size_read(inode); + bool got_page; end_index = i_size >> PAGE_SHIFT; if (index > end_index) @@ -2544,16 +2524,20 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) break; } - error = shmem_getpage(inode, index, &page, sgp); + error = shmem_getpage(inode, index, &page, SGP_READ); if (error) { if (error == -EINVAL) error = 0; break; } if (page) { - if (sgp == SGP_CACHE) - set_page_dirty(page); unlock_page(page); + + if (PageHWPoison(page)) { + put_page(page); + error = -EIO; + break; + } } /* @@ -2586,9 +2570,10 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) */ if (!offset) mark_page_accessed(page); + got_page = true; } else { page = ZERO_PAGE(0); - get_page(page); + got_page = false; } /* @@ -2601,7 +2586,8 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) index += offset >> PAGE_SHIFT; offset &= ~PAGE_MASK; - put_page(page); + if (got_page) + put_page(page); if (!iov_iter_count(to)) break; if (ret < nr) { @@ -3093,7 +3079,8 @@ static const char *shmem_get_link(struct dentry *dentry, page = find_get_page(inode->i_mapping, 0); if (!page) return ERR_PTR(-ECHILD); - if (!PageUptodate(page)) { + if (PageHWPoison(page) || + !PageUptodate(page)) { put_page(page); return ERR_PTR(-ECHILD); } @@ -3101,6 +3088,13 @@ static const char *shmem_get_link(struct dentry *dentry, error = shmem_getpage(inode, 0, &page, SGP_READ); if (error) return ERR_PTR(error); + if (!page) + return ERR_PTR(-ECHILD); + if (PageHWPoison(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-ECHILD); + } unlock_page(page); } set_delayed_call(done, shmem_put_link, page); @@ -3203,6 +3197,7 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) #endif /* CONFIG_TMPFS_XATTR */ static const struct inode_operations shmem_short_symlink_operations = { + .getattr = shmem_getattr, .get_link = simple_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, @@ -3210,6 +3205,7 @@ static const struct inode_operations shmem_short_symlink_operations = { }; static const struct inode_operations shmem_symlink_inode_operations = { + .getattr = shmem_getattr, .get_link = shmem_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, @@ -3714,7 +3710,7 @@ static struct kmem_cache *shmem_inode_cachep; static struct inode *shmem_alloc_inode(struct super_block *sb) { struct shmem_inode_info *info; - info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); + info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL); if (!info) return NULL; return &info->vfs_inode; @@ -3751,9 +3747,16 @@ static void shmem_destroy_inodecache(void) kmem_cache_destroy(shmem_inode_cachep); } +/* Keep the page in page cache instead of truncating it */ +static int shmem_error_remove_page(struct address_space *mapping, + struct page *page) +{ + return 0; +} + const struct address_space_operations shmem_aops = { .writepage = shmem_writepage, - .set_page_dirty = __set_page_dirty_no_writeback, + .dirty_folio = noop_dirty_folio, #ifdef CONFIG_TMPFS .write_begin = shmem_write_begin, .write_end = shmem_write_end, @@ -3761,7 +3764,7 @@ const struct address_space_operations shmem_aops = { #ifdef CONFIG_MIGRATION .migratepage = migrate_page, #endif - .error_remove_page = generic_error_remove_page, + .error_remove_page = shmem_error_remove_page, }; EXPORT_SYMBOL(shmem_aops); @@ -3790,6 +3793,7 @@ static const struct inode_operations shmem_inode_operations = { static const struct inode_operations shmem_dir_inode_operations = { #ifdef CONFIG_TMPFS + .getattr = shmem_getattr, .create = shmem_create, .lookup = simple_lookup, .link = shmem_link, @@ -3811,6 +3815,7 @@ static const struct inode_operations shmem_dir_inode_operations = { }; static const struct inode_operations shmem_special_inode_operations = { + .getattr = shmem_getattr, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, #endif @@ -3962,8 +3967,7 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, return count; } -struct kobj_attribute shmem_enabled_attr = - __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store); +struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled); #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ #else /* !CONFIG_SHMEM */ @@ -3995,8 +3999,7 @@ int __init shmem_init(void) return 0; } -int shmem_unuse(unsigned int type, bool frontswap, - unsigned long *fs_pages_to_unuse) +int shmem_unuse(unsigned int type) { return 0; } @@ -4169,9 +4172,14 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL, NULL, NULL); if (error) - page = ERR_PTR(error); - else - unlock_page(page); + return ERR_PTR(error); + + unlock_page(page); + if (PageHWPoison(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + return page; #else /* diff --git a/mm/slab.c b/mm/slab.c index ddf5737c63d9..d9dec7a8fd79 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3211,7 +3211,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_ bool init = false; flags &= gfp_allowed_mask; - cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags); + cachep = slab_pre_alloc_hook(cachep, NULL, &objcg, 1, flags); if (unlikely(!cachep)) return NULL; @@ -3287,7 +3287,8 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) #endif /* CONFIG_NUMA */ static __always_inline void * -slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned long caller) +slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, + size_t orig_size, unsigned long caller) { unsigned long save_flags; void *objp; @@ -3295,7 +3296,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned lo bool init = false; flags &= gfp_allowed_mask; - cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags); + cachep = slab_pre_alloc_hook(cachep, lru, &objcg, 1, flags); if (unlikely(!cachep)) return NULL; @@ -3484,6 +3485,18 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, __free_one(ac, objp); } +static __always_inline +void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, + gfp_t flags) +{ + void *ret = slab_alloc(cachep, lru, flags, cachep->object_size, _RET_IP_); + + trace_kmem_cache_alloc(_RET_IP_, ret, + cachep->object_size, cachep->size, flags); + + return ret; +} + /** * kmem_cache_alloc - Allocate an object * @cachep: The cache to allocate from. @@ -3496,15 +3509,17 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, */ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { - void *ret = slab_alloc(cachep, flags, cachep->object_size, _RET_IP_); - - trace_kmem_cache_alloc(_RET_IP_, ret, - cachep->object_size, cachep->size, flags); - - return ret; + return __kmem_cache_alloc_lru(cachep, NULL, flags); } EXPORT_SYMBOL(kmem_cache_alloc); +void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, + gfp_t flags) +{ + return __kmem_cache_alloc_lru(cachep, lru, flags); +} +EXPORT_SYMBOL(kmem_cache_alloc_lru); + static __always_inline void cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p, unsigned long caller) @@ -3521,7 +3536,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, size_t i; struct obj_cgroup *objcg = NULL; - s = slab_pre_alloc_hook(s, &objcg, size, flags); + s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags); if (!s) return 0; @@ -3562,7 +3577,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) { void *ret; - ret = slab_alloc(cachep, flags, size, _RET_IP_); + ret = slab_alloc(cachep, NULL, flags, size, _RET_IP_); ret = kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(_RET_IP_, ret, @@ -3689,7 +3704,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - ret = slab_alloc(cachep, flags, size, caller); + ret = slab_alloc(cachep, NULL, flags, size, caller); ret = kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(caller, ret, diff --git a/mm/slab.h b/mm/slab.h index 95b9a74a2d51..fd7ae2024897 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -67,14 +67,8 @@ struct slab { static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl)) SLAB_MATCH(flags, __page_flags); SLAB_MATCH(compound_head, slab_list); /* Ensure bit 0 is clear */ -SLAB_MATCH(slab_list, slab_list); #ifndef CONFIG_SLOB SLAB_MATCH(rcu_head, rcu_head); -SLAB_MATCH(slab_cache, slab_cache); -#endif -#ifdef CONFIG_SLAB -SLAB_MATCH(s_mem, s_mem); -SLAB_MATCH(active, active); #endif SLAB_MATCH(_refcount, __page_refcount); #ifdef CONFIG_MEMCG @@ -237,6 +231,7 @@ struct kmem_cache { #include <linux/kmemleak.h> #include <linux/random.h> #include <linux/sched/mm.h> +#include <linux/list_lru.h> /* * State of the slab allocator. @@ -478,6 +473,7 @@ static inline size_t obj_full_size(struct kmem_cache *s) * Returns false if the allocation should fail. */ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, struct obj_cgroup **objcgp, size_t objects, gfp_t flags) { @@ -493,13 +489,26 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, if (!objcg) return true; - if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) { - obj_cgroup_put(objcg); - return false; + if (lru) { + int ret; + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_objcg(objcg); + ret = memcg_list_lru_alloc(memcg, lru, flags); + css_put(&memcg->css); + + if (ret) + goto out; } + if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) + goto out; + *objcgp = objcg; return true; +out: + obj_cgroup_put(objcg); + return false; } static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, @@ -604,6 +613,7 @@ static inline void memcg_free_slab_cgroups(struct slab *slab) } static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, struct obj_cgroup **objcgp, size_t objects, gfp_t flags) { @@ -703,6 +713,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s) } static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, struct obj_cgroup **objcgp, size_t size, gfp_t flags) { @@ -713,7 +724,7 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, if (should_failslab(s, flags)) return NULL; - if (!memcg_slab_pre_alloc_hook(s, objcgp, size, flags)) + if (!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags)) return NULL; return s; @@ -794,11 +805,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) #endif -void *slab_start(struct seq_file *m, loff_t *pos); -void *slab_next(struct seq_file *m, void *p, loff_t *pos); -void slab_stop(struct seq_file *m, void *p); -int memcg_slab_show(struct seq_file *m, void *p); - #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) void dump_unreclaimable_slab(void); #else diff --git a/mm/slab_common.c b/mm/slab_common.c index dc15566141d4..6ee64d6208b3 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -489,9 +489,7 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { - int err; - - if (unlikely(!s)) + if (unlikely(!s) || !kasan_check_byte(s)) return; cpus_read_lock(); @@ -501,12 +499,9 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock; - err = shutdown_cache(s); - if (err) { - pr_err("%s %s: Slab cache still has objects\n", - __func__, s->name); - dump_stack(); - } + WARN(shutdown_cache(s), + "%s %s: Slab cache still has objects when called from %pS", + __func__, s->name, (void *)_RET_IP_); out_unlock: mutex_unlock(&slab_mutex); cpus_read_unlock(); @@ -812,7 +807,7 @@ void __init setup_kmalloc_cache_index_table(void) unsigned int i; BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || - (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); + !is_power_of_2(KMALLOC_MIN_SIZE)); for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { unsigned int elem = size_index_elem(i); @@ -824,7 +819,7 @@ void __init setup_kmalloc_cache_index_table(void) if (KMALLOC_MIN_SIZE >= 64) { /* - * The 96 byte size cache is not used if the alignment + * The 96 byte sized cache is not used if the alignment * is 64 byte. */ for (i = 64 + 8; i <= 96; i += 8) @@ -849,7 +844,7 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags) if (type == KMALLOC_RECLAIM) { flags |= SLAB_RECLAIM_ACCOUNT; } else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) { - if (cgroup_memory_nokmem) { + if (mem_cgroup_kmem_disabled()) { kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx]; return; } @@ -1044,18 +1039,18 @@ static void print_slabinfo_header(struct seq_file *m) seq_putc(m, '\n'); } -void *slab_start(struct seq_file *m, loff_t *pos) +static void *slab_start(struct seq_file *m, loff_t *pos) { mutex_lock(&slab_mutex); return seq_list_start(&slab_caches, *pos); } -void *slab_next(struct seq_file *m, void *p, loff_t *pos) +static void *slab_next(struct seq_file *m, void *p, loff_t *pos) { return seq_list_next(p, &slab_caches, pos); } -void slab_stop(struct seq_file *m, void *p) +static void slab_stop(struct seq_file *m, void *p) { mutex_unlock(&slab_mutex); } @@ -1123,17 +1118,6 @@ void dump_unreclaimable_slab(void) mutex_unlock(&slab_mutex); } -#if defined(CONFIG_MEMCG_KMEM) -int memcg_slab_show(struct seq_file *m, void *p) -{ - /* - * Deprecated. - * Please, take a look at tools/cgroup/slabinfo.py . - */ - return 0; -} -#endif - /* * slabinfo_op - iterator that generates /proc/slabinfo * diff --git a/mm/slob.c b/mm/slob.c index 60c5842215f1..dfa6808dff36 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -635,6 +635,12 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) } EXPORT_SYMBOL(kmem_cache_alloc); + +void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags) +{ + return slob_alloc_node(cachep, flags, NUMA_NO_NODE); +} +EXPORT_SYMBOL(kmem_cache_alloc_lru); #ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t gfp, int node) { @@ -708,7 +714,7 @@ int __kmem_cache_shrink(struct kmem_cache *d) return 0; } -struct kmem_cache kmem_cache_boot = { +static struct kmem_cache kmem_cache_boot = { .name = "kmem_cache", .size = sizeof(struct kmem_cache), .flags = SLAB_PANIC, diff --git a/mm/slub.c b/mm/slub.c index 261474092e43..74d92aa4a3a2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1788,8 +1788,8 @@ static void *setup_object(struct kmem_cache *s, struct slab *slab, /* * Slab allocation and freeing */ -static inline struct slab *alloc_slab_page(struct kmem_cache *s, - gfp_t flags, int node, struct kmem_cache_order_objects oo) +static inline struct slab *alloc_slab_page(gfp_t flags, int node, + struct kmem_cache_order_objects oo) { struct folio *folio; struct slab *slab; @@ -1941,7 +1941,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL); - slab = alloc_slab_page(s, alloc_gfp, node, oo); + slab = alloc_slab_page(alloc_gfp, node, oo); if (unlikely(!slab)) { oo = s->min; alloc_gfp = flags; @@ -1949,7 +1949,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * Allocation may have failed due to fragmentation. * Try a lower order alloc if possible */ - slab = alloc_slab_page(s, alloc_gfp, node, oo); + slab = alloc_slab_page(alloc_gfp, node, oo); if (unlikely(!slab)) goto out; stat(s, ORDER_FALLBACK); @@ -2348,10 +2348,10 @@ static void init_kmem_cache_cpus(struct kmem_cache *s) static void deactivate_slab(struct kmem_cache *s, struct slab *slab, void *freelist) { - enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; + enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE, M_FULL_NOLIST }; struct kmem_cache_node *n = get_node(s, slab_nid(slab)); - int lock = 0, free_delta = 0; - enum slab_modes l = M_NONE, m = M_NONE; + int free_delta = 0; + enum slab_modes mode = M_NONE; void *nextfree, *freelist_iter, *freelist_tail; int tail = DEACTIVATE_TO_HEAD; unsigned long flags = 0; @@ -2393,14 +2393,10 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, * Ensure that the slab is unfrozen while the list presence * reflects the actual number of objects during unfreeze. * - * We setup the list membership and then perform a cmpxchg - * with the count. If there is a mismatch then the slab - * is not unfrozen but the slab is on the wrong list. - * - * Then we restart the process which may have to remove - * the slab from the list that we just put it on again - * because the number of objects in the slab may have - * changed. + * We first perform cmpxchg holding lock and insert to list + * when it succeed. If there is mismatch then the slab is not + * unfrozen and number of objects in the slab may have changed. + * Then release lock and retry cmpxchg again. */ redo: @@ -2419,61 +2415,52 @@ redo: new.frozen = 0; - if (!new.inuse && n->nr_partial >= s->min_partial) - m = M_FREE; - else if (new.freelist) { - m = M_PARTIAL; - if (!lock) { - lock = 1; - /* - * Taking the spinlock removes the possibility that - * acquire_slab() will see a slab that is frozen - */ - spin_lock_irqsave(&n->list_lock, flags); - } + if (!new.inuse && n->nr_partial >= s->min_partial) { + mode = M_FREE; + } else if (new.freelist) { + mode = M_PARTIAL; + /* + * Taking the spinlock removes the possibility that + * acquire_slab() will see a slab that is frozen + */ + spin_lock_irqsave(&n->list_lock, flags); + } else if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) { + mode = M_FULL; + /* + * This also ensures that the scanning of full + * slabs from diagnostic functions will not see + * any frozen slabs. + */ + spin_lock_irqsave(&n->list_lock, flags); } else { - m = M_FULL; - if (kmem_cache_debug_flags(s, SLAB_STORE_USER) && !lock) { - lock = 1; - /* - * This also ensures that the scanning of full - * slabs from diagnostic functions will not see - * any frozen slabs. - */ - spin_lock_irqsave(&n->list_lock, flags); - } + mode = M_FULL_NOLIST; } - if (l != m) { - if (l == M_PARTIAL) - remove_partial(n, slab); - else if (l == M_FULL) - remove_full(s, n, slab); - - if (m == M_PARTIAL) - add_partial(n, slab, tail); - else if (m == M_FULL) - add_full(s, n, slab); - } - l = m; if (!cmpxchg_double_slab(s, slab, old.freelist, old.counters, new.freelist, new.counters, - "unfreezing slab")) + "unfreezing slab")) { + if (mode == M_PARTIAL || mode == M_FULL) + spin_unlock_irqrestore(&n->list_lock, flags); goto redo; + } - if (lock) - spin_unlock_irqrestore(&n->list_lock, flags); - if (m == M_PARTIAL) + if (mode == M_PARTIAL) { + add_partial(n, slab, tail); + spin_unlock_irqrestore(&n->list_lock, flags); stat(s, tail); - else if (m == M_FULL) - stat(s, DEACTIVATE_FULL); - else if (m == M_FREE) { + } else if (mode == M_FREE) { stat(s, DEACTIVATE_EMPTY); discard_slab(s, slab); stat(s, FREE_SLAB); + } else if (mode == M_FULL) { + add_full(s, n, slab); + spin_unlock_irqrestore(&n->list_lock, flags); + stat(s, DEACTIVATE_FULL); + } else if (mode == M_FULL_NOLIST) { + stat(s, DEACTIVATE_FULL); } } @@ -3131,7 +3118,7 @@ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, * * Otherwise we can simply pick the next object from the lockless free list. */ -static __always_inline void *slab_alloc_node(struct kmem_cache *s, +static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { void *object; @@ -3141,7 +3128,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct obj_cgroup *objcg = NULL; bool init = false; - s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags); + s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags); if (!s) return NULL; @@ -3232,27 +3219,41 @@ out: return object; } -static __always_inline void *slab_alloc(struct kmem_cache *s, +static __always_inline void *slab_alloc(struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags, unsigned long addr, size_t orig_size) { - return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr, orig_size); + return slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, addr, orig_size); } -void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) +static __always_inline +void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, + gfp_t gfpflags) { - void *ret = slab_alloc(s, gfpflags, _RET_IP_, s->object_size); + void *ret = slab_alloc(s, lru, gfpflags, _RET_IP_, s->object_size); trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags); return ret; } + +void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) +{ + return __kmem_cache_alloc_lru(s, NULL, gfpflags); +} EXPORT_SYMBOL(kmem_cache_alloc); +void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, + gfp_t gfpflags) +{ + return __kmem_cache_alloc_lru(s, lru, gfpflags); +} +EXPORT_SYMBOL(kmem_cache_alloc_lru); + #ifdef CONFIG_TRACING void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { - void *ret = slab_alloc(s, gfpflags, _RET_IP_, size); + void *ret = slab_alloc(s, NULL, gfpflags, _RET_IP_, size); trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; @@ -3263,7 +3264,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace); #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { - void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, s->object_size); + void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size); trace_kmem_cache_alloc_node(_RET_IP_, ret, s->object_size, s->size, gfpflags, node); @@ -3277,7 +3278,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, int node, size_t size) { - void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, size); + void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size); trace_kmalloc_node(_RET_IP_, ret, size, s->size, gfpflags, node); @@ -3667,7 +3668,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, struct obj_cgroup *objcg = NULL; /* memcg and kmem_cache debug support */ - s = slab_pre_alloc_hook(s, &objcg, size, flags); + s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags); if (unlikely(!s)) return false; /* @@ -4000,15 +4001,6 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) return 1; } -static void set_min_partial(struct kmem_cache *s, unsigned long min) -{ - if (min < MIN_PARTIAL) - min = MIN_PARTIAL; - else if (min > MAX_PARTIAL) - min = MAX_PARTIAL; - s->min_partial = min; -} - static void set_cpu_partial(struct kmem_cache *s) { #ifdef CONFIG_SLUB_CPU_PARTIAL @@ -4046,7 +4038,7 @@ static void set_cpu_partial(struct kmem_cache *s) * calculate_sizes() determines the order and the distribution of data within * a slab object. */ -static int calculate_sizes(struct kmem_cache *s, int forced_order) +static int calculate_sizes(struct kmem_cache *s) { slab_flags_t flags = s->flags; unsigned int size = s->object_size; @@ -4150,10 +4142,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) size = ALIGN(size, s->align); s->size = size; s->reciprocal_size = reciprocal_value(size); - if (forced_order >= 0) - order = forced_order; - else - order = calculate_order(size); + order = calculate_order(size); if ((int)order < 0) return 0; @@ -4189,7 +4178,7 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) s->random = get_random_long(); #endif - if (!calculate_sizes(s, -1)) + if (!calculate_sizes(s)) goto error; if (disable_higher_order_debug) { /* @@ -4199,7 +4188,7 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) if (get_order(s->size) > get_order(s->object_size)) { s->flags &= ~DEBUG_METADATA_FLAGS; s->offset = 0; - if (!calculate_sizes(s, -1)) + if (!calculate_sizes(s)) goto error; } } @@ -4215,7 +4204,8 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) * The larger the object size is, the more slabs we want on the partial * list to avoid pounding the page allocator excessively. */ - set_min_partial(s, ilog2(s->size) / 2); + s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); + s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); set_cpu_partial(s); @@ -4417,7 +4407,7 @@ void *__kmalloc(size_t size, gfp_t flags) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, flags, _RET_IP_, size); + ret = slab_alloc(s, NULL, flags, _RET_IP_, size); trace_kmalloc(_RET_IP_, ret, size, s->size, flags); @@ -4465,7 +4455,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc_node(s, flags, node, _RET_IP_, size); + ret = slab_alloc_node(s, NULL, flags, node, _RET_IP_, size); trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); @@ -4923,7 +4913,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, gfpflags, caller, size); + ret = slab_alloc(s, NULL, gfpflags, caller, size); /* Honor the call site pointer we received. */ trace_kmalloc(caller, ret, size, s->size, gfpflags); @@ -4954,7 +4944,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc_node(s, gfpflags, node, caller, size); + ret = slab_alloc_node(s, NULL, gfpflags, node, caller, size); /* Honor the call site pointer we received. */ trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); @@ -5344,12 +5334,10 @@ struct slab_attribute { }; #define SLAB_ATTR_RO(_name) \ - static struct slab_attribute _name##_attr = \ - __ATTR(_name, 0400, _name##_show, NULL) + static struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400) #define SLAB_ATTR(_name) \ - static struct slab_attribute _name##_attr = \ - __ATTR(_name, 0600, _name##_show, _name##_store) + static struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600) static ssize_t slab_size_show(struct kmem_cache *s, char *buf) { @@ -5396,7 +5384,7 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, if (err) return err; - set_min_partial(s, min); + s->min_partial = min; return length; } SLAB_ATTR(min_partial); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index db6df27c852a..8aecd6b3896c 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -34,6 +34,7 @@ #include <asm/pgalloc.h> #include <asm/tlbflush.h> +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP /** * struct vmemmap_remap_walk - walk vmemmap page table * @@ -53,8 +54,7 @@ struct vmemmap_remap_walk { struct list_head *vmemmap_pages; }; -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, - struct vmemmap_remap_walk *walk) +static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) { pmd_t __pmd; int i; @@ -76,15 +76,34 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, set_pte_at(&init_mm, addr, pte, entry); } - /* Make pte visible before pmd. See comment in pmd_install(). */ - smp_wmb(); - pmd_populate_kernel(&init_mm, pmd, pgtable); - - flush_tlb_kernel_range(start, start + PMD_SIZE); + spin_lock(&init_mm.page_table_lock); + if (likely(pmd_leaf(*pmd))) { + /* Make pte visible before pmd. See comment in pmd_install(). */ + smp_wmb(); + pmd_populate_kernel(&init_mm, pmd, pgtable); + flush_tlb_kernel_range(start, start + PMD_SIZE); + } else { + pte_free_kernel(&init_mm, pgtable); + } + spin_unlock(&init_mm.page_table_lock); return 0; } +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) +{ + int leaf; + + spin_lock(&init_mm.page_table_lock); + leaf = pmd_leaf(*pmd); + spin_unlock(&init_mm.page_table_lock); + + if (!leaf) + return 0; + + return __split_vmemmap_huge_pmd(pmd, start); +} + static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct vmemmap_remap_walk *walk) @@ -121,13 +140,12 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, pmd = pmd_offset(pud, addr); do { - if (pmd_leaf(*pmd)) { - int ret; + int ret; + + ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); + if (ret) + return ret; - ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, walk); - if (ret) - return ret; - } next = pmd_addr_end(addr, end); vmemmap_pte_range(pmd, addr, next, walk); } while (pmd++, addr = next, addr != end); @@ -245,6 +263,26 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, set_pte_at(&init_mm, addr, pte, entry); } +/* + * How many struct page structs need to be reset. When we reuse the head + * struct page, the special metadata (e.g. page->flags or page->mapping) + * cannot copy to the tail struct page structs. The invalid value will be + * checked in the free_tail_pages_check(). In order to avoid the message + * of "corrupted mapping in tail page". We need to reset at least 3 (one + * head struct page struct and two tail struct page structs) struct page + * structs. + */ +#define NR_RESET_STRUCT_PAGE 3 + +static inline void reset_struct_pages(struct page *start) +{ + int i; + struct page *from = start + NR_RESET_STRUCT_PAGE; + + for (i = 0; i < NR_RESET_STRUCT_PAGE; i++) + memcpy(start + i, from, sizeof(*from)); +} + static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { @@ -258,6 +296,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, list_del(&page->lru); to = page_to_virt(page); copy_page(to, (void *)walk->reuse_addr); + reset_struct_pages(to); set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); } @@ -300,10 +339,8 @@ int vmemmap_remap_free(unsigned long start, unsigned long end, */ BUG_ON(start - reuse != PAGE_SIZE); - mmap_write_lock(&init_mm); + mmap_read_lock(&init_mm); ret = vmemmap_remap_range(reuse, end, &walk); - mmap_write_downgrade(&init_mm); - if (ret && walk.nr_walked) { end = reuse + walk.nr_walked * PAGE_SIZE; /* @@ -383,6 +420,7 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end, return 0; } +#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */ /* * Allocate a block of memory to be used to back the virtual memory map diff --git a/mm/sparse.c b/mm/sparse.c index d21c6e5910d0..952f06d8f373 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -126,7 +126,7 @@ static inline int sparse_early_nid(struct mem_section *section) } /* Validate the physical addressing limitations of the model */ -void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, +static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, unsigned long *end_pfn) { unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); diff --git a/mm/swap.c b/mm/swap.c index e8c9dc6d0377..5b30045207e1 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -74,8 +74,8 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = { }; /* - * This path almost never happens for VM activity - pages are normally - * freed via pagevecs. But it gets used by networking. + * This path almost never happens for VM activity - pages are normally freed + * via pagevecs. But it gets used by networking - and for compound pages. */ static void __page_cache_release(struct page *page) { @@ -89,6 +89,14 @@ static void __page_cache_release(struct page *page) __clear_page_lru_flags(page); unlock_page_lruvec_irqrestore(lruvec, flags); } + /* See comment on PageMlocked in release_pages() */ + if (unlikely(PageMlocked(page))) { + int nr_pages = thp_nr_pages(page); + + __ClearPageMlocked(page); + mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); + } __ClearPageWaiters(page); } @@ -114,17 +122,9 @@ static void __put_compound_page(struct page *page) void __put_page(struct page *page) { - if (is_zone_device_page(page)) { - put_dev_pagemap(page->pgmap); - - /* - * The page belongs to the device that created pgmap. Do - * not return it to page allocator. - */ - return; - } - - if (unlikely(PageCompound(page))) + if (unlikely(is_zone_device_page(page))) + free_zone_device_page(page); + else if (unlikely(PageCompound(page))) __put_compound_page(page); else __put_single_page(page); @@ -425,7 +425,7 @@ void folio_mark_accessed(struct folio *folio) /* * Unevictable pages are on the "LRU_UNEVICTABLE" list. But, * this list is never rotated or maintained, so marking an - * evictable page accessed has no effect. + * unevictable page accessed has no effect. */ } else if (!folio_test_active(folio)) { /* @@ -482,22 +482,12 @@ EXPORT_SYMBOL(folio_add_lru); void lru_cache_add_inactive_or_unevictable(struct page *page, struct vm_area_struct *vma) { - bool unevictable; - VM_BUG_ON_PAGE(PageLRU(page), page); - unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED; - if (unlikely(unevictable) && !TestSetPageMlocked(page)) { - int nr_pages = thp_nr_pages(page); - /* - * We use the irq-unsafe __mod_zone_page_state because this - * counter is not modified from interrupt context, and the pte - * lock is held(spinlock), which implies preemption disabled. - */ - __mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); - count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); - } - lru_cache_add(page); + if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) + mlock_new_page(page); + else + lru_cache_add(page); } /* @@ -636,35 +626,37 @@ void lru_add_drain_cpu(int cpu) pagevec_lru_move_fn(pvec, lru_lazyfree_fn); activate_page_drain(cpu); + mlock_page_drain(cpu); } /** - * deactivate_file_page - forcefully deactivate a file page - * @page: page to deactivate + * deactivate_file_folio() - Forcefully deactivate a file folio. + * @folio: Folio to deactivate. * - * This function hints the VM that @page is a good reclaim candidate, - * for example if its invalidation fails due to the page being dirty + * This function hints to the VM that @folio is a good reclaim candidate, + * for example if its invalidation fails due to the folio being dirty * or under writeback. + * + * Context: Caller holds a reference on the page. */ -void deactivate_file_page(struct page *page) +void deactivate_file_folio(struct folio *folio) { + struct pagevec *pvec; + /* - * In a workload with many unevictable page such as mprotect, - * unevictable page deactivation for accelerating reclaim is pointless. + * In a workload with many unevictable pages such as mprotect, + * unevictable folio deactivation for accelerating reclaim is pointless. */ - if (PageUnevictable(page)) + if (folio_test_unevictable(folio)) return; - if (likely(get_page_unless_zero(page))) { - struct pagevec *pvec; - - local_lock(&lru_pvecs.lock); - pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file); + folio_get(folio); + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file); - if (pagevec_add_and_need_flush(pvec, page)) - pagevec_lru_move_fn(pvec, lru_deactivate_file_fn); - local_unlock(&lru_pvecs.lock); - } + if (pagevec_add_and_need_flush(pvec, &folio->page)) + pagevec_lru_move_fn(pvec, lru_deactivate_file_fn); + local_unlock(&lru_pvecs.lock); } /* @@ -831,13 +823,13 @@ inline void __lru_add_drain_all(bool force_all_cpus) for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); - if (force_all_cpus || - pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) || + if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) || data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) || pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || need_activate_page_drain(cpu) || + need_mlock_page_drain(cpu) || has_bh_in_lru(cpu, NULL)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); @@ -876,15 +868,21 @@ atomic_t lru_disable_count = ATOMIC_INIT(0); void lru_cache_disable(void) { atomic_inc(&lru_disable_count); -#ifdef CONFIG_SMP /* - * lru_add_drain_all in the force mode will schedule draining on - * all online CPUs so any calls of lru_cache_disabled wrapped by - * local_lock or preemption disabled would be ordered by that. - * The atomic operation doesn't need to have stronger ordering - * requirements because that is enforeced by the scheduling - * guarantees. + * Readers of lru_disable_count are protected by either disabling + * preemption or rcu_read_lock: + * + * preempt_disable, local_irq_disable [bh_lru_lock()] + * rcu_read_lock [rt_spin_lock CONFIG_PREEMPT_RT] + * preempt_disable [local_lock !CONFIG_PREEMPT_RT] + * + * Since v5.1 kernel, synchronize_rcu() is guaranteed to wait on + * preempt_disable() regions of code. So any CPU which sees + * lru_disable_count = 0 will have exited the critical + * section when synchronize_rcu() returns. */ + synchronize_rcu(); +#ifdef CONFIG_SMP __lru_add_drain_all(true); #else lru_add_and_bh_lrus_drain(); @@ -930,18 +928,10 @@ void release_pages(struct page **pages, int nr) unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } - /* - * ZONE_DEVICE pages that return 'false' from - * page_is_devmap_managed() do not require special - * processing, and instead, expect a call to - * put_page_testzero(). - */ - if (page_is_devmap_managed(page)) { - put_devmap_managed_page(page); + if (put_devmap_managed_page(page)) continue; - } if (put_page_testzero(page)) - put_dev_pagemap(page->pgmap); + free_zone_device_page(page); continue; } @@ -969,6 +959,18 @@ void release_pages(struct page **pages, int nr) __clear_page_lru_flags(page); } + /* + * In rare cases, when truncation or holepunching raced with + * munlock after VM_LOCKED was cleared, Mlocked may still be + * found set here. This does not indicate a problem, unless + * "unevictable_pgs_cleared" appears worryingly large. + */ + if (unlikely(PageMlocked(page))) { + __ClearPageMlocked(page); + dec_zone_page_state(page, NR_MLOCK); + count_vm_event(UNEVICTABLE_PGCLEARED); + } + __ClearPageWaiters(page); list_add(&page->lru, &pages_to_free); @@ -1009,43 +1011,32 @@ static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec) VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + folio_set_lru(folio); /* - * A folio becomes evictable in two ways: - * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()]. - * 2) Before acquiring LRU lock to put the folio on the correct LRU - * and then - * a) do PageLRU check with lock [check_move_unevictable_pages] - * b) do PageLRU check before lock [clear_page_mlock] - * - * (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need - * following strict ordering: - * - * #0: __pagevec_lru_add_fn #1: clear_page_mlock - * - * folio_set_lru() folio_test_clear_mlocked() - * smp_mb() // explicit ordering // above provides strict - * // ordering - * folio_test_mlocked() folio_test_lru() - * + * Is an smp_mb__after_atomic() still required here, before + * folio_evictable() tests PageMlocked, to rule out the possibility + * of stranding an evictable folio on an unevictable LRU? I think + * not, because __munlock_page() only clears PageMlocked while the LRU + * lock is held. * - * if '#1' does not observe setting of PG_lru by '#0' and - * fails isolation, the explicit barrier will make sure that - * folio_evictable check will put the folio on the correct - * LRU. Without smp_mb(), folio_set_lru() can be reordered - * after folio_test_mlocked() check and can make '#1' fail the - * isolation of the folio whose mlocked bit is cleared (#0 is - * also looking at the same folio) and the evictable folio will - * be stranded on an unevictable LRU. + * (That is not true of __page_cache_release(), and not necessarily + * true of release_pages(): but those only clear PageMlocked after + * put_page_testzero() has excluded any other users of the page.) */ - folio_set_lru(folio); - smp_mb__after_atomic(); - if (folio_evictable(folio)) { if (was_unevictable) __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } else { folio_clear_active(folio); folio_set_unevictable(folio); + /* + * folio->mlock_count = !!folio_test_mlocked(folio)? + * But that leaves __mlock_page() in doubt whether another + * actor has already counted the mlock or not. Err on the + * safe side, underestimate, let page reclaim fix it, rather + * than leaving a page on the unevictable LRU indefinitely. + */ + folio->mlock_count = 0; if (!was_unevictable) __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } @@ -1077,24 +1068,24 @@ void __pagevec_lru_add(struct pagevec *pvec) } /** - * pagevec_remove_exceptionals - pagevec exceptionals pruning - * @pvec: The pagevec to prune + * folio_batch_remove_exceptionals() - Prune non-folios from a batch. + * @fbatch: The batch to prune * - * find_get_entries() fills both pages and XArray value entries (aka - * exceptional entries) into the pagevec. This function prunes all - * exceptionals from @pvec without leaving holes, so that it can be - * passed on to page-only pagevec operations. + * find_get_entries() fills a batch with both folios and shadow/swap/DAX + * entries. This function prunes all the non-folio entries from @fbatch + * without leaving holes, so that it can be passed on to folio-only batch + * operations. */ -void pagevec_remove_exceptionals(struct pagevec *pvec) +void folio_batch_remove_exceptionals(struct folio_batch *fbatch) { - int i, j; + unsigned int i, j; - for (i = 0, j = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - if (!xa_is_value(page)) - pvec->pages[j++] = page; + for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) { + struct folio *folio = fbatch->folios[i]; + if (!xa_is_value(folio)) + fbatch->folios[j++] = folio; } - pvec->nr = j; + fbatch->nr = j; } /** @@ -1153,26 +1144,3 @@ void __init swap_setup(void) * _really_ don't want to cluster much more */ } - -#ifdef CONFIG_DEV_PAGEMAP_OPS -void put_devmap_managed_page(struct page *page) -{ - int count; - - if (WARN_ON_ONCE(!page_is_devmap_managed(page))) - return; - - count = page_ref_dec_return(page); - - /* - * devmap page refcounts are 1-based, rather than 0-based: if - * refcount is 1, then the page is free and the refcount is - * stable because nobody holds a reference on the page. - */ - if (count == 1) - free_devmap_managed_page(page); - else if (!count) - __put_page(page); -} -EXPORT_SYMBOL(put_devmap_managed_page); -#endif diff --git a/mm/swap_state.c b/mm/swap_state.c index 8d4104242100..013856004825 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -30,7 +30,7 @@ */ static const struct address_space_operations swap_aops = { .writepage = swap_writepage, - .set_page_dirty = swap_set_page_dirty, + .dirty_folio = swap_dirty_folio, #ifdef CONFIG_MIGRATION .migratepage = migrate_page, #endif @@ -478,7 +478,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * __read_swap_cache_async(), which has set SWAP_HAS_CACHE * in swap_map, but not yet added its page to swap cache. */ - cond_resched(); + schedule_timeout_uninterruptible(1); } /* diff --git a/mm/swapfile.c b/mm/swapfile.c index e59e08ef46e1..33c7abb16610 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -49,7 +49,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); -DEFINE_SPINLOCK(swap_lock); +static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; atomic_long_t nr_swap_pages; /* @@ -71,7 +71,7 @@ static const char Unused_offset[] = "Unused swap offset entry "; * all active swap_info_structs * protected with swap_lock, and ordered by priority. */ -PLIST_HEAD(swap_active_head); +static PLIST_HEAD(swap_active_head); /* * all available (active, not full) swap_info_structs @@ -1601,31 +1601,30 @@ static bool page_swapped(struct page *page) return false; } -static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, +static int page_trans_huge_map_swapcount(struct page *page, int *total_swapcount) { - int i, map_swapcount, _total_mapcount, _total_swapcount; + int i, map_swapcount, _total_swapcount; unsigned long offset = 0; struct swap_info_struct *si; struct swap_cluster_info *ci = NULL; unsigned char *map = NULL; - int mapcount, swapcount = 0; + int swapcount = 0; /* hugetlbfs shouldn't call it */ VM_BUG_ON_PAGE(PageHuge(page), page); if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) { - mapcount = page_trans_huge_mapcount(page, total_mapcount); if (PageSwapCache(page)) swapcount = page_swapcount(page); if (total_swapcount) *total_swapcount = swapcount; - return mapcount + swapcount; + return swapcount + page_trans_huge_mapcount(page); } page = compound_head(page); - _total_mapcount = _total_swapcount = map_swapcount = 0; + _total_swapcount = map_swapcount = 0; if (PageSwapCache(page)) { swp_entry_t entry; @@ -1639,8 +1638,7 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, if (map) ci = lock_cluster(si, offset); for (i = 0; i < HPAGE_PMD_NR; i++) { - mapcount = atomic_read(&page[i]._mapcount) + 1; - _total_mapcount += mapcount; + int mapcount = atomic_read(&page[i]._mapcount) + 1; if (map) { swapcount = swap_count(map[offset + i]); _total_swapcount += swapcount; @@ -1648,19 +1646,14 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, map_swapcount = max(map_swapcount, mapcount + swapcount); } unlock_cluster(ci); - if (PageDoubleMap(page)) { + + if (PageDoubleMap(page)) map_swapcount -= 1; - _total_mapcount -= HPAGE_PMD_NR; - } - mapcount = compound_mapcount(page); - map_swapcount += mapcount; - _total_mapcount += mapcount; - if (total_mapcount) - *total_mapcount = _total_mapcount; + if (total_swapcount) *total_swapcount = _total_swapcount; - return map_swapcount; + return map_swapcount + compound_mapcount(page); } /* @@ -1668,22 +1661,15 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, * to it. And as a side-effect, free up its swap: because the old content * on disk will never be read, and seeking back there to write new content * later would only waste time away from clustering. - * - * NOTE: total_map_swapcount should not be relied upon by the caller if - * reuse_swap_page() returns false, but it may be always overwritten - * (see the other implementation for CONFIG_SWAP=n). */ -bool reuse_swap_page(struct page *page, int *total_map_swapcount) +bool reuse_swap_page(struct page *page) { - int count, total_mapcount, total_swapcount; + int count, total_swapcount; VM_BUG_ON_PAGE(!PageLocked(page), page); if (unlikely(PageKsm(page))) return false; - count = page_trans_huge_map_swapcount(page, &total_mapcount, - &total_swapcount); - if (total_map_swapcount) - *total_map_swapcount = total_mapcount + total_swapcount; + count = page_trans_huge_map_swapcount(page, &total_swapcount); if (count == 1 && PageSwapCache(page) && (likely(!PageTransCompound(page)) || /* The remaining swap count will be freed soon */ @@ -1917,14 +1903,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); - set_pte_at(vma->vm_mm, addr, pte, - pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { page_add_anon_rmap(page, vma, addr, false); } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, addr, false); lru_cache_add_inactive_or_unevictable(page, vma); } + set_pte_at(vma->vm_mm, addr, pte, + pte_mkold(mk_pte(page, vma->vm_page_prot))); swap_free(entry); out: pte_unmap_unlock(pte, ptl); @@ -1937,8 +1923,7 @@ out: static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, - unsigned int type, bool frontswap, - unsigned long *fs_pages_to_unuse) + unsigned int type) { struct page *page; swp_entry_t entry; @@ -1959,9 +1944,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, continue; offset = swp_offset(entry); - if (frontswap && !frontswap_test(si, offset)) - continue; - pte_unmap(pte); swap_map = &si->swap_map[offset]; page = lookup_swap_cache(entry, vma, addr); @@ -1969,6 +1951,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, struct vm_fault vmf = { .vma = vma, .address = addr, + .real_address = addr, .pmd = pmd, }; @@ -1993,11 +1976,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, try_to_free_swap(page); unlock_page(page); put_page(page); - - if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) { - ret = FRONTSWAP_PAGES_UNUSED; - goto out; - } try_next: pte = pte_offset_map(pmd, addr); } while (pte++, addr += PAGE_SIZE, addr != end); @@ -2010,8 +1988,7 @@ out: static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, - unsigned int type, bool frontswap, - unsigned long *fs_pages_to_unuse) + unsigned int type) { pmd_t *pmd; unsigned long next; @@ -2023,8 +2000,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; - ret = unuse_pte_range(vma, pmd, addr, next, type, - frontswap, fs_pages_to_unuse); + ret = unuse_pte_range(vma, pmd, addr, next, type); if (ret) return ret; } while (pmd++, addr = next, addr != end); @@ -2033,8 +2009,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, - unsigned int type, bool frontswap, - unsigned long *fs_pages_to_unuse) + unsigned int type) { pud_t *pud; unsigned long next; @@ -2045,8 +2020,7 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - ret = unuse_pmd_range(vma, pud, addr, next, type, - frontswap, fs_pages_to_unuse); + ret = unuse_pmd_range(vma, pud, addr, next, type); if (ret) return ret; } while (pud++, addr = next, addr != end); @@ -2055,8 +2029,7 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, - unsigned int type, bool frontswap, - unsigned long *fs_pages_to_unuse) + unsigned int type) { p4d_t *p4d; unsigned long next; @@ -2067,16 +2040,14 @@ static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; - ret = unuse_pud_range(vma, p4d, addr, next, type, - frontswap, fs_pages_to_unuse); + ret = unuse_pud_range(vma, p4d, addr, next, type); if (ret) return ret; } while (p4d++, addr = next, addr != end); return 0; } -static int unuse_vma(struct vm_area_struct *vma, unsigned int type, - bool frontswap, unsigned long *fs_pages_to_unuse) +static int unuse_vma(struct vm_area_struct *vma, unsigned int type) { pgd_t *pgd; unsigned long addr, end, next; @@ -2090,16 +2061,14 @@ static int unuse_vma(struct vm_area_struct *vma, unsigned int type, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - ret = unuse_p4d_range(vma, pgd, addr, next, type, - frontswap, fs_pages_to_unuse); + ret = unuse_p4d_range(vma, pgd, addr, next, type); if (ret) return ret; } while (pgd++, addr = next, addr != end); return 0; } -static int unuse_mm(struct mm_struct *mm, unsigned int type, - bool frontswap, unsigned long *fs_pages_to_unuse) +static int unuse_mm(struct mm_struct *mm, unsigned int type) { struct vm_area_struct *vma; int ret = 0; @@ -2107,8 +2076,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type, mmap_read_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma->anon_vma) { - ret = unuse_vma(vma, type, frontswap, - fs_pages_to_unuse); + ret = unuse_vma(vma, type); if (ret) break; } @@ -2124,7 +2092,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type, * if there are no inuse entries after prev till end of the map. */ static unsigned int find_next_to_unuse(struct swap_info_struct *si, - unsigned int prev, bool frontswap) + unsigned int prev) { unsigned int i; unsigned char count; @@ -2138,8 +2106,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, for (i = prev + 1; i < si->max; i++) { count = READ_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) - if (!frontswap || frontswap_test(si, i)) - break; + break; if ((i % LATENCY_LIMIT) == 0) cond_resched(); } @@ -2150,12 +2117,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, return i; } -/* - * If the boolean frontswap is true, only unuse pages_to_unuse pages; - * pages_to_unuse==0 means all pages; ignored if frontswap is false - */ -int try_to_unuse(unsigned int type, bool frontswap, - unsigned long pages_to_unuse) +static int try_to_unuse(unsigned int type) { struct mm_struct *prev_mm; struct mm_struct *mm; @@ -2169,13 +2131,10 @@ int try_to_unuse(unsigned int type, bool frontswap, if (!READ_ONCE(si->inuse_pages)) return 0; - if (!frontswap) - pages_to_unuse = 0; - retry: - retval = shmem_unuse(type, frontswap, &pages_to_unuse); + retval = shmem_unuse(type); if (retval) - goto out; + return retval; prev_mm = &init_mm; mmget(prev_mm); @@ -2192,11 +2151,10 @@ retry: spin_unlock(&mmlist_lock); mmput(prev_mm); prev_mm = mm; - retval = unuse_mm(mm, type, frontswap, &pages_to_unuse); - + retval = unuse_mm(mm, type); if (retval) { mmput(prev_mm); - goto out; + return retval; } /* @@ -2213,7 +2171,7 @@ retry: i = 0; while (READ_ONCE(si->inuse_pages) && !signal_pending(current) && - (i = find_next_to_unuse(si, i, frontswap)) != 0) { + (i = find_next_to_unuse(si, i)) != 0) { entry = swp_entry(type, i); page = find_get_page(swap_address_space(entry), i); @@ -2231,14 +2189,6 @@ retry: try_to_free_swap(page); unlock_page(page); put_page(page); - - /* - * For frontswap, we just need to unuse pages_to_unuse, if - * it was specified. Need not check frontswap again here as - * we already zeroed out pages_to_unuse if not frontswap. - */ - if (pages_to_unuse && --pages_to_unuse == 0) - goto out; } /* @@ -2256,10 +2206,10 @@ retry: if (READ_ONCE(si->inuse_pages)) { if (!signal_pending(current)) goto retry; - retval = -EINTR; + return -EINTR; } -out: - return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval; + + return 0; } /* @@ -2477,7 +2427,8 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, struct swap_cluster_info *cluster_info, unsigned long *frontswap_map) { - frontswap_init(p->type, frontswap_map); + if (IS_ENABLED(CONFIG_FRONTSWAP)) + frontswap_init(p->type, frontswap_map); spin_lock(&swap_lock); spin_lock(&p->lock); setup_swap_info(p, prio, swap_map, cluster_info); @@ -2590,7 +2541,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) disable_swap_slots_cache_lock(); set_current_oom_origin(); - err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ + err = try_to_unuse(p->type); clear_current_oom_origin(); if (err) { diff --git a/mm/truncate.c b/mm/truncate.c index cc83a3f7c1ad..ab50d0d59a2a 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -19,10 +19,8 @@ #include <linux/highmem.h> #include <linux/pagevec.h> #include <linux/task_io_accounting_ops.h> -#include <linux/buffer_head.h> /* grr. try_to_release_page, - do_invalidatepage */ +#include <linux/buffer_head.h> /* grr. try_to_release_page */ #include <linux/shmem_fs.h> -#include <linux/cleancache.h> #include <linux/rmap.h> #include "internal.h" @@ -56,11 +54,11 @@ static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, /* * Unconditionally remove exceptional entries. Usually called from truncate - * path. Note that the pagevec may be altered by this function by removing - * exceptional entries similar to what pagevec_remove_exceptionals does. + * path. Note that the folio_batch may be altered by this function by removing + * exceptional entries similar to what folio_batch_remove_exceptionals() does. */ -static void truncate_exceptional_pvec_entries(struct address_space *mapping, - struct pagevec *pvec, pgoff_t *indices) +static void truncate_folio_batch_exceptionals(struct address_space *mapping, + struct folio_batch *fbatch, pgoff_t *indices) { int i, j; bool dax; @@ -69,11 +67,11 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, if (shmem_mapping(mapping)) return; - for (j = 0; j < pagevec_count(pvec); j++) - if (xa_is_value(pvec->pages[j])) + for (j = 0; j < folio_batch_count(fbatch); j++) + if (xa_is_value(fbatch->folios[j])) break; - if (j == pagevec_count(pvec)) + if (j == folio_batch_count(fbatch)) return; dax = dax_mapping(mapping); @@ -82,12 +80,12 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, xa_lock_irq(&mapping->i_pages); } - for (i = j; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; + for (i = j; i < folio_batch_count(fbatch); i++) { + struct folio *folio = fbatch->folios[i]; pgoff_t index = indices[i]; - if (!xa_is_value(page)) { - pvec->pages[j++] = page; + if (!xa_is_value(folio)) { + fbatch->folios[j++] = folio; continue; } @@ -96,7 +94,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, continue; } - __clear_shadow_entry(mapping, index, page); + __clear_shadow_entry(mapping, index, folio); } if (!dax) { @@ -105,7 +103,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); } - pvec->nr = j; + fbatch->nr = j; } /* @@ -139,33 +137,28 @@ static int invalidate_exceptional_entry2(struct address_space *mapping, } /** - * do_invalidatepage - invalidate part or all of a page - * @page: the page which is affected + * folio_invalidate - Invalidate part or all of a folio. + * @folio: The folio which is affected. * @offset: start of the range to invalidate * @length: length of the range to invalidate * - * do_invalidatepage() is called when all or part of the page has become + * folio_invalidate() is called when all or part of the folio has become * invalidated by a truncate operation. * - * do_invalidatepage() does not have to release all buffers, but it must + * folio_invalidate() does not have to release all buffers, but it must * ensure that no dirty buffer is left outside @offset and that no I/O * is underway against any of the blocks which are outside the truncation * point. Because the caller is about to free (and possibly reuse) those * blocks on-disk. */ -void do_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +void folio_invalidate(struct folio *folio, size_t offset, size_t length) { - void (*invalidatepage)(struct page *, unsigned int, unsigned int); - - invalidatepage = page->mapping->a_ops->invalidatepage; -#ifdef CONFIG_BLOCK - if (!invalidatepage) - invalidatepage = block_invalidatepage; -#endif - if (invalidatepage) - (*invalidatepage)(page, offset, length); + const struct address_space_operations *aops = folio->mapping->a_ops; + + if (aops->invalidate_folio) + aops->invalidate_folio(folio, offset, length); } +EXPORT_SYMBOL_GPL(folio_invalidate); /* * If truncate cannot remove the fs-private metadata from the page, the page @@ -177,57 +170,82 @@ void do_invalidatepage(struct page *page, unsigned int offset, * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. */ -static void truncate_cleanup_page(struct page *page) +static void truncate_cleanup_folio(struct folio *folio) { - if (page_mapped(page)) - unmap_mapping_page(page); + if (folio_mapped(folio)) + unmap_mapping_folio(folio); - if (page_has_private(page)) - do_invalidatepage(page, 0, thp_size(page)); + if (folio_has_private(folio)) + folio_invalidate(folio, 0, folio_size(folio)); /* * Some filesystems seem to re-dirty the page even after * the VM has canceled the dirty bit (eg ext3 journaling). * Hence dirty accounting check is placed after invalidation. */ - cancel_dirty_page(page); - ClearPageMappedToDisk(page); + folio_cancel_dirty(folio); + folio_clear_mappedtodisk(folio); } -/* - * This is for invalidate_mapping_pages(). That function can be called at - * any time, and is not supposed to throw away dirty pages. But pages can - * be marked dirty at any time too, so use remove_mapping which safely - * discards clean, unused pages. - * - * Returns non-zero if the page was successfully invalidated. - */ -static int -invalidate_complete_page(struct address_space *mapping, struct page *page) +int truncate_inode_folio(struct address_space *mapping, struct folio *folio) { - int ret; - - if (page->mapping != mapping) - return 0; - - if (page_has_private(page) && !try_to_release_page(page, 0)) - return 0; - - ret = remove_mapping(mapping, page); + if (folio->mapping != mapping) + return -EIO; - return ret; + truncate_cleanup_folio(folio); + filemap_remove_folio(folio); + return 0; } -int truncate_inode_page(struct address_space *mapping, struct page *page) +/* + * Handle partial folios. The folio may be entirely within the + * range if a split has raced with us. If not, we zero the part of the + * folio that's within the [start, end] range, and then split the folio if + * it's large. split_page_range() will discard pages which now lie beyond + * i_size, and we rely on the caller to discard pages which lie within a + * newly created hole. + * + * Returns false if splitting failed so the caller can avoid + * discarding the entire folio which is stubbornly unsplit. + */ +bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) { - VM_BUG_ON_PAGE(PageTail(page), page); + loff_t pos = folio_pos(folio); + unsigned int offset, length; - if (page->mapping != mapping) - return -EIO; + if (pos < start) + offset = start - pos; + else + offset = 0; + length = folio_size(folio); + if (pos + length <= (u64)end) + length = length - offset; + else + length = end + 1 - pos - offset; - truncate_cleanup_page(page); - delete_from_page_cache(page); - return 0; + folio_wait_writeback(folio); + if (length == folio_size(folio)) { + truncate_inode_folio(folio->mapping, folio); + return true; + } + + /* + * We may be zeroing pages we're about to discard, but it avoids + * doing a complex calculation here, and then doing the zeroing + * anyway if the page split fails. + */ + folio_zero_range(folio, offset, length); + + if (folio_has_private(folio)) + folio_invalidate(folio, offset, length); + if (!folio_test_large(folio)) + return true; + if (split_huge_page(&folio->page) == 0) + return true; + if (folio_test_dirty(folio)) + return false; + truncate_inode_folio(folio->mapping, folio); + return true; } /* @@ -235,6 +253,8 @@ int truncate_inode_page(struct address_space *mapping, struct page *page) */ int generic_error_remove_page(struct address_space *mapping, struct page *page) { + VM_BUG_ON_PAGE(PageTail(page), page); + if (!mapping) return -EINVAL; /* @@ -243,26 +263,44 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page) */ if (!S_ISREG(mapping->host->i_mode)) return -EIO; - return truncate_inode_page(mapping, page); + return truncate_inode_folio(mapping, page_folio(page)); } EXPORT_SYMBOL(generic_error_remove_page); -/* +static long mapping_evict_folio(struct address_space *mapping, + struct folio *folio) +{ + if (folio_test_dirty(folio) || folio_test_writeback(folio)) + return 0; + /* The refcount will be elevated if any page in the folio is mapped */ + if (folio_ref_count(folio) > + folio_nr_pages(folio) + folio_has_private(folio) + 1) + return 0; + if (folio_has_private(folio) && !filemap_release_folio(folio, 0)) + return 0; + + return remove_mapping(mapping, folio); +} + +/** + * invalidate_inode_page() - Remove an unused page from the pagecache. + * @page: The page to remove. + * * Safely invalidate one page from its pagecache mapping. - * It only drops clean, unused pages. The page must be locked. + * It only drops clean, unused pages. * - * Returns 1 if the page is successfully invalidated, otherwise 0. + * Context: Page must be locked. + * Return: The number of pages successfully removed. */ -int invalidate_inode_page(struct page *page) +long invalidate_inode_page(struct page *page) { - struct address_space *mapping = page_mapping(page); + struct folio *folio = page_folio(page); + struct address_space *mapping = folio_mapping(folio); + + /* The page may have been truncated before it was locked */ if (!mapping) return 0; - if (PageDirty(page) || PageWriteback(page)) - return 0; - if (page_mapped(page)) - return 0; - return invalidate_complete_page(mapping, page); + return mapping_evict_folio(mapping, folio); } /** @@ -285,7 +323,7 @@ int invalidate_inode_page(struct page *page) * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. * - * Note that since ->invalidatepage() accepts range to invalidate + * Note that since ->invalidate_folio() accepts range to invalidate * truncate_inode_pages_range is able to handle cases where lend + 1 is not * page aligned properly. */ @@ -294,19 +332,15 @@ void truncate_inode_pages_range(struct address_space *mapping, { pgoff_t start; /* inclusive */ pgoff_t end; /* exclusive */ - unsigned int partial_start; /* inclusive */ - unsigned int partial_end; /* exclusive */ - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index; int i; + struct folio *folio; + bool same_folio; if (mapping_empty(mapping)) - goto out; - - /* Offsets within partial pages */ - partial_start = lstart & (PAGE_SIZE - 1); - partial_end = (lend + 1) & (PAGE_SIZE - 1); + return; /* * 'start' and 'end' always covers the range of pages to be fully @@ -325,64 +359,49 @@ void truncate_inode_pages_range(struct address_space *mapping, else end = (lend + 1) >> PAGE_SHIFT; - pagevec_init(&pvec); + folio_batch_init(&fbatch); index = start; while (index < end && find_lock_entries(mapping, index, end - 1, - &pvec, indices)) { - index = indices[pagevec_count(&pvec) - 1] + 1; - truncate_exceptional_pvec_entries(mapping, &pvec, indices); - for (i = 0; i < pagevec_count(&pvec); i++) - truncate_cleanup_page(pvec.pages[i]); - delete_from_page_cache_batch(mapping, &pvec); - for (i = 0; i < pagevec_count(&pvec); i++) - unlock_page(pvec.pages[i]); - pagevec_release(&pvec); + &fbatch, indices)) { + index = indices[folio_batch_count(&fbatch) - 1] + 1; + truncate_folio_batch_exceptionals(mapping, &fbatch, indices); + for (i = 0; i < folio_batch_count(&fbatch); i++) + truncate_cleanup_folio(fbatch.folios[i]); + delete_from_page_cache_batch(mapping, &fbatch); + for (i = 0; i < folio_batch_count(&fbatch); i++) + folio_unlock(fbatch.folios[i]); + folio_batch_release(&fbatch); cond_resched(); } - if (partial_start) { - struct page *page = find_lock_page(mapping, start - 1); - if (page) { - unsigned int top = PAGE_SIZE; - if (start > end) { - /* Truncation within a single page */ - top = partial_end; - partial_end = 0; - } - wait_on_page_writeback(page); - zero_user_segment(page, partial_start, top); - cleancache_invalidate_page(mapping, page); - if (page_has_private(page)) - do_invalidatepage(page, partial_start, - top - partial_start); - unlock_page(page); - put_page(page); + same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); + folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0); + if (folio) { + same_folio = lend < folio_pos(folio) + folio_size(folio); + if (!truncate_inode_partial_folio(folio, lstart, lend)) { + start = folio->index + folio_nr_pages(folio); + if (same_folio) + end = folio->index; } + folio_unlock(folio); + folio_put(folio); + folio = NULL; } - if (partial_end) { - struct page *page = find_lock_page(mapping, end); - if (page) { - wait_on_page_writeback(page); - zero_user_segment(page, 0, partial_end); - cleancache_invalidate_page(mapping, page); - if (page_has_private(page)) - do_invalidatepage(page, 0, - partial_end); - unlock_page(page); - put_page(page); - } + + if (!same_folio) + folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT, + FGP_LOCK, 0); + if (folio) { + if (!truncate_inode_partial_folio(folio, lstart, lend)) + end = folio->index; + folio_unlock(folio); + folio_put(folio); } - /* - * If the truncation happened within a single page no pages - * will be released, just zeroed, so we can bail out now. - */ - if (start >= end) - goto out; index = start; - for ( ; ; ) { + while (index < end) { cond_resched(); - if (!find_get_entries(mapping, index, end - 1, &pvec, + if (!find_get_entries(mapping, index, end - 1, &fbatch, indices)) { /* If all gone from start onwards, we're done */ if (index == start) @@ -392,28 +411,26 @@ void truncate_inode_pages_range(struct address_space *mapping, continue; } - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; - lock_page(page); - WARN_ON(page_to_index(page) != index); - wait_on_page_writeback(page); - truncate_inode_page(mapping, page); - unlock_page(page); + folio_lock(folio); + VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); + folio_wait_writeback(folio); + truncate_inode_folio(mapping, folio); + folio_unlock(folio); + index = folio_index(folio) + folio_nr_pages(folio) - 1; } - truncate_exceptional_pvec_entries(mapping, &pvec, indices); - pagevec_release(&pvec); + truncate_folio_batch_exceptionals(mapping, &fbatch, indices); + folio_batch_release(&fbatch); index++; } - -out: - cleancache_invalidate_inode(mapping); } EXPORT_SYMBOL(truncate_inode_pages_range); @@ -467,56 +484,63 @@ void truncate_inode_pages_final(struct address_space *mapping) xa_unlock_irq(&mapping->i_pages); } - /* - * Cleancache needs notification even if there are no pages or shadow - * entries. - */ truncate_inode_pages(mapping, 0); } EXPORT_SYMBOL(truncate_inode_pages_final); -static unsigned long __invalidate_mapping_pages(struct address_space *mapping, +/** + * invalidate_mapping_pagevec - Invalidate all the unlocked pages of one inode + * @mapping: the address_space which holds the pages to invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * @nr_pagevec: invalidate failed page number for caller + * + * This helper is similar to invalidate_mapping_pages(), except that it accounts + * for pages that are likely on a pagevec and counts them in @nr_pagevec, which + * will be used by the caller. + */ +unsigned long invalidate_mapping_pagevec(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) { pgoff_t indices[PAGEVEC_SIZE]; - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t index = start; unsigned long ret; unsigned long count = 0; int i; - pagevec_init(&pvec); - while (find_lock_entries(mapping, index, end, &pvec, indices)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + folio_batch_init(&fbatch); + while (find_lock_entries(mapping, index, end, &fbatch, indices)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; - /* We rely upon deletion not changing page->index */ + /* We rely upon deletion not changing folio->index */ index = indices[i]; - if (xa_is_value(page)) { + if (xa_is_value(folio)) { count += invalidate_exceptional_entry(mapping, index, - page); + folio); continue; } - index += thp_nr_pages(page) - 1; + index += folio_nr_pages(folio) - 1; - ret = invalidate_inode_page(page); - unlock_page(page); + ret = mapping_evict_folio(mapping, folio); + folio_unlock(folio); /* - * Invalidation is a hint that the page is no longer + * Invalidation is a hint that the folio is no longer * of interest and try to speed up its reclaim. */ if (!ret) { - deactivate_file_page(page); + deactivate_file_folio(folio); /* It is likely on the pagevec of a remote CPU */ if (nr_pagevec) (*nr_pagevec)++; } count += ret; } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); cond_resched(); index++; } @@ -540,59 +564,40 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { - return __invalidate_mapping_pages(mapping, start, end, NULL); + return invalidate_mapping_pagevec(mapping, start, end, NULL); } EXPORT_SYMBOL(invalidate_mapping_pages); -/** - * invalidate_mapping_pagevec - Invalidate all the unlocked pages of one inode - * @mapping: the address_space which holds the pages to invalidate - * @start: the offset 'from' which to invalidate - * @end: the offset 'to' which to invalidate (inclusive) - * @nr_pagevec: invalidate failed page number for caller - * - * This helper is similar to invalidate_mapping_pages(), except that it accounts - * for pages that are likely on a pagevec and counts them in @nr_pagevec, which - * will be used by the caller. - */ -void invalidate_mapping_pagevec(struct address_space *mapping, - pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) -{ - __invalidate_mapping_pages(mapping, start, end, nr_pagevec); -} - /* - * This is like invalidate_complete_page(), except it ignores the page's + * This is like invalidate_inode_page(), except it ignores the page's * refcount. We do this because invalidate_inode_pages2() needs stronger * invalidation guarantees, and cannot afford to leave pages behind because * shrink_page_list() has a temp ref on them, or because they're transiently * sitting in the lru_cache_add() pagevecs. */ -static int -invalidate_complete_page2(struct address_space *mapping, struct page *page) +static int invalidate_complete_folio2(struct address_space *mapping, + struct folio *folio) { - if (page->mapping != mapping) + if (folio->mapping != mapping) return 0; - if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) + if (folio_has_private(folio) && + !filemap_release_folio(folio, GFP_KERNEL)) return 0; spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); - if (PageDirty(page)) + if (folio_test_dirty(folio)) goto failed; - BUG_ON(page_has_private(page)); - __delete_from_page_cache(page, NULL); + BUG_ON(folio_has_private(folio)); + __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); - if (mapping->a_ops->freepage) - mapping->a_ops->freepage(page); - - put_page(page); /* pagecache ref */ + filemap_free_folio(mapping, folio); return 1; failed: xa_unlock_irq(&mapping->i_pages); @@ -600,13 +605,13 @@ failed: return 0; } -static int do_launder_page(struct address_space *mapping, struct page *page) +static int folio_launder(struct address_space *mapping, struct folio *folio) { - if (!PageDirty(page)) + if (!folio_test_dirty(folio)) return 0; - if (page->mapping != mapping || mapping->a_ops->launder_page == NULL) + if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL) return 0; - return mapping->a_ops->launder_page(page); + return mapping->a_ops->launder_folio(folio); } /** @@ -624,7 +629,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { pgoff_t indices[PAGEVEC_SIZE]; - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t index; int i; int ret = 0; @@ -632,27 +637,27 @@ int invalidate_inode_pages2_range(struct address_space *mapping, int did_range_unmap = 0; if (mapping_empty(mapping)) - goto out; + return 0; - pagevec_init(&pvec); + folio_batch_init(&fbatch); index = start; - while (find_get_entries(mapping, index, end, &pvec, indices)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + while (find_get_entries(mapping, index, end, &fbatch, indices)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; - /* We rely upon deletion not changing page->index */ + /* We rely upon deletion not changing folio->index */ index = indices[i]; - if (xa_is_value(page)) { + if (xa_is_value(folio)) { if (!invalidate_exceptional_entry2(mapping, - index, page)) + index, folio)) ret = -EBUSY; continue; } - if (!did_range_unmap && page_mapped(page)) { + if (!did_range_unmap && folio_mapped(folio)) { /* - * If page is mapped, before taking its lock, + * If folio is mapped, before taking its lock, * zap the rest of the file in one hit. */ unmap_mapping_pages(mapping, index, @@ -660,29 +665,29 @@ int invalidate_inode_pages2_range(struct address_space *mapping, did_range_unmap = 1; } - lock_page(page); - WARN_ON(page_to_index(page) != index); - if (page->mapping != mapping) { - unlock_page(page); + folio_lock(folio); + VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); + if (folio->mapping != mapping) { + folio_unlock(folio); continue; } - wait_on_page_writeback(page); + folio_wait_writeback(folio); - if (page_mapped(page)) - unmap_mapping_page(page); - BUG_ON(page_mapped(page)); + if (folio_mapped(folio)) + unmap_mapping_folio(folio); + BUG_ON(folio_mapped(folio)); - ret2 = do_launder_page(mapping, page); + ret2 = folio_launder(mapping, folio); if (ret2 == 0) { - if (!invalidate_complete_page2(mapping, page)) + if (!invalidate_complete_folio2(mapping, folio)) ret2 = -EBUSY; } if (ret2 < 0) ret = ret2; - unlock_page(page); + folio_unlock(folio); } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); cond_resched(); index++; } @@ -696,8 +701,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping, if (dax_mapping(mapping)) { unmap_mapping_pages(mapping, start, end - start + 1, false); } -out: - cleancache_invalidate_inode(mapping); return ret; } EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); diff --git a/mm/usercopy.c b/mm/usercopy.c index d0d268135d96..2c235d5c2364 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -29,7 +29,7 @@ * Returns: * NOT_STACK: not at all on the stack * GOOD_FRAME: fully within a valid stack frame - * GOOD_STACK: fully on the stack (when can't do frame-checking) + * GOOD_STACK: within the current stack (when can't frame-check exactly) * BAD_STACK: error condition (invalid stack position or bad stack frame) */ static noinline int check_stack_object(const void *obj, unsigned long len) @@ -55,6 +55,17 @@ static noinline int check_stack_object(const void *obj, unsigned long len) if (ret) return ret; + /* Finally, check stack depth if possible. */ +#ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER + if (IS_ENABLED(CONFIG_STACK_GROWSUP)) { + if ((void *)current_stack_pointer < obj + len) + return BAD_STACK; + } else { + if (obj < (void *)current_stack_pointer) + return BAD_STACK; + } +#endif + return GOOD_STACK; } @@ -70,17 +81,6 @@ static noinline int check_stack_object(const void *obj, unsigned long len) * kmem_cache_create_usercopy() function to create the cache (and * carefully audit the whitelist range). */ -void usercopy_warn(const char *name, const char *detail, bool to_user, - unsigned long offset, unsigned long len) -{ - WARN_ONCE(1, "Bad or missing usercopy whitelist? Kernel memory %s attempt detected %s %s%s%s%s (offset %lu, size %lu)!\n", - to_user ? "exposure" : "overwrite", - to_user ? "from" : "to", - name ? : "unknown?!", - detail ? " '" : "", detail ? : "", detail ? "'" : "", - offset, len); -} - void __noreturn usercopy_abort(const char *name, const char *detail, bool to_user, unsigned long offset, unsigned long len) @@ -280,7 +280,15 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user) */ return; default: - usercopy_abort("process stack", NULL, to_user, 0, n); + usercopy_abort("process stack", NULL, to_user, +#ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER + IS_ENABLED(CONFIG_STACK_GROWSUP) ? + ptr - (void *)current_stack_pointer : + (void *)current_stack_pointer - ptr, +#else + 0, +#endif + n); } /* Check for bad heap object. */ @@ -295,7 +303,10 @@ static bool enable_checks __initdata = true; static int __init parse_hardened_usercopy(char *str) { - return strtobool(str, &enable_checks); + if (strtobool(str, &enable_checks)) + pr_warn("Invalid option string for hardened_usercopy: '%s'\n", + str); + return 1; } __setup("hardened_usercopy=", parse_hardened_usercopy); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index ac6f036298cd..0cb8e5ef1713 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -95,10 +95,15 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, if (!pte_none(*dst_pte)) goto out_unlock; - if (page_in_cache) - page_add_file_rmap(page, false); - else + if (page_in_cache) { + /* Usually, cache pages are already added to LRU */ + if (newly_allocated) + lru_cache_add(page); + page_add_file_rmap(page, dst_vma, false); + } else { page_add_new_anon_rmap(page, dst_vma, dst_addr, false); + lru_cache_add_inactive_or_unevictable(page, dst_vma); + } /* * Must happen after rmap, as mm_counter() checks mapping (via @@ -106,9 +111,6 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, */ inc_mm_counter(dst_mm, mm_counter(page)); - if (newly_allocated) - lru_cache_add_inactive_or_unevictable(page, dst_vma); - set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ @@ -150,6 +152,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, /* don't free the page */ goto out; } + + flush_dcache_page(page); } else { page = *pagep; *pagep = NULL; @@ -232,6 +236,11 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm, goto out; } + if (PageHWPoison(page)) { + ret = -EIO; + goto out_release; + } + ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, page, false, wp_copy); if (ret) @@ -620,6 +629,7 @@ retry: err = -EFAULT; goto out; } + flush_dcache_page(page); goto retry; } else BUG_ON(page); diff --git a/mm/util.c b/mm/util.c index 741ba32a43ac..1e2728736398 100644 --- a/mm/util.c +++ b/mm/util.c @@ -549,13 +549,10 @@ EXPORT_SYMBOL(vm_mmap); * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * - * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. + * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is * preferable to the vmalloc fallback, due to visible performance drawbacks. * - * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not - * fall back to vmalloc. - * * Return: pointer to the allocated memory of %NULL in case of failure */ void *kvmalloc_node(size_t size, gfp_t flags, int node) @@ -564,13 +561,6 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) void *ret; /* - * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables) - * so the given set of flags has to be compatible. - */ - if ((flags & GFP_KERNEL) != GFP_KERNEL) - return kmalloc_node(size, flags, node); - - /* * We want to attempt a large physically contiguous block first because * it is less likely to fragment multiple larger blocks and therefore * contribute to a long term fragmentation less than vmalloc fallback. @@ -582,6 +572,9 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL)) kmalloc_flags |= __GFP_NORETRY; + + /* nofail semantic is implemented by the vmalloc fallback */ + kmalloc_flags &= ~__GFP_NOFAIL; } ret = kmalloc_node(size, kmalloc_flags, node); @@ -594,8 +587,10 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) return ret; /* Don't even allow crazy sizes */ - if (WARN_ON_ONCE(size > INT_MAX)) + if (unlikely(size > INT_MAX)) { + WARN_ON_ONCE(!(flags & __GFP_NOWARN)); return NULL; + } return __vmalloc_node(size, 1, flags, node, __builtin_return_address(0)); @@ -686,9 +681,8 @@ bool folio_mapped(struct folio *folio) } EXPORT_SYMBOL(folio_mapped); -struct anon_vma *page_anon_vma(struct page *page) +struct anon_vma *folio_anon_vma(struct folio *folio) { - struct folio *folio = page_folio(page); unsigned long mapping = (unsigned long)folio->mapping; if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) @@ -748,6 +742,39 @@ int __page_mapcount(struct page *page) EXPORT_SYMBOL_GPL(__page_mapcount); /** + * folio_mapcount() - Calculate the number of mappings of this folio. + * @folio: The folio. + * + * A large folio tracks both how many times the entire folio is mapped, + * and how many times each individual page in the folio is mapped. + * This function calculates the total number of times the folio is + * mapped. + * + * Return: The number of times this folio is mapped. + */ +int folio_mapcount(struct folio *folio) +{ + int i, compound, nr, ret; + + if (likely(!folio_test_large(folio))) + return atomic_read(&folio->_mapcount) + 1; + + compound = folio_entire_mapcount(folio); + nr = folio_nr_pages(folio); + if (folio_test_hugetlb(folio)) + return compound; + ret = compound; + for (i = 0; i < nr; i++) + ret += atomic_read(&folio_page(folio, i)->_mapcount) + 1; + /* File pages has compound_mapcount included in _mapcount */ + if (!folio_test_anon(folio)) + return ret - compound * nr; + if (folio_test_double_map(folio)) + ret -= nr; + return ret; +} + +/** * folio_copy - Copy the contents of one folio to another. * @dst: Folio to copy to. * @src: Folio to copy from. diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d2a00ad4e1dd..99e0f3e8d1a5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -31,6 +31,7 @@ #include <linux/kmemleak.h> #include <linux/atomic.h> #include <linux/compiler.h> +#include <linux/memcontrol.h> #include <linux/llist.h> #include <linux/bitops.h> #include <linux/rbtree_augmented.h> @@ -38,6 +39,7 @@ #include <linux/pgtable.h> #include <linux/uaccess.h> #include <linux/hugetlb.h> +#include <linux/sched/mm.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> @@ -116,7 +118,6 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (size != PAGE_SIZE) { pte_t entry = pfn_pte(pfn, prot); - entry = pte_mkhuge(entry); entry = arch_make_huge_pte(entry, ilog2(size), 0); set_huge_pte_at(&init_mm, addr, pte, entry); pfn += PFN_DOWN(size); @@ -774,23 +775,13 @@ get_subtree_max_size(struct rb_node *node) return va ? va->subtree_max_size : 0; } -/* - * Gets called when remove the node and rotate. - */ -static __always_inline unsigned long -compute_subtree_max_size(struct vmap_area *va) -{ - return max3(va_size(va), - get_subtree_max_size(va->rb_node.rb_left), - get_subtree_max_size(va->rb_node.rb_right)); -} - RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) static void purge_vmap_area_lazy(void); static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); -static unsigned long lazy_max_pages(void); +static void drain_vmap_area_work(struct work_struct *work); +static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work); static atomic_long_t nr_vmalloc_pages; @@ -971,6 +962,17 @@ unlink_va(struct vmap_area *va, struct rb_root *root) } #if DEBUG_AUGMENT_PROPAGATE_CHECK +/* + * Gets called when remove the node and rotate. + */ +static __always_inline unsigned long +compute_subtree_max_size(struct vmap_area *va) +{ + return max3(va_size(va), + get_subtree_max_size(va->rb_node.rb_left), + get_subtree_max_size(va->rb_node.rb_right)); +} + static void augment_tree_propagate_check(void) { @@ -1187,22 +1189,28 @@ is_within_this_va(struct vmap_area *va, unsigned long size, /* * Find the first free block(lowest start address) in the tree, * that will accomplish the request corresponding to passing - * parameters. + * parameters. Please note, with an alignment bigger than PAGE_SIZE, + * a search length is adjusted to account for worst case alignment + * overhead. */ static __always_inline struct vmap_area * -find_vmap_lowest_match(unsigned long size, - unsigned long align, unsigned long vstart) +find_vmap_lowest_match(unsigned long size, unsigned long align, + unsigned long vstart, bool adjust_search_size) { struct vmap_area *va; struct rb_node *node; + unsigned long length; /* Start from the root. */ node = free_vmap_area_root.rb_node; + /* Adjust the search size for alignment overhead. */ + length = adjust_search_size ? size + align - 1 : size; + while (node) { va = rb_entry(node, struct vmap_area, rb_node); - if (get_subtree_max_size(node->rb_left) >= size && + if (get_subtree_max_size(node->rb_left) >= length && vstart < va->va_start) { node = node->rb_left; } else { @@ -1212,9 +1220,9 @@ find_vmap_lowest_match(unsigned long size, /* * Does not make sense to go deeper towards the right * sub-tree if it does not have a free block that is - * equal or bigger to the requested search size. + * equal or bigger to the requested search length. */ - if (get_subtree_max_size(node->rb_right) >= size) { + if (get_subtree_max_size(node->rb_right) >= length) { node = node->rb_right; continue; } @@ -1230,7 +1238,7 @@ find_vmap_lowest_match(unsigned long size, if (is_within_this_va(va, size, align, vstart)) return va; - if (get_subtree_max_size(node->rb_right) >= size && + if (get_subtree_max_size(node->rb_right) >= length && vstart <= va->va_start) { /* * Shift the vstart forward. Please note, we update it with @@ -1278,7 +1286,7 @@ find_vmap_lowest_match_check(unsigned long size, unsigned long align) get_random_bytes(&rnd, sizeof(rnd)); vstart = VMALLOC_START + rnd; - va_1 = find_vmap_lowest_match(size, align, vstart); + va_1 = find_vmap_lowest_match(size, align, vstart, false); va_2 = find_vmap_lowest_linear_match(size, align, vstart); if (va_1 != va_2) @@ -1429,12 +1437,25 @@ static __always_inline unsigned long __alloc_vmap_area(unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend) { + bool adjust_search_size = true; unsigned long nva_start_addr; struct vmap_area *va; enum fit_type type; int ret; - va = find_vmap_lowest_match(size, align, vstart); + /* + * Do not adjust when: + * a) align <= PAGE_SIZE, because it does not make any sense. + * All blocks(their start addresses) are at least PAGE_SIZE + * aligned anyway; + * b) a short range where a requested size corresponds to exactly + * specified [vstart:vend] interval and an alignment > PAGE_SIZE. + * With adjusted search length an allocation would not succeed. + */ + if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size)) + adjust_search_size = false; + + va = find_vmap_lowest_match(size, align, vstart, adjust_search_size); if (unlikely(!va)) return vend; @@ -1718,18 +1739,6 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) } /* - * Kick off a purge of the outstanding lazy areas. Don't bother if somebody - * is already purging. - */ -static void try_purge_vmap_area_lazy(void) -{ - if (mutex_trylock(&vmap_purge_lock)) { - __purge_vmap_area_lazy(ULONG_MAX, 0); - mutex_unlock(&vmap_purge_lock); - } -} - -/* * Kick off a purge of the outstanding lazy areas. */ static void purge_vmap_area_lazy(void) @@ -1740,6 +1749,20 @@ static void purge_vmap_area_lazy(void) mutex_unlock(&vmap_purge_lock); } +static void drain_vmap_area_work(struct work_struct *work) +{ + unsigned long nr_lazy; + + do { + mutex_lock(&vmap_purge_lock); + __purge_vmap_area_lazy(ULONG_MAX, 0); + mutex_unlock(&vmap_purge_lock); + + /* Recheck if further work is required. */ + nr_lazy = atomic_long_read(&vmap_lazy_nr); + } while (nr_lazy > lazy_max_pages()); +} + /* * Free a vmap area, caller ensuring that the area has been unmapped * and flush_cache_vunmap had been called for the correct range @@ -1766,7 +1789,7 @@ static void free_vmap_area_noflush(struct vmap_area *va) /* After this point, we may free va at any time */ if (unlikely(nr_lazy > lazy_max_pages())) - try_purge_vmap_area_lazy(); + schedule_work(&drain_vmap_work); } /* @@ -2623,12 +2646,13 @@ static void __vunmap(const void *addr, int deallocate_pages) if (deallocate_pages) { unsigned int page_order = vm_area_page_order(area); - int i; + int i, step = 1U << page_order; - for (i = 0; i < area->nr_pages; i += 1U << page_order) { + for (i = 0; i < area->nr_pages; i += step) { struct page *page = area->pages[i]; BUG_ON(!page); + mod_memcg_page_state(page, MEMCG_VMALLOC, -step); __free_pages(page, page_order); cond_resched(); } @@ -2844,6 +2868,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * more permissive. */ if (!order) { + gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL; + while (nr_allocated < nr_pages) { unsigned int nr, nr_pages_request; @@ -2861,12 +2887,12 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * but mempolcy want to alloc memory by interleaving. */ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) - nr = alloc_pages_bulk_array_mempolicy(gfp, + nr = alloc_pages_bulk_array_mempolicy(bulk_gfp, nr_pages_request, pages + nr_allocated); else - nr = alloc_pages_bulk_array_node(gfp, nid, + nr = alloc_pages_bulk_array_node(bulk_gfp, nid, nr_pages_request, pages + nr_allocated); @@ -2920,12 +2946,14 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, int node) { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - const gfp_t orig_gfp_mask = gfp_mask; + bool nofail = gfp_mask & __GFP_NOFAIL; unsigned long addr = (unsigned long)area->addr; unsigned long size = get_vm_area_size(area); unsigned long array_size; unsigned int nr_small_pages = size >> PAGE_SHIFT; unsigned int page_order; + unsigned int flags; + int ret; array_size = (unsigned long)nr_small_pages * sizeof(struct page *); gfp_mask |= __GFP_NOWARN; @@ -2941,7 +2969,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } if (!area->pages) { - warn_alloc(orig_gfp_mask, NULL, + warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocated page array size %lu", nr_small_pages * PAGE_SIZE, array_size); free_vm_area(area); @@ -2951,25 +2979,52 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, set_vm_area_page_order(area, page_shift - PAGE_SHIFT); page_order = vm_area_page_order(area); - area->nr_pages = vm_area_alloc_pages(gfp_mask, node, - page_order, nr_small_pages, area->pages); + area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN, + node, page_order, nr_small_pages, area->pages); atomic_long_add(area->nr_pages, &nr_vmalloc_pages); + if (gfp_mask & __GFP_ACCOUNT) { + int i, step = 1U << page_order; + + for (i = 0; i < area->nr_pages; i += step) + mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, + step); + } /* * If not enough pages were obtained to accomplish an * allocation request, free them via __vfree() if any. */ if (area->nr_pages != nr_small_pages) { - warn_alloc(orig_gfp_mask, NULL, + warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, page order %u, failed to allocate pages", area->nr_pages * PAGE_SIZE, page_order); goto fail; } - if (vmap_pages_range(addr, addr + size, prot, area->pages, - page_shift) < 0) { - warn_alloc(orig_gfp_mask, NULL, + /* + * page tables allocations ignore external gfp mask, enforce it + * by the scope API + */ + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + flags = memalloc_nofs_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + flags = memalloc_noio_save(); + + do { + ret = vmap_pages_range(addr, addr + size, prot, area->pages, + page_shift); + if (nofail && (ret < 0)) + schedule_timeout_uninterruptible(1); + } while (nofail && (ret < 0)); + + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + memalloc_nofs_restore(flags); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + memalloc_noio_restore(flags); + + if (ret < 0) { + warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to map pages", area->nr_pages * PAGE_SIZE); goto fail; @@ -2996,12 +3051,14 @@ fail: * * Allocate enough pages to cover @size from the page level * allocator with @gfp_mask flags. Please note that the full set of gfp - * flags are not supported. GFP_KERNEL would be a preferred allocation mode - * but GFP_NOFS and GFP_NOIO are supported as well. Zone modifiers are not - * supported. From the reclaim modifiers__GFP_DIRECT_RECLAIM is required (aka - * GFP_NOWAIT is not supported) and only __GFP_NOFAIL is supported (aka - * __GFP_NORETRY and __GFP_RETRY_MAYFAIL are not supported). - * __GFP_NOWARN can be used to suppress error messages about failures. + * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all + * supported. + * Zone modifiers are not supported. From the reclaim modifiers + * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported) + * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and + * __GFP_RETRY_MAYFAIL are not supported). + * + * __GFP_NOWARN can be used to suppress failures messages. * * Map them into contiguous kernel virtual space, using a pagetable * protection of @prot. @@ -3056,9 +3113,14 @@ again: VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); if (!area) { + bool nofail = gfp_mask & __GFP_NOFAIL; warn_alloc(gfp_mask, NULL, - "vmalloc error: size %lu, vm_struct allocation failed", - real_size); + "vmalloc error: size %lu, vm_struct allocation failed%s", + real_size, (nofail) ? ". Retrying." : ""); + if (nofail) { + schedule_timeout_uninterruptible(1); + goto again; + } goto fail; } @@ -3074,7 +3136,8 @@ again: clear_vm_uninitialized_flag(area); size = PAGE_ALIGN(size); - kmemleak_vmalloc(area, size, gfp_mask); + if (!(vm_flags & VM_DEFER_KMEMLEAK)) + kmemleak_vmalloc(area, size, gfp_mask); return addr; diff --git a/mm/vmscan.c b/mm/vmscan.c index 700434db5735..1678802e03e7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -56,6 +56,7 @@ #include <linux/swapops.h> #include <linux/balloon_compaction.h> +#include <linux/sched/sysctl.h> #include "internal.h" @@ -951,7 +952,7 @@ out: return freed; } -void drop_slab_node(int nid) +static void drop_slab_node(int nid) { unsigned long freed; int shift = 0; @@ -978,47 +979,36 @@ void drop_slab(void) drop_slab_node(nid); } -static inline int is_page_cache_freeable(struct page *page) +static inline int is_page_cache_freeable(struct folio *folio) { /* * A freeable page cache page is referenced only by the caller * that isolated the page, the page cache and optional buffer * heads at page->private. */ - int page_cache_pins = thp_nr_pages(page); - return page_count(page) - page_has_private(page) == 1 + page_cache_pins; -} - -static int may_write_to_inode(struct inode *inode) -{ - if (current->flags & PF_SWAPWRITE) - return 1; - if (!inode_write_congested(inode)) - return 1; - if (inode_to_bdi(inode) == current->backing_dev_info) - return 1; - return 0; + return folio_ref_count(folio) - folio_test_private(folio) == + 1 + folio_nr_pages(folio); } /* - * We detected a synchronous write error writing a page out. Probably + * We detected a synchronous write error writing a folio out. Probably * -ENOSPC. We need to propagate that into the address_space for a subsequent * fsync(), msync() or close(). * * The tricky part is that after writepage we cannot touch the mapping: nothing - * prevents it from being freed up. But we have a ref on the page and once - * that page is locked, the mapping is pinned. + * prevents it from being freed up. But we have a ref on the folio and once + * that folio is locked, the mapping is pinned. * - * We're allowed to run sleeping lock_page() here because we know the caller has + * We're allowed to run sleeping folio_lock() here because we know the caller has * __GFP_FS. */ static void handle_write_error(struct address_space *mapping, - struct page *page, int error) + struct folio *folio, int error) { - lock_page(page); - if (page_mapping(page) == mapping) + folio_lock(folio); + if (folio_mapping(folio) == mapping) mapping_set_error(mapping, error); - unlock_page(page); + folio_unlock(folio); } static bool skip_throttle_noprogress(pg_data_t *pgdat) @@ -1066,8 +1056,10 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) * forward progress (e.g. journalling workqueues or kthreads). */ if (!current_is_kswapd() && - current->flags & (PF_IO_WORKER|PF_KTHREAD)) + current->flags & (PF_IO_WORKER|PF_KTHREAD)) { + cond_resched(); return; + } /* * These figures are pulled out of thin air. @@ -1163,35 +1155,35 @@ typedef enum { * pageout is called by shrink_page_list() for each dirty page. * Calls ->writepage(). */ -static pageout_t pageout(struct page *page, struct address_space *mapping) +static pageout_t pageout(struct folio *folio, struct address_space *mapping) { /* - * If the page is dirty, only perform writeback if that write + * If the folio is dirty, only perform writeback if that write * will be non-blocking. To prevent this allocation from being * stalled by pagecache activity. But note that there may be * stalls if we need to run get_block(). We could test * PagePrivate for that. * * If this process is currently in __generic_file_write_iter() against - * this page's queue, we can perform writeback even if that + * this folio's queue, we can perform writeback even if that * will block. * - * If the page is swapcache, write it back even if that would + * If the folio is swapcache, write it back even if that would * block, for some throttling. This happens by accident, because * swap_backing_dev_info is bust: it doesn't reflect the * congestion state of the swapdevs. Easy to fix, if needed. */ - if (!is_page_cache_freeable(page)) + if (!is_page_cache_freeable(folio)) return PAGE_KEEP; if (!mapping) { /* - * Some data journaling orphaned pages can have - * page->mapping == NULL while being dirty with clean buffers. + * Some data journaling orphaned folios can have + * folio->mapping == NULL while being dirty with clean buffers. */ - if (page_has_private(page)) { - if (try_to_free_buffers(page)) { - ClearPageDirty(page); - pr_info("%s: orphaned page\n", __func__); + if (folio_test_private(folio)) { + if (try_to_free_buffers(&folio->page)) { + folio_clear_dirty(folio); + pr_info("%s: orphaned folio\n", __func__); return PAGE_CLEAN; } } @@ -1199,10 +1191,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_inode(mapping->host)) - return PAGE_KEEP; - if (clear_page_dirty_for_io(page)) { + if (folio_clear_dirty_for_io(folio)) { int res; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, @@ -1212,21 +1202,21 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) .for_reclaim = 1, }; - SetPageReclaim(page); - res = mapping->a_ops->writepage(page, &wbc); + folio_set_reclaim(folio); + res = mapping->a_ops->writepage(&folio->page, &wbc); if (res < 0) - handle_write_error(mapping, page, res); + handle_write_error(mapping, folio, res); if (res == AOP_WRITEPAGE_ACTIVATE) { - ClearPageReclaim(page); + folio_clear_reclaim(folio); return PAGE_ACTIVATE; } - if (!PageWriteback(page)) { + if (!folio_test_writeback(folio)) { /* synchronous write or broken a_ops? */ - ClearPageReclaim(page); + folio_clear_reclaim(folio); } - trace_mm_vmscan_writepage(page); - inc_node_page_state(page, NR_VMSCAN_WRITE); + trace_mm_vmscan_write_folio(folio); + node_stat_add_folio(folio, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -1237,16 +1227,16 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) * Same as remove_mapping, but if the page is removed from the mapping, it * gets returned with a refcount of 0. */ -static int __remove_mapping(struct address_space *mapping, struct page *page, +static int __remove_mapping(struct address_space *mapping, struct folio *folio, bool reclaimed, struct mem_cgroup *target_memcg) { int refcount; void *shadow = NULL; - BUG_ON(!PageLocked(page)); - BUG_ON(mapping != page_mapping(page)); + BUG_ON(!folio_test_locked(folio)); + BUG_ON(mapping != folio_mapping(folio)); - if (!PageSwapCache(page)) + if (!folio_test_swapcache(folio)) spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); /* @@ -1274,23 +1264,23 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under the i_pages lock, then this ordering is not required. */ - refcount = 1 + compound_nr(page); - if (!page_ref_freeze(page, refcount)) + refcount = 1 + folio_nr_pages(folio); + if (!folio_ref_freeze(folio, refcount)) goto cannot_free; /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */ - if (unlikely(PageDirty(page))) { - page_ref_unfreeze(page, refcount); + if (unlikely(folio_test_dirty(folio))) { + folio_ref_unfreeze(folio, refcount); goto cannot_free; } - if (PageSwapCache(page)) { - swp_entry_t swap = { .val = page_private(page) }; - mem_cgroup_swapout(page, swap); + if (folio_test_swapcache(folio)) { + swp_entry_t swap = folio_swap_entry(folio); + mem_cgroup_swapout(folio, swap); if (reclaimed && !mapping_exiting(mapping)) - shadow = workingset_eviction(page, target_memcg); - __delete_from_swap_cache(page, swap, shadow); + shadow = workingset_eviction(folio, target_memcg); + __delete_from_swap_cache(&folio->page, swap, shadow); xa_unlock_irq(&mapping->i_pages); - put_swap_page(page, swap); + put_swap_page(&folio->page, swap); } else { void (*freepage)(struct page *); @@ -1311,61 +1301,67 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * exceptional entries and shadow exceptional entries in the * same address_space. */ - if (reclaimed && page_is_file_lru(page) && + if (reclaimed && folio_is_file_lru(folio) && !mapping_exiting(mapping) && !dax_mapping(mapping)) - shadow = workingset_eviction(page, target_memcg); - __delete_from_page_cache(page, shadow); + shadow = workingset_eviction(folio, target_memcg); + __filemap_remove_folio(folio, shadow); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); if (freepage != NULL) - freepage(page); + freepage(&folio->page); } return 1; cannot_free: xa_unlock_irq(&mapping->i_pages); - if (!PageSwapCache(page)) + if (!folio_test_swapcache(folio)) spin_unlock(&mapping->host->i_lock); return 0; } -/* - * Attempt to detach a locked page from its ->mapping. If it is dirty or if - * someone else has a ref on the page, abort and return 0. If it was - * successfully detached, return 1. Assumes the caller has a single ref on - * this page. +/** + * remove_mapping() - Attempt to remove a folio from its mapping. + * @mapping: The address space. + * @folio: The folio to remove. + * + * If the folio is dirty, under writeback or if someone else has a ref + * on it, removal will fail. + * Return: The number of pages removed from the mapping. 0 if the folio + * could not be removed. + * Context: The caller should have a single refcount on the folio and + * hold its lock. */ -int remove_mapping(struct address_space *mapping, struct page *page) +long remove_mapping(struct address_space *mapping, struct folio *folio) { - if (__remove_mapping(mapping, page, false, NULL)) { + if (__remove_mapping(mapping, folio, false, NULL)) { /* - * Unfreezing the refcount with 1 rather than 2 effectively + * Unfreezing the refcount with 1 effectively * drops the pagecache ref for us without requiring another * atomic operation. */ - page_ref_unfreeze(page, 1); - return 1; + folio_ref_unfreeze(folio, 1); + return folio_nr_pages(folio); } return 0; } /** - * putback_lru_page - put previously isolated page onto appropriate LRU list - * @page: page to be put back to appropriate lru list + * folio_putback_lru - Put previously isolated folio onto appropriate LRU list. + * @folio: Folio to be returned to an LRU list. * - * Add previously isolated @page to appropriate LRU list. - * Page may still be unevictable for other reasons. + * Add previously isolated @folio to appropriate LRU list. + * The folio may still be unevictable for other reasons. * - * lru_lock must not be held, interrupts must be enabled. + * Context: lru_lock must not be held, interrupts must be enabled. */ -void putback_lru_page(struct page *page) +void folio_putback_lru(struct folio *folio) { - lru_cache_add(page); - put_page(page); /* drop ref from isolate */ + folio_add_lru(folio); + folio_put(folio); /* drop ref from isolate */ } enum page_references { @@ -1375,61 +1371,61 @@ enum page_references { PAGEREF_ACTIVATE, }; -static enum page_references page_check_references(struct page *page, +static enum page_references folio_check_references(struct folio *folio, struct scan_control *sc) { - int referenced_ptes, referenced_page; + int referenced_ptes, referenced_folio; unsigned long vm_flags; - referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, - &vm_flags); - referenced_page = TestClearPageReferenced(page); + referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, + &vm_flags); + referenced_folio = folio_test_clear_referenced(folio); /* - * Mlock lost the isolation race with us. Let try_to_unmap() - * move the page to the unevictable list. + * The supposedly reclaimable folio was found to be in a VM_LOCKED vma. + * Let the folio, now marked Mlocked, be moved to the unevictable list. */ if (vm_flags & VM_LOCKED) - return PAGEREF_RECLAIM; + return PAGEREF_ACTIVATE; if (referenced_ptes) { /* - * All mapped pages start out with page table + * All mapped folios start out with page table * references from the instantiating fault, so we need - * to look twice if a mapped file page is used more + * to look twice if a mapped file/anon folio is used more * than once. * * Mark it and spare it for another trip around the * inactive list. Another page table reference will * lead to its activation. * - * Note: the mark is set for activated pages as well - * so that recently deactivated but used pages are + * Note: the mark is set for activated folios as well + * so that recently deactivated but used folios are * quickly recovered. */ - SetPageReferenced(page); + folio_set_referenced(folio); - if (referenced_page || referenced_ptes > 1) + if (referenced_folio || referenced_ptes > 1) return PAGEREF_ACTIVATE; /* - * Activate file-backed executable pages after first usage. + * Activate file-backed executable folios after first usage. */ - if ((vm_flags & VM_EXEC) && !PageSwapBacked(page)) + if ((vm_flags & VM_EXEC) && !folio_test_swapbacked(folio)) return PAGEREF_ACTIVATE; return PAGEREF_KEEP; } - /* Reclaim if clean, defer dirty pages to writeback */ - if (referenced_page && !PageSwapBacked(page)) + /* Reclaim if clean, defer dirty folios to writeback */ + if (referenced_folio && !folio_test_swapbacked(folio)) return PAGEREF_RECLAIM_CLEAN; return PAGEREF_RECLAIM; } /* Check if a page is dirty or under writeback */ -static void page_check_dirty_writeback(struct page *page, +static void folio_check_dirty_writeback(struct folio *folio, bool *dirty, bool *writeback) { struct address_space *mapping; @@ -1438,24 +1434,24 @@ static void page_check_dirty_writeback(struct page *page, * Anonymous pages are not handled by flushers and must be written * from reclaim context. Do not stall reclaim based on them */ - if (!page_is_file_lru(page) || - (PageAnon(page) && !PageSwapBacked(page))) { + if (!folio_is_file_lru(folio) || + (folio_test_anon(folio) && !folio_test_swapbacked(folio))) { *dirty = false; *writeback = false; return; } - /* By default assume that the page flags are accurate */ - *dirty = PageDirty(page); - *writeback = PageWriteback(page); + /* By default assume that the folio flags are accurate */ + *dirty = folio_test_dirty(folio); + *writeback = folio_test_writeback(folio); /* Verify dirty/writeback state if the filesystem supports it */ - if (!page_has_private(page)) + if (!folio_test_private(folio)) return; - mapping = page_mapping(page); + mapping = folio_mapping(folio); if (mapping && mapping->a_ops->is_dirty_writeback) - mapping->a_ops->is_dirty_writeback(page, dirty, writeback); + mapping->a_ops->is_dirty_writeback(&folio->page, dirty, writeback); } static struct page *alloc_demote_page(struct page *page, unsigned long node) @@ -1529,14 +1525,16 @@ retry: while (!list_empty(page_list)) { struct address_space *mapping; struct page *page; + struct folio *folio; enum page_references references = PAGEREF_RECLAIM; bool dirty, writeback, may_enter_fs; unsigned int nr_pages; cond_resched(); - page = lru_to_page(page_list); - list_del(&page->lru); + folio = lru_to_folio(page_list); + list_del(&folio->lru); + page = &folio->page; if (!trylock_page(page)) goto keep; @@ -1562,12 +1560,12 @@ retry: * reclaim_congested. kswapd will stall and start writing * pages if the tail of the LRU is all dirty unqueued pages. */ - page_check_dirty_writeback(page, &dirty, &writeback); + folio_check_dirty_writeback(folio, &dirty, &writeback); if (dirty || writeback) - stat->nr_dirty++; + stat->nr_dirty += nr_pages; if (dirty && !writeback) - stat->nr_unqueued_dirty++; + stat->nr_unqueued_dirty += nr_pages; /* * Treat this page as congested if the underlying BDI is or if @@ -1576,10 +1574,8 @@ retry: * end of the LRU a second time. */ mapping = page_mapping(page); - if (((dirty || writeback) && mapping && - inode_write_congested(mapping->host)) || - (writeback && PageReclaim(page))) - stat->nr_congested++; + if (writeback && PageReclaim(page)) + stat->nr_congested += nr_pages; /* * If a page at the tail of the LRU is under writeback, there @@ -1628,7 +1624,7 @@ retry: if (current_is_kswapd() && PageReclaim(page) && test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { - stat->nr_immediate++; + stat->nr_immediate += nr_pages; goto activate_locked; /* Case 2 above */ @@ -1646,7 +1642,7 @@ retry: * and it's also appropriate in global reclaim. */ SetPageReclaim(page); - stat->nr_writeback++; + stat->nr_writeback += nr_pages; goto activate_locked; /* Case 3 above */ @@ -1660,7 +1656,7 @@ retry: } if (!ignore_references) - references = page_check_references(page, sc); + references = folio_check_references(folio, sc); switch (references) { case PAGEREF_ACTIVATE: @@ -1693,28 +1689,28 @@ retry: if (!PageSwapCache(page)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; - if (page_maybe_dma_pinned(page)) + if (folio_maybe_dma_pinned(folio)) goto keep_locked; if (PageTransHuge(page)) { /* cannot split THP, skip it */ - if (!can_split_huge_page(page, NULL)) + if (!can_split_folio(folio, NULL)) goto activate_locked; /* * Split pages without a PMD map right * away. Chances are some or all of the * tail pages can be freed without IO. */ - if (!compound_mapcount(page) && - split_huge_page_to_list(page, - page_list)) + if (!folio_entire_mapcount(folio) && + split_folio_to_list(folio, + page_list)) goto activate_locked; } if (!add_to_swap(page)) { if (!PageTransHuge(page)) goto activate_locked_split; /* Fallback to swap normal pages */ - if (split_huge_page_to_list(page, - page_list)) + if (split_folio_to_list(folio, + page_list)) goto activate_locked; #ifdef CONFIG_TRANSPARENT_HUGEPAGE count_vm_event(THP_SWPOUT_FALLBACK); @@ -1728,9 +1724,9 @@ retry: /* Adding to swap updated mapping */ mapping = page_mapping(page); } - } else if (unlikely(PageTransHuge(page))) { - /* Split file THP */ - if (split_huge_page_to_list(page, page_list)) + } else if (PageSwapBacked(page) && PageTransHuge(page)) { + /* Split shmem THP */ + if (split_folio_to_list(folio, page_list)) goto keep_locked; } @@ -1754,10 +1750,11 @@ retry: enum ttu_flags flags = TTU_BATCH_FLUSH; bool was_swapbacked = PageSwapBacked(page); - if (unlikely(PageTransHuge(page))) + if (PageTransHuge(page) && + thp_order(page) >= HPAGE_PMD_ORDER) flags |= TTU_SPLIT_HUGE_PMD; - try_to_unmap(page, flags); + try_to_unmap(folio, flags); if (page_mapped(page)) { stat->nr_unmap_fail += nr_pages; if (!was_swapbacked && PageSwapBacked(page)) @@ -1805,13 +1802,13 @@ retry: * starts and then write it out here. */ try_to_unmap_flush_dirty(); - switch (pageout(page, mapping)) { + switch (pageout(folio, mapping)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: goto activate_locked; case PAGE_SUCCESS: - stat->nr_pageout += thp_nr_pages(page); + stat->nr_pageout += nr_pages; if (PageWriteback(page)) goto keep; @@ -1889,7 +1886,7 @@ retry: */ count_vm_event(PGLAZYFREED); count_memcg_page_event(page, PGLAZYFREED); - } else if (!mapping || !__remove_mapping(mapping, page, true, + } else if (!mapping || !__remove_mapping(mapping, folio, true, sc->target_mem_cgroup)) goto keep_locked; @@ -2012,69 +2009,6 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, } /* - * Attempt to remove the specified page from its LRU. Only take this page - * if it is of the appropriate PageActive status. Pages which are being - * freed elsewhere are also ignored. - * - * page: page to consider - * mode: one of the LRU isolation modes defined above - * - * returns true on success, false on failure. - */ -bool __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode) -{ - /* Only take pages on the LRU. */ - if (!PageLRU(page)) - return false; - - /* Compaction should not handle unevictable pages but CMA can do so */ - if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE)) - return false; - - /* - * To minimise LRU disruption, the caller can indicate that it only - * wants to isolate pages it will be able to operate on without - * blocking - clean pages for the most part. - * - * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages - * that it is possible to migrate without blocking - */ - if (mode & ISOLATE_ASYNC_MIGRATE) { - /* All the caller can do on PageWriteback is block */ - if (PageWriteback(page)) - return false; - - if (PageDirty(page)) { - struct address_space *mapping; - bool migrate_dirty; - - /* - * Only pages without mappings or that have a - * ->migratepage callback are possible to migrate - * without blocking. However, we can be racing with - * truncation so it's necessary to lock the page - * to stabilise the mapping as truncation holds - * the page lock until after the page is removed - * from the page cache. - */ - if (!trylock_page(page)) - return false; - - mapping = page_mapping(page); - migrate_dirty = !mapping || mapping->a_ops->migratepage; - unlock_page(page); - if (!migrate_dirty) - return false; - } - } - - if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) - return false; - - return true; -} - -/* * Update LRU sizes after isolating pages. The LRU size updates must * be complete before mem_cgroup_update_lru_size due to a sanity check. */ @@ -2125,11 +2059,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, unsigned long skipped = 0; unsigned long scan, total_scan, nr_pages; LIST_HEAD(pages_skipped); - isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED); total_scan = 0; scan = 0; while (scan < nr_to_scan && !list_empty(src)) { + struct list_head *move_to = src; struct page *page; page = lru_to_page(src); @@ -2139,9 +2073,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, total_scan += nr_pages; if (page_zonenum(page) > sc->reclaim_idx) { - list_move(&page->lru, &pages_skipped); nr_skipped[page_zonenum(page)] += nr_pages; - continue; + move_to = &pages_skipped; + goto move; } /* @@ -2149,37 +2083,34 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * return with no isolated pages if the LRU mostly contains * ineligible pages. This causes the VM to not reclaim any * pages, triggering a premature OOM. - * - * Account all tail pages of THP. This would not cause - * premature OOM since __isolate_lru_page() returns -EBUSY - * only when the page is being freed somewhere else. + * Account all tail pages of THP. */ scan += nr_pages; - if (!__isolate_lru_page_prepare(page, mode)) { - /* It is being freed elsewhere */ - list_move(&page->lru, src); - continue; - } + + if (!PageLRU(page)) + goto move; + if (!sc->may_unmap && page_mapped(page)) + goto move; + /* * Be careful not to clear PageLRU until after we're * sure the page is not being freed elsewhere -- the * page release code relies on it. */ - if (unlikely(!get_page_unless_zero(page))) { - list_move(&page->lru, src); - continue; - } + if (unlikely(!get_page_unless_zero(page))) + goto move; if (!TestClearPageLRU(page)) { /* Another thread is already isolating this page */ put_page(page); - list_move(&page->lru, src); - continue; + goto move; } nr_taken += nr_pages; nr_zone_taken[page_zonenum(page)] += nr_pages; - list_move(&page->lru, dst); + move_to = dst; +move: + list_move(&page->lru, move_to); } /* @@ -2203,51 +2134,47 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, } *nr_scanned = total_scan; trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, - total_scan, skipped, nr_taken, mode, lru); + total_scan, skipped, nr_taken, + sc->may_unmap ? 0 : ISOLATE_UNMAPPED, lru); update_lru_sizes(lruvec, lru, nr_zone_taken); return nr_taken; } /** - * isolate_lru_page - tries to isolate a page from its LRU list - * @page: page to isolate from its LRU list - * - * Isolates a @page from an LRU list, clears PageLRU and adjusts the - * vmstat statistic corresponding to whatever LRU list the page was on. + * folio_isolate_lru() - Try to isolate a folio from its LRU list. + * @folio: Folio to isolate from its LRU list. * - * Returns 0 if the page was removed from an LRU list. - * Returns -EBUSY if the page was not on an LRU list. + * Isolate a @folio from an LRU list and adjust the vmstat statistic + * corresponding to whatever LRU list the folio was on. * - * The returned page will have PageLRU() cleared. If it was found on - * the active list, it will have PageActive set. If it was found on - * the unevictable list, it will have the PageUnevictable bit set. That flag + * The folio will have its LRU flag cleared. If it was found on the + * active list, it will have the Active flag set. If it was found on the + * unevictable list, it will have the Unevictable flag set. These flags * may need to be cleared by the caller before letting the page go. * - * The vmstat statistic corresponding to the list on which the page was - * found will be decremented. - * - * Restrictions: + * Context: * * (1) Must be called with an elevated refcount on the page. This is a - * fundamental difference from isolate_lru_pages (which is called + * fundamental difference from isolate_lru_pages() (which is called * without a stable reference). - * (2) the lru_lock must not be held. - * (3) interrupts must be enabled. + * (2) The lru_lock must not be held. + * (3) Interrupts must be enabled. + * + * Return: 0 if the folio was removed from an LRU list. + * -EBUSY if the folio was not on an LRU list. */ -int isolate_lru_page(struct page *page) +int folio_isolate_lru(struct folio *folio) { - struct folio *folio = page_folio(page); int ret = -EBUSY; - VM_BUG_ON_PAGE(!page_count(page), page); - WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); + VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio); - if (TestClearPageLRU(page)) { + if (folio_test_clear_lru(folio)) { struct lruvec *lruvec; - get_page(page); + folio_get(folio); lruvec = folio_lruvec_lock_irq(folio); - del_page_from_lru_list(page, lruvec); + lruvec_del_folio(lruvec, folio); unlock_page_lruvec_irq(lruvec); ret = 0; } @@ -2377,9 +2304,7 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec, */ static int current_may_throttle(void) { - return !(current->flags & PF_LOCAL_THROTTLE) || - current->backing_dev_info == NULL || - bdi_write_congested(current->backing_dev_info); + return !(current->flags & PF_LOCAL_THROTTLE); } /* @@ -2485,7 +2410,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * * If the pages are mostly unmapped, the processing is fast and it is * appropriate to hold lru_lock across the whole operation. But if - * the pages are mapped, the processing is slow (page_referenced()), so + * the pages are mapped, the processing is slow (folio_referenced()), so * we should drop lru_lock around each page. It's impossible to balance * this, so instead we remove the pages from the LRU while processing them. * It is safe to rely on PG_active against the non-LRU pages in here because @@ -2505,7 +2430,6 @@ static void shrink_active_list(unsigned long nr_to_scan, LIST_HEAD(l_hold); /* The pages which were snipped off */ LIST_HEAD(l_active); LIST_HEAD(l_inactive); - struct page *page; unsigned nr_deactivate, nr_activate; unsigned nr_rotated = 0; int file = is_file_lru(lru); @@ -2527,9 +2451,13 @@ static void shrink_active_list(unsigned long nr_to_scan, spin_unlock_irq(&lruvec->lru_lock); while (!list_empty(&l_hold)) { + struct folio *folio; + struct page *page; + cond_resched(); - page = lru_to_page(&l_hold); - list_del(&page->lru); + folio = lru_to_folio(&l_hold); + list_del(&folio->lru); + page = &folio->page; if (unlikely(!page_evictable(page))) { putback_lru_page(page); @@ -2544,8 +2472,8 @@ static void shrink_active_list(unsigned long nr_to_scan, } } - if (page_referenced(page, 0, sc->target_mem_cgroup, - &vm_flags)) { + if (folio_referenced(folio, 0, sc->target_mem_cgroup, + &vm_flags)) { /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So @@ -3975,7 +3903,10 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) if (!managed_zone(zone)) continue; - mark = high_wmark_pages(zone); + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) + mark = wmark_pages(zone, WMARK_PROMO); + else + mark = high_wmark_pages(zone); if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) return true; } @@ -4472,7 +4403,7 @@ static int kswapd(void *p) * us from recursively trying to free more memory as we're * trying to free the first piece of memory in the first place). */ - tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; + tsk->flags |= PF_MEMALLOC | PF_KSWAPD; set_freezable(); WRITE_ONCE(pgdat->kswapd_order, 0); @@ -4523,7 +4454,7 @@ kswapd_try_sleep: goto kswapd_try_sleep; } - tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); + tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD); return 0; } @@ -4764,11 +4695,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in fs_reclaim_acquire(sc.gfp_mask); /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP - * and we also need to be able to write out pages for RECLAIM_WRITE - * and RECLAIM_UNMAP. */ noreclaim_flag = memalloc_noreclaim_save(); - p->flags |= PF_SWAPWRITE; set_task_reclaim_state(p, &sc.reclaim_state); if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) { @@ -4782,7 +4710,6 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in } set_task_reclaim_state(p, NULL); - current->flags &= ~PF_SWAPWRITE; memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(sc.gfp_mask); psi_memstall_leave(&pflags); diff --git a/mm/vmstat.c b/mm/vmstat.c index d701c335628c..b75b1a64b54c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -28,6 +28,7 @@ #include <linux/mm_inline.h> #include <linux/page_ext.h> #include <linux/page_owner.h> +#include <linux/migrate.h> #include "internal.h" @@ -1242,6 +1243,9 @@ const char * const vmstat_text[] = { #ifdef CONFIG_SWAP "nr_swapcached", #endif +#ifdef CONFIG_NUMA_BALANCING + "pgpromote_success", +#endif /* enum writeback_stat_item counters */ "nr_dirty_threshold", @@ -1353,6 +1357,9 @@ const char * const vmstat_text[] = { "thp_split_page_failed", "thp_deferred_split_page", "thp_split_pmd", + "thp_scan_exceed_none_pte", + "thp_scan_exceed_swap_pte", + "thp_scan_exceed_share_pte", #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD "thp_split_pud", #endif @@ -1382,6 +1389,9 @@ const char * const vmstat_text[] = { #ifdef CONFIG_SWAP "swap_ra", "swap_ra_hit", +#ifdef CONFIG_KSM + "ksm_swpin_copy", +#endif #endif #ifdef CONFIG_X86 "direct_map_level2_splits", @@ -2040,7 +2050,12 @@ static void __init init_cpu_node_state(void) static int vmstat_cpu_online(unsigned int cpu) { refresh_zone_stat_thresholds(); - node_set_state(cpu_to_node(cpu), N_CPU); + + if (!node_state(cpu_to_node(cpu), N_CPU)) { + node_set_state(cpu_to_node(cpu), N_CPU); + set_migration_target_nodes(); + } + return 0; } @@ -2063,6 +2078,8 @@ static int vmstat_cpu_dead(unsigned int cpu) return 0; node_clear_state(node, N_CPU); + set_migration_target_nodes(); + return 0; } @@ -2094,6 +2111,9 @@ void __init init_mm_internals(void) start_shepherd_timer(); #endif +#if defined(CONFIG_MIGRATION) && defined(CONFIG_HOTPLUG_CPU) + migrate_on_reclaim_init(); +#endif #ifdef CONFIG_PROC_FS proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op); proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op); diff --git a/mm/workingset.c b/mm/workingset.c index 8c03afe1d67c..8a3828acc0bf 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -245,31 +245,32 @@ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) } /** - * workingset_eviction - note the eviction of a page from memory + * workingset_eviction - note the eviction of a folio from memory * @target_memcg: the cgroup that is causing the reclaim - * @page: the page being evicted + * @folio: the folio being evicted * - * Return: a shadow entry to be stored in @page->mapping->i_pages in place - * of the evicted @page so that a later refault can be detected. + * Return: a shadow entry to be stored in @folio->mapping->i_pages in place + * of the evicted @folio so that a later refault can be detected. */ -void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) +void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) { - struct pglist_data *pgdat = page_pgdat(page); + struct pglist_data *pgdat = folio_pgdat(folio); unsigned long eviction; struct lruvec *lruvec; int memcgid; - /* Page is fully exclusive and pins page's memory cgroup pointer */ - VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(page_count(page), page); - VM_BUG_ON_PAGE(!PageLocked(page), page); + /* Folio is fully exclusive and pins folio's memory cgroup pointer */ + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); lruvec = mem_cgroup_lruvec(target_memcg, pgdat); /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); - workingset_age_nonresident(lruvec, thp_nr_pages(page)); - return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); + workingset_age_nonresident(lruvec, folio_nr_pages(folio)); + return pack_shadow(memcgid, pgdat, eviction, + folio_test_workingset(folio)); } /** @@ -429,10 +430,12 @@ out: * point where they would still be useful. */ -static struct list_lru shadow_nodes; +struct list_lru shadow_nodes; void workingset_update_node(struct xa_node *node) { + struct address_space *mapping; + /* * Track non-empty nodes that contain only shadow entries; * unlink those that contain pages or are being freed. @@ -441,7 +444,8 @@ void workingset_update_node(struct xa_node *node) * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ - VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */ + mapping = container_of(node->array, struct address_space, i_pages); + lockdep_assert_held(&mapping->i_pages.xa_lock); if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { diff --git a/mm/zpool.c b/mm/zpool.c index 6d9ed48141e5..68facc193496 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -24,16 +24,11 @@ struct zpool { const struct zpool_ops *ops; bool evictable; bool can_sleep_mapped; - - struct list_head list; }; static LIST_HEAD(drivers_head); static DEFINE_SPINLOCK(drivers_lock); -static LIST_HEAD(pools_head); -static DEFINE_SPINLOCK(pools_lock); - /** * zpool_register_driver() - register a zpool implementation. * @driver: driver to register @@ -195,10 +190,6 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, pr_debug("created pool type %s\n", type); - spin_lock(&pools_lock); - list_add(&zpool->list, &pools_head); - spin_unlock(&pools_lock); - return zpool; } @@ -217,9 +208,6 @@ void zpool_destroy_pool(struct zpool *zpool) { pr_debug("destroying pool type %s\n", zpool->driver->type); - spin_lock(&pools_lock); - list_del(&zpool->list); - spin_unlock(&pools_lock); zpool->driver->destroy(zpool->pool); zpool_put_driver(zpool->driver); kfree(zpool); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 0d3b65939016..9152fbde33b5 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -30,6 +30,14 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +/* + * lock ordering: + * page_lock + * pool->migrate_lock + * class->lock + * zspage->lock + */ + #include <linux/module.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -57,6 +65,7 @@ #include <linux/wait.h> #include <linux/pagemap.h> #include <linux/fs.h> +#include <linux/local_lock.h> #define ZSPAGE_MAGIC 0x58 @@ -101,15 +110,6 @@ #define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT) /* - * Memory for allocating for handle keeps object position by - * encoding <page, obj_idx> and the encoded value has a room - * in least bit(ie, look at obj_to_location). - * We use the bit to synchronize between object access by - * user and migration. - */ -#define HANDLE_PIN_BIT 0 - -/* * Head in allocated object should have OBJ_ALLOCATED_TAG * to identify the object was allocated or not. * It's okay to add the status bit in the least bit because @@ -121,6 +121,7 @@ #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) +#define HUGE_BITS 1 #define FULLNESS_BITS 2 #define CLASS_BITS 8 #define ISOLATED_BITS 3 @@ -158,7 +159,7 @@ enum fullness_group { NR_ZS_FULLNESS, }; -enum zs_stat_type { +enum class_stat_type { CLASS_EMPTY, CLASS_ALMOST_EMPTY, CLASS_ALMOST_FULL, @@ -213,22 +214,6 @@ struct size_class { struct zs_size_stat stats; }; -/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ -static void SetPageHugeObject(struct page *page) -{ - SetPageOwnerPriv1(page); -} - -static void ClearPageHugeObject(struct page *page) -{ - ClearPageOwnerPriv1(page); -} - -static int PageHugeObject(struct page *page) -{ - return PageOwnerPriv1(page); -} - /* * Placed within free objects to form a singly linked list. * For every zspage, zspage->freeobj gives head of this list. @@ -269,15 +254,14 @@ struct zs_pool { #ifdef CONFIG_COMPACTION struct inode *inode; struct work_struct free_work; - /* A wait queue for when migration races with async_free_zspage() */ - struct wait_queue_head migration_wait; - atomic_long_t isolated_pages; - bool destroying; #endif + /* protect page/zspage migration */ + rwlock_t migrate_lock; }; struct zspage { struct { + unsigned int huge:HUGE_BITS; unsigned int fullness:FULLNESS_BITS; unsigned int class:CLASS_BITS + 1; unsigned int isolated:ISOLATED_BITS; @@ -293,17 +277,32 @@ struct zspage { }; struct mapping_area { + local_lock_t lock; char *vm_buf; /* copy buffer for objects that span pages */ char *vm_addr; /* address of kmap_atomic()'ed pages */ enum zs_mapmode vm_mm; /* mapping mode */ }; +/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ +static void SetZsHugePage(struct zspage *zspage) +{ + zspage->huge = 1; +} + +static bool ZsHugePage(struct zspage *zspage) +{ + return zspage->huge; +} + #ifdef CONFIG_COMPACTION static int zs_register_migration(struct zs_pool *pool); static void zs_unregister_migration(struct zs_pool *pool); static void migrate_lock_init(struct zspage *zspage); static void migrate_read_lock(struct zspage *zspage); static void migrate_read_unlock(struct zspage *zspage); +static void migrate_write_lock(struct zspage *zspage); +static void migrate_write_lock_nested(struct zspage *zspage); +static void migrate_write_unlock(struct zspage *zspage); static void kick_deferred_free(struct zs_pool *pool); static void init_deferred_free(struct zs_pool *pool); static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); @@ -315,6 +314,9 @@ static void zs_unregister_migration(struct zs_pool *pool) {} static void migrate_lock_init(struct zspage *zspage) {} static void migrate_read_lock(struct zspage *zspage) {} static void migrate_read_unlock(struct zspage *zspage) {} +static void migrate_write_lock(struct zspage *zspage) {} +static void migrate_write_lock_nested(struct zspage *zspage) {} +static void migrate_write_unlock(struct zspage *zspage) {} static void kick_deferred_free(struct zs_pool *pool) {} static void init_deferred_free(struct zs_pool *pool) {} static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} @@ -366,14 +368,10 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) kmem_cache_free(pool->zspage_cachep, zspage); } +/* class->lock(which owns the handle) synchronizes races */ static void record_obj(unsigned long handle, unsigned long obj) { - /* - * lsb of @obj represents handle lock while other bits - * represent object value the handle is pointing so - * updating shouldn't do store tearing. - */ - WRITE_ONCE(*(unsigned long *)handle, obj); + *(unsigned long *)handle = obj; } /* zpool driver */ @@ -455,12 +453,9 @@ MODULE_ALIAS("zpool-zsmalloc"); #endif /* CONFIG_ZPOOL */ /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ -static DEFINE_PER_CPU(struct mapping_area, zs_map_area); - -static bool is_zspage_isolated(struct zspage *zspage) -{ - return zspage->isolated; -} +static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { + .lock = INIT_LOCAL_LOCK(lock), +}; static __maybe_unused int is_first_page(struct page *page) { @@ -517,6 +512,12 @@ static void get_zspage_mapping(struct zspage *zspage, *class_idx = zspage->class; } +static struct size_class *zspage_class(struct zs_pool *pool, + struct zspage *zspage) +{ + return pool->size_class[zspage->class]; +} + static void set_zspage_mapping(struct zspage *zspage, unsigned int class_idx, enum fullness_group fullness) @@ -543,21 +544,21 @@ static int get_size_class_index(int size) return min_t(int, ZS_SIZE_CLASSES - 1, idx); } -/* type can be of enum type zs_stat_type or fullness_group */ -static inline void zs_stat_inc(struct size_class *class, +/* type can be of enum type class_stat_type or fullness_group */ +static inline void class_stat_inc(struct size_class *class, int type, unsigned long cnt) { class->stats.objs[type] += cnt; } -/* type can be of enum type zs_stat_type or fullness_group */ -static inline void zs_stat_dec(struct size_class *class, +/* type can be of enum type class_stat_type or fullness_group */ +static inline void class_stat_dec(struct size_class *class, int type, unsigned long cnt) { class->stats.objs[type] -= cnt; } -/* type can be of enum type zs_stat_type or fullness_group */ +/* type can be of enum type class_stat_type or fullness_group */ static inline unsigned long zs_stat_get(struct size_class *class, int type) { @@ -719,7 +720,7 @@ static void insert_zspage(struct size_class *class, { struct zspage *head; - zs_stat_inc(class, fullness, 1); + class_stat_inc(class, fullness, 1); head = list_first_entry_or_null(&class->fullness_list[fullness], struct zspage, list); /* @@ -741,10 +742,9 @@ static void remove_zspage(struct size_class *class, enum fullness_group fullness) { VM_BUG_ON(list_empty(&class->fullness_list[fullness])); - VM_BUG_ON(is_zspage_isolated(zspage)); list_del_init(&zspage->list); - zs_stat_dec(class, fullness, 1); + class_stat_dec(class, fullness, 1); } /* @@ -767,13 +767,9 @@ static enum fullness_group fix_fullness_group(struct size_class *class, if (newfg == currfg) goto out; - if (!is_zspage_isolated(zspage)) { - remove_zspage(class, zspage, currfg); - insert_zspage(class, zspage, newfg); - } - + remove_zspage(class, zspage, currfg); + insert_zspage(class, zspage, newfg); set_zspage_mapping(zspage, class_idx, newfg); - out: return newfg; } @@ -824,7 +820,9 @@ static struct zspage *get_zspage(struct page *page) static struct page *get_next_page(struct page *page) { - if (unlikely(PageHugeObject(page))) + struct zspage *zspage = get_zspage(page); + + if (unlikely(ZsHugePage(zspage))) return NULL; return (struct page *)page->index; @@ -844,6 +842,12 @@ static void obj_to_location(unsigned long obj, struct page **page, *obj_idx = (obj & OBJ_INDEX_MASK); } +static void obj_to_page(unsigned long obj, struct page **page) +{ + obj >>= OBJ_TAG_BITS; + *page = pfn_to_page(obj >> OBJ_INDEX_BITS); +} + /** * location_to_obj - get obj value encoded from (<page>, <obj_idx>) * @page: page object resides in zspage @@ -865,33 +869,22 @@ static unsigned long handle_to_obj(unsigned long handle) return *(unsigned long *)handle; } -static unsigned long obj_to_head(struct page *page, void *obj) +static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) { - if (unlikely(PageHugeObject(page))) { + unsigned long handle; + struct zspage *zspage = get_zspage(page); + + if (unlikely(ZsHugePage(zspage))) { VM_BUG_ON_PAGE(!is_first_page(page), page); - return page->index; + handle = page->index; } else - return *(unsigned long *)obj; -} - -static inline int testpin_tag(unsigned long handle) -{ - return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); -} - -static inline int trypin_tag(unsigned long handle) -{ - return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); -} + handle = *(unsigned long *)obj; -static void pin_tag(unsigned long handle) __acquires(bitlock) -{ - bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); -} + if (!(handle & OBJ_ALLOCATED_TAG)) + return false; -static void unpin_tag(unsigned long handle) __releases(bitlock) -{ - bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); + *phandle = handle & ~OBJ_ALLOCATED_TAG; + return true; } static void reset_page(struct page *page) @@ -900,7 +893,6 @@ static void reset_page(struct page *page) ClearPagePrivate(page); set_page_private(page, 0); page_mapcount_reset(page); - ClearPageHugeObject(page); page->index = 0; } @@ -952,7 +944,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, cache_free_zspage(pool, zspage); - zs_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage); + class_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage); atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated); } @@ -963,6 +955,11 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class, VM_BUG_ON(get_zspage_inuse(zspage)); VM_BUG_ON(list_empty(&zspage->list)); + /* + * Since zs_free couldn't be sleepable, this function cannot call + * lock_page. The page locks trylock_zspage got will be released + * by __free_zspage. + */ if (!trylock_zspage(zspage)) { kick_deferred_free(pool); return; @@ -1042,7 +1039,7 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage, SetPagePrivate(page); if (unlikely(class->objs_per_zspage == 1 && class->pages_per_zspage == 1)) - SetPageHugeObject(page); + SetZsHugePage(zspage); } else { prev_page->index = (unsigned long)page; } @@ -1246,8 +1243,6 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, unsigned long obj, off; unsigned int obj_idx; - unsigned int class_idx; - enum fullness_group fg; struct size_class *class; struct mapping_area *area; struct page *pages[2]; @@ -1260,21 +1255,26 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, */ BUG_ON(in_interrupt()); - /* From now on, migration cannot move the object */ - pin_tag(handle); - + /* It guarantees it can get zspage from handle safely */ + read_lock(&pool->migrate_lock); obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); zspage = get_zspage(page); - /* migration cannot move any subpage in this zspage */ + /* + * migration cannot move any zpages in this zspage. Here, class->lock + * is too heavy since callers would take some time until they calls + * zs_unmap_object API so delegate the locking from class to zspage + * which is smaller granularity. + */ migrate_read_lock(zspage); + read_unlock(&pool->migrate_lock); - get_zspage_mapping(zspage, &class_idx, &fg); - class = pool->size_class[class_idx]; + class = zspage_class(pool, zspage); off = (class->size * obj_idx) & ~PAGE_MASK; - area = &get_cpu_var(zs_map_area); + local_lock(&zs_map_area.lock); + area = this_cpu_ptr(&zs_map_area); area->vm_mm = mm; if (off + class->size <= PAGE_SIZE) { /* this object is contained entirely within a page */ @@ -1290,7 +1290,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, ret = __zs_map_object(area, pages, off, class->size); out: - if (likely(!PageHugeObject(page))) + if (likely(!ZsHugePage(zspage))) ret += ZS_HANDLE_SIZE; return ret; @@ -1304,16 +1304,13 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) unsigned long obj, off; unsigned int obj_idx; - unsigned int class_idx; - enum fullness_group fg; struct size_class *class; struct mapping_area *area; obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); zspage = get_zspage(page); - get_zspage_mapping(zspage, &class_idx, &fg); - class = pool->size_class[class_idx]; + class = zspage_class(pool, zspage); off = (class->size * obj_idx) & ~PAGE_MASK; area = this_cpu_ptr(&zs_map_area); @@ -1328,10 +1325,9 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) __zs_unmap_object(area, pages, off, class->size); } - put_cpu_var(zs_map_area); + local_unlock(&zs_map_area.lock); migrate_read_unlock(zspage); - unpin_tag(handle); } EXPORT_SYMBOL_GPL(zs_unmap_object); @@ -1354,17 +1350,19 @@ size_t zs_huge_class_size(struct zs_pool *pool) } EXPORT_SYMBOL_GPL(zs_huge_class_size); -static unsigned long obj_malloc(struct size_class *class, +static unsigned long obj_malloc(struct zs_pool *pool, struct zspage *zspage, unsigned long handle) { int i, nr_page, offset; unsigned long obj; struct link_free *link; + struct size_class *class; struct page *m_page; unsigned long m_offset; void *vaddr; + class = pool->size_class[zspage->class]; handle |= OBJ_ALLOCATED_TAG; obj = get_freeobj(zspage); @@ -1379,7 +1377,7 @@ static unsigned long obj_malloc(struct size_class *class, vaddr = kmap_atomic(m_page); link = (struct link_free *)vaddr + m_offset / sizeof(*link); set_freeobj(zspage, link->next >> OBJ_TAG_BITS); - if (likely(!PageHugeObject(m_page))) + if (likely(!ZsHugePage(zspage))) /* record handle in the header of allocated chunk */ link->handle = handle; else @@ -1388,7 +1386,6 @@ static unsigned long obj_malloc(struct size_class *class, kunmap_atomic(vaddr); mod_zspage_inuse(zspage, 1); - zs_stat_inc(class, OBJ_USED, 1); obj = location_to_obj(m_page, obj); @@ -1424,13 +1421,15 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) size += ZS_HANDLE_SIZE; class = pool->size_class[get_size_class_index(size)]; + /* class->lock effectively protects the zpage migration */ spin_lock(&class->lock); zspage = find_get_zspage(class); if (likely(zspage)) { - obj = obj_malloc(class, zspage, handle); + obj = obj_malloc(pool, zspage, handle); /* Now move the zspage to another fullness group, if required */ fix_fullness_group(class, zspage); record_obj(handle, obj); + class_stat_inc(class, OBJ_USED, 1); spin_unlock(&class->lock); return handle; @@ -1445,14 +1444,15 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) } spin_lock(&class->lock); - obj = obj_malloc(class, zspage, handle); + obj = obj_malloc(pool, zspage, handle); newfg = get_fullness_group(class, zspage); insert_zspage(class, zspage, newfg); set_zspage_mapping(zspage, class->index, newfg); record_obj(handle, obj); atomic_long_add(class->pages_per_zspage, &pool->pages_allocated); - zs_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage); + class_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage); + class_stat_inc(class, OBJ_USED, 1); /* We completely set up zspage so mark them as movable */ SetZsPageMovable(pool, zspage); @@ -1462,7 +1462,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) } EXPORT_SYMBOL_GPL(zs_malloc); -static void obj_free(struct size_class *class, unsigned long obj) +static void obj_free(int class_size, unsigned long obj) { struct link_free *link; struct zspage *zspage; @@ -1472,18 +1472,20 @@ static void obj_free(struct size_class *class, unsigned long obj) void *vaddr; obj_to_location(obj, &f_page, &f_objidx); - f_offset = (class->size * f_objidx) & ~PAGE_MASK; + f_offset = (class_size * f_objidx) & ~PAGE_MASK; zspage = get_zspage(f_page); vaddr = kmap_atomic(f_page); /* Insert this object in containing zspage's freelist */ link = (struct link_free *)(vaddr + f_offset); - link->next = get_freeobj(zspage) << OBJ_TAG_BITS; + if (likely(!ZsHugePage(zspage))) + link->next = get_freeobj(zspage) << OBJ_TAG_BITS; + else + f_page->index = 0; kunmap_atomic(vaddr); set_freeobj(zspage, f_objidx); mod_zspage_inuse(zspage, -1); - zs_stat_dec(class, OBJ_USED, 1); } void zs_free(struct zs_pool *pool, unsigned long handle) @@ -1491,42 +1493,33 @@ void zs_free(struct zs_pool *pool, unsigned long handle) struct zspage *zspage; struct page *f_page; unsigned long obj; - unsigned int f_objidx; - int class_idx; struct size_class *class; enum fullness_group fullness; - bool isolated; if (unlikely(!handle)) return; - pin_tag(handle); + /* + * The pool->migrate_lock protects the race with zpage's migration + * so it's safe to get the page from handle. + */ + read_lock(&pool->migrate_lock); obj = handle_to_obj(handle); - obj_to_location(obj, &f_page, &f_objidx); + obj_to_page(obj, &f_page); zspage = get_zspage(f_page); - - migrate_read_lock(zspage); - - get_zspage_mapping(zspage, &class_idx, &fullness); - class = pool->size_class[class_idx]; - + class = zspage_class(pool, zspage); spin_lock(&class->lock); - obj_free(class, obj); + read_unlock(&pool->migrate_lock); + + obj_free(class->size, obj); + class_stat_dec(class, OBJ_USED, 1); fullness = fix_fullness_group(class, zspage); - if (fullness != ZS_EMPTY) { - migrate_read_unlock(zspage); + if (fullness != ZS_EMPTY) goto out; - } - isolated = is_zspage_isolated(zspage); - migrate_read_unlock(zspage); - /* If zspage is isolated, zs_page_putback will free the zspage */ - if (likely(!isolated)) - free_zspage(pool, class, zspage); + free_zspage(pool, class, zspage); out: - spin_unlock(&class->lock); - unpin_tag(handle); cache_free_handle(pool, handle); } EXPORT_SYMBOL_GPL(zs_free); @@ -1601,7 +1594,6 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, static unsigned long find_alloced_obj(struct size_class *class, struct page *page, int *obj_idx) { - unsigned long head; int offset = 0; int index = *obj_idx; unsigned long handle = 0; @@ -1611,13 +1603,8 @@ static unsigned long find_alloced_obj(struct size_class *class, offset += class->size * index; while (offset < PAGE_SIZE) { - head = obj_to_head(page, addr + offset); - if (head & OBJ_ALLOCATED_TAG) { - handle = head & ~OBJ_ALLOCATED_TAG; - if (trypin_tag(handle)) - break; - handle = 0; - } + if (obj_allocated(page, addr + offset, &handle)) + break; offset += class->size; index++; @@ -1663,25 +1650,16 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, /* Stop if there is no more space */ if (zspage_full(class, get_zspage(d_page))) { - unpin_tag(handle); ret = -ENOMEM; break; } used_obj = handle_to_obj(handle); - free_obj = obj_malloc(class, get_zspage(d_page), handle); + free_obj = obj_malloc(pool, get_zspage(d_page), handle); zs_object_copy(class, free_obj, used_obj); obj_idx++; - /* - * record_obj updates handle's value to free_obj and it will - * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which - * breaks synchronization using pin_tag(e,g, zs_free) so - * let's keep the lock bit. - */ - free_obj |= BIT(HANDLE_PIN_BIT); record_obj(handle, free_obj); - unpin_tag(handle); - obj_free(class, used_obj); + obj_free(class->size, used_obj); } /* Remember last position in this iteration */ @@ -1706,7 +1684,6 @@ static struct zspage *isolate_zspage(struct size_class *class, bool source) zspage = list_first_entry_or_null(&class->fullness_list[fg[i]], struct zspage, list); if (zspage) { - VM_BUG_ON(is_zspage_isolated(zspage)); remove_zspage(class, zspage, fg[i]); return zspage; } @@ -1727,8 +1704,6 @@ static enum fullness_group putback_zspage(struct size_class *class, { enum fullness_group fullness; - VM_BUG_ON(is_zspage_isolated(zspage)); - fullness = get_fullness_group(class, zspage); insert_zspage(class, zspage, fullness); set_zspage_mapping(zspage, class->index, fullness); @@ -1797,6 +1772,11 @@ static void migrate_write_lock(struct zspage *zspage) write_lock(&zspage->lock); } +static void migrate_write_lock_nested(struct zspage *zspage) +{ + write_lock_nested(&zspage->lock, SINGLE_DEPTH_NESTING); +} + static void migrate_write_unlock(struct zspage *zspage) { write_unlock(&zspage->lock); @@ -1810,35 +1790,10 @@ static void inc_zspage_isolation(struct zspage *zspage) static void dec_zspage_isolation(struct zspage *zspage) { + VM_BUG_ON(zspage->isolated == 0); zspage->isolated--; } -static void putback_zspage_deferred(struct zs_pool *pool, - struct size_class *class, - struct zspage *zspage) -{ - enum fullness_group fg; - - fg = putback_zspage(class, zspage); - if (fg == ZS_EMPTY) - schedule_work(&pool->free_work); - -} - -static inline void zs_pool_dec_isolated(struct zs_pool *pool) -{ - VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0); - atomic_long_dec(&pool->isolated_pages); - /* - * Checking pool->destroying must happen after atomic_long_dec() - * for pool->isolated_pages above. Paired with the smp_mb() in - * zs_unregister_migration(). - */ - smp_mb__after_atomic(); - if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying) - wake_up_all(&pool->migration_wait); -} - static void replace_sub_page(struct size_class *class, struct zspage *zspage, struct page *newpage, struct page *oldpage) { @@ -1857,19 +1812,14 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage, create_page_chain(class, zspage, pages); set_first_obj_offset(newpage, get_first_obj_offset(oldpage)); - if (unlikely(PageHugeObject(oldpage))) + if (unlikely(ZsHugePage(zspage))) newpage->index = oldpage->index; __SetPageMovable(newpage, page_mapping(oldpage)); } static bool zs_page_isolate(struct page *page, isolate_mode_t mode) { - struct zs_pool *pool; - struct size_class *class; - int class_idx; - enum fullness_group fullness; struct zspage *zspage; - struct address_space *mapping; /* * Page is locked so zspage couldn't be destroyed. For detail, look at @@ -1879,41 +1829,9 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) VM_BUG_ON_PAGE(PageIsolated(page), page); zspage = get_zspage(page); - - /* - * Without class lock, fullness could be stale while class_idx is okay - * because class_idx is constant unless page is freed so we should get - * fullness again under class lock. - */ - get_zspage_mapping(zspage, &class_idx, &fullness); - mapping = page_mapping(page); - pool = mapping->private_data; - class = pool->size_class[class_idx]; - - spin_lock(&class->lock); - if (get_zspage_inuse(zspage) == 0) { - spin_unlock(&class->lock); - return false; - } - - /* zspage is isolated for object migration */ - if (list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { - spin_unlock(&class->lock); - return false; - } - - /* - * If this is first time isolation for the zspage, isolate zspage from - * size_class to prevent further object allocation from the zspage. - */ - if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { - get_zspage_mapping(zspage, &class_idx, &fullness); - atomic_long_inc(&pool->isolated_pages); - remove_zspage(class, zspage, fullness); - } - + migrate_write_lock(zspage); inc_zspage_isolation(zspage); - spin_unlock(&class->lock); + migrate_write_unlock(zspage); return true; } @@ -1923,16 +1841,13 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, { struct zs_pool *pool; struct size_class *class; - int class_idx; - enum fullness_group fullness; struct zspage *zspage; struct page *dummy; void *s_addr, *d_addr, *addr; - int offset, pos; - unsigned long handle, head; + int offset; + unsigned long handle; unsigned long old_obj, new_obj; unsigned int obj_idx; - int ret = -EAGAIN; /* * We cannot support the _NO_COPY case here, because copy needs to @@ -1945,35 +1860,25 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); - zspage = get_zspage(page); - - /* Concurrent compactor cannot migrate any subpage in zspage */ - migrate_write_lock(zspage); - get_zspage_mapping(zspage, &class_idx, &fullness); pool = mapping->private_data; - class = pool->size_class[class_idx]; - offset = get_first_obj_offset(page); + /* + * The pool migrate_lock protects the race between zpage migration + * and zs_free. + */ + write_lock(&pool->migrate_lock); + zspage = get_zspage(page); + class = zspage_class(pool, zspage); + + /* + * the class lock protects zpage alloc/free in the zspage. + */ spin_lock(&class->lock); - if (!get_zspage_inuse(zspage)) { - /* - * Set "offset" to end of the page so that every loops - * skips unnecessary object scanning. - */ - offset = PAGE_SIZE; - } + /* the migrate_write_lock protects zpage access via zs_map_object */ + migrate_write_lock(zspage); - pos = offset; + offset = get_first_obj_offset(page); s_addr = kmap_atomic(page); - while (pos < PAGE_SIZE) { - head = obj_to_head(page, s_addr + pos); - if (head & OBJ_ALLOCATED_TAG) { - handle = head & ~OBJ_ALLOCATED_TAG; - if (!trypin_tag(handle)) - goto unpin_objects; - } - pos += class->size; - } /* * Here, any user cannot access all objects in the zspage so let's move. @@ -1982,42 +1887,30 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, memcpy(d_addr, s_addr, PAGE_SIZE); kunmap_atomic(d_addr); - for (addr = s_addr + offset; addr < s_addr + pos; + for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE; addr += class->size) { - head = obj_to_head(page, addr); - if (head & OBJ_ALLOCATED_TAG) { - handle = head & ~OBJ_ALLOCATED_TAG; - BUG_ON(!testpin_tag(handle)); + if (obj_allocated(page, addr, &handle)) { old_obj = handle_to_obj(handle); obj_to_location(old_obj, &dummy, &obj_idx); new_obj = (unsigned long)location_to_obj(newpage, obj_idx); - new_obj |= BIT(HANDLE_PIN_BIT); record_obj(handle, new_obj); } } + kunmap_atomic(s_addr); replace_sub_page(class, zspage, newpage, page); - get_page(newpage); - - dec_zspage_isolation(zspage); - /* - * Page migration is done so let's putback isolated zspage to - * the list if @page is final isolated subpage in the zspage. + * Since we complete the data copy and set up new zspage structure, + * it's okay to release migration_lock. */ - if (!is_zspage_isolated(zspage)) { - /* - * We cannot race with zs_destroy_pool() here because we wait - * for isolation to hit zero before we start destroying. - * Also, we ensure that everyone can see pool->destroying before - * we start waiting. - */ - putback_zspage_deferred(pool, class, zspage); - zs_pool_dec_isolated(pool); - } + write_unlock(&pool->migrate_lock); + spin_unlock(&class->lock); + dec_zspage_isolation(zspage); + migrate_write_unlock(zspage); + get_page(newpage); if (page_zone(newpage) != page_zone(page)) { dec_zone_page_state(page, NR_ZSPAGES); inc_zone_page_state(newpage, NR_ZSPAGES); @@ -2025,55 +1918,21 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, reset_page(page); put_page(page); - page = newpage; - - ret = MIGRATEPAGE_SUCCESS; -unpin_objects: - for (addr = s_addr + offset; addr < s_addr + pos; - addr += class->size) { - head = obj_to_head(page, addr); - if (head & OBJ_ALLOCATED_TAG) { - handle = head & ~OBJ_ALLOCATED_TAG; - BUG_ON(!testpin_tag(handle)); - unpin_tag(handle); - } - } - kunmap_atomic(s_addr); - spin_unlock(&class->lock); - migrate_write_unlock(zspage); - return ret; + return MIGRATEPAGE_SUCCESS; } static void zs_page_putback(struct page *page) { - struct zs_pool *pool; - struct size_class *class; - int class_idx; - enum fullness_group fg; - struct address_space *mapping; struct zspage *zspage; VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); zspage = get_zspage(page); - get_zspage_mapping(zspage, &class_idx, &fg); - mapping = page_mapping(page); - pool = mapping->private_data; - class = pool->size_class[class_idx]; - - spin_lock(&class->lock); + migrate_write_lock(zspage); dec_zspage_isolation(zspage); - if (!is_zspage_isolated(zspage)) { - /* - * Due to page_lock, we cannot free zspage immediately - * so let's defer. - */ - putback_zspage_deferred(pool, class, zspage); - zs_pool_dec_isolated(pool); - } - spin_unlock(&class->lock); + migrate_write_unlock(zspage); } static const struct address_space_operations zsmalloc_aops = { @@ -2095,36 +1954,8 @@ static int zs_register_migration(struct zs_pool *pool) return 0; } -static bool pool_isolated_are_drained(struct zs_pool *pool) -{ - return atomic_long_read(&pool->isolated_pages) == 0; -} - -/* Function for resolving migration */ -static void wait_for_isolated_drain(struct zs_pool *pool) -{ - - /* - * We're in the process of destroying the pool, so there are no - * active allocations. zs_page_isolate() fails for completely free - * zspages, so we need only wait for the zs_pool's isolated - * count to hit zero. - */ - wait_event(pool->migration_wait, - pool_isolated_are_drained(pool)); -} - static void zs_unregister_migration(struct zs_pool *pool) { - pool->destroying = true; - /* - * We need a memory barrier here to ensure global visibility of - * pool->destroying. Thus pool->isolated pages will either be 0 in which - * case we don't care, or it will be > 0 and pool->destroying will - * ensure that we wake up once isolation hits 0. - */ - smp_mb(); - wait_for_isolated_drain(pool); /* This can block */ flush_work(&pool->free_work); iput(pool->inode); } @@ -2154,7 +1985,6 @@ static void async_free_zspage(struct work_struct *work) spin_unlock(&class->lock); } - list_for_each_entry_safe(zspage, tmp, &free_pages, list) { list_del(&zspage->list); lock_zspage(zspage); @@ -2218,8 +2048,13 @@ static unsigned long __zs_compact(struct zs_pool *pool, struct zspage *dst_zspage = NULL; unsigned long pages_freed = 0; + /* protect the race between zpage migration and zs_free */ + write_lock(&pool->migrate_lock); + /* protect zpage allocation/free */ spin_lock(&class->lock); while ((src_zspage = isolate_zspage(class, true))) { + /* protect someone accessing the zspage(i.e., zs_map_object) */ + migrate_write_lock(src_zspage); if (!zs_can_compact(class)) break; @@ -2228,6 +2063,8 @@ static unsigned long __zs_compact(struct zs_pool *pool, cc.s_page = get_first_page(src_zspage); while ((dst_zspage = isolate_zspage(class, false))) { + migrate_write_lock_nested(dst_zspage); + cc.d_page = get_first_page(dst_zspage); /* * If there is no more space in dst_page, resched @@ -2237,6 +2074,10 @@ static unsigned long __zs_compact(struct zs_pool *pool, break; putback_zspage(class, dst_zspage); + migrate_write_unlock(dst_zspage); + dst_zspage = NULL; + if (rwlock_is_contended(&pool->migrate_lock)) + break; } /* Stop if we couldn't find slot */ @@ -2244,19 +2085,28 @@ static unsigned long __zs_compact(struct zs_pool *pool, break; putback_zspage(class, dst_zspage); + migrate_write_unlock(dst_zspage); + if (putback_zspage(class, src_zspage) == ZS_EMPTY) { + migrate_write_unlock(src_zspage); free_zspage(pool, class, src_zspage); pages_freed += class->pages_per_zspage; - } + } else + migrate_write_unlock(src_zspage); spin_unlock(&class->lock); + write_unlock(&pool->migrate_lock); cond_resched(); + write_lock(&pool->migrate_lock); spin_lock(&class->lock); } - if (src_zspage) + if (src_zspage) { putback_zspage(class, src_zspage); + migrate_write_unlock(src_zspage); + } spin_unlock(&class->lock); + write_unlock(&pool->migrate_lock); return pages_freed; } @@ -2362,15 +2212,12 @@ struct zs_pool *zs_create_pool(const char *name) return NULL; init_deferred_free(pool); + rwlock_init(&pool->migrate_lock); pool->name = kstrdup(name, GFP_KERNEL); if (!pool->name) goto err; -#ifdef CONFIG_COMPACTION - init_waitqueue_head(&pool->migration_wait); -#endif - if (create_cache(pool)) goto err; diff --git a/mm/zswap.c b/mm/zswap.c index 7944e3e57e78..3efd8cae315e 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -120,11 +120,19 @@ static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ module_param_named(accept_threshold_percent, zswap_accept_thr_percent, uint, 0644); -/* Enable/disable handling same-value filled pages (enabled by default) */ +/* + * Enable/disable handling same-value filled pages (enabled by default). + * If disabled every page is considered non-same-value filled. + */ static bool zswap_same_filled_pages_enabled = true; module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, bool, 0644); +/* Enable/disable handling non-same-value filled pages (enabled by default) */ +static bool zswap_non_same_filled_pages_enabled = true; +module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, + bool, 0644); + /********************************* * data structures **********************************/ @@ -1147,6 +1155,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, kunmap_atomic(src); } + if (!zswap_non_same_filled_pages_enabled) { + ret = -EINVAL; + goto freepage; + } + /* if entry is successfully added, it keeps the reference */ entry->pool = zswap_pool_current_get(); if (!entry->pool) { @@ -1378,7 +1391,7 @@ static void zswap_frontswap_init(unsigned type) zswap_trees[type] = tree; } -static struct frontswap_ops zswap_frontswap_ops = { +static const struct frontswap_ops zswap_frontswap_ops = { .store = zswap_frontswap_store, .load = zswap_frontswap_load, .invalidate_page = zswap_frontswap_invalidate_page, @@ -1475,11 +1488,15 @@ static int __init init_zswap(void) if (!shrink_wq) goto fallback_fail; - frontswap_register_ops(&zswap_frontswap_ops); + ret = frontswap_register_ops(&zswap_frontswap_ops); + if (ret) + goto destroy_wq; if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); return 0; +destroy_wq: + destroy_workqueue(shrink_wq); fallback_fail: if (pool) zswap_pool_destroy(pool); |