diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/gup_benchmark.c | 8 | ||||
-rw-r--r-- | mm/highmem.c | 2 | ||||
-rw-r--r-- | mm/huge_memory.c | 38 | ||||
-rw-r--r-- | mm/hugetlb.c | 51 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 198 | ||||
-rw-r--r-- | mm/memcontrol.c | 37 | ||||
-rw-r--r-- | mm/memory.c | 2 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 31 | ||||
-rw-r--r-- | mm/mempolicy.c | 10 | ||||
-rw-r--r-- | mm/memremap.c | 2 | ||||
-rw-r--r-- | mm/migrate.c | 23 | ||||
-rw-r--r-- | mm/mmap.c | 8 | ||||
-rw-r--r-- | mm/oom_kill.c | 2 | ||||
-rw-r--r-- | mm/page-writeback.c | 10 | ||||
-rw-r--r-- | mm/page_alloc.c | 61 | ||||
-rw-r--r-- | mm/shmem.c | 7 | ||||
-rw-r--r-- | mm/slab.c | 4 | ||||
-rw-r--r-- | mm/slab_common.c | 3 | ||||
-rw-r--r-- | mm/slub.c | 16 | ||||
-rw-r--r-- | mm/sparse.c | 9 | ||||
-rw-r--r-- | mm/vmalloc.c | 12 | ||||
-rw-r--r-- | mm/zsmalloc.c | 5 |
22 files changed, 384 insertions, 155 deletions
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index 7dd602d7f8db..ad9d5b1c4473 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -26,6 +26,7 @@ static int __gup_benchmark_ioctl(unsigned int cmd, unsigned long i, nr_pages, addr, next; int nr; struct page **pages; + int ret = 0; if (gup->size > ULONG_MAX) return -EINVAL; @@ -63,7 +64,9 @@ static int __gup_benchmark_ioctl(unsigned int cmd, NULL); break; default: - return -1; + kvfree(pages); + ret = -EINVAL; + goto out; } if (nr <= 0) @@ -85,7 +88,8 @@ static int __gup_benchmark_ioctl(unsigned int cmd, gup->put_delta_usec = ktime_us_delta(end_time, start_time); kvfree(pages); - return 0; +out: + return ret; } static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd, diff --git a/mm/highmem.c b/mm/highmem.c index 107b10f9878e..64d8dea47dd1 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -29,7 +29,7 @@ #include <linux/highmem.h> #include <linux/kgdb.h> #include <asm/tlbflush.h> - +#include <linux/vmalloc.h> #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) DEFINE_PER_CPU(int, __kmap_atomic_idx); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 41a0fbddc96b..a88093213674 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -527,13 +527,13 @@ void prep_transhuge_page(struct page *page) set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); } -static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len, +static unsigned long __thp_get_unmapped_area(struct file *filp, + unsigned long addr, unsigned long len, loff_t off, unsigned long flags, unsigned long size) { - unsigned long addr; loff_t off_end = off + len; loff_t off_align = round_up(off, size); - unsigned long len_pad; + unsigned long len_pad, ret; if (off_end <= off_align || (off_end - off_align) < size) return 0; @@ -542,30 +542,40 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long le if (len_pad < len || (off + len_pad) < off) return 0; - addr = current->mm->get_unmapped_area(filp, 0, len_pad, + ret = current->mm->get_unmapped_area(filp, addr, len_pad, off >> PAGE_SHIFT, flags); - if (IS_ERR_VALUE(addr)) + + /* + * The failure might be due to length padding. The caller will retry + * without the padding. + */ + if (IS_ERR_VALUE(ret)) return 0; - addr += (off - addr) & (size - 1); - return addr; + /* + * Do not try to align to THP boundary if allocation at the address + * hint succeeds. + */ + if (ret == addr) + return addr; + + ret += (off - ret) & (size - 1); + return ret; } unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { + unsigned long ret; loff_t off = (loff_t)pgoff << PAGE_SHIFT; - if (addr) - goto out; if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD)) goto out; - addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE); - if (addr) - return addr; - - out: + ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE); + if (ret) + return ret; +out: return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ac65bb5e38ac..dd8737a94bec 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -27,6 +27,7 @@ #include <linux/swapops.h> #include <linux/jhash.h> #include <linux/numa.h> +#include <linux/llist.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -1136,7 +1137,7 @@ static inline void ClearPageHugeTemporary(struct page *page) page[2].mapping = NULL; } -void free_huge_page(struct page *page) +static void __free_huge_page(struct page *page) { /* * Can't pass hstate in here because it is called from the @@ -1199,6 +1200,54 @@ void free_huge_page(struct page *page) spin_unlock(&hugetlb_lock); } +/* + * As free_huge_page() can be called from a non-task context, we have + * to defer the actual freeing in a workqueue to prevent potential + * hugetlb_lock deadlock. + * + * free_hpage_workfn() locklessly retrieves the linked list of pages to + * be freed and frees them one-by-one. As the page->mapping pointer is + * going to be cleared in __free_huge_page() anyway, it is reused as the + * llist_node structure of a lockless linked list of huge pages to be freed. + */ +static LLIST_HEAD(hpage_freelist); + +static void free_hpage_workfn(struct work_struct *work) +{ + struct llist_node *node; + struct page *page; + + node = llist_del_all(&hpage_freelist); + + while (node) { + page = container_of((struct address_space **)node, + struct page, mapping); + node = node->next; + __free_huge_page(page); + } +} +static DECLARE_WORK(free_hpage_work, free_hpage_workfn); + +void free_huge_page(struct page *page) +{ + /* + * Defer freeing if in non-task context to avoid hugetlb_lock deadlock. + */ + if (!in_task()) { + /* + * Only call schedule_work() if hpage_freelist is previously + * empty. Otherwise, schedule_work() had been called but the + * workfn hasn't retrieved the list yet. + */ + if (llist_add((struct llist_node *)&page->mapping, + &hpage_freelist)) + schedule_work(&free_hpage_work); + return; + } + + __free_huge_page(page); +} + static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { INIT_LIST_HEAD(&page->lru); diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 2ac38bdc18a1..e434b05416c6 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -3,6 +3,10 @@ * Copyright IBM Corporation, 2012 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> * + * Cgroup v2 + * Copyright (C) 2019 Red Hat, Inc. + * Author: Giuseppe Scrivano <gscrivan@redhat.com> + * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2.1 of the GNU Lesser General Public License * as published by the Free Software Foundation. @@ -19,18 +23,36 @@ #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> +enum hugetlb_memory_event { + HUGETLB_MAX, + HUGETLB_NR_MEMORY_EVENTS, +}; + struct hugetlb_cgroup { struct cgroup_subsys_state css; + /* * the counter to account for hugepages from hugetlb. */ struct page_counter hugepage[HUGE_MAX_HSTATE]; + + atomic_long_t events[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS]; + atomic_long_t events_local[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS]; + + /* Handle for "hugetlb.events" */ + struct cgroup_file events_file[HUGE_MAX_HSTATE]; + + /* Handle for "hugetlb.events.local" */ + struct cgroup_file events_local_file[HUGE_MAX_HSTATE]; }; #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) +#define hugetlb_cgroup_from_counter(counter, idx) \ + container_of(counter, struct hugetlb_cgroup, hugepage[idx]) + static struct hugetlb_cgroup *root_h_cgroup __read_mostly; static inline @@ -178,6 +200,19 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) } while (hugetlb_cgroup_have_usage(h_cg)); } +static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx, + enum hugetlb_memory_event event) +{ + atomic_long_inc(&hugetlb->events_local[idx][event]); + cgroup_file_notify(&hugetlb->events_local_file[idx]); + + do { + atomic_long_inc(&hugetlb->events[idx][event]); + cgroup_file_notify(&hugetlb->events_file[idx]); + } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) && + !hugetlb_cgroup_is_root(hugetlb)); +} + int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { @@ -202,8 +237,12 @@ again: } rcu_read_unlock(); - if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter)) + if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, + &counter)) { ret = -ENOMEM; + hugetlb_event(hugetlb_cgroup_from_counter(counter, idx), idx, + HUGETLB_MAX); + } css_put(&h_cg->css); done: *ptr = h_cg; @@ -283,10 +322,45 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, } } +static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v) +{ + int idx; + u64 val; + struct cftype *cft = seq_cft(seq); + unsigned long limit; + struct page_counter *counter; + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); + + idx = MEMFILE_IDX(cft->private); + counter = &h_cg->hugepage[idx]; + + limit = round_down(PAGE_COUNTER_MAX, + 1 << huge_page_order(&hstates[idx])); + + switch (MEMFILE_ATTR(cft->private)) { + case RES_USAGE: + val = (u64)page_counter_read(counter); + seq_printf(seq, "%llu\n", val * PAGE_SIZE); + break; + case RES_LIMIT: + val = (u64)counter->max; + if (val == limit) + seq_puts(seq, "max\n"); + else + seq_printf(seq, "%llu\n", val * PAGE_SIZE); + break; + default: + BUG(); + } + + return 0; +} + static DEFINE_MUTEX(hugetlb_limit_mutex); static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) + char *buf, size_t nbytes, loff_t off, + const char *max) { int ret, idx; unsigned long nr_pages; @@ -296,7 +370,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, return -EINVAL; buf = strstrip(buf); - ret = page_counter_memparse(buf, "-1", &nr_pages); + ret = page_counter_memparse(buf, max, &nr_pages); if (ret) return ret; @@ -316,6 +390,18 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, return ret ?: nbytes; } +static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return hugetlb_cgroup_write(of, buf, nbytes, off, "-1"); +} + +static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return hugetlb_cgroup_write(of, buf, nbytes, off, "max"); +} + static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -350,7 +436,36 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize) return buf; } -static void __init __hugetlb_cgroup_file_init(int idx) +static int __hugetlb_events_show(struct seq_file *seq, bool local) +{ + int idx; + long max; + struct cftype *cft = seq_cft(seq); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); + + idx = MEMFILE_IDX(cft->private); + + if (local) + max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]); + else + max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]); + + seq_printf(seq, "max %lu\n", max); + + return 0; +} + +static int hugetlb_events_show(struct seq_file *seq, void *v) +{ + return __hugetlb_events_show(seq, false); +} + +static int hugetlb_events_local_show(struct seq_file *seq, void *v) +{ + return __hugetlb_events_show(seq, true); +} + +static void __init __hugetlb_cgroup_file_dfl_init(int idx) { char buf[32]; struct cftype *cft; @@ -360,38 +475,93 @@ static void __init __hugetlb_cgroup_file_init(int idx) mem_fmt(buf, 32, huge_page_size(h)); /* Add the limit file */ - cft = &h->cgroup_files[0]; + cft = &h->cgroup_files_dfl[0]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); + cft->seq_show = hugetlb_cgroup_read_u64_max; + cft->write = hugetlb_cgroup_write_dfl; + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* Add the current usage file */ + cft = &h->cgroup_files_dfl[1]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.current", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); + cft->seq_show = hugetlb_cgroup_read_u64_max; + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* Add the events file */ + cft = &h->cgroup_files_dfl[2]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events", buf); + cft->private = MEMFILE_PRIVATE(idx, 0); + cft->seq_show = hugetlb_events_show; + cft->file_offset = offsetof(struct hugetlb_cgroup, events_file[idx]), + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* Add the events.local file */ + cft = &h->cgroup_files_dfl[3]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events.local", buf); + cft->private = MEMFILE_PRIVATE(idx, 0); + cft->seq_show = hugetlb_events_local_show; + cft->file_offset = offsetof(struct hugetlb_cgroup, + events_local_file[idx]), + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* NULL terminate the last cft */ + cft = &h->cgroup_files_dfl[4]; + memset(cft, 0, sizeof(*cft)); + + WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, + h->cgroup_files_dfl)); +} + +static void __init __hugetlb_cgroup_file_legacy_init(int idx) +{ + char buf[32]; + struct cftype *cft; + struct hstate *h = &hstates[idx]; + + /* format the size */ + mem_fmt(buf, 32, huge_page_size(h)); + + /* Add the limit file */ + cft = &h->cgroup_files_legacy[0]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); cft->read_u64 = hugetlb_cgroup_read_u64; - cft->write = hugetlb_cgroup_write; + cft->write = hugetlb_cgroup_write_legacy; /* Add the usage file */ - cft = &h->cgroup_files[1]; + cft = &h->cgroup_files_legacy[1]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the MAX usage file */ - cft = &h->cgroup_files[2]; + cft = &h->cgroup_files_legacy[2]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the failcntfile */ - cft = &h->cgroup_files[3]; + cft = &h->cgroup_files_legacy[3]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; /* NULL terminate the last cft */ - cft = &h->cgroup_files[4]; + cft = &h->cgroup_files_legacy[4]; memset(cft, 0, sizeof(*cft)); WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, - h->cgroup_files)); + h->cgroup_files_legacy)); +} + +static void __init __hugetlb_cgroup_file_init(int idx) +{ + __hugetlb_cgroup_file_dfl_init(idx); + __hugetlb_cgroup_file_legacy_init(idx); } void __init hugetlb_cgroup_file_init(void) @@ -433,8 +603,14 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) return; } +static struct cftype hugetlb_files[] = { + {} /* terminate */ +}; + struct cgroup_subsys hugetlb_cgrp_subsys = { .css_alloc = hugetlb_cgroup_css_alloc, .css_offline = hugetlb_cgroup_css_offline, .css_free = hugetlb_cgroup_css_free, + .dfl_cftypes = hugetlb_files, + .legacy_cftypes = hugetlb_files, }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c5b5f74cfd4d..6c83cf4ed970 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3287,49 +3287,34 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } } -static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only) +static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg) { - unsigned long stat[MEMCG_NR_STAT]; + unsigned long stat[MEMCG_NR_STAT] = {0}; struct mem_cgroup *mi; int node, cpu, i; - int min_idx, max_idx; - - if (slab_only) { - min_idx = NR_SLAB_RECLAIMABLE; - max_idx = NR_SLAB_UNRECLAIMABLE; - } else { - min_idx = 0; - max_idx = MEMCG_NR_STAT; - } - - for (i = min_idx; i < max_idx; i++) - stat[i] = 0; for_each_online_cpu(cpu) - for (i = min_idx; i < max_idx; i++) + for (i = 0; i < MEMCG_NR_STAT; i++) stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu); for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - for (i = min_idx; i < max_idx; i++) + for (i = 0; i < MEMCG_NR_STAT; i++) atomic_long_add(stat[i], &mi->vmstats[i]); - if (!slab_only) - max_idx = NR_VM_NODE_STAT_ITEMS; - for_each_node(node) { struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; struct mem_cgroup_per_node *pi; - for (i = min_idx; i < max_idx; i++) + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) stat[i] = 0; for_each_online_cpu(cpu) - for (i = min_idx; i < max_idx; i++) + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) stat[i] += per_cpu( pn->lruvec_stat_cpu->count[i], cpu); for (pi = pn; pi; pi = parent_nodeinfo(pi, node)) - for (i = min_idx; i < max_idx; i++) + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) atomic_long_add(stat[i], &pi->lruvec_stat[i]); } } @@ -3403,13 +3388,9 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) parent = root_mem_cgroup; /* - * Deactivate and reparent kmem_caches. Then flush percpu - * slab statistics to have precise values at the parent and - * all ancestor levels. It's required to keep slab stats - * accurate after the reparenting of kmem_caches. + * Deactivate and reparent kmem_caches. */ memcg_deactivate_kmem_caches(memcg, parent); - memcg_flush_percpu_vmstats(memcg, true); kmemcg_id = memcg->kmemcg_id; BUG_ON(kmemcg_id < 0); @@ -4913,7 +4894,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) * Flush percpu vmstats and vmevents to guarantee the value correctness * on parent's and all ancestor levels. */ - memcg_flush_percpu_vmstats(memcg, false); + memcg_flush_percpu_vmstats(memcg); memcg_flush_percpu_vmevents(memcg); __mem_cgroup_free(memcg); } diff --git a/mm/memory.c b/mm/memory.c index 45442d9a4f52..1c4be871a237 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2203,7 +2203,7 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, pte_t *page_table, pte_t orig_pte) { int same = 1; -#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION) if (sizeof(pte_t) > sizeof(unsigned long)) { spinlock_t *ptl = pte_lockptr(mm, pmd); spin_lock(ptl); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 55ac23ef11c1..a91a072f2b2c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -483,8 +483,9 @@ static void update_pgdat_span(struct pglist_data *pgdat) pgdat->node_spanned_pages = node_end_pfn - node_start_pfn; } -static void __remove_zone(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages) +void __ref remove_pfn_range_from_zone(struct zone *zone, + unsigned long start_pfn, + unsigned long nr_pages) { struct pglist_data *pgdat = zone->zone_pgdat; unsigned long flags; @@ -499,28 +500,30 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn, return; #endif + clear_zone_contiguous(zone); + pgdat_resize_lock(zone->zone_pgdat, &flags); shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); update_pgdat_span(pgdat); pgdat_resize_unlock(zone->zone_pgdat, &flags); + + set_zone_contiguous(zone); } -static void __remove_section(struct zone *zone, unsigned long pfn, - unsigned long nr_pages, unsigned long map_offset, - struct vmem_altmap *altmap) +static void __remove_section(unsigned long pfn, unsigned long nr_pages, + unsigned long map_offset, + struct vmem_altmap *altmap) { struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn)); if (WARN_ON_ONCE(!valid_section(ms))) return; - __remove_zone(zone, pfn, nr_pages); sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap); } /** - * __remove_pages() - remove sections of pages from a zone - * @zone: zone from which pages need to be removed + * __remove_pages() - remove sections of pages * @pfn: starting pageframe (must be aligned to start of a section) * @nr_pages: number of pages to remove (must be multiple of section size) * @altmap: alternative device page map or %NULL if default memmap is used @@ -530,16 +533,14 @@ static void __remove_section(struct zone *zone, unsigned long pfn, * sure that pages are marked reserved and zones are adjust properly by * calling offline_pages(). */ -void __remove_pages(struct zone *zone, unsigned long pfn, - unsigned long nr_pages, struct vmem_altmap *altmap) +void __remove_pages(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) { unsigned long map_offset = 0; unsigned long nr, start_sec, end_sec; map_offset = vmem_altmap_offset(altmap); - clear_zone_contiguous(zone); - if (check_pfn_span(pfn, nr_pages, "remove")) return; @@ -551,13 +552,11 @@ void __remove_pages(struct zone *zone, unsigned long pfn, cond_resched(); pfns = min(nr_pages, PAGES_PER_SECTION - (pfn & ~PAGE_SECTION_MASK)); - __remove_section(zone, pfn, pfns, map_offset, altmap); + __remove_section(pfn, pfns, map_offset, altmap); pfn += pfns; nr_pages -= pfns; map_offset = 0; } - - set_zone_contiguous(zone); } int set_online_page_callback(online_page_callback_t callback) @@ -869,6 +868,7 @@ failed_addition: (unsigned long long) pfn << PAGE_SHIFT, (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_ONLINE, &arg); + remove_pfn_range_from_zone(zone, pfn, nr_pages); mem_hotplug_done(); return ret; } @@ -1628,6 +1628,7 @@ static int __ref __offline_pages(unsigned long start_pfn, writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); + remove_pfn_range_from_zone(zone, start_pfn, nr_pages); mem_hotplug_done(); return 0; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 067cf7d3daf5..b2920ae87a61 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2148,18 +2148,22 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, nmask = policy_nodemask(gfp, pol); if (!nmask || node_isset(hpage_node, *nmask)) { mpol_cond_put(pol); + /* + * First, try to allocate THP only on local node, but + * don't reclaim unnecessarily, just compact. + */ page = __alloc_pages_node(hpage_node, - gfp | __GFP_THISNODE, order); + gfp | __GFP_THISNODE | __GFP_NORETRY, order); /* * If hugepage allocations are configured to always * synchronous compact or the vma has been madvised * to prefer hugepage backing, retry allowing remote - * memory as well. + * memory with both reclaim and compact as well. */ if (!page && (gfp & __GFP_DIRECT_RECLAIM)) page = __alloc_pages_node(hpage_node, - gfp | __GFP_NORETRY, order); + gfp, order); goto out; } diff --git a/mm/memremap.c b/mm/memremap.c index 03ccbdfeb697..c51c6bd2fe34 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -120,7 +120,7 @@ void memunmap_pages(struct dev_pagemap *pgmap) mem_hotplug_begin(); if (pgmap->type == MEMORY_DEVICE_PRIVATE) { - __remove_pages(page_zone(first_page), PHYS_PFN(res->start), + __remove_pages(PHYS_PFN(res->start), PHYS_PFN(resource_size(res)), NULL); } else { arch_remove_memory(nid, res->start, resource_size(res), diff --git a/mm/migrate.c b/mm/migrate.c index eae1565285e3..86873b6f38a7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1512,9 +1512,11 @@ static int do_move_pages_to_node(struct mm_struct *mm, /* * Resolves the given address to a struct page, isolates it from the LRU and * puts it to the given pagelist. - * Returns -errno if the page cannot be found/isolated or 0 when it has been - * queued or the page doesn't need to be migrated because it is already on - * the target node + * Returns: + * errno - if the page cannot be found/isolated + * 0 - when it doesn't have to be migrated because it is already on the + * target node + * 1 - when it has been queued */ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, int node, struct list_head *pagelist, bool migrate_all) @@ -1553,7 +1555,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, if (PageHuge(page)) { if (PageHead(page)) { isolate_huge_page(page, pagelist); - err = 0; + err = 1; } } else { struct page *head; @@ -1563,7 +1565,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, if (err) goto out_putpage; - err = 0; + err = 1; list_add_tail(&head->lru, pagelist); mod_node_page_state(page_pgdat(head), NR_ISOLATED_ANON + page_is_file_cache(head), @@ -1640,8 +1642,17 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, */ err = add_page_for_migration(mm, addr, current_node, &pagelist, flags & MPOL_MF_MOVE_ALL); - if (!err) + + if (!err) { + /* The page is already on the target node */ + err = store_status(status, i, current_node, 1); + if (err) + goto out_flush; continue; + } else if (err > 0) { + /* The page is successfully queued for migration */ + continue; + } err = store_status(status, i, err, 1); if (err) diff --git a/mm/mmap.c b/mm/mmap.c index 9c648524e4dc..bc788548c4e5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -90,12 +90,6 @@ static void unmap_region(struct mm_struct *mm, * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (copy) copy w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes - * - * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and - * MAP_PRIVATE: - * r: (no) no - * w: (no) no - * x: (yes) yes */ pgprot_t protection_map[16] __ro_after_init = { __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, @@ -3342,6 +3336,8 @@ static const struct vm_operations_struct special_mapping_vmops = { .fault = special_mapping_fault, .mremap = special_mapping_mremap, .name = special_mapping_name, + /* vDSO code relies that VVAR can't be accessed remotely */ + .access = NULL, }; static const struct vm_operations_struct legacy_special_mapping_vmops = { diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 71e3acea7817..d58c481b3df8 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -890,7 +890,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) K(get_mm_counter(mm, MM_FILEPAGES)), K(get_mm_counter(mm, MM_SHMEMPAGES)), from_kuid(&init_user_ns, task_uid(victim)), - mm_pgtables_bytes(mm), victim->signal->oom_score_adj); + mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj); task_unlock(victim); /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 50055d2e4ea8..2caf780a42e7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -201,11 +201,11 @@ static void wb_min_max_ratio(struct bdi_writeback *wb, if (this_bw < tot_bw) { if (min) { min *= this_bw; - do_div(min, tot_bw); + min = div64_ul(min, tot_bw); } if (max < 100) { max *= this_bw; - do_div(max, tot_bw); + max = div64_ul(max, tot_bw); } } @@ -766,7 +766,7 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) struct wb_domain *dom = dtc_dom(dtc); unsigned long thresh = dtc->thresh; u64 wb_thresh; - long numerator, denominator; + unsigned long numerator, denominator; unsigned long wb_min_ratio, wb_max_ratio; /* @@ -777,7 +777,7 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; wb_thresh *= numerator; - do_div(wb_thresh, denominator); + wb_thresh = div64_ul(wb_thresh, denominator); wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); @@ -1102,7 +1102,7 @@ static void wb_update_write_bandwidth(struct bdi_writeback *wb, bw = written - min(written, wb->written_stamp); bw *= HZ; if (unlikely(elapsed > period)) { - do_div(bw, elapsed); + bw = div64_ul(bw, elapsed); avg = bw; goto out; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4785a8a2040e..d047bf7d8fd4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -694,34 +694,27 @@ void prep_compound_page(struct page *page, unsigned int order) #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; -#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT -DEFINE_STATIC_KEY_TRUE(_debug_pagealloc_enabled); -#else +bool _debug_pagealloc_enabled_early __read_mostly + = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); +EXPORT_SYMBOL(_debug_pagealloc_enabled_early); DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); -#endif EXPORT_SYMBOL(_debug_pagealloc_enabled); DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled); static int __init early_debug_pagealloc(char *buf) { - bool enable = false; - - if (kstrtobool(buf, &enable)) - return -EINVAL; - - if (enable) - static_branch_enable(&_debug_pagealloc_enabled); - - return 0; + return kstrtobool(buf, &_debug_pagealloc_enabled_early); } early_param("debug_pagealloc", early_debug_pagealloc); -static void init_debug_guardpage(void) +void init_debug_pagealloc(void) { if (!debug_pagealloc_enabled()) return; + static_branch_enable(&_debug_pagealloc_enabled); + if (!debug_guardpage_minorder()) return; @@ -1186,7 +1179,7 @@ static __always_inline bool free_pages_prepare(struct page *page, */ arch_free_page(page, order); - if (debug_pagealloc_enabled()) + if (debug_pagealloc_enabled_static()) kernel_map_pages(page, 1 << order, 0); kasan_free_nondeferred_pages(page, order); @@ -1207,7 +1200,7 @@ static bool free_pcp_prepare(struct page *page) static bool bulkfree_pcp_prepare(struct page *page) { - if (debug_pagealloc_enabled()) + if (debug_pagealloc_enabled_static()) return free_pages_check(page); else return false; @@ -1221,7 +1214,7 @@ static bool bulkfree_pcp_prepare(struct page *page) */ static bool free_pcp_prepare(struct page *page) { - if (debug_pagealloc_enabled()) + if (debug_pagealloc_enabled_static()) return free_pages_prepare(page, 0, true); else return free_pages_prepare(page, 0, false); @@ -1973,10 +1966,6 @@ void __init page_alloc_init_late(void) for_each_populated_zone(zone) set_zone_contiguous(zone); - -#ifdef CONFIG_DEBUG_PAGEALLOC - init_debug_guardpage(); -#endif } #ifdef CONFIG_CMA @@ -2106,7 +2095,7 @@ static inline bool free_pages_prezeroed(void) */ static inline bool check_pcp_refill(struct page *page) { - if (debug_pagealloc_enabled()) + if (debug_pagealloc_enabled_static()) return check_new_page(page); else return false; @@ -2128,7 +2117,7 @@ static inline bool check_pcp_refill(struct page *page) } static inline bool check_new_pcp(struct page *page) { - if (debug_pagealloc_enabled()) + if (debug_pagealloc_enabled_static()) return check_new_page(page); else return false; @@ -2155,7 +2144,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, set_page_refcounted(page); arch_alloc_page(page, order); - if (debug_pagealloc_enabled()) + if (debug_pagealloc_enabled_static()) kernel_map_pages(page, 1 << order, 1); kasan_alloc_pages(page, order); kernel_poison_pages(page, 1 << order, 1); @@ -4476,8 +4465,11 @@ retry_cpuset: if (page) goto got_pg; - if (order >= pageblock_order && (gfp_mask & __GFP_IO) && - !(gfp_mask & __GFP_RETRY_MAYFAIL)) { + /* + * Checks for costly allocations with __GFP_NORETRY, which + * includes some THP page fault allocations + */ + if (costly_order && (gfp_mask & __GFP_NORETRY)) { /* * If allocating entire pageblock(s) and compaction * failed because all zones are below low watermarks @@ -4498,23 +4490,6 @@ retry_cpuset: if (compact_result == COMPACT_SKIPPED || compact_result == COMPACT_DEFERRED) goto nopage; - } - - /* - * Checks for costly allocations with __GFP_NORETRY, which - * includes THP page fault allocations - */ - if (costly_order && (gfp_mask & __GFP_NORETRY)) { - /* - * If compaction is deferred for high-order allocations, - * it is because sync compaction recently failed. If - * this is the case and the caller requested a THP - * allocation, we do not want to heavily disrupt the - * system, so we fail the allocation instead of entering - * direct reclaim. - */ - if (compact_result == COMPACT_DEFERRED) - goto nopage; /* * Looks like reclaim/compaction is worth trying, but diff --git a/mm/shmem.c b/mm/shmem.c index 165fa6332993..8793e8cc1a48 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2107,9 +2107,10 @@ unsigned long shmem_get_unmapped_area(struct file *file, /* * Our priority is to support MAP_SHARED mapped hugely; * and support MAP_PRIVATE mapped hugely too, until it is COWed. - * But if caller specified an address hint, respect that as before. + * But if caller specified an address hint and we allocated area there + * successfully, respect that as before. */ - if (uaddr) + if (uaddr == addr) return addr; if (shmem_huge != SHMEM_HUGE_FORCE) { @@ -2143,7 +2144,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (inflated_len < len) return addr; - inflated_addr = get_area(NULL, 0, inflated_len, 0, flags); + inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags); if (IS_ERR_VALUE(inflated_addr)) return addr; if (inflated_addr & ~PAGE_MASK) diff --git a/mm/slab.c b/mm/slab.c index f1e1840af533..a89633603b2d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1416,7 +1416,7 @@ static void kmem_rcu_free(struct rcu_head *head) #if DEBUG static bool is_debug_pagealloc_cache(struct kmem_cache *cachep) { - if (debug_pagealloc_enabled() && OFF_SLAB(cachep) && + if (debug_pagealloc_enabled_static() && OFF_SLAB(cachep) && (cachep->size % PAGE_SIZE) == 0) return true; @@ -2008,7 +2008,7 @@ int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags) * to check size >= 256. It guarantees that all necessary small * sized slab is initialized in current slab initialization sequence. */ - if (debug_pagealloc_enabled() && (flags & SLAB_POISON) && + if (debug_pagealloc_enabled_static() && (flags & SLAB_POISON) && size >= 256 && cachep->object_size > cache_line_size()) { if (size < PAGE_SIZE || size % PAGE_SIZE == 0) { size_t tmp_size = ALIGN(size, PAGE_SIZE); diff --git a/mm/slab_common.c b/mm/slab_common.c index f0ab6d4ceb4c..0d95ddea13b0 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -903,7 +903,8 @@ static void flush_memcg_workqueue(struct kmem_cache *s) * deactivates the memcg kmem_caches through workqueue. Make sure all * previous workitems on workqueue are processed. */ - flush_workqueue(memcg_kmem_cache_wq); + if (likely(memcg_kmem_cache_wq)) + flush_workqueue(memcg_kmem_cache_wq); /* * If we're racing with children kmem_cache deactivation, it might diff --git a/mm/slub.c b/mm/slub.c index d11389710b12..0ab92ec8c2a6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -288,7 +288,7 @@ static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) unsigned long freepointer_addr; void *p; - if (!debug_pagealloc_enabled()) + if (!debug_pagealloc_enabled_static()) return get_freepointer(s, object); freepointer_addr = (unsigned long)object + s->offset; @@ -1964,7 +1964,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, return get_any_partial(s, flags, c); } -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* * Calculate the next globally unique transaction for disambiguiation * during cmpxchg. The transactions start with the cpu number and are then @@ -2009,7 +2009,7 @@ static inline void note_cmpxchg_failure(const char *n, pr_info("%s %s: cmpxchg redo ", n, s->name); -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) pr_warn("due to cpu change %d -> %d\n", tid_to_cpu(tid), tid_to_cpu(actual_tid)); @@ -2341,7 +2341,7 @@ static bool has_cpu_slab(int cpu, void *info) static void flush_all(struct kmem_cache *s) { - on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC); + on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1); } /* @@ -2637,7 +2637,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long flags; local_irq_save(flags); -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* * We may have been preempted and rescheduled on a different * cpu before disabling interrupts. Need to reload cpu area @@ -2691,13 +2691,13 @@ redo: * as we end up on the original cpu again when doing the cmpxchg. * * We should guarantee that tid and kmem_cache are retrieved on - * the same cpu. It could be different if CONFIG_PREEMPT so we need + * the same cpu. It could be different if CONFIG_PREEMPTION so we need * to check if it is matched or not. */ do { tid = this_cpu_read(s->cpu_slab->tid); c = raw_cpu_ptr(s->cpu_slab); - } while (IS_ENABLED(CONFIG_PREEMPT) && + } while (IS_ENABLED(CONFIG_PREEMPTION) && unlikely(tid != READ_ONCE(c->tid))); /* @@ -2971,7 +2971,7 @@ redo: do { tid = this_cpu_read(s->cpu_slab->tid); c = raw_cpu_ptr(s->cpu_slab); - } while (IS_ENABLED(CONFIG_PREEMPT) && + } while (IS_ENABLED(CONFIG_PREEMPTION) && unlikely(tid != READ_ONCE(c->tid))); /* Same with comment on barrier() in slab_alloc_node() */ diff --git a/mm/sparse.c b/mm/sparse.c index b20ab7cdac86..3822ecbd8a1f 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -777,7 +777,14 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) { unsigned long section_nr = pfn_to_section_nr(pfn); - if (!section_is_early) { + /* + * When removing an early section, the usage map is kept (as the + * usage maps of other sections fall into the same page). It + * will be re-used when re-adding the section - which is then no + * longer an early section. If the usage map is PageReserved, it + * was allocated during boot. + */ + if (!PageReserved(virt_to_page(ms->usage))) { kfree(ms->usage); ms->usage = NULL; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e9681dc4aa75..1f46c3b86f9f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -41,6 +41,14 @@ #include "internal.h" +bool is_vmalloc_addr(const void *x) +{ + unsigned long addr = (unsigned long)x; + + return addr >= VMALLOC_START && addr < VMALLOC_END; +} +EXPORT_SYMBOL(is_vmalloc_addr); + struct vfree_deferred { struct llist_head list; struct work_struct wq; @@ -1383,7 +1391,7 @@ static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); unmap_vmap_area(va); - if (debug_pagealloc_enabled()) + if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(va->va_start, va->va_end); free_vmap_area_noflush(va); @@ -1681,7 +1689,7 @@ static void vb_free(const void *addr, unsigned long size) vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); - if (debug_pagealloc_enabled()) + if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range((unsigned long)addr, (unsigned long)addr + size); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 2b2b9aae8a3c..22d17ecfe7df 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -2069,6 +2069,11 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, zs_pool_dec_isolated(pool); } + if (page_zone(newpage) != page_zone(page)) { + dec_zone_page_state(page, NR_ZSPAGES); + inc_zone_page_state(newpage, NR_ZSPAGES); + } + reset_page(page); put_page(page); page = newpage; |