diff options
Diffstat (limited to 'kernel')
32 files changed, 1191 insertions, 598 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 557a9b9d2244..1ea181a58465 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -153,11 +153,7 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = { static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu); -/* - * The default hierarchy, reserved for the subsystems that are otherwise - * unattached - it never has more than a single cgroup, and all tasks are - * part of that cgroup. - */ +/* the default hierarchy */ struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu }; EXPORT_SYMBOL_GPL(cgrp_dfl_root); @@ -251,9 +247,6 @@ bool cgroup_ssid_enabled(int ssid) * cases where a subsystem should behave differnetly depending on the * interface version. * - * The set of behaviors which change on the default hierarchy are still - * being determined and the mount option is prefixed with __DEVEL__. - * * List of changed behaviors: * * - Mount options "noprefix", "xattr", "clone_children", "release_agent" @@ -4881,7 +4874,6 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cpu.stat", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_stat_show, }, #ifdef CONFIG_PSI diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 41ca996568df..b6397a186ce9 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -389,18 +389,62 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp, cgroup_base_stat_cputime_account_end(cgrp, rstatc); } +/* + * compute the cputime for the root cgroup by getting the per cpu data + * at a global level, then categorizing the fields in a manner consistent + * with how it is done by __cgroup_account_cputime_field for each bit of + * cpu time attributed to a cgroup. + */ +static void root_cgroup_cputime(struct task_cputime *cputime) +{ + int i; + + cputime->stime = 0; + cputime->utime = 0; + cputime->sum_exec_runtime = 0; + for_each_possible_cpu(i) { + struct kernel_cpustat kcpustat; + u64 *cpustat = kcpustat.cpustat; + u64 user = 0; + u64 sys = 0; + + kcpustat_cpu_fetch(&kcpustat, i); + + user += cpustat[CPUTIME_USER]; + user += cpustat[CPUTIME_NICE]; + cputime->utime += user; + + sys += cpustat[CPUTIME_SYSTEM]; + sys += cpustat[CPUTIME_IRQ]; + sys += cpustat[CPUTIME_SOFTIRQ]; + cputime->stime += sys; + + cputime->sum_exec_runtime += user; + cputime->sum_exec_runtime += sys; + cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; + cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST]; + cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE]; + } +} + void cgroup_base_stat_cputime_show(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; u64 usage, utime, stime; - - if (!cgroup_parent(cgrp)) - return; - - cgroup_rstat_flush_hold(cgrp); - usage = cgrp->bstat.cputime.sum_exec_runtime; - cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime); - cgroup_rstat_flush_release(); + struct task_cputime cputime; + + if (cgroup_parent(cgrp)) { + cgroup_rstat_flush_hold(cgrp); + usage = cgrp->bstat.cputime.sum_exec_runtime; + cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, + &utime, &stime); + cgroup_rstat_flush_release(); + } else { + root_cgroup_cputime(&cputime); + usage = cputime.sum_exec_runtime; + utime = cputime.utime; + stime = cputime.stime; + } do_div(usage, NSEC_PER_USEC); do_div(utime, NSEC_PER_USEC); diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index cbca6879ab7d..44a259338e33 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); */ int cpu_pm_enter(void) { - int nr_calls; + int nr_calls = 0; int ret = 0; ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls); @@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit); */ int cpu_cluster_pm_enter(void) { - int nr_calls; + int nr_calls = 0; int ret = 0; ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls); diff --git a/kernel/cred.c b/kernel/cred.c index 71a792616917..421b1149c651 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -315,6 +315,9 @@ struct cred *prepare_exec_creds(void) new->process_keyring = NULL; #endif + new->suid = new->fsuid = new->euid; + new->sgid = new->fsgid = new->egid; + return new; } diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index ef94e906f05a..ccc0f98abdd4 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -415,6 +415,18 @@ int kgdb_isremovedbreak(unsigned long addr) return 0; } +int kgdb_has_hit_break(unsigned long addr) +{ + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state == BP_ACTIVE && + kgdb_break[i].bpt_addr == addr) + return 1; + } + return 0; +} + int dbg_remove_all_break(void) { int error; @@ -923,7 +935,7 @@ static void sysrq_handle_dbg(int key) kgdb_breakpoint(); } -static struct sysrq_key_op sysrq_dbg_op = { +static const struct sysrq_key_op sysrq_dbg_op = { .handler = sysrq_handle_dbg, .help_msg = "debug(g)", .action_msg = "DEBUG", diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 4c103a24e380..d006668c0027 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -79,10 +79,14 @@ config DMA_REMAP select DMA_NONCOHERENT_MMAP bool -config DMA_DIRECT_REMAP +config DMA_COHERENT_POOL bool select DMA_REMAP +config DMA_DIRECT_REMAP + bool + select DMA_COHERENT_POOL + config DMA_CMA bool "DMA Contiguous Memory Allocator" depends on HAVE_DMA_CONTIGUOUS && CMA diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index d237cf3dc181..370f63344e9c 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -6,4 +6,5 @@ obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o obj-$(CONFIG_DMA_VIRT_OPS) += virt.o obj-$(CONFIG_DMA_API_DEBUG) += debug.o obj-$(CONFIG_SWIOTLB) += swiotlb.o +obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o obj-$(CONFIG_DMA_REMAP) += remap.o diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 8bc6f2d670f9..15bc5026c485 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -222,8 +222,8 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, * @gfp: Allocation flags. * * This function allocates contiguous memory buffer for specified device. It - * first tries to use device specific contiguous memory area if available or - * the default global one, then tries a fallback allocation of normal pages. + * tries to use device specific contiguous memory area if available, or the + * default global one. * * Note that it byapss one-page size of allocations from the global area as * the addresses within one page are always contiguous, so there is no need diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 9e1777c81f55..36c962a86bf2 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -656,7 +656,7 @@ static struct dma_debug_entry *__dma_entry_alloc(void) return entry; } -void __dma_entry_alloc_check_leak(void) +static void __dma_entry_alloc_check_leak(void) { u32 tmp = nr_total_entries % nr_prealloc_entries; diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 8f4bbdaf965e..0a4881e59aa7 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -45,8 +45,8 @@ u64 dma_direct_get_required_mask(struct device *dev) return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } -static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, - u64 *phys_limit) +gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, + u64 *phys_limit) { u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit); @@ -76,6 +76,39 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); } +/* + * Decrypting memory is allowed to block, so if this device requires + * unencrypted memory it must come from atomic pools. + */ +static inline bool dma_should_alloc_from_pool(struct device *dev, gfp_t gfp, + unsigned long attrs) +{ + if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL)) + return false; + if (gfpflags_allow_blocking(gfp)) + return false; + if (force_dma_unencrypted(dev)) + return true; + if (!IS_ENABLED(CONFIG_DMA_DIRECT_REMAP)) + return false; + if (dma_alloc_need_uncached(dev, attrs)) + return true; + return false; +} + +static inline bool dma_should_free_from_pool(struct device *dev, + unsigned long attrs) +{ + if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL)) + return true; + if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && + !force_dma_unencrypted(dev)) + return false; + if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP)) + return true; + return false; +} + struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp_t gfp, unsigned long attrs) { @@ -89,8 +122,8 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, /* we always manually zero the memory once we are done: */ gfp &= ~__GFP_ZERO; - gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, - &phys_limit); + gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, + &phys_limit); page = dma_alloc_contiguous(dev, alloc_size, gfp); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_free_contiguous(dev, page, alloc_size); @@ -125,10 +158,8 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, struct page *page; void *ret; - if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_alloc_need_uncached(dev, attrs) && - !gfpflags_allow_blocking(gfp)) { - ret = dma_alloc_from_pool(PAGE_ALIGN(size), &page, gfp); + if (dma_should_alloc_from_pool(dev, gfp, attrs)) { + ret = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &page, gfp); if (!ret) return NULL; goto done; @@ -204,6 +235,11 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, { unsigned int page_order = get_order(size); + /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ + if (dma_should_free_from_pool(dev, attrs) && + dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) + return; + if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && !force_dma_unencrypted(dev)) { /* cpu_addr is a struct page cookie, not a kernel address */ @@ -211,10 +247,6 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, return; } - if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_free_from_pool(cpu_addr, PAGE_ALIGN(size))) - return; - if (force_dma_unencrypted(dev)) set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c new file mode 100644 index 000000000000..35bb51c31fff --- /dev/null +++ b/kernel/dma/pool.c @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2012 ARM Ltd. + * Copyright (C) 2020 Google LLC + */ +#include <linux/debugfs.h> +#include <linux/dma-direct.h> +#include <linux/dma-noncoherent.h> +#include <linux/dma-contiguous.h> +#include <linux/init.h> +#include <linux/genalloc.h> +#include <linux/set_memory.h> +#include <linux/slab.h> +#include <linux/workqueue.h> + +static struct gen_pool *atomic_pool_dma __ro_after_init; +static unsigned long pool_size_dma; +static struct gen_pool *atomic_pool_dma32 __ro_after_init; +static unsigned long pool_size_dma32; +static struct gen_pool *atomic_pool_kernel __ro_after_init; +static unsigned long pool_size_kernel; + +/* Size can be defined by the coherent_pool command line */ +static size_t atomic_pool_size; + +/* Dynamic background expansion when the atomic pool is near capacity */ +static struct work_struct atomic_pool_work; + +static int __init early_coherent_pool(char *p) +{ + atomic_pool_size = memparse(p, &p); + return 0; +} +early_param("coherent_pool", early_coherent_pool); + +static void __init dma_atomic_pool_debugfs_init(void) +{ + struct dentry *root; + + root = debugfs_create_dir("dma_pools", NULL); + if (IS_ERR_OR_NULL(root)) + return; + + debugfs_create_ulong("pool_size_dma", 0400, root, &pool_size_dma); + debugfs_create_ulong("pool_size_dma32", 0400, root, &pool_size_dma32); + debugfs_create_ulong("pool_size_kernel", 0400, root, &pool_size_kernel); +} + +static void dma_atomic_pool_size_add(gfp_t gfp, size_t size) +{ + if (gfp & __GFP_DMA) + pool_size_dma += size; + else if (gfp & __GFP_DMA32) + pool_size_dma32 += size; + else + pool_size_kernel += size; +} + +static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, + gfp_t gfp) +{ + unsigned int order; + struct page *page; + void *addr; + int ret = -ENOMEM; + + /* Cannot allocate larger than MAX_ORDER-1 */ + order = min(get_order(pool_size), MAX_ORDER-1); + + do { + pool_size = 1 << (PAGE_SHIFT + order); + + if (dev_get_cma_area(NULL)) + page = dma_alloc_from_contiguous(NULL, 1 << order, + order, false); + else + page = alloc_pages(gfp, order); + } while (!page && order-- > 0); + if (!page) + goto out; + + arch_dma_prep_coherent(page, pool_size); + +#ifdef CONFIG_DMA_DIRECT_REMAP + addr = dma_common_contiguous_remap(page, pool_size, + pgprot_dmacoherent(PAGE_KERNEL), + __builtin_return_address(0)); + if (!addr) + goto free_page; +#else + addr = page_to_virt(page); +#endif + /* + * Memory in the atomic DMA pools must be unencrypted, the pools do not + * shrink so no re-encryption occurs in dma_direct_free_pages(). + */ + ret = set_memory_decrypted((unsigned long)page_to_virt(page), + 1 << order); + if (ret) + goto remove_mapping; + ret = gen_pool_add_virt(pool, (unsigned long)addr, page_to_phys(page), + pool_size, NUMA_NO_NODE); + if (ret) + goto encrypt_mapping; + + dma_atomic_pool_size_add(gfp, pool_size); + return 0; + +encrypt_mapping: + ret = set_memory_encrypted((unsigned long)page_to_virt(page), + 1 << order); + if (WARN_ON_ONCE(ret)) { + /* Decrypt succeeded but encrypt failed, purposely leak */ + goto out; + } +remove_mapping: +#ifdef CONFIG_DMA_DIRECT_REMAP + dma_common_free_remap(addr, pool_size); +#endif +free_page: __maybe_unused + if (!dma_release_from_contiguous(NULL, page, 1 << order)) + __free_pages(page, order); +out: + return ret; +} + +static void atomic_pool_resize(struct gen_pool *pool, gfp_t gfp) +{ + if (pool && gen_pool_avail(pool) < atomic_pool_size) + atomic_pool_expand(pool, gen_pool_size(pool), gfp); +} + +static void atomic_pool_work_fn(struct work_struct *work) +{ + if (IS_ENABLED(CONFIG_ZONE_DMA)) + atomic_pool_resize(atomic_pool_dma, + GFP_KERNEL | GFP_DMA); + if (IS_ENABLED(CONFIG_ZONE_DMA32)) + atomic_pool_resize(atomic_pool_dma32, + GFP_KERNEL | GFP_DMA32); + atomic_pool_resize(atomic_pool_kernel, GFP_KERNEL); +} + +static __init struct gen_pool *__dma_atomic_pool_init(size_t pool_size, + gfp_t gfp) +{ + struct gen_pool *pool; + int ret; + + pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE); + if (!pool) + return NULL; + + gen_pool_set_algo(pool, gen_pool_first_fit_order_align, NULL); + + ret = atomic_pool_expand(pool, pool_size, gfp); + if (ret) { + gen_pool_destroy(pool); + pr_err("DMA: failed to allocate %zu KiB %pGg pool for atomic allocation\n", + pool_size >> 10, &gfp); + return NULL; + } + + pr_info("DMA: preallocated %zu KiB %pGg pool for atomic allocations\n", + gen_pool_size(pool) >> 10, &gfp); + return pool; +} + +static int __init dma_atomic_pool_init(void) +{ + int ret = 0; + + /* + * If coherent_pool was not used on the command line, default the pool + * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER-1. + */ + if (!atomic_pool_size) { + atomic_pool_size = max(totalram_pages() >> PAGE_SHIFT, 1UL) * + SZ_128K; + atomic_pool_size = min_t(size_t, atomic_pool_size, + 1 << (PAGE_SHIFT + MAX_ORDER-1)); + } + INIT_WORK(&atomic_pool_work, atomic_pool_work_fn); + + atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size, + GFP_KERNEL); + if (!atomic_pool_kernel) + ret = -ENOMEM; + if (IS_ENABLED(CONFIG_ZONE_DMA)) { + atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size, + GFP_KERNEL | GFP_DMA); + if (!atomic_pool_dma) + ret = -ENOMEM; + } + if (IS_ENABLED(CONFIG_ZONE_DMA32)) { + atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size, + GFP_KERNEL | GFP_DMA32); + if (!atomic_pool_dma32) + ret = -ENOMEM; + } + + dma_atomic_pool_debugfs_init(); + return ret; +} +postcore_initcall(dma_atomic_pool_init); + +static inline struct gen_pool *dev_to_pool(struct device *dev) +{ + u64 phys_mask; + gfp_t gfp; + + gfp = dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, + &phys_mask); + if (IS_ENABLED(CONFIG_ZONE_DMA) && gfp == GFP_DMA) + return atomic_pool_dma; + if (IS_ENABLED(CONFIG_ZONE_DMA32) && gfp == GFP_DMA32) + return atomic_pool_dma32; + return atomic_pool_kernel; +} + +static bool dma_in_atomic_pool(struct device *dev, void *start, size_t size) +{ + struct gen_pool *pool = dev_to_pool(dev); + + if (unlikely(!pool)) + return false; + return gen_pool_has_addr(pool, (unsigned long)start, size); +} + +void *dma_alloc_from_pool(struct device *dev, size_t size, + struct page **ret_page, gfp_t flags) +{ + struct gen_pool *pool = dev_to_pool(dev); + unsigned long val; + void *ptr = NULL; + + if (!pool) { + WARN(1, "%pGg atomic pool not initialised!\n", &flags); + return NULL; + } + + val = gen_pool_alloc(pool, size); + if (val) { + phys_addr_t phys = gen_pool_virt_to_phys(pool, val); + + *ret_page = pfn_to_page(__phys_to_pfn(phys)); + ptr = (void *)val; + memset(ptr, 0, size); + } + if (gen_pool_avail(pool) < atomic_pool_size) + schedule_work(&atomic_pool_work); + + return ptr; +} + +bool dma_free_from_pool(struct device *dev, void *start, size_t size) +{ + struct gen_pool *pool = dev_to_pool(dev); + + if (!dma_in_atomic_pool(dev, start, size)) + return false; + gen_pool_free(pool, (unsigned long)start, size); + return true; +} diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 914ff5a58dd5..e739a6eea6e7 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -1,13 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (C) 2012 ARM Ltd. * Copyright (c) 2014 The Linux Foundation */ -#include <linux/dma-direct.h> -#include <linux/dma-noncoherent.h> -#include <linux/dma-contiguous.h> -#include <linux/init.h> -#include <linux/genalloc.h> +#include <linux/dma-mapping.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -73,117 +68,3 @@ void dma_common_free_remap(void *cpu_addr, size_t size) unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size)); vunmap(cpu_addr); } - -#ifdef CONFIG_DMA_DIRECT_REMAP -static struct gen_pool *atomic_pool __ro_after_init; - -#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K -static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; - -static int __init early_coherent_pool(char *p) -{ - atomic_pool_size = memparse(p, &p); - return 0; -} -early_param("coherent_pool", early_coherent_pool); - -static gfp_t dma_atomic_pool_gfp(void) -{ - if (IS_ENABLED(CONFIG_ZONE_DMA)) - return GFP_DMA; - if (IS_ENABLED(CONFIG_ZONE_DMA32)) - return GFP_DMA32; - return GFP_KERNEL; -} - -static int __init dma_atomic_pool_init(void) -{ - unsigned int pool_size_order = get_order(atomic_pool_size); - unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; - struct page *page; - void *addr; - int ret; - - if (dev_get_cma_area(NULL)) - page = dma_alloc_from_contiguous(NULL, nr_pages, - pool_size_order, false); - else - page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order); - if (!page) - goto out; - - arch_dma_prep_coherent(page, atomic_pool_size); - - atomic_pool = gen_pool_create(PAGE_SHIFT, -1); - if (!atomic_pool) - goto free_page; - - addr = dma_common_contiguous_remap(page, atomic_pool_size, - pgprot_dmacoherent(PAGE_KERNEL), - __builtin_return_address(0)); - if (!addr) - goto destroy_genpool; - - ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr, - page_to_phys(page), atomic_pool_size, -1); - if (ret) - goto remove_mapping; - gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, NULL); - - pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n", - atomic_pool_size / 1024); - return 0; - -remove_mapping: - dma_common_free_remap(addr, atomic_pool_size); -destroy_genpool: - gen_pool_destroy(atomic_pool); - atomic_pool = NULL; -free_page: - if (!dma_release_from_contiguous(NULL, page, nr_pages)) - __free_pages(page, pool_size_order); -out: - pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n", - atomic_pool_size / 1024); - return -ENOMEM; -} -postcore_initcall(dma_atomic_pool_init); - -bool dma_in_atomic_pool(void *start, size_t size) -{ - if (unlikely(!atomic_pool)) - return false; - - return gen_pool_has_addr(atomic_pool, (unsigned long)start, size); -} - -void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags) -{ - unsigned long val; - void *ptr = NULL; - - if (!atomic_pool) { - WARN(1, "coherent pool not initialised!\n"); - return NULL; - } - - val = gen_pool_alloc(atomic_pool, size); - if (val) { - phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val); - - *ret_page = pfn_to_page(__phys_to_pfn(phys)); - ptr = (void *)val; - memset(ptr, 0, size); - } - - return ptr; -} - -bool dma_free_from_pool(void *start, size_t size) -{ - if (!dma_in_atomic_pool(start, size)) - return false; - gen_pool_free(atomic_pool, (unsigned long)start, size); - return true; -} -#endif /* CONFIG_DMA_DIRECT_REMAP */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 2e330f330303..fcfadecd3a08 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -12220,7 +12220,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * When a child task exits, feed back event values to parent events. * * Can be called with exec_update_mutex held when called from - * install_exec_creds(). + * setup_new_exec(). */ void perf_event_exit_task(struct task_struct *child) { diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 3cc8416ec844..b48d7039a015 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -213,6 +213,15 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, list_del(&bp->hw.bp_list); } +__weak int arch_reserve_bp_slot(struct perf_event *bp) +{ + return 0; +} + +__weak void arch_release_bp_slot(struct perf_event *bp) +{ +} + /* * Function to perform processor-specific cleanup during unregistration */ @@ -270,6 +279,7 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) struct bp_busy_slots slots = {0}; enum bp_type_idx type; int weight; + int ret; /* We couldn't initialize breakpoint constraints on boot */ if (!constraints_initialized) @@ -294,6 +304,10 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) if (slots.pinned + (!!slots.flexible) > nr_slots[type]) return -ENOSPC; + ret = arch_reserve_bp_slot(bp); + if (ret) + return ret; + toggle_bp_slot(bp, true, type, weight); return 0; @@ -317,6 +331,8 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type) enum bp_type_idx type; int weight; + arch_release_bp_slot(bp); + type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); toggle_bp_slot(bp, false, type, weight); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ece7e13f6e4a..eddc8db96027 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -162,14 +162,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, }; int err; struct mmu_notifier_range range; - struct mem_cgroup *memcg; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); if (new_page) { - err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, - &memcg, false); + err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL); if (err) return err; } @@ -179,17 +177,13 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_invalidate_range_start(&range); err = -EAGAIN; - if (!page_vma_mapped_walk(&pvmw)) { - if (new_page) - mem_cgroup_cancel_charge(new_page, memcg, false); + if (!page_vma_mapped_walk(&pvmw)) goto unlock; - } VM_BUG_ON_PAGE(addr != pvmw.address, old_page); if (new_page) { get_page(new_page); page_add_new_anon_rmap(new_page, vma, addr, false); - mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); } else /* no new page, just dec_mm_counter for old_page */ diff --git a/kernel/fork.c b/kernel/fork.c index be98e94cb3cc..cefe8745c46e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1759,7 +1759,7 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) pid_t nr = -1; if (likely(pid_has_task(pid, PIDTYPE_PID))) { - ns = proc_pid_ns(file_inode(m->file)); + ns = proc_pid_ns(file_inode(m->file)->i_sb); nr = pid_nr_ns(pid, ns); } diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index e13ca842eb7e..c1510f0ab3ea 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -88,7 +88,7 @@ find $cpio_dir -type f -print0 | find $cpio_dir -printf "./%P\n" | LC_ALL=C sort | \ tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ --owner=0 --group=0 --numeric-owner --no-recursion \ - -Jcf $tarfile -C $cpio_dir/ -T - > /dev/null + -I $XZ -cf $tarfile -C $cpio_dir/ -T - > /dev/null echo $headers_md5 > kernel/kheaders.md5 echo "$this_file_md5" >> kernel/kheaders.md5 diff --git a/kernel/kcov.c b/kernel/kcov.c index 8accc9722a81..55c5d883a93e 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -86,6 +86,18 @@ static DEFINE_SPINLOCK(kcov_remote_lock); static DEFINE_HASHTABLE(kcov_remote_map, 4); static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas); +struct kcov_percpu_data { + void *irq_area; + + unsigned int saved_mode; + unsigned int saved_size; + void *saved_area; + struct kcov *saved_kcov; + int saved_sequence; +}; + +DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data); + /* Must be called with kcov_remote_lock locked. */ static struct kcov_remote *kcov_remote_find(u64 handle) { @@ -98,6 +110,7 @@ static struct kcov_remote *kcov_remote_find(u64 handle) return NULL; } +/* Must be called with kcov_remote_lock locked. */ static struct kcov_remote *kcov_remote_add(struct kcov *kcov, u64 handle) { struct kcov_remote *remote; @@ -119,16 +132,13 @@ static struct kcov_remote_area *kcov_remote_area_get(unsigned int size) struct kcov_remote_area *area; struct list_head *pos; - kcov_debug("size = %u\n", size); list_for_each(pos, &kcov_remote_areas) { area = list_entry(pos, struct kcov_remote_area, list); if (area->size == size) { list_del(&area->list); - kcov_debug("rv = %px\n", area); return area; } } - kcov_debug("rv = NULL\n"); return NULL; } @@ -136,7 +146,6 @@ static struct kcov_remote_area *kcov_remote_area_get(unsigned int size) static void kcov_remote_area_put(struct kcov_remote_area *area, unsigned int size) { - kcov_debug("area = %px, size = %u\n", area, size); INIT_LIST_HEAD(&area->list); area->size = size; list_add(&area->list, &kcov_remote_areas); @@ -148,9 +157,10 @@ static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_stru /* * We are interested in code coverage as a function of a syscall inputs, - * so we ignore code executed in interrupts. + * so we ignore code executed in interrupts, unless we are in a remote + * coverage collection section in a softirq. */ - if (!in_task()) + if (!in_task() && !(in_serving_softirq() && t->kcov_softirq)) return false; mode = READ_ONCE(t->kcov_mode); /* @@ -312,23 +322,26 @@ void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases) EXPORT_SYMBOL(__sanitizer_cov_trace_switch); #endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */ -static void kcov_start(struct task_struct *t, unsigned int size, - void *area, enum kcov_mode mode, int sequence) +static void kcov_start(struct task_struct *t, struct kcov *kcov, + unsigned int size, void *area, enum kcov_mode mode, + int sequence) { kcov_debug("t = %px, size = %u, area = %px\n", t, size, area); + t->kcov = kcov; /* Cache in task struct for performance. */ t->kcov_size = size; t->kcov_area = area; + t->kcov_sequence = sequence; /* See comment in check_kcov_mode(). */ barrier(); WRITE_ONCE(t->kcov_mode, mode); - t->kcov_sequence = sequence; } static void kcov_stop(struct task_struct *t) { WRITE_ONCE(t->kcov_mode, KCOV_MODE_DISABLED); barrier(); + t->kcov = NULL; t->kcov_size = 0; t->kcov_area = NULL; } @@ -336,7 +349,6 @@ static void kcov_stop(struct task_struct *t) static void kcov_task_reset(struct task_struct *t) { kcov_stop(t); - t->kcov = NULL; t->kcov_sequence = 0; t->kcov_handle = 0; } @@ -361,18 +373,18 @@ static void kcov_remote_reset(struct kcov *kcov) int bkt; struct kcov_remote *remote; struct hlist_node *tmp; + unsigned long flags; - spin_lock(&kcov_remote_lock); + spin_lock_irqsave(&kcov_remote_lock, flags); hash_for_each_safe(kcov_remote_map, bkt, tmp, remote, hnode) { if (remote->kcov != kcov) continue; - kcov_debug("removing handle %llx\n", remote->handle); hash_del(&remote->hnode); kfree(remote); } /* Do reset before unlock to prevent races with kcov_remote_start(). */ kcov_reset(kcov); - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); } static void kcov_disable(struct task_struct *t, struct kcov *kcov) @@ -401,12 +413,13 @@ static void kcov_put(struct kcov *kcov) void kcov_task_exit(struct task_struct *t) { struct kcov *kcov; + unsigned long flags; kcov = t->kcov; if (kcov == NULL) return; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); kcov_debug("t = %px, kcov->t = %px\n", t, kcov->t); /* * For KCOV_ENABLE devices we want to make sure that t->kcov->t == t, @@ -430,12 +443,12 @@ void kcov_task_exit(struct task_struct *t) * By combining all three checks into one we get: */ if (WARN_ON(kcov->t != t)) { - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); return; } /* Just to not leave dangling references behind. */ kcov_disable(t, kcov); - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); kcov_put(kcov); } @@ -446,12 +459,13 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) struct kcov *kcov = vma->vm_file->private_data; unsigned long size, off; struct page *page; + unsigned long flags; area = vmalloc_user(vma->vm_end - vma->vm_start); if (!area) return -ENOMEM; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); size = kcov->size * sizeof(unsigned long); if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != size) { @@ -461,7 +475,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) if (!kcov->area) { kcov->area = area; vma->vm_flags |= VM_DONTEXPAND; - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); for (off = 0; off < size; off += PAGE_SIZE) { page = vmalloc_to_page(kcov->area + off); if (vm_insert_page(vma, vma->vm_start + off, page)) @@ -470,7 +484,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) return 0; } exit: - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); vfree(area); return res; } @@ -550,10 +564,10 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, int mode, i; struct kcov_remote_arg *remote_arg; struct kcov_remote *remote; + unsigned long flags; switch (cmd) { case KCOV_INIT_TRACE: - kcov_debug("KCOV_INIT_TRACE\n"); /* * Enable kcov in trace mode and setup buffer size. * Must happen before anything else. @@ -572,7 +586,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov->mode = KCOV_MODE_INIT; return 0; case KCOV_ENABLE: - kcov_debug("KCOV_ENABLE\n"); /* * Enable coverage for the current task. * At this point user must have been enabled trace mode, @@ -590,15 +603,13 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, return mode; kcov_fault_in_area(kcov); kcov->mode = mode; - kcov_start(t, kcov->size, kcov->area, kcov->mode, + kcov_start(t, kcov, kcov->size, kcov->area, kcov->mode, kcov->sequence); - t->kcov = kcov; kcov->t = t; /* Put either in kcov_task_exit() or in KCOV_DISABLE. */ kcov_get(kcov); return 0; case KCOV_DISABLE: - kcov_debug("KCOV_DISABLE\n"); /* Disable coverage for the current task. */ unused = arg; if (unused != 0 || current->kcov != kcov) @@ -610,7 +621,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov_put(kcov); return 0; case KCOV_REMOTE_ENABLE: - kcov_debug("KCOV_REMOTE_ENABLE\n"); if (kcov->mode != KCOV_MODE_INIT || !kcov->area) return -EINVAL; t = current; @@ -627,41 +637,42 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov->t = t; kcov->remote = true; kcov->remote_size = remote_arg->area_size; - spin_lock(&kcov_remote_lock); + spin_lock_irqsave(&kcov_remote_lock, flags); for (i = 0; i < remote_arg->num_handles; i++) { - kcov_debug("handle %llx\n", remote_arg->handles[i]); if (!kcov_check_handle(remote_arg->handles[i], false, true, false)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return -EINVAL; } remote = kcov_remote_add(kcov, remote_arg->handles[i]); if (IS_ERR(remote)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return PTR_ERR(remote); } } if (remote_arg->common_handle) { - kcov_debug("common handle %llx\n", - remote_arg->common_handle); if (!kcov_check_handle(remote_arg->common_handle, true, false, false)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return -EINVAL; } remote = kcov_remote_add(kcov, remote_arg->common_handle); if (IS_ERR(remote)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return PTR_ERR(remote); } t->kcov_handle = remote_arg->common_handle; } - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); /* Put either in kcov_task_exit() or in KCOV_DISABLE. */ kcov_get(kcov); return 0; @@ -677,6 +688,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) struct kcov_remote_arg *remote_arg = NULL; unsigned int remote_num_handles; unsigned long remote_arg_size; + unsigned long flags; if (cmd == KCOV_REMOTE_ENABLE) { if (get_user(remote_num_handles, (unsigned __user *)(arg + @@ -697,9 +709,9 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) } kcov = filep->private_data; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); res = kcov_ioctl_locked(kcov, cmd, arg); - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); kfree(remote_arg); @@ -716,8 +728,8 @@ static const struct file_operations kcov_fops = { /* * kcov_remote_start() and kcov_remote_stop() can be used to annotate a section - * of code in a kernel background thread to allow kcov to be used to collect - * coverage from that part of code. + * of code in a kernel background thread or in a softirq to allow kcov to be + * used to collect coverage from that part of code. * * The handle argument of kcov_remote_start() identifies a code section that is * used for coverage collection. A userspace process passes this handle to @@ -728,9 +740,9 @@ static const struct file_operations kcov_fops = { * the type of the kernel thread whose code is being annotated. * * For global kernel threads that are spawned in a limited number of instances - * (e.g. one USB hub_event() worker thread is spawned per USB HCD), each - * instance must be assigned a unique 4-byte instance id. The instance id is - * then combined with a 1-byte subsystem id to get a handle via + * (e.g. one USB hub_event() worker thread is spawned per USB HCD) and for + * softirqs, each instance must be assigned a unique 4-byte instance id. The + * instance id is then combined with a 1-byte subsystem id to get a handle via * kcov_remote_handle(subsystem_id, instance_id). * * For local kernel threads that are spawned from system calls handler when a @@ -749,70 +761,136 @@ static const struct file_operations kcov_fops = { * * See Documentation/dev-tools/kcov.rst for more details. * - * Internally, this function looks up the kcov device associated with the + * Internally, kcov_remote_start() looks up the kcov device associated with the * provided handle, allocates an area for coverage collection, and saves the * pointers to kcov and area into the current task_struct to allow coverage to * be collected via __sanitizer_cov_trace_pc() * In turns kcov_remote_stop() clears those pointers from task_struct to stop * collecting coverage and copies all collected coverage into the kcov area. */ + +static inline bool kcov_mode_enabled(unsigned int mode) +{ + return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED; +} + +void kcov_remote_softirq_start(struct task_struct *t) +{ + struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); + unsigned int mode; + + mode = READ_ONCE(t->kcov_mode); + barrier(); + if (kcov_mode_enabled(mode)) { + data->saved_mode = mode; + data->saved_size = t->kcov_size; + data->saved_area = t->kcov_area; + data->saved_sequence = t->kcov_sequence; + data->saved_kcov = t->kcov; + kcov_stop(t); + } +} + +void kcov_remote_softirq_stop(struct task_struct *t) +{ + struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); + + if (data->saved_kcov) { + kcov_start(t, data->saved_kcov, data->saved_size, + data->saved_area, data->saved_mode, + data->saved_sequence); + data->saved_mode = 0; + data->saved_size = 0; + data->saved_area = NULL; + data->saved_sequence = 0; + data->saved_kcov = NULL; + } +} + void kcov_remote_start(u64 handle) { + struct task_struct *t = current; struct kcov_remote *remote; + struct kcov *kcov; + unsigned int mode; void *area; - struct task_struct *t; unsigned int size; - enum kcov_mode mode; int sequence; + unsigned long flags; if (WARN_ON(!kcov_check_handle(handle, true, true, true))) return; - if (WARN_ON(!in_task())) + if (!in_task() && !in_serving_softirq()) return; - t = current; + + local_irq_save(flags); + /* - * Check that kcov_remote_start is not called twice - * nor called by user tasks (with enabled kcov). + * Check that kcov_remote_start() is not called twice in background + * threads nor called by user tasks (with enabled kcov). */ - if (WARN_ON(t->kcov)) + mode = READ_ONCE(t->kcov_mode); + if (WARN_ON(in_task() && kcov_mode_enabled(mode))) { + local_irq_restore(flags); return; - - kcov_debug("handle = %llx\n", handle); + } + /* + * Check that kcov_remote_start() is not called twice in softirqs. + * Note, that kcov_remote_start() can be called from a softirq that + * happened while collecting coverage from a background thread. + */ + if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) { + local_irq_restore(flags); + return; + } spin_lock(&kcov_remote_lock); remote = kcov_remote_find(handle); if (!remote) { - kcov_debug("no remote found"); - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); return; } + kcov_debug("handle = %llx, context: %s\n", handle, + in_task() ? "task" : "softirq"); + kcov = remote->kcov; /* Put in kcov_remote_stop(). */ - kcov_get(remote->kcov); - t->kcov = remote->kcov; + kcov_get(kcov); /* * Read kcov fields before unlock to prevent races with * KCOV_DISABLE / kcov_remote_reset(). */ - size = remote->kcov->remote_size; - mode = remote->kcov->mode; - sequence = remote->kcov->sequence; - area = kcov_remote_area_get(size); - spin_unlock(&kcov_remote_lock); + mode = kcov->mode; + sequence = kcov->sequence; + if (in_task()) { + size = kcov->remote_size; + area = kcov_remote_area_get(size); + } else { + size = CONFIG_KCOV_IRQ_AREA_SIZE; + area = this_cpu_ptr(&kcov_percpu_data)->irq_area; + } + spin_unlock_irqrestore(&kcov_remote_lock, flags); + /* Can only happen when in_task(). */ if (!area) { area = vmalloc(size * sizeof(unsigned long)); if (!area) { - t->kcov = NULL; - kcov_put(remote->kcov); + kcov_put(kcov); return; } } + + local_irq_save(flags); + /* Reset coverage size. */ *(u64 *)area = 0; - kcov_debug("area = %px, size = %u", area, size); + if (in_serving_softirq()) { + kcov_remote_softirq_start(t); + t->kcov_softirq = 1; + } + kcov_start(t, kcov, size, area, mode, sequence); - kcov_start(t, size, area, mode, sequence); + local_irq_restore(flags); } EXPORT_SYMBOL(kcov_remote_start); @@ -876,34 +954,58 @@ static void kcov_move_area(enum kcov_mode mode, void *dst_area, void kcov_remote_stop(void) { struct task_struct *t = current; - struct kcov *kcov = t->kcov; - void *area = t->kcov_area; - unsigned int size = t->kcov_size; - int sequence = t->kcov_sequence; + struct kcov *kcov; + unsigned int mode; + void *area; + unsigned int size; + int sequence; + unsigned long flags; - if (!kcov) { - kcov_debug("no kcov found\n"); + if (!in_task() && !in_serving_softirq()) + return; + + local_irq_save(flags); + + mode = READ_ONCE(t->kcov_mode); + barrier(); + if (!kcov_mode_enabled(mode)) { + local_irq_restore(flags); + return; + } + kcov = t->kcov; + area = t->kcov_area; + size = t->kcov_size; + sequence = t->kcov_sequence; + + if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) { + local_irq_restore(flags); return; } kcov_stop(t); - t->kcov = NULL; + if (in_serving_softirq()) { + t->kcov_softirq = 0; + kcov_remote_softirq_stop(t); + } spin_lock(&kcov->lock); /* * KCOV_DISABLE could have been called between kcov_remote_start() - * and kcov_remote_stop(), hence the check. + * and kcov_remote_stop(), hence the sequence check. */ - kcov_debug("move if: %d == %d && %d\n", - sequence, kcov->sequence, (int)kcov->remote); if (sequence == kcov->sequence && kcov->remote) kcov_move_area(kcov->mode, kcov->area, kcov->size, area); spin_unlock(&kcov->lock); - spin_lock(&kcov_remote_lock); - kcov_remote_area_put(area, size); - spin_unlock(&kcov_remote_lock); + if (in_task()) { + spin_lock(&kcov_remote_lock); + kcov_remote_area_put(area, size); + spin_unlock(&kcov_remote_lock); + } + + local_irq_restore(flags); + /* Get in kcov_remote_start(). */ kcov_put(kcov); } EXPORT_SYMBOL(kcov_remote_stop); @@ -917,6 +1019,16 @@ EXPORT_SYMBOL(kcov_common_handle); static int __init kcov_init(void) { + int cpu; + + for_each_possible_cpu(cpu) { + void *area = vmalloc(CONFIG_KCOV_IRQ_AREA_SIZE * + sizeof(unsigned long)); + if (!area) + return -ENOMEM; + per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area; + } + /* * The kcov debugfs file won't ever get removed and thus, * there is no need to protect it against removal races. The diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index faa74d5f6941..bb05fd52de85 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -540,6 +540,11 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) unsigned long sz = end - start + 1; /* Returning 0 will take to next memory range */ + + /* Don't use memory that will be detected and handled by a driver. */ + if (res->flags & IORESOURCE_MEM_DRIVER_MANAGED) + return 0; + if (sz < kbuf->memsz) return 0; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 0fbdee78266b..50cd84f53df0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2475,24 +2475,14 @@ static int show_kprobe_addr(struct seq_file *pi, void *v) return 0; } -static const struct seq_operations kprobes_seq_ops = { +static const struct seq_operations kprobes_sops = { .start = kprobe_seq_start, .next = kprobe_seq_next, .stop = kprobe_seq_stop, .show = show_kprobe_addr }; -static int kprobes_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &kprobes_seq_ops); -} - -static const struct file_operations debugfs_kprobes_operations = { - .open = kprobes_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(kprobes); /* kprobes/blacklist -- shows which functions can not be probed */ static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos) @@ -2529,24 +2519,13 @@ static void kprobe_blacklist_seq_stop(struct seq_file *f, void *v) mutex_unlock(&kprobe_mutex); } -static const struct seq_operations kprobe_blacklist_seq_ops = { +static const struct seq_operations kprobe_blacklist_sops = { .start = kprobe_blacklist_seq_start, .next = kprobe_blacklist_seq_next, .stop = kprobe_blacklist_seq_stop, .show = kprobe_blacklist_seq_show, }; - -static int kprobe_blacklist_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &kprobe_blacklist_seq_ops); -} - -static const struct file_operations debugfs_kprobe_blacklist_ops = { - .open = kprobe_blacklist_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(kprobe_blacklist); static int arm_all_kprobes(void) { @@ -2705,13 +2684,12 @@ static int __init debugfs_kprobe_init(void) dir = debugfs_create_dir("kprobes", NULL); - debugfs_create_file("list", 0400, dir, NULL, - &debugfs_kprobes_operations); + debugfs_create_file("list", 0400, dir, NULL, &kprobes_fops); debugfs_create_file("enabled", 0600, dir, &value, &fops_kp); debugfs_create_file("blacklist", 0400, dir, NULL, - &debugfs_kprobe_blacklist_ops); + &kprobe_blacklist_fops); return 0; } diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index c3512e7e0801..f76fdb925532 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -191,18 +191,21 @@ static int klp_find_object_symbol(const char *objname, const char *name, return -EINVAL; } -static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod) +static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab, + unsigned int symndx, Elf_Shdr *relasec, + const char *sec_objname) { - int i, cnt, vmlinux, ret; - char objname[MODULE_NAME_LEN]; - char symname[KSYM_NAME_LEN]; - char *strtab = pmod->core_kallsyms.strtab; + int i, cnt, ret; + char sym_objname[MODULE_NAME_LEN]; + char sym_name[KSYM_NAME_LEN]; Elf_Rela *relas; Elf_Sym *sym; unsigned long sympos, addr; + bool sym_vmlinux; + bool sec_vmlinux = !strcmp(sec_objname, "vmlinux"); /* - * Since the field widths for objname and symname in the sscanf() + * Since the field widths for sym_objname and sym_name in the sscanf() * call are hard-coded and correspond to MODULE_NAME_LEN and * KSYM_NAME_LEN respectively, we must make sure that MODULE_NAME_LEN * and KSYM_NAME_LEN have the values we expect them to have. @@ -216,27 +219,40 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod) relas = (Elf_Rela *) relasec->sh_addr; /* For each rela in this klp relocation section */ for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) { - sym = pmod->core_kallsyms.symtab + ELF_R_SYM(relas[i].r_info); + sym = (Elf64_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info); if (sym->st_shndx != SHN_LIVEPATCH) { pr_err("symbol %s is not marked as a livepatch symbol\n", strtab + sym->st_name); return -EINVAL; } - /* Format: .klp.sym.objname.symname,sympos */ + /* Format: .klp.sym.sym_objname.sym_name,sympos */ cnt = sscanf(strtab + sym->st_name, ".klp.sym.%55[^.].%127[^,],%lu", - objname, symname, &sympos); + sym_objname, sym_name, &sympos); if (cnt != 3) { pr_err("symbol %s has an incorrectly formatted name\n", strtab + sym->st_name); return -EINVAL; } + sym_vmlinux = !strcmp(sym_objname, "vmlinux"); + + /* + * Prevent module-specific KLP rela sections from referencing + * vmlinux symbols. This helps prevent ordering issues with + * module special section initializations. Presumably such + * symbols are exported and normal relas can be used instead. + */ + if (!sec_vmlinux && sym_vmlinux) { + pr_err("invalid access to vmlinux symbol '%s' from module-specific livepatch relocation section", + sym_name); + return -EINVAL; + } + /* klp_find_object_symbol() treats a NULL objname as vmlinux */ - vmlinux = !strcmp(objname, "vmlinux"); - ret = klp_find_object_symbol(vmlinux ? NULL : objname, - symname, sympos, &addr); + ret = klp_find_object_symbol(sym_vmlinux ? NULL : sym_objname, + sym_name, sympos, &addr); if (ret) return ret; @@ -246,54 +262,59 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod) return 0; } -static int klp_write_object_relocations(struct module *pmod, - struct klp_object *obj) +/* + * At a high-level, there are two types of klp relocation sections: those which + * reference symbols which live in vmlinux; and those which reference symbols + * which live in other modules. This function is called for both types: + * + * 1) When a klp module itself loads, the module code calls this function to + * write vmlinux-specific klp relocations (.klp.rela.vmlinux.* sections). + * These relocations are written to the klp module text to allow the patched + * code/data to reference unexported vmlinux symbols. They're written as + * early as possible to ensure that other module init code (.e.g., + * jump_label_apply_nops) can access any unexported vmlinux symbols which + * might be referenced by the klp module's special sections. + * + * 2) When a to-be-patched module loads -- or is already loaded when a + * corresponding klp module loads -- klp code calls this function to write + * module-specific klp relocations (.klp.rela.{module}.* sections). These + * are written to the klp module text to allow the patched code/data to + * reference symbols which live in the to-be-patched module or one of its + * module dependencies. Exported symbols are supported, in addition to + * unexported symbols, in order to enable late module patching, which allows + * the to-be-patched module to be loaded and patched sometime *after* the + * klp module is loaded. + */ +int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, + const char *shstrtab, const char *strtab, + unsigned int symndx, unsigned int secndx, + const char *objname) { - int i, cnt, ret = 0; - const char *objname, *secname; + int cnt, ret; char sec_objname[MODULE_NAME_LEN]; - Elf_Shdr *sec; + Elf_Shdr *sec = sechdrs + secndx; - if (WARN_ON(!klp_is_object_loaded(obj))) + /* + * Format: .klp.rela.sec_objname.section_name + * See comment in klp_resolve_symbols() for an explanation + * of the selected field width value. + */ + cnt = sscanf(shstrtab + sec->sh_name, ".klp.rela.%55[^.]", + sec_objname); + if (cnt != 1) { + pr_err("section %s has an incorrectly formatted name\n", + shstrtab + sec->sh_name); return -EINVAL; + } - objname = klp_is_module(obj) ? obj->name : "vmlinux"; - - /* For each klp relocation section */ - for (i = 1; i < pmod->klp_info->hdr.e_shnum; i++) { - sec = pmod->klp_info->sechdrs + i; - secname = pmod->klp_info->secstrings + sec->sh_name; - if (!(sec->sh_flags & SHF_RELA_LIVEPATCH)) - continue; - - /* - * Format: .klp.rela.sec_objname.section_name - * See comment in klp_resolve_symbols() for an explanation - * of the selected field width value. - */ - cnt = sscanf(secname, ".klp.rela.%55[^.]", sec_objname); - if (cnt != 1) { - pr_err("section %s has an incorrectly formatted name\n", - secname); - ret = -EINVAL; - break; - } - - if (strcmp(objname, sec_objname)) - continue; - - ret = klp_resolve_symbols(sec, pmod); - if (ret) - break; + if (strcmp(objname ? objname : "vmlinux", sec_objname)) + return 0; - ret = apply_relocate_add(pmod->klp_info->sechdrs, - pmod->core_kallsyms.strtab, - pmod->klp_info->symndx, i, pmod); - if (ret) - break; - } + ret = klp_resolve_symbols(sechdrs, strtab, symndx, sec, sec_objname); + if (ret) + return ret; - return ret; + return apply_relocate_add(sechdrs, strtab, symndx, secndx, pmod); } /* @@ -724,10 +745,27 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) func->old_sympos ? func->old_sympos : 1); } -/* Arches may override this to finish any remaining arch-specific tasks */ -void __weak arch_klp_init_object_loaded(struct klp_patch *patch, - struct klp_object *obj) +static int klp_apply_object_relocs(struct klp_patch *patch, + struct klp_object *obj) { + int i, ret; + struct klp_modinfo *info = patch->mod->klp_info; + + for (i = 1; i < info->hdr.e_shnum; i++) { + Elf_Shdr *sec = info->sechdrs + i; + + if (!(sec->sh_flags & SHF_RELA_LIVEPATCH)) + continue; + + ret = klp_apply_section_relocs(patch->mod, info->sechdrs, + info->secstrings, + patch->mod->core_kallsyms.strtab, + info->symndx, i, obj->name); + if (ret) + return ret; + } + + return 0; } /* parts of the initialization that is done only when the object is loaded */ @@ -737,21 +775,18 @@ static int klp_init_object_loaded(struct klp_patch *patch, struct klp_func *func; int ret; - mutex_lock(&text_mutex); - - module_disable_ro(patch->mod); - ret = klp_write_object_relocations(patch->mod, obj); - if (ret) { - module_enable_ro(patch->mod, true); - mutex_unlock(&text_mutex); - return ret; + if (klp_is_module(obj)) { + /* + * Only write module-specific relocations here + * (.klp.rela.{module}.*). vmlinux-specific relocations were + * written earlier during the initialization of the klp module + * itself. + */ + ret = klp_apply_object_relocs(patch, obj); + if (ret) + return ret; } - arch_klp_init_object_loaded(patch, obj); - module_enable_ro(patch->mod, true); - - mutex_unlock(&text_mutex); - klp_for_each_func(obj, func) { ret = klp_find_object_symbol(obj->name, func->old_name, func->old_sympos, @@ -1139,6 +1174,11 @@ int klp_module_coming(struct module *mod) if (WARN_ON(mod->state != MODULE_STATE_COMING)) return -EINVAL; + if (!strcmp(mod->name, "vmlinux")) { + pr_err("vmlinux.ko: invalid module name"); + return -EINVAL; + } + mutex_lock(&klp_mutex); /* * Each module has to know that klp_module_coming() diff --git a/kernel/module.c b/kernel/module.c index be5413903d20..ef400c389f49 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1946,7 +1946,6 @@ static void mod_sysfs_teardown(struct module *mod) mod_sysfs_fini(mod); } -#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX /* * LKM RO/NX protection: protect module's text/ro-data * from modification and any data from execution. @@ -1960,6 +1959,14 @@ static void mod_sysfs_teardown(struct module *mod) * * These values are always page-aligned (as is base) */ + +/* + * Since some arches are moving towards PAGE_KERNEL module allocations instead + * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() outside of the + * CONFIG_STRICT_MODULE_RWX block below because they are needed regardless of + * whether we are strict. + */ +#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX static void frob_text(const struct module_layout *layout, int (*set_memory)(unsigned long start, int num_pages)) { @@ -1969,6 +1976,15 @@ static void frob_text(const struct module_layout *layout, layout->text_size >> PAGE_SHIFT); } +static void module_enable_x(const struct module *mod) +{ + frob_text(&mod->core_layout, set_memory_x); + frob_text(&mod->init_layout, set_memory_x); +} +#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ +static void module_enable_x(const struct module *mod) { } +#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ + #ifdef CONFIG_STRICT_MODULE_RWX static void frob_rodata(const struct module_layout *layout, int (*set_memory)(unsigned long start, int num_pages)) @@ -2000,20 +2016,7 @@ static void frob_writable_data(const struct module_layout *layout, (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); } -/* livepatching wants to disable read-only so it can frob module. */ -void module_disable_ro(const struct module *mod) -{ - if (!rodata_enabled) - return; - - frob_text(&mod->core_layout, set_memory_rw); - frob_rodata(&mod->core_layout, set_memory_rw); - frob_ro_after_init(&mod->core_layout, set_memory_rw); - frob_text(&mod->init_layout, set_memory_rw); - frob_rodata(&mod->init_layout, set_memory_rw); -} - -void module_enable_ro(const struct module *mod, bool after_init) +static void module_enable_ro(const struct module *mod, bool after_init) { if (!rodata_enabled) return; @@ -2039,19 +2042,29 @@ static void module_enable_nx(const struct module *mod) frob_writable_data(&mod->init_layout, set_memory_nx); } +static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *mod) +{ + const unsigned long shf_wx = SHF_WRITE|SHF_EXECINSTR; + int i; + + for (i = 0; i < hdr->e_shnum; i++) { + if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) + return -ENOEXEC; + } + + return 0; +} + #else /* !CONFIG_STRICT_MODULE_RWX */ static void module_enable_nx(const struct module *mod) { } -#endif /* CONFIG_STRICT_MODULE_RWX */ -static void module_enable_x(const struct module *mod) +static void module_enable_ro(const struct module *mod, bool after_init) {} +static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *mod) { - frob_text(&mod->core_layout, set_memory_x); - frob_text(&mod->init_layout, set_memory_x); + return 0; } -#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ -static void module_enable_nx(const struct module *mod) { } -static void module_enable_x(const struct module *mod) { } -#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ - +#endif /* CONFIG_STRICT_MODULE_RWX */ #ifdef CONFIG_LIVEPATCH /* @@ -2337,11 +2350,13 @@ static int apply_relocations(struct module *mod, const struct load_info *info) if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC)) continue; - /* Livepatch relocation sections are applied by livepatch */ if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH) - continue; - - if (info->sechdrs[i].sh_type == SHT_REL) + err = klp_apply_section_relocs(mod, info->sechdrs, + info->secstrings, + info->strtab, + info->index.sym, i, + NULL); + else if (info->sechdrs[i].sh_type == SHT_REL) err = apply_relocate(info->sechdrs, info->strtab, info->index.sym, i, mod); else if (info->sechdrs[i].sh_type == SHT_RELA) @@ -3395,6 +3410,11 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) if (err < 0) return ERR_PTR(err); + err = module_enforce_rwx_sections(info->hdr, info->sechdrs, + info->secstrings, info->mod); + if (err < 0) + return ERR_PTR(err); + /* We will do a special allocation for per-cpu sections later. */ info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC; diff --git a/kernel/padata.c b/kernel/padata.c index aae789896616..29fc5d87a4cd 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -7,6 +7,9 @@ * Copyright (C) 2008, 2009 secunet Security Networks AG * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com> * + * Copyright (c) 2020 Oracle and/or its affiliates. + * Author: Daniel Jordan <daniel.m.jordan@oracle.com> + * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. @@ -21,6 +24,7 @@ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ +#include <linux/completion.h> #include <linux/export.h> #include <linux/cpumask.h> #include <linux/err.h> @@ -31,11 +35,30 @@ #include <linux/slab.h> #include <linux/sysfs.h> #include <linux/rcupdate.h> -#include <linux/module.h> -#define MAX_OBJ_NUM 1000 +#define PADATA_WORK_ONSTACK 1 /* Work's memory is on stack */ + +struct padata_work { + struct work_struct pw_work; + struct list_head pw_list; /* padata_free_works linkage */ + void *pw_data; +}; + +static DEFINE_SPINLOCK(padata_works_lock); +static struct padata_work *padata_works; +static LIST_HEAD(padata_free_works); + +struct padata_mt_job_state { + spinlock_t lock; + struct completion completion; + struct padata_mt_job *job; + int nworks; + int nworks_fini; + unsigned long chunk_size; +}; static void padata_free_pd(struct parallel_data *pd); +static void __init padata_mt_helper(struct work_struct *work); static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) { @@ -59,30 +82,82 @@ static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr) return padata_index_to_cpu(pd, cpu_index); } -static void padata_parallel_worker(struct work_struct *parallel_work) +static struct padata_work *padata_work_alloc(void) { - struct padata_parallel_queue *pqueue; - LIST_HEAD(local_list); + struct padata_work *pw; - local_bh_disable(); - pqueue = container_of(parallel_work, - struct padata_parallel_queue, work); + lockdep_assert_held(&padata_works_lock); - spin_lock(&pqueue->parallel.lock); - list_replace_init(&pqueue->parallel.list, &local_list); - spin_unlock(&pqueue->parallel.lock); + if (list_empty(&padata_free_works)) + return NULL; /* No more work items allowed to be queued. */ - while (!list_empty(&local_list)) { - struct padata_priv *padata; + pw = list_first_entry(&padata_free_works, struct padata_work, pw_list); + list_del(&pw->pw_list); + return pw; +} - padata = list_entry(local_list.next, - struct padata_priv, list); +static void padata_work_init(struct padata_work *pw, work_func_t work_fn, + void *data, int flags) +{ + if (flags & PADATA_WORK_ONSTACK) + INIT_WORK_ONSTACK(&pw->pw_work, work_fn); + else + INIT_WORK(&pw->pw_work, work_fn); + pw->pw_data = data; +} - list_del_init(&padata->list); +static int __init padata_work_alloc_mt(int nworks, void *data, + struct list_head *head) +{ + int i; - padata->parallel(padata); + spin_lock(&padata_works_lock); + /* Start at 1 because the current task participates in the job. */ + for (i = 1; i < nworks; ++i) { + struct padata_work *pw = padata_work_alloc(); + + if (!pw) + break; + padata_work_init(pw, padata_mt_helper, data, 0); + list_add(&pw->pw_list, head); + } + spin_unlock(&padata_works_lock); + + return i; +} + +static void padata_work_free(struct padata_work *pw) +{ + lockdep_assert_held(&padata_works_lock); + list_add(&pw->pw_list, &padata_free_works); +} + +static void __init padata_works_free(struct list_head *works) +{ + struct padata_work *cur, *next; + + if (list_empty(works)) + return; + + spin_lock(&padata_works_lock); + list_for_each_entry_safe(cur, next, works, pw_list) { + list_del(&cur->pw_list); + padata_work_free(cur); } + spin_unlock(&padata_works_lock); +} +static void padata_parallel_worker(struct work_struct *parallel_work) +{ + struct padata_work *pw = container_of(parallel_work, struct padata_work, + pw_work); + struct padata_priv *padata = pw->pw_data; + + local_bh_disable(); + padata->parallel(padata); + spin_lock(&padata_works_lock); + padata_work_free(pw); + spin_unlock(&padata_works_lock); local_bh_enable(); } @@ -106,9 +181,9 @@ int padata_do_parallel(struct padata_shell *ps, struct padata_priv *padata, int *cb_cpu) { struct padata_instance *pinst = ps->pinst; - int i, cpu, cpu_index, target_cpu, err; - struct padata_parallel_queue *queue; + int i, cpu, cpu_index, err; struct parallel_data *pd; + struct padata_work *pw; rcu_read_lock_bh(); @@ -136,25 +211,25 @@ int padata_do_parallel(struct padata_shell *ps, if ((pinst->flags & PADATA_RESET)) goto out; - if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) - goto out; - - err = 0; atomic_inc(&pd->refcnt); padata->pd = pd; padata->cb_cpu = *cb_cpu; - padata->seq_nr = atomic_inc_return(&pd->seq_nr); - target_cpu = padata_cpu_hash(pd, padata->seq_nr); - padata->cpu = target_cpu; - queue = per_cpu_ptr(pd->pqueue, target_cpu); - - spin_lock(&queue->parallel.lock); - list_add_tail(&padata->list, &queue->parallel.list); - spin_unlock(&queue->parallel.lock); + rcu_read_unlock_bh(); - queue_work(pinst->parallel_wq, &queue->work); + spin_lock(&padata_works_lock); + padata->seq_nr = ++pd->seq_nr; + pw = padata_work_alloc(); + spin_unlock(&padata_works_lock); + if (pw) { + padata_work_init(pw, padata_parallel_worker, padata, 0); + queue_work(pinst->parallel_wq, &pw->pw_work); + } else { + /* Maximum works limit exceeded, run in the current task. */ + padata->parallel(padata); + } + return 0; out: rcu_read_unlock_bh(); @@ -325,8 +400,9 @@ static void padata_serial_worker(struct work_struct *serial_work) void padata_do_serial(struct padata_priv *padata) { struct parallel_data *pd = padata->pd; + int hashed_cpu = padata_cpu_hash(pd, padata->seq_nr); struct padata_parallel_queue *pqueue = per_cpu_ptr(pd->pqueue, - padata->cpu); + hashed_cpu); struct padata_priv *cur; spin_lock(&pqueue->reorder.lock); @@ -387,6 +463,98 @@ out: return err; } +static void __init padata_mt_helper(struct work_struct *w) +{ + struct padata_work *pw = container_of(w, struct padata_work, pw_work); + struct padata_mt_job_state *ps = pw->pw_data; + struct padata_mt_job *job = ps->job; + bool done; + + spin_lock(&ps->lock); + + while (job->size > 0) { + unsigned long start, size, end; + + start = job->start; + /* So end is chunk size aligned if enough work remains. */ + size = roundup(start + 1, ps->chunk_size) - start; + size = min(size, job->size); + end = start + size; + + job->start = end; + job->size -= size; + + spin_unlock(&ps->lock); + job->thread_fn(start, end, job->fn_arg); + spin_lock(&ps->lock); + } + + ++ps->nworks_fini; + done = (ps->nworks_fini == ps->nworks); + spin_unlock(&ps->lock); + + if (done) + complete(&ps->completion); +} + +/** + * padata_do_multithreaded - run a multithreaded job + * @job: Description of the job. + * + * See the definition of struct padata_mt_job for more details. + */ +void __init padata_do_multithreaded(struct padata_mt_job *job) +{ + /* In case threads finish at different times. */ + static const unsigned long load_balance_factor = 4; + struct padata_work my_work, *pw; + struct padata_mt_job_state ps; + LIST_HEAD(works); + int nworks; + + if (job->size == 0) + return; + + /* Ensure at least one thread when size < min_chunk. */ + nworks = max(job->size / job->min_chunk, 1ul); + nworks = min(nworks, job->max_threads); + + if (nworks == 1) { + /* Single thread, no coordination needed, cut to the chase. */ + job->thread_fn(job->start, job->start + job->size, job->fn_arg); + return; + } + + spin_lock_init(&ps.lock); + init_completion(&ps.completion); + ps.job = job; + ps.nworks = padata_work_alloc_mt(nworks, &ps, &works); + ps.nworks_fini = 0; + + /* + * Chunk size is the amount of work a helper does per call to the + * thread function. Load balance large jobs between threads by + * increasing the number of chunks, guarantee at least the minimum + * chunk size from the caller, and honor the caller's alignment. + */ + ps.chunk_size = job->size / (ps.nworks * load_balance_factor); + ps.chunk_size = max(ps.chunk_size, job->min_chunk); + ps.chunk_size = roundup(ps.chunk_size, job->align); + + list_for_each_entry(pw, &works, pw_list) + queue_work(system_unbound_wq, &pw->pw_work); + + /* Use the current thread, which saves starting a workqueue worker. */ + padata_work_init(&my_work, padata_mt_helper, &ps, PADATA_WORK_ONSTACK); + padata_mt_helper(&my_work.pw_work); + + /* Wait for all the helpers to finish. */ + wait_for_completion(&ps.completion); + + destroy_work_on_stack(&my_work.pw_work); + padata_works_free(&works); +} + static void __padata_list_init(struct padata_list *pd_list) { INIT_LIST_HEAD(&pd_list->list); @@ -417,8 +585,6 @@ static void padata_init_pqueues(struct parallel_data *pd) pqueue = per_cpu_ptr(pd->pqueue, cpu); __padata_list_init(&pqueue->reorder); - __padata_list_init(&pqueue->parallel); - INIT_WORK(&pqueue->work, padata_parallel_worker); atomic_set(&pqueue->num_obj, 0); } } @@ -452,7 +618,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) padata_init_pqueues(pd); padata_init_squeues(pd); - atomic_set(&pd->seq_nr, -1); + pd->seq_nr = -1; atomic_set(&pd->refcnt, 1); spin_lock_init(&pd->lock); pd->cpu = cpumask_first(pd->cpumask.pcpu); @@ -1052,32 +1218,41 @@ void padata_free_shell(struct padata_shell *ps) } EXPORT_SYMBOL(padata_free_shell); -#ifdef CONFIG_HOTPLUG_CPU - -static __init int padata_driver_init(void) +void __init padata_init(void) { + unsigned int i, possible_cpus; +#ifdef CONFIG_HOTPLUG_CPU int ret; ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online", padata_cpu_online, NULL); if (ret < 0) - return ret; + goto err; hp_online = ret; ret = cpuhp_setup_state_multi(CPUHP_PADATA_DEAD, "padata:dead", NULL, padata_cpu_dead); - if (ret < 0) { - cpuhp_remove_multi_state(hp_online); - return ret; - } - return 0; -} -module_init(padata_driver_init); + if (ret < 0) + goto remove_online_state; +#endif -static __exit void padata_driver_exit(void) -{ + possible_cpus = num_possible_cpus(); + padata_works = kmalloc_array(possible_cpus, sizeof(struct padata_work), + GFP_KERNEL); + if (!padata_works) + goto remove_dead_state; + + for (i = 0; i < possible_cpus; ++i) + list_add(&padata_works[i].pw_list, &padata_free_works); + + return; + +remove_dead_state: +#ifdef CONFIG_HOTPLUG_CPU cpuhp_remove_multi_state(CPUHP_PADATA_DEAD); +remove_online_state: cpuhp_remove_multi_state(hp_online); -} -module_exit(padata_driver_exit); +err: #endif + pr_warn("padata: initialization failed\n"); +} diff --git a/kernel/pid.c b/kernel/pid.c index c835b844aca7..f1496b757162 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -363,6 +363,25 @@ void change_pid(struct task_struct *task, enum pid_type type, attach_pid(task, type); } +void exchange_tids(struct task_struct *left, struct task_struct *right) +{ + struct pid *pid1 = left->thread_pid; + struct pid *pid2 = right->thread_pid; + struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID]; + struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID]; + + /* Swap the single entry tid lists */ + hlists_swap_heads_rcu(head1, head2); + + /* Swap the per task_struct pid */ + rcu_assign_pointer(left->thread_pid, pid2); + rcu_assign_pointer(right->thread_pid, pid1); + + /* Swap the cached value */ + WRITE_ONCE(left->pid, pid_nr(pid2)); + WRITE_ONCE(right->pid, pid_nr(pid1)); +} + /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) @@ -476,8 +495,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) ns = task_active_pid_ns(current); - if (likely(pid_alive(task))) - nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); + nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); rcu_read_unlock(); return nr; diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 6d475281c730..562aa0e450ed 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -29,7 +29,7 @@ static void handle_poweroff(int key) schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); } -static struct sysrq_key_op sysrq_poweroff_op = { +static const struct sysrq_key_op sysrq_poweroff_op = { .handler = handle_poweroff, .help_msg = "poweroff(o)", .action_msg = "Power Off", diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index ae76bd329582..54a6dba0280d 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -807,7 +807,7 @@ static void sysrq_show_rcu(int key) show_rcu_gp_kthreads(); } -static struct sysrq_key_op sysrq_rcudump_op = { +static const struct sysrq_key_op sysrq_rcudump_op = { .handler = sysrq_show_rcu, .help_msg = "show-rcu(y)", .action_msg = "Show RCU tree", diff --git a/kernel/relay.c b/kernel/relay.c index 90c7a002436d..204867220f8a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -581,6 +581,11 @@ struct rchan *relay_open(const char *base_filename, return NULL; chan->buf = alloc_percpu(struct rchan_buf *); + if (!chan->buf) { + kfree(chan); + return NULL; + } + chan->version = RELAYFS_CHANNEL_VERSION; chan->n_subbufs = n_subbufs; chan->subbuf_size = subbuf_size; @@ -991,14 +996,14 @@ static void relay_file_read_consume(struct rchan_buf *buf, /* * relay_file_read_avail - boolean, are there unconsumed bytes available? */ -static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) +static int relay_file_read_avail(struct rchan_buf *buf) { size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t produced = buf->subbufs_produced; size_t consumed = buf->subbufs_consumed; - relay_file_read_consume(buf, read_pos, 0); + relay_file_read_consume(buf, 0, 0); consumed = buf->subbufs_consumed; @@ -1059,23 +1064,20 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos, /** * relay_file_read_start_pos - find the first available byte to read - * @read_pos: file read position * @buf: relay channel buffer * - * If the @read_pos is in the middle of padding, return the + * If the read_pos is in the middle of padding, return the * position of the first actually available byte, otherwise * return the original value. */ -static size_t relay_file_read_start_pos(size_t read_pos, - struct rchan_buf *buf) +static size_t relay_file_read_start_pos(struct rchan_buf *buf) { size_t read_subbuf, padding, padding_start, padding_end; size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t consumed = buf->subbufs_consumed % n_subbufs; + size_t read_pos = consumed * subbuf_size + buf->bytes_consumed; - if (!read_pos) - read_pos = consumed * subbuf_size + buf->bytes_consumed; read_subbuf = read_pos / subbuf_size; padding = buf->padding[read_subbuf]; padding_start = (read_subbuf + 1) * subbuf_size - padding; @@ -1131,10 +1133,10 @@ static ssize_t relay_file_read(struct file *filp, do { void *from; - if (!relay_file_read_avail(buf, *ppos)) + if (!relay_file_read_avail(buf)) break; - read_start = relay_file_read_start_pos(*ppos, buf); + read_start = relay_file_read_start_pos(buf); avail = relay_file_read_subbuf_avail(read_start, buf); if (!avail) break; diff --git a/kernel/resource.c b/kernel/resource.c index 76036a41143b..841737bbda9e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1126,6 +1126,7 @@ struct resource * __request_region(struct resource *parent, { DECLARE_WAITQUEUE(wait, current); struct resource *res = alloc_resource(GFP_KERNEL); + struct resource *orig_parent = parent; if (!res) return NULL; @@ -1176,6 +1177,10 @@ struct resource * __request_region(struct resource *parent, break; } write_unlock(&resource_lock); + + if (res && orig_parent == &iomem_resource) + revoke_devmem(res); + return res; } EXPORT_SYMBOL(__request_region); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7efe0d499c1d..715774d8c55f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -118,6 +118,7 @@ static unsigned long zero_ul; static unsigned long one_ul = 1; static unsigned long long_max = LONG_MAX; static int one_hundred = 100; +static int two_hundred = 200; static int one_thousand = 1000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; @@ -2732,7 +2733,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = &two_hundred, }, #ifdef CONFIG_HUGETLB_PAGE { diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 2fd3b3fa68bf..165117996ea0 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -47,85 +47,65 @@ void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) /* * Functions for validating access to tasks. */ -static struct task_struct *lookup_task(const pid_t pid, bool thread, - bool gettime) +static struct pid *pid_for_clock(const clockid_t clock, bool gettime) { - struct task_struct *p; + const bool thread = !!CPUCLOCK_PERTHREAD(clock); + const pid_t upid = CPUCLOCK_PID(clock); + struct pid *pid; + + if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX) + return NULL; /* * If the encoded PID is 0, then the timer is targeted at current * or the process to which current belongs. */ - if (!pid) - return thread ? current : current->group_leader; + if (upid == 0) + return thread ? task_pid(current) : task_tgid(current); - p = find_task_by_vpid(pid); - if (!p) - return p; - - if (thread) - return same_thread_group(p, current) ? p : NULL; + pid = find_vpid(upid); + if (!pid) + return NULL; - if (gettime) { - /* - * For clock_gettime(PROCESS) the task does not need to be - * the actual group leader. tsk->sighand gives - * access to the group's clock. - * - * Timers need the group leader because they take a - * reference on it and store the task pointer until the - * timer is destroyed. - */ - return (p == current || thread_group_leader(p)) ? p : NULL; + if (thread) { + struct task_struct *tsk = pid_task(pid, PIDTYPE_PID); + return (tsk && same_thread_group(tsk, current)) ? pid : NULL; } /* - * For processes require that p is group leader. + * For clock_gettime(PROCESS) allow finding the process by + * with the pid of the current task. The code needs the tgid + * of the process so that pid_task(pid, PIDTYPE_TGID) can be + * used to find the process. */ - return has_group_leader_pid(p) ? p : NULL; + if (gettime && (pid == task_pid(current))) + return task_tgid(current); + + /* + * For processes require that pid identifies a process. + */ + return pid_has_task(pid, PIDTYPE_TGID) ? pid : NULL; } -static struct task_struct *__get_task_for_clock(const clockid_t clock, - bool getref, bool gettime) +static inline int validate_clock_permissions(const clockid_t clock) { - const bool thread = !!CPUCLOCK_PERTHREAD(clock); - const pid_t pid = CPUCLOCK_PID(clock); - struct task_struct *p; - - if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX) - return NULL; + int ret; rcu_read_lock(); - p = lookup_task(pid, thread, gettime); - if (p && getref) - get_task_struct(p); + ret = pid_for_clock(clock, false) ? 0 : -EINVAL; rcu_read_unlock(); - return p; -} - -static inline struct task_struct *get_task_for_clock(const clockid_t clock) -{ - return __get_task_for_clock(clock, true, false); -} -static inline struct task_struct *get_task_for_clock_get(const clockid_t clock) -{ - return __get_task_for_clock(clock, true, true); -} - -static inline int validate_clock_permissions(const clockid_t clock) -{ - return __get_task_for_clock(clock, false, false) ? 0 : -EINVAL; + return ret; } -static inline enum pid_type cpu_timer_pid_type(struct k_itimer *timer) +static inline enum pid_type clock_pid_type(const clockid_t clock) { - return CPUCLOCK_PERTHREAD(timer->it_clock) ? PIDTYPE_PID : PIDTYPE_TGID; + return CPUCLOCK_PERTHREAD(clock) ? PIDTYPE_PID : PIDTYPE_TGID; } static inline struct task_struct *cpu_timer_task_rcu(struct k_itimer *timer) { - return pid_task(timer->it.cpu.pid, cpu_timer_pid_type(timer)); + return pid_task(timer->it.cpu.pid, clock_pid_type(timer->it_clock)); } /* @@ -373,15 +353,18 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) struct task_struct *tsk; u64 t; - tsk = get_task_for_clock_get(clock); - if (!tsk) + rcu_read_lock(); + tsk = pid_task(pid_for_clock(clock, true), clock_pid_type(clock)); + if (!tsk) { + rcu_read_unlock(); return -EINVAL; + } if (CPUCLOCK_PERTHREAD(clock)) t = cpu_clock_sample(clkid, tsk); else t = cpu_clock_sample_group(clkid, tsk, false); - put_task_struct(tsk); + rcu_read_unlock(); *tp = ns_to_timespec64(t); return 0; @@ -394,19 +377,19 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) */ static int posix_cpu_timer_create(struct k_itimer *new_timer) { - struct task_struct *p = get_task_for_clock(new_timer->it_clock); + struct pid *pid; - if (!p) + rcu_read_lock(); + pid = pid_for_clock(new_timer->it_clock, false); + if (!pid) { + rcu_read_unlock(); return -EINVAL; + } new_timer->kclock = &clock_posix_cpu; timerqueue_init(&new_timer->it.cpu.node); - new_timer->it.cpu.pid = get_task_pid(p, cpu_timer_pid_type(new_timer)); - /* - * get_task_for_clock() took a reference on @p. Drop it as the timer - * holds a reference on the pid of @p. - */ - put_task_struct(p); + new_timer->it.cpu.pid = get_pid(pid); + rcu_read_unlock(); return 0; } diff --git a/kernel/user.c b/kernel/user.c index 5235d7f49982..b1635d94a1f2 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -82,7 +82,7 @@ EXPORT_SYMBOL_GPL(init_user_ns); #define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid)))) static struct kmem_cache *uid_cachep; -struct hlist_head uidhash_table[UIDHASH_SZ]; +static struct hlist_head uidhash_table[UIDHASH_SZ]; /* * The uidhash_lock is mostly taken from process context, but it is diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 891ccad5f271..9fbe1e237563 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -145,7 +145,7 @@ enum { /* struct worker is defined in workqueue_internal.h */ struct worker_pool { - spinlock_t lock; /* the pool lock */ + raw_spinlock_t lock; /* the pool lock */ int cpu; /* I: the associated cpu */ int node; /* I: the associated node ID */ int id; /* I: pool ID */ @@ -300,8 +300,9 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ -static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ -static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */ +static DEFINE_RAW_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ +/* wait for manager to go away */ +static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait); static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ @@ -826,7 +827,7 @@ static struct worker *first_idle_worker(struct worker_pool *pool) * Wake up the first idle worker of @pool. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void wake_up_worker(struct worker_pool *pool) { @@ -881,7 +882,7 @@ void wq_worker_sleeping(struct task_struct *task) return; worker->sleeping = 1; - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * The counterpart of the following dec_and_test, implied mb, @@ -900,7 +901,7 @@ void wq_worker_sleeping(struct task_struct *task) if (next) wake_up_process(next->task); } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } /** @@ -911,7 +912,7 @@ void wq_worker_sleeping(struct task_struct *task) * the scheduler to get a worker's last known identity. * * CONTEXT: - * spin_lock_irq(rq->lock) + * raw_spin_lock_irq(rq->lock) * * This function is called during schedule() when a kworker is going * to sleep. It's used by psi to identify aggregation workers during @@ -942,7 +943,7 @@ work_func_t wq_worker_last_func(struct task_struct *task) * Set @flags in @worker->flags and adjust nr_running accordingly. * * CONTEXT: - * spin_lock_irq(pool->lock) + * raw_spin_lock_irq(pool->lock) */ static inline void worker_set_flags(struct worker *worker, unsigned int flags) { @@ -967,7 +968,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags) * Clear @flags in @worker->flags and adjust nr_running accordingly. * * CONTEXT: - * spin_lock_irq(pool->lock) + * raw_spin_lock_irq(pool->lock) */ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) { @@ -1015,7 +1016,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) * actually occurs, it should be easy to locate the culprit work function. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). * * Return: * Pointer to worker which is executing @work if found, %NULL @@ -1050,7 +1051,7 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool, * nested inside outer list_for_each_entry_safe(). * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void move_linked_works(struct work_struct *work, struct list_head *head, struct work_struct **nextp) @@ -1128,9 +1129,9 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq) * As both pwqs and pools are RCU protected, the * following lock operations are safe. */ - spin_lock_irq(&pwq->pool->lock); + raw_spin_lock_irq(&pwq->pool->lock); put_pwq(pwq); - spin_unlock_irq(&pwq->pool->lock); + raw_spin_unlock_irq(&pwq->pool->lock); } } @@ -1163,7 +1164,7 @@ static void pwq_activate_first_delayed(struct pool_workqueue *pwq) * decrement nr_in_flight of its pwq and handle workqueue flushing. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) { @@ -1262,7 +1263,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, if (!pool) goto fail; - spin_lock(&pool->lock); + raw_spin_lock(&pool->lock); /* * work->data is guaranteed to point to pwq only while the work * item is queued on pwq->wq, and both updating work->data to point @@ -1291,11 +1292,11 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, /* work->data points to pwq iff queued, point to pool */ set_work_pool_and_keep_pending(work, pool->id); - spin_unlock(&pool->lock); + raw_spin_unlock(&pool->lock); rcu_read_unlock(); return 1; } - spin_unlock(&pool->lock); + raw_spin_unlock(&pool->lock); fail: rcu_read_unlock(); local_irq_restore(*flags); @@ -1316,7 +1317,7 @@ fail: * work_struct flags. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, struct list_head *head, unsigned int extra_flags) @@ -1433,7 +1434,7 @@ retry: if (last_pool && last_pool != pwq->pool) { struct worker *worker; - spin_lock(&last_pool->lock); + raw_spin_lock(&last_pool->lock); worker = find_worker_executing_work(last_pool, work); @@ -1441,11 +1442,11 @@ retry: pwq = worker->current_pwq; } else { /* meh... not running there, queue here */ - spin_unlock(&last_pool->lock); - spin_lock(&pwq->pool->lock); + raw_spin_unlock(&last_pool->lock); + raw_spin_lock(&pwq->pool->lock); } } else { - spin_lock(&pwq->pool->lock); + raw_spin_lock(&pwq->pool->lock); } /* @@ -1458,7 +1459,7 @@ retry: */ if (unlikely(!pwq->refcnt)) { if (wq->flags & WQ_UNBOUND) { - spin_unlock(&pwq->pool->lock); + raw_spin_unlock(&pwq->pool->lock); cpu_relax(); goto retry; } @@ -1490,7 +1491,7 @@ retry: insert_work(pwq, work, worklist, work_flags); out: - spin_unlock(&pwq->pool->lock); + raw_spin_unlock(&pwq->pool->lock); rcu_read_unlock(); } @@ -1759,7 +1760,7 @@ EXPORT_SYMBOL(queue_rcu_work); * necessary. * * LOCKING: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void worker_enter_idle(struct worker *worker) { @@ -1799,7 +1800,7 @@ static void worker_enter_idle(struct worker *worker) * @worker is leaving idle state. Update stats. * * LOCKING: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void worker_leave_idle(struct worker *worker) { @@ -1937,11 +1938,11 @@ static struct worker *create_worker(struct worker_pool *pool) worker_attach_to_pool(worker, pool); /* start the newly created worker */ - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); worker->pool->nr_workers++; worker_enter_idle(worker); wake_up_process(worker->task); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); return worker; @@ -1960,7 +1961,7 @@ fail: * be idle. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void destroy_worker(struct worker *worker) { @@ -1986,7 +1987,7 @@ static void idle_worker_timeout(struct timer_list *t) { struct worker_pool *pool = from_timer(pool, t, idle_timer); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); while (too_many_workers(pool)) { struct worker *worker; @@ -2004,7 +2005,7 @@ static void idle_worker_timeout(struct timer_list *t) destroy_worker(worker); } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } static void send_mayday(struct work_struct *work) @@ -2035,8 +2036,8 @@ static void pool_mayday_timeout(struct timer_list *t) struct worker_pool *pool = from_timer(pool, t, mayday_timer); struct work_struct *work; - spin_lock_irq(&pool->lock); - spin_lock(&wq_mayday_lock); /* for wq->maydays */ + raw_spin_lock_irq(&pool->lock); + raw_spin_lock(&wq_mayday_lock); /* for wq->maydays */ if (need_to_create_worker(pool)) { /* @@ -2049,8 +2050,8 @@ static void pool_mayday_timeout(struct timer_list *t) send_mayday(work); } - spin_unlock(&wq_mayday_lock); - spin_unlock_irq(&pool->lock); + raw_spin_unlock(&wq_mayday_lock); + raw_spin_unlock_irq(&pool->lock); mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } @@ -2069,7 +2070,7 @@ static void pool_mayday_timeout(struct timer_list *t) * may_start_working() %true. * * LOCKING: - * spin_lock_irq(pool->lock) which may be released and regrabbed + * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. Called only from * manager. */ @@ -2078,7 +2079,7 @@ __releases(&pool->lock) __acquires(&pool->lock) { restart: - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); @@ -2094,7 +2095,7 @@ restart: } del_timer_sync(&pool->mayday_timer); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * This is necessary even after a new worker was just successfully * created as @pool->lock was dropped and the new worker might have @@ -2117,7 +2118,7 @@ restart: * and may_start_working() is true. * * CONTEXT: - * spin_lock_irq(pool->lock) which may be released and regrabbed + * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. * * Return: @@ -2140,7 +2141,7 @@ static bool manage_workers(struct worker *worker) pool->manager = NULL; pool->flags &= ~POOL_MANAGER_ACTIVE; - wake_up(&wq_manager_wait); + rcuwait_wake_up(&manager_wait); return true; } @@ -2156,7 +2157,7 @@ static bool manage_workers(struct worker *worker) * call this function to process a work. * * CONTEXT: - * spin_lock_irq(pool->lock) which is released and regrabbed. + * raw_spin_lock_irq(pool->lock) which is released and regrabbed. */ static void process_one_work(struct worker *worker, struct work_struct *work) __releases(&pool->lock) @@ -2238,7 +2239,7 @@ __acquires(&pool->lock) */ set_work_pool_and_clear_pending(work, pool->id); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); lock_map_acquire(&pwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); @@ -2293,7 +2294,7 @@ __acquires(&pool->lock) */ cond_resched(); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* clear cpu intensive status */ if (unlikely(cpu_intensive)) @@ -2319,7 +2320,7 @@ __acquires(&pool->lock) * fetches a work from the top and executes it. * * CONTEXT: - * spin_lock_irq(pool->lock) which may be released and regrabbed + * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. */ static void process_scheduled_works(struct worker *worker) @@ -2361,11 +2362,11 @@ static int worker_thread(void *__worker) /* tell the scheduler that this is a workqueue worker */ set_pf_worker(true); woke_up: - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* am I supposed to die? */ if (unlikely(worker->flags & WORKER_DIE)) { - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); WARN_ON_ONCE(!list_empty(&worker->entry)); set_pf_worker(false); @@ -2431,7 +2432,7 @@ sleep: */ worker_enter_idle(worker); __set_current_state(TASK_IDLE); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); schedule(); goto woke_up; } @@ -2485,7 +2486,7 @@ repeat: should_stop = kthread_should_stop(); /* see whether any pwq is asking for help */ - spin_lock_irq(&wq_mayday_lock); + raw_spin_lock_irq(&wq_mayday_lock); while (!list_empty(&wq->maydays)) { struct pool_workqueue *pwq = list_first_entry(&wq->maydays, @@ -2497,11 +2498,11 @@ repeat: __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); - spin_unlock_irq(&wq_mayday_lock); + raw_spin_unlock_irq(&wq_mayday_lock); worker_attach_to_pool(rescuer, pool); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * Slurp in all works issued via this workqueue and @@ -2529,8 +2530,8 @@ repeat: * being used to relieve memory pressure, don't * incur MAYDAY_INTERVAL delay inbetween. */ - if (need_to_create_worker(pool)) { - spin_lock(&wq_mayday_lock); + if (pwq->nr_active && need_to_create_worker(pool)) { + raw_spin_lock(&wq_mayday_lock); /* * Queue iff we aren't racing destruction * and somebody else hasn't queued it already. @@ -2539,7 +2540,7 @@ repeat: get_pwq(pwq); list_add_tail(&pwq->mayday_node, &wq->maydays); } - spin_unlock(&wq_mayday_lock); + raw_spin_unlock(&wq_mayday_lock); } } @@ -2557,14 +2558,14 @@ repeat: if (need_more_worker(pool)) wake_up_worker(pool); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); worker_detach_from_pool(rescuer); - spin_lock_irq(&wq_mayday_lock); + raw_spin_lock_irq(&wq_mayday_lock); } - spin_unlock_irq(&wq_mayday_lock); + raw_spin_unlock_irq(&wq_mayday_lock); if (should_stop) { __set_current_state(TASK_RUNNING); @@ -2644,7 +2645,7 @@ static void wq_barrier_func(struct work_struct *work) * underneath us, so we can't reliably determine pwq from @target. * * CONTEXT: - * spin_lock_irq(pool->lock). + * raw_spin_lock_irq(pool->lock). */ static void insert_wq_barrier(struct pool_workqueue *pwq, struct wq_barrier *barr, @@ -2731,7 +2732,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, for_each_pwq(pwq, wq) { struct worker_pool *pool = pwq->pool; - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); if (flush_color >= 0) { WARN_ON_ONCE(pwq->flush_color != -1); @@ -2748,7 +2749,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, pwq->work_color = work_color; } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush)) @@ -2948,9 +2949,9 @@ reflush: for_each_pwq(pwq, wq) { bool drained; - spin_lock_irq(&pwq->pool->lock); + raw_spin_lock_irq(&pwq->pool->lock); drained = !pwq->nr_active && list_empty(&pwq->delayed_works); - spin_unlock_irq(&pwq->pool->lock); + raw_spin_unlock_irq(&pwq->pool->lock); if (drained) continue; @@ -2986,7 +2987,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, return false; } - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* see the comment in try_to_grab_pending() with the same code */ pwq = get_work_pwq(work); if (pwq) { @@ -3002,7 +3003,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, check_flush_dependency(pwq->wq, work); insert_wq_barrier(pwq, barr, work, worker); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); /* * Force a lock recursion deadlock when using flush_work() inside a @@ -3021,7 +3022,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, rcu_read_unlock(); return true; already_gone: - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); rcu_read_unlock(); return false; } @@ -3414,7 +3415,7 @@ static bool wqattrs_equal(const struct workqueue_attrs *a, */ static int init_worker_pool(struct worker_pool *pool) { - spin_lock_init(&pool->lock); + raw_spin_lock_init(&pool->lock); pool->id = -1; pool->cpu = -1; pool->node = NUMA_NO_NODE; @@ -3491,7 +3492,6 @@ static void rcu_free_wq(struct rcu_head *rcu) else free_workqueue_attrs(wq->unbound_attrs); - kfree(wq->rescuer); kfree(wq); } @@ -3504,6 +3504,18 @@ static void rcu_free_pool(struct rcu_head *rcu) kfree(pool); } +/* This returns with the lock held on success (pool manager is inactive). */ +static bool wq_manager_inactive(struct worker_pool *pool) +{ + raw_spin_lock_irq(&pool->lock); + + if (pool->flags & POOL_MANAGER_ACTIVE) { + raw_spin_unlock_irq(&pool->lock); + return false; + } + return true; +} + /** * put_unbound_pool - put a worker_pool * @pool: worker_pool to put @@ -3539,16 +3551,17 @@ static void put_unbound_pool(struct worker_pool *pool) * Become the manager and destroy all workers. This prevents * @pool's workers from blocking on attach_mutex. We're the last * manager and @pool gets freed with the flag set. + * Because of how wq_manager_inactive() works, we will hold the + * spinlock after a successful wait. */ - spin_lock_irq(&pool->lock); - wait_event_lock_irq(wq_manager_wait, - !(pool->flags & POOL_MANAGER_ACTIVE), pool->lock); + rcuwait_wait_event(&manager_wait, wq_manager_inactive(pool), + TASK_UNINTERRUPTIBLE); pool->flags |= POOL_MANAGER_ACTIVE; while ((worker = first_idle_worker(pool))) destroy_worker(worker); WARN_ON(pool->nr_workers || pool->nr_idle); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); mutex_lock(&wq_pool_attach_mutex); if (!list_empty(&pool->workers)) @@ -3704,7 +3717,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) return; /* this function can be called during early boot w/ irq disabled */ - spin_lock_irqsave(&pwq->pool->lock, flags); + raw_spin_lock_irqsave(&pwq->pool->lock, flags); /* * During [un]freezing, the caller is responsible for ensuring that @@ -3727,7 +3740,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) pwq->max_active = 0; } - spin_unlock_irqrestore(&pwq->pool->lock, flags); + raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); } /* initialize newly alloced @pwq which is associated with @wq and @pool */ @@ -4129,9 +4142,9 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, use_dfl_pwq: mutex_lock(&wq->mutex); - spin_lock_irq(&wq->dfl_pwq->pool->lock); + raw_spin_lock_irq(&wq->dfl_pwq->pool->lock); get_pwq(wq->dfl_pwq); - spin_unlock_irq(&wq->dfl_pwq->pool->lock); + raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock); old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq); out_unlock: mutex_unlock(&wq->mutex); @@ -4208,8 +4221,8 @@ static int init_rescuer(struct workqueue_struct *wq) rescuer->rescue_wq = wq; rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); - ret = PTR_ERR_OR_ZERO(rescuer->task); - if (ret) { + if (IS_ERR(rescuer->task)) { + ret = PTR_ERR(rescuer->task); kfree(rescuer); return ret; } @@ -4360,9 +4373,9 @@ void destroy_workqueue(struct workqueue_struct *wq) struct worker *rescuer = wq->rescuer; /* this prevents new queueing */ - spin_lock_irq(&wq_mayday_lock); + raw_spin_lock_irq(&wq_mayday_lock); wq->rescuer = NULL; - spin_unlock_irq(&wq_mayday_lock); + raw_spin_unlock_irq(&wq_mayday_lock); /* rescuer will empty maydays list before exiting */ kthread_stop(rescuer->task); @@ -4376,27 +4389,25 @@ void destroy_workqueue(struct workqueue_struct *wq) mutex_lock(&wq_pool_mutex); mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) { - spin_lock_irq(&pwq->pool->lock); + raw_spin_lock_irq(&pwq->pool->lock); if (WARN_ON(pwq_busy(pwq))) { pr_warn("%s: %s has the following busy pwq\n", __func__, wq->name); show_pwq(pwq); - spin_unlock_irq(&pwq->pool->lock); + raw_spin_unlock_irq(&pwq->pool->lock); mutex_unlock(&wq->mutex); mutex_unlock(&wq_pool_mutex); show_workqueue_state(); return; } - spin_unlock_irq(&pwq->pool->lock); + raw_spin_unlock_irq(&pwq->pool->lock); } mutex_unlock(&wq->mutex); - mutex_unlock(&wq_pool_mutex); /* * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. */ - mutex_lock(&wq_pool_mutex); list_del_rcu(&wq->list); mutex_unlock(&wq_pool_mutex); @@ -4558,10 +4569,10 @@ unsigned int work_busy(struct work_struct *work) rcu_read_lock(); pool = get_work_pool(work); if (pool) { - spin_lock_irqsave(&pool->lock, flags); + raw_spin_lock_irqsave(&pool->lock, flags); if (find_worker_executing_work(pool, work)) ret |= WORK_BUSY_RUNNING; - spin_unlock_irqrestore(&pool->lock, flags); + raw_spin_unlock_irqrestore(&pool->lock, flags); } rcu_read_unlock(); @@ -4768,10 +4779,10 @@ void show_workqueue_state(void) pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); for_each_pwq(pwq, wq) { - spin_lock_irqsave(&pwq->pool->lock, flags); + raw_spin_lock_irqsave(&pwq->pool->lock, flags); if (pwq->nr_active || !list_empty(&pwq->delayed_works)) show_pwq(pwq); - spin_unlock_irqrestore(&pwq->pool->lock, flags); + raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); /* * We could be printing a lot from atomic context, e.g. * sysrq-t -> show_workqueue_state(). Avoid triggering @@ -4785,7 +4796,7 @@ void show_workqueue_state(void) struct worker *worker; bool first = true; - spin_lock_irqsave(&pool->lock, flags); + raw_spin_lock_irqsave(&pool->lock, flags); if (pool->nr_workers == pool->nr_idle) goto next_pool; @@ -4804,7 +4815,7 @@ void show_workqueue_state(void) } pr_cont("\n"); next_pool: - spin_unlock_irqrestore(&pool->lock, flags); + raw_spin_unlock_irqrestore(&pool->lock, flags); /* * We could be printing a lot from atomic context, e.g. * sysrq-t -> show_workqueue_state(). Avoid triggering @@ -4834,7 +4845,7 @@ void wq_worker_comm(char *buf, size_t size, struct task_struct *task) struct worker_pool *pool = worker->pool; if (pool) { - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * ->desc tracks information (wq name or * set_worker_desc()) for the latest execution. If @@ -4848,7 +4859,7 @@ void wq_worker_comm(char *buf, size_t size, struct task_struct *task) scnprintf(buf + off, size - off, "-%s", worker->desc); } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } } @@ -4879,7 +4890,7 @@ static void unbind_workers(int cpu) for_each_cpu_worker_pool(pool, cpu) { mutex_lock(&wq_pool_attach_mutex); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); /* * We've blocked all attach/detach operations. Make all workers @@ -4893,7 +4904,7 @@ static void unbind_workers(int cpu) pool->flags |= POOL_DISASSOCIATED; - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); mutex_unlock(&wq_pool_attach_mutex); /* @@ -4919,9 +4930,9 @@ static void unbind_workers(int cpu) * worker blocking could lead to lengthy stalls. Kick off * unbound chain execution of currently pending work items. */ - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); wake_up_worker(pool); - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } } @@ -4948,7 +4959,7 @@ static void rebind_workers(struct worker_pool *pool) WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask) < 0); - spin_lock_irq(&pool->lock); + raw_spin_lock_irq(&pool->lock); pool->flags &= ~POOL_DISASSOCIATED; @@ -4987,7 +4998,7 @@ static void rebind_workers(struct worker_pool *pool) WRITE_ONCE(worker->flags, worker_flags); } - spin_unlock_irq(&pool->lock); + raw_spin_unlock_irq(&pool->lock); } /** @@ -5906,7 +5917,7 @@ void __init workqueue_init_early(void) int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; int i, cpu; - WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); + BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags)); |