diff options
Diffstat (limited to 'kernel')
53 files changed, 1738 insertions, 701 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 48c5376d290a..daad787fb795 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -58,6 +58,7 @@ endif obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULE_SIG) += module_signing.o +obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_CRASH_CORE) += crash_core.o @@ -127,7 +128,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz -cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@ + cmd_genikh = $(BASH) $(srctree)/kernel/gen_kheaders.sh $@ $(obj)/kheaders_data.tar.xz: FORCE $(call cmd,genikh) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index cc0d0cf114e3..a70f7209cda3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -14,8 +14,9 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/fs.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/kdev_t.h> -#include <linux/parser.h> #include <linux/filter.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> @@ -583,58 +584,52 @@ static const struct super_operations bpf_super_ops = { enum { OPT_MODE, - OPT_ERR, }; -static const match_table_t bpf_mount_tokens = { - { OPT_MODE, "mode=%o" }, - { OPT_ERR, NULL }, +static const struct fs_parameter_spec bpf_param_specs[] = { + fsparam_u32oct ("mode", OPT_MODE), + {} +}; + +static const struct fs_parameter_description bpf_fs_parameters = { + .name = "bpf", + .specs = bpf_param_specs, }; struct bpf_mount_opts { umode_t mode; }; -static int bpf_parse_options(char *data, struct bpf_mount_opts *opts) +static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) { - substring_t args[MAX_OPT_ARGS]; - int option, token; - char *ptr; + struct bpf_mount_opts *opts = fc->fs_private; + struct fs_parse_result result; + int opt; - opts->mode = S_IRWXUGO; - - while ((ptr = strsep(&data, ",")) != NULL) { - if (!*ptr) - continue; - - token = match_token(ptr, bpf_mount_tokens, args); - switch (token) { - case OPT_MODE: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->mode = option & S_IALLUGO; - break; + opt = fs_parse(fc, &bpf_fs_parameters, param, &result); + if (opt < 0) /* We might like to report bad mount options here, but * traditionally we've ignored all mount options, so we'd * better continue to ignore non-existing options for bpf. */ - } + return opt == -ENOPARAM ? 0 : opt; + + switch (opt) { + case OPT_MODE: + opts->mode = result.uint_32 & S_IALLUGO; + break; } return 0; } -static int bpf_fill_super(struct super_block *sb, void *data, int silent) +static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) { static const struct tree_descr bpf_rfiles[] = { { "" } }; - struct bpf_mount_opts opts; + struct bpf_mount_opts *opts = fc->fs_private; struct inode *inode; int ret; - ret = bpf_parse_options(data, &opts); - if (ret) - return ret; - ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); if (ret) return ret; @@ -644,21 +639,50 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent) inode = sb->s_root->d_inode; inode->i_op = &bpf_dir_iops; inode->i_mode &= ~S_IALLUGO; - inode->i_mode |= S_ISVTX | opts.mode; + inode->i_mode |= S_ISVTX | opts->mode; return 0; } -static struct dentry *bpf_mount(struct file_system_type *type, int flags, - const char *dev_name, void *data) +static int bpf_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, bpf_fill_super); +} + +static void bpf_free_fc(struct fs_context *fc) { - return mount_nodev(type, flags, data, bpf_fill_super); + kfree(fc->fs_private); +} + +static const struct fs_context_operations bpf_context_ops = { + .free = bpf_free_fc, + .parse_param = bpf_parse_param, + .get_tree = bpf_get_tree, +}; + +/* + * Set up the filesystem mount context. + */ +static int bpf_init_fs_context(struct fs_context *fc) +{ + struct bpf_mount_opts *opts; + + opts = kzalloc(sizeof(struct bpf_mount_opts), GFP_KERNEL); + if (!opts) + return -ENOMEM; + + opts->mode = S_IRWXUGO; + + fc->fs_private = opts; + fc->ops = &bpf_context_ops; + return 0; } static struct file_system_type bpf_fs_type = { .owner = THIS_MODULE, .name = "bpf", - .mount = bpf_mount, + .init_fs_context = bpf_init_fs_context, + .parameters = &bpf_fs_parameters, .kill_sb = kill_litter_super, }; diff --git a/kernel/cpu.c b/kernel/cpu.c index e1967e9eddc2..fc28e17940e0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -392,8 +392,7 @@ enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; void __init cpu_smt_disable(bool force) { - if (cpu_smt_control == CPU_SMT_FORCE_DISABLED || - cpu_smt_control == CPU_SMT_NOT_SUPPORTED) + if (!cpu_smt_possible()) return; if (force) { @@ -438,6 +437,14 @@ static inline bool cpu_smt_allowed(unsigned int cpu) */ return !cpumask_test_cpu(cpu, &cpus_booted_once_mask); } + +/* Returns true if SMT is not supported of forcefully (irreversibly) disabled */ +bool cpu_smt_possible(void) +{ + return cpu_smt_control != CPU_SMT_FORCE_DISABLED && + cpu_smt_control != CPU_SMT_NOT_SUPPORTED; +} +EXPORT_SYMBOL_GPL(cpu_smt_possible); #else static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } #endif diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 5cc608de6883..f76d6f77dd5e 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -787,11 +787,8 @@ out: } /* - * GDB places a breakpoint at this function to know dynamically - * loaded objects. It's not defined static so that only one instance with this - * name exists in the kernel. + * GDB places a breakpoint at this function to know dynamically loaded objects. */ - static int module_event(struct notifier_block *self, unsigned long val, void *data) { @@ -896,30 +893,25 @@ static struct sysrq_key_op sysrq_dbg_op = { }; #endif -static int kgdb_panic_event(struct notifier_block *self, - unsigned long val, - void *data) +void kgdb_panic(const char *msg) { + if (!kgdb_io_module_registered) + return; + /* - * Avoid entering the debugger if we were triggered due to a panic - * We don't want to get stuck waiting for input from user in such case. - * panic_timeout indicates the system should automatically + * We don't want to get stuck waiting for input from user if + * "panic_timeout" indicates the system should automatically * reboot on panic. */ if (panic_timeout) - return NOTIFY_DONE; + return; if (dbg_kdb_mode) - kdb_printf("PANIC: %s\n", (char *)data); + kdb_printf("PANIC: %s\n", msg); + kgdb_breakpoint(); - return NOTIFY_DONE; } -static struct notifier_block kgdb_panic_event_nb = { - .notifier_call = kgdb_panic_event, - .priority = INT_MAX, -}; - void __weak kgdb_arch_late(void) { } @@ -968,8 +960,6 @@ static void kgdb_register_callbacks(void) kgdb_arch_late(); register_module_notifier(&dbg_module_load_nb); register_reboot_notifier(&dbg_reboot_notifier); - atomic_notifier_chain_register(&panic_notifier_list, - &kgdb_panic_event_nb); #ifdef CONFIG_MAGIC_SYSRQ register_sysrq_key('g', &sysrq_dbg_op); #endif @@ -983,16 +973,14 @@ static void kgdb_register_callbacks(void) static void kgdb_unregister_callbacks(void) { /* - * When this routine is called KGDB should unregister from the - * panic handler and clean up, making sure it is not handling any + * When this routine is called KGDB should unregister from + * handlers and clean up, making sure it is not handling any * break exceptions at the time. */ if (kgdb_io_module_registered) { kgdb_io_module_registered = 0; unregister_reboot_notifier(&dbg_reboot_notifier); unregister_module_notifier(&dbg_module_load_nb); - atomic_notifier_chain_unregister(&panic_notifier_list, - &kgdb_panic_event_nb); kgdb_arch_exit(); #ifdef CONFIG_MAGIC_SYSRQ unregister_sysrq_key('g', &sysrq_dbg_op); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 9ecfa37c7fbf..4567fe998c30 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -830,7 +830,7 @@ static void parse_grep(const char *str) cp++; while (isspace(*cp)) cp++; - if (strncmp(cp, "grep ", 5)) { + if (!str_has_prefix(cp, "grep ")) { kdb_printf("invalid 'pipe', see grephelp\n"); return; } diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 9decbba255fc..73c5c2b8e824 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -20,6 +20,15 @@ config ARCH_HAS_DMA_COHERENCE_H config ARCH_HAS_DMA_SET_MASK bool +# +# Select this option if the architecture needs special handling for +# DMA_ATTR_WRITE_COMBINE. Normally the "uncached" mapping should be what +# people thing of when saying write combine, so very few platforms should +# need to enable this. +# +config ARCH_HAS_DMA_WRITE_COMBINE + bool + config DMA_DECLARE_COHERENT bool @@ -45,9 +54,6 @@ config ARCH_HAS_DMA_PREP_COHERENT config ARCH_HAS_DMA_COHERENT_TO_PFN bool -config ARCH_HAS_DMA_MMAP_PGPROT - bool - config ARCH_HAS_FORCE_DMA_UNENCRYPTED bool diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 29fd6590dc1e..545e3869b0e3 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -122,18 +122,6 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_release_coherent_memory(mem); return ret; } -EXPORT_SYMBOL(dma_declare_coherent_memory); - -void dma_release_declared_memory(struct device *dev) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - - if (!mem) - return; - dma_release_coherent_memory(mem); - dev->dma_mem = NULL; -} -EXPORT_SYMBOL(dma_release_declared_memory); static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, ssize_t size, dma_addr_t *dma_handle) @@ -288,7 +276,6 @@ int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma, return __dma_mmap_from_coherent(mem, vma, vaddr, size, ret); } -EXPORT_SYMBOL(dma_mmap_from_dev_coherent); int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr, size_t size, int *ret) diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index b0038ca3aa92..d9334f31a5af 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -136,17 +136,29 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, return ret; } +/* + * The whole dma_get_sgtable() idea is fundamentally unsafe - it seems + * that the intention is to allow exporting memory allocated via the + * coherent DMA APIs through the dma_buf API, which only accepts a + * scattertable. This presents a couple of problems: + * 1. Not all memory allocated via the coherent DMA APIs is backed by + * a struct page + * 2. Passing coherent DMA memory into the streaming APIs is not allowed + * as we will try to flush the memory through a different alias to that + * actually being used (and the flushes are redundant.) + */ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); - if (!dma_is_direct(ops) && ops->get_sgtable) - return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, - attrs); - return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, size, - attrs); + if (dma_is_direct(ops)) + return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, + size, attrs); + if (!ops->get_sgtable) + return -ENXIO; + return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs); } EXPORT_SYMBOL(dma_get_sgtable_attrs); @@ -161,9 +173,11 @@ pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) (IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) && (attrs & DMA_ATTR_NON_CONSISTENT))) return prot; - if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_MMAP_PGPROT)) - return arch_dma_mmap_pgprot(dev, prot, attrs); - return pgprot_noncached(prot); +#ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE + if (attrs & DMA_ATTR_WRITE_COMBINE) + return pgprot_writecombine(prot); +#endif + return pgprot_dmacoherent(prot); } #endif /* CONFIG_MMU */ @@ -174,7 +188,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { -#ifndef CONFIG_ARCH_NO_COHERENT_DMA_MMAP +#ifdef CONFIG_MMU unsigned long user_count = vma_pages(vma); unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; unsigned long off = vma->vm_pgoff; @@ -205,8 +219,29 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, user_count << PAGE_SHIFT, vma->vm_page_prot); #else return -ENXIO; -#endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */ +#endif /* CONFIG_MMU */ +} + +/** + * dma_can_mmap - check if a given device supports dma_mmap_* + * @dev: device to check + * + * Returns %true if @dev supports dma_mmap_coherent() and dma_mmap_attrs() to + * map DMA allocations to userspace. + */ +bool dma_can_mmap(struct device *dev) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (dma_is_direct(ops)) { + return IS_ENABLED(CONFIG_MMU) && + (dev_is_dma_coherent(dev) || + IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN)); + } + + return ops->mmap != NULL; } +EXPORT_SYMBOL_GPL(dma_can_mmap); /** * dma_mmap_attrs - map a coherent DMA allocation into user space @@ -227,31 +262,15 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, { const struct dma_map_ops *ops = get_dma_ops(dev); - if (!dma_is_direct(ops) && ops->mmap) - return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs); - return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, attrs); + if (dma_is_direct(ops)) + return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, + attrs); + if (!ops->mmap) + return -ENXIO; + return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs); } EXPORT_SYMBOL(dma_mmap_attrs); -static u64 dma_default_get_required_mask(struct device *dev) -{ - u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT); - u32 high_totalram = ((max_pfn - 1) >> (32 - PAGE_SHIFT)); - u64 mask; - - if (!high_totalram) { - /* convert to mask just covering totalram */ - low_totalram = (1 << (fls(low_totalram) - 1)); - low_totalram += low_totalram - 1; - mask = low_totalram; - } else { - high_totalram = (1 << (fls(high_totalram) - 1)); - high_totalram += high_totalram - 1; - mask = (((u64)high_totalram) << 32) + 0xffffffff; - } - return mask; -} - u64 dma_get_required_mask(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -260,7 +279,16 @@ u64 dma_get_required_mask(struct device *dev) return dma_direct_get_required_mask(dev); if (ops->get_required_mask) return ops->get_required_mask(dev); - return dma_default_get_required_mask(dev); + + /* + * We require every DMA ops implementation to at least support a 32-bit + * DMA mask (and use bounce buffering if that isn't supported in + * hardware). As the direct mapping code has its own routine to + * actually report an optimal mask we default to 32-bit here as that + * is the right thing for most IOMMUs, and at least not actively + * harmful in general. + */ + return DMA_BIT_MASK(32); } EXPORT_SYMBOL_GPL(dma_get_required_mask); @@ -317,12 +345,6 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, } EXPORT_SYMBOL(dma_free_attrs); -static inline void dma_check_mask(struct device *dev, u64 mask) -{ - if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1))) - dev_warn(dev, "SME is active, device will require DMA bounce buffers\n"); -} - int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -353,7 +375,6 @@ int dma_set_mask(struct device *dev, u64 mask) return -EIO; arch_dma_set_mask(dev, mask); - dma_check_mask(dev, mask); *dev->dma_mask = mask; return 0; } @@ -371,7 +392,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask) if (!dma_supported(dev, mask)) return -EIO; - dma_check_mask(dev, mask); dev->coherent_dma_mask = mask; return 0; } @@ -405,3 +425,14 @@ size_t dma_max_mapping_size(struct device *dev) return size; } EXPORT_SYMBOL_GPL(dma_max_mapping_size); + +unsigned long dma_get_merge_boundary(struct device *dev) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (!ops || !ops->get_merge_boundary) + return 0; /* can't merge */ + + return ops->get_merge_boundary(dev); +} +EXPORT_SYMBOL_GPL(dma_get_merge_boundary); diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index ffe78f0b2fe4..ca4e5d44b571 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -11,13 +11,21 @@ #include <linux/slab.h> #include <linux/vmalloc.h> +struct page **dma_common_find_pages(void *cpu_addr) +{ + struct vm_struct *area = find_vm_area(cpu_addr); + + if (!area || area->flags != VM_DMA_COHERENT) + return NULL; + return area->pages; +} + static struct vm_struct *__dma_common_pages_remap(struct page **pages, - size_t size, unsigned long vm_flags, pgprot_t prot, - const void *caller) + size_t size, pgprot_t prot, const void *caller) { struct vm_struct *area; - area = get_vm_area_caller(size, vm_flags, caller); + area = get_vm_area_caller(size, VM_DMA_COHERENT, caller); if (!area) return NULL; @@ -34,12 +42,11 @@ static struct vm_struct *__dma_common_pages_remap(struct page **pages, * Cannot be used in non-sleeping contexts */ void *dma_common_pages_remap(struct page **pages, size_t size, - unsigned long vm_flags, pgprot_t prot, - const void *caller) + pgprot_t prot, const void *caller) { struct vm_struct *area; - area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller); + area = __dma_common_pages_remap(pages, size, prot, caller); if (!area) return NULL; @@ -53,7 +60,6 @@ void *dma_common_pages_remap(struct page **pages, size_t size, * Cannot be used in non-sleeping contexts */ void *dma_common_contiguous_remap(struct page *page, size_t size, - unsigned long vm_flags, pgprot_t prot, const void *caller) { int i; @@ -67,7 +73,7 @@ void *dma_common_contiguous_remap(struct page *page, size_t size, for (i = 0; i < (size >> PAGE_SHIFT); i++) pages[i] = nth_page(page, i); - area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller); + area = __dma_common_pages_remap(pages, size, prot, caller); kfree(pages); @@ -79,11 +85,11 @@ void *dma_common_contiguous_remap(struct page *page, size_t size, /* * Unmaps a range previously mapped by dma_common_*_remap */ -void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags) +void dma_common_free_remap(void *cpu_addr, size_t size) { - struct vm_struct *area = find_vm_area(cpu_addr); + struct page **pages = dma_common_find_pages(cpu_addr); - if (!area || (area->flags & vm_flags) != vm_flags) { + if (!pages) { WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr); return; } @@ -105,7 +111,16 @@ static int __init early_coherent_pool(char *p) } early_param("coherent_pool", early_coherent_pool); -int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot) +static gfp_t dma_atomic_pool_gfp(void) +{ + if (IS_ENABLED(CONFIG_ZONE_DMA)) + return GFP_DMA; + if (IS_ENABLED(CONFIG_ZONE_DMA32)) + return GFP_DMA32; + return GFP_KERNEL; +} + +static int __init dma_atomic_pool_init(void) { unsigned int pool_size_order = get_order(atomic_pool_size); unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; @@ -117,7 +132,7 @@ int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot) page = dma_alloc_from_contiguous(NULL, nr_pages, pool_size_order, false); else - page = alloc_pages(gfp, pool_size_order); + page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order); if (!page) goto out; @@ -127,8 +142,9 @@ int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot) if (!atomic_pool) goto free_page; - addr = dma_common_contiguous_remap(page, atomic_pool_size, VM_USERMAP, - prot, __builtin_return_address(0)); + addr = dma_common_contiguous_remap(page, atomic_pool_size, + pgprot_dmacoherent(PAGE_KERNEL), + __builtin_return_address(0)); if (!addr) goto destroy_genpool; @@ -143,7 +159,7 @@ int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot) return 0; remove_mapping: - dma_common_free_remap(addr, atomic_pool_size, VM_USERMAP); + dma_common_free_remap(addr, atomic_pool_size); destroy_genpool: gen_pool_destroy(atomic_pool); atomic_pool = NULL; @@ -155,6 +171,7 @@ out: atomic_pool_size / 1024); return -ENOMEM; } +postcore_initcall(dma_atomic_pool_init); bool dma_in_atomic_pool(void *start, size_t size) { @@ -217,7 +234,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, arch_dma_prep_coherent(page, size); /* create a coherent mapping */ - ret = dma_common_contiguous_remap(page, size, VM_USERMAP, + ret = dma_common_contiguous_remap(page, size, dma_pgprot(dev, PAGE_KERNEL, attrs), __builtin_return_address(0)); if (!ret) { diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 796a44f8ef5a..673a2cdb2656 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -463,8 +463,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); if (mem_encrypt_active()) - pr_warn_once("%s is active and system is using DMA bounce buffers\n", - sme_active() ? "SME" : "SEV"); + pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n"); if (mapping_size > alloc_size) { dev_warn_once(hwdev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)", diff --git a/kernel/elfcore.c b/kernel/elfcore.c index fc482c8e0bd8..57fb4dcff434 100644 --- a/kernel/elfcore.c +++ b/kernel/elfcore.c @@ -3,6 +3,7 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/binfmts.h> +#include <linux/elfcore.h> Elf_Half __weak elf_core_extra_phdrs(void) { diff --git a/kernel/events/core.c b/kernel/events/core.c index 4f08b17d6426..4655adbbae10 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2239,7 +2239,7 @@ static void __perf_event_disable(struct perf_event *event, * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to - * remains valid. This condition is satisifed when called through + * remains valid. This condition is satisfied when called through * perf_event_for_each_child or perf_event_for_each because they * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). @@ -6054,7 +6054,7 @@ static void perf_sample_regs_intr(struct perf_regs *regs_intr, * Get remaining task size from user stack pointer. * * It'd be better to take stack vma map and limit this more - * precisly, but there's no way to get it safely under interrupt, + * precisely, but there's no way to get it safely under interrupt, * so using TASK_SIZE as limit. */ static u64 perf_ustack_task_size(struct pt_regs *regs) @@ -6616,7 +6616,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_STACK_USER) { /* - * Either we need PERF_SAMPLE_STACK_USER bit to be allways + * Either we need PERF_SAMPLE_STACK_USER bit to be always * processed as the last one or have additional check added * in case new sample type is added, because we could eat * up the rest of the sample size. @@ -10917,6 +10917,13 @@ SYSCALL_DEFINE5(perf_event_open, perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) return -EACCES; + err = security_locked_down(LOCKDOWN_PERF); + if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR)) + /* REGS_INTR can leak data, lockdown must prevent this */ + return err; + + err = 0; + /* * In cgroup mode, the pid argument is used to pass the fd * opened to the cgroup directory in cgroupfs. The cpu argument diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 84fa00497c49..94d38a39d72e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -26,6 +26,7 @@ #include <linux/percpu-rwsem.h> #include <linux/task_work.h> #include <linux/shmem_fs.h> +#include <linux/khugepaged.h> #include <linux/uprobes.h> @@ -143,17 +144,19 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) * * @vma: vma that holds the pte pointing to page * @addr: address the old @page is mapped at - * @page: the cowed page we are replacing by kpage - * @kpage: the modified page we replace page by + * @old_page: the page we are replacing by new_page + * @new_page: the modified page we replace page by * - * Returns 0 on success, -EFAULT on failure. + * If @new_page is NULL, only unmap @old_page. + * + * Returns 0 on success, negative error code otherwise. */ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) { struct mm_struct *mm = vma->vm_mm; struct page_vma_mapped_walk pvmw = { - .page = old_page, + .page = compound_head(old_page), .vma = vma, .address = addr, }; @@ -164,12 +167,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); - VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); - - err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg, - false); - if (err) - return err; + if (new_page) { + err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, + &memcg, false); + if (err) + return err; + } /* For try_to_free_swap() and munlock_vma_page() below */ lock_page(old_page); @@ -177,15 +180,20 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_invalidate_range_start(&range); err = -EAGAIN; if (!page_vma_mapped_walk(&pvmw)) { - mem_cgroup_cancel_charge(new_page, memcg, false); + if (new_page) + mem_cgroup_cancel_charge(new_page, memcg, false); goto unlock; } VM_BUG_ON_PAGE(addr != pvmw.address, old_page); - get_page(new_page); - page_add_new_anon_rmap(new_page, vma, addr, false); - mem_cgroup_commit_charge(new_page, memcg, false, false); - lru_cache_add_active_or_unevictable(new_page, vma); + if (new_page) { + get_page(new_page); + page_add_new_anon_rmap(new_page, vma, addr, false); + mem_cgroup_commit_charge(new_page, memcg, false, false); + lru_cache_add_active_or_unevictable(new_page, vma); + } else + /* no new page, just dec_mm_counter for old_page */ + dec_mm_counter(mm, MM_ANONPAGES); if (!PageAnon(old_page)) { dec_mm_counter(mm, mm_counter_file(old_page)); @@ -194,8 +202,9 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); ptep_clear_flush_notify(vma, addr, pvmw.pte); - set_pte_at_notify(mm, addr, pvmw.pte, - mk_pte(new_page, vma->vm_page_prot)); + if (new_page) + set_pte_at_notify(mm, addr, pvmw.pte, + mk_pte(new_page, vma->vm_page_prot)); page_remove_rmap(old_page, false); if (!page_mapped(old_page)) @@ -464,6 +473,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, struct page *old_page, *new_page; struct vm_area_struct *vma; int ret, is_register, ref_ctr_updated = 0; + bool orig_page_huge = false; is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); @@ -471,7 +481,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, retry: /* Read the page with vaddr into memory */ ret = get_user_pages_remote(NULL, mm, vaddr, 1, - FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL); + FOLL_FORCE | FOLL_SPLIT_PMD, &old_page, &vma, NULL); if (ret <= 0) return ret; @@ -488,6 +498,10 @@ retry: ref_ctr_updated = 1; } + ret = 0; + if (!is_register && !PageAnon(old_page)) + goto put_old; + ret = anon_vma_prepare(vma); if (ret) goto put_old; @@ -501,8 +515,33 @@ retry: copy_highpage(new_page, old_page); copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + if (!is_register) { + struct page *orig_page; + pgoff_t index; + + VM_BUG_ON_PAGE(!PageAnon(old_page), old_page); + + index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT; + orig_page = find_get_page(vma->vm_file->f_inode->i_mapping, + index); + + if (orig_page) { + if (PageUptodate(orig_page) && + pages_identical(new_page, orig_page)) { + /* let go new_page */ + put_page(new_page); + new_page = NULL; + + if (PageCompound(orig_page)) + orig_page_huge = true; + } + put_page(orig_page); + } + } + ret = __replace_page(vma, vaddr, old_page, new_page); - put_page(new_page); + if (new_page) + put_page(new_page); put_old: put_page(old_page); @@ -513,6 +552,10 @@ put_old: if (ret && is_register && ref_ctr_updated) update_ref_ctr(uprobe, mm, -1); + /* try collapse pmd for compound page */ + if (!ret && orig_page_huge) + collapse_pte_mapped_thp(mm, vaddr); + return ret; } diff --git a/kernel/exit.c b/kernel/exit.c index 22ab6a4bdc51..a46a50d67002 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -182,6 +182,11 @@ static void delayed_put_task_struct(struct rcu_head *rhp) put_task_struct(tsk); } +void put_task_struct_rcu_user(struct task_struct *task) +{ + if (refcount_dec_and_test(&task->rcu_users)) + call_rcu(&task->rcu, delayed_put_task_struct); +} void release_task(struct task_struct *p) { @@ -222,76 +227,13 @@ repeat: write_unlock_irq(&tasklist_lock); release_thread(p); - call_rcu(&p->rcu, delayed_put_task_struct); + put_task_struct_rcu_user(p); p = leader; if (unlikely(zap_leader)) goto repeat; } -/* - * Note that if this function returns a valid task_struct pointer (!NULL) - * task->usage must remain >0 for the duration of the RCU critical section. - */ -struct task_struct *task_rcu_dereference(struct task_struct **ptask) -{ - struct sighand_struct *sighand; - struct task_struct *task; - - /* - * We need to verify that release_task() was not called and thus - * delayed_put_task_struct() can't run and drop the last reference - * before rcu_read_unlock(). We check task->sighand != NULL, - * but we can read the already freed and reused memory. - */ -retry: - task = rcu_dereference(*ptask); - if (!task) - return NULL; - - probe_kernel_address(&task->sighand, sighand); - - /* - * Pairs with atomic_dec_and_test() in put_task_struct(). If this task - * was already freed we can not miss the preceding update of this - * pointer. - */ - smp_rmb(); - if (unlikely(task != READ_ONCE(*ptask))) - goto retry; - - /* - * We've re-checked that "task == *ptask", now we have two different - * cases: - * - * 1. This is actually the same task/task_struct. In this case - * sighand != NULL tells us it is still alive. - * - * 2. This is another task which got the same memory for task_struct. - * We can't know this of course, and we can not trust - * sighand != NULL. - * - * In this case we actually return a random value, but this is - * correct. - * - * If we return NULL - we can pretend that we actually noticed that - * *ptask was updated when the previous task has exited. Or pretend - * that probe_slab_address(&sighand) reads NULL. - * - * If we return the new task (because sighand is not NULL for any - * reason) - this is fine too. This (new) task can't go away before - * another gp pass. - * - * And note: We could even eliminate the false positive if re-read - * task->sighand once again to avoid the falsely NULL. But this case - * is very unlikely so we don't care. - */ - if (!sighand) - return NULL; - - return task; -} - void rcuwait_wake_up(struct rcuwait *w) { struct task_struct *task; @@ -311,10 +253,6 @@ void rcuwait_wake_up(struct rcuwait *w) */ smp_mb(); /* (B) */ - /* - * Avoid using task_rcu_dereference() magic as long as we are careful, - * see comment in rcuwait_wait_event() regarding ->exit_state. - */ task = rcu_dereference(w->task); if (task) wake_up_process(task); diff --git a/kernel/extable.c b/kernel/extable.c index e23cce6e6092..f6c9406eec7d 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -40,13 +40,20 @@ void __init sort_main_extable(void) } } +/* Given an address, look for it in the kernel exception table */ +const +struct exception_table_entry *search_kernel_exception_table(unsigned long addr) +{ + return search_extable(__start___ex_table, + __stop___ex_table - __start___ex_table, addr); +} + /* Given an address, look for it in the exception tables. */ const struct exception_table_entry *search_exception_tables(unsigned long addr) { const struct exception_table_entry *e; - e = search_extable(__start___ex_table, - __stop___ex_table - __start___ex_table, addr); + e = search_kernel_exception_table(addr); if (!e) e = search_module_extables(addr); return e; diff --git a/kernel/fork.c b/kernel/fork.c index 53e780748fe3..f9572f416126 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -125,6 +125,15 @@ int nr_threads; /* The idle threads do not count.. */ static int max_threads; /* tunable limit on nr_threads */ +#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x) + +static const char * const resident_page_types[] = { + NAMED_ARRAY_INDEX(MM_FILEPAGES), + NAMED_ARRAY_INDEX(MM_ANONPAGES), + NAMED_ARRAY_INDEX(MM_SWAPENTS), + NAMED_ARRAY_INDEX(MM_SHMEMPAGES), +}; + DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ @@ -645,12 +654,15 @@ static void check_mm(struct mm_struct *mm) { int i; + BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS, + "Please make sure 'struct resident_page_types[]' is updated as well"); + for (i = 0; i < NR_MM_COUNTERS; i++) { long x = atomic_long_read(&mm->rss_stat.count[i]); if (unlikely(x)) - printk(KERN_ALERT "BUG: Bad rss-counter state " - "mm:%p idx:%d val:%ld\n", mm, i, x); + pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", + mm, resident_page_types[i], x); } if (mm_pgtables_bytes(mm)) @@ -903,10 +915,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->cpus_ptr = &tsk->cpus_mask; /* - * One for us, one for whoever does the "release_task()" (usually - * parent) + * One for the user space visible state that goes away when reaped. + * One for the scheduler. */ - refcount_set(&tsk->usage, 2); + refcount_set(&tsk->rcu_users, 2); + /* One for the rcu users */ + refcount_set(&tsk->usage, 1); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; #endif @@ -1009,7 +1023,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_owner(mm, p); RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_mm_init(mm); - hmm_mm_init(mm); init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 3941a9c48f83..060e8e726755 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -4,7 +4,7 @@ menu "GCOV-based kernel profiling" config GCOV_KERNEL bool "Enable gcov-based kernel profiling" depends on DEBUG_FS - select CONSTRUCTORS if !UML + select CONSTRUCTORS default n ---help--- This option enables gcov-based code profiling (e.g. for code coverage diff --git a/kernel/kexec.c b/kernel/kexec.c index 1b018f1a6e0d..bc933c0db9bf 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -206,6 +206,14 @@ static inline int kexec_load_check(unsigned long nr_segments, return result; /* + * kexec can be used to circumvent module loading restrictions, so + * prevent loading in that case + */ + result = security_locked_down(LOCKDOWN_KEXEC); + if (result) + return result; + + /* * Verify we have a legal set of flags * This leaves us room for future extensions. */ diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index d5870723b8ad..15d70a90b50d 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -300,6 +300,8 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) { struct page *pages; + if (fatal_signal_pending(current)) + return NULL; pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order); if (pages) { unsigned int count, i; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b8cc032d5620..79f252af7dee 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -88,7 +88,7 @@ int __weak arch_kimage_file_post_load_cleanup(struct kimage *image) return kexec_image_post_load_cleanup_default(image); } -#ifdef CONFIG_KEXEC_VERIFY_SIG +#ifdef CONFIG_KEXEC_SIG static int kexec_image_verify_sig_default(struct kimage *image, void *buf, unsigned long buf_len) { @@ -177,6 +177,59 @@ void kimage_file_post_load_cleanup(struct kimage *image) image->image_loader_data = NULL; } +#ifdef CONFIG_KEXEC_SIG +static int +kimage_validate_signature(struct kimage *image) +{ + const char *reason; + int ret; + + ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, + image->kernel_buf_len); + switch (ret) { + case 0: + break; + + /* Certain verification errors are non-fatal if we're not + * checking errors, provided we aren't mandating that there + * must be a valid signature. + */ + case -ENODATA: + reason = "kexec of unsigned image"; + goto decide; + case -ENOPKG: + reason = "kexec of image with unsupported crypto"; + goto decide; + case -ENOKEY: + reason = "kexec of image with unavailable key"; + decide: + if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) { + pr_notice("%s rejected\n", reason); + return ret; + } + + /* If IMA is guaranteed to appraise a signature on the kexec + * image, permit it even if the kernel is otherwise locked + * down. + */ + if (!ima_appraise_signature(READING_KEXEC_IMAGE) && + security_locked_down(LOCKDOWN_KEXEC)) + return -EPERM; + + return 0; + + /* All other errors are fatal, including nomem, unparseable + * signatures and signature check failures - even if signatures + * aren't required. + */ + default: + pr_notice("kernel signature verification failed (%d).\n", ret); + } + + return ret; +} +#endif + /* * In file mode list of segments is prepared by kernel. Copy relevant * data from user space, do error checking, prepare segment list @@ -186,7 +239,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, const char __user *cmdline_ptr, unsigned long cmdline_len, unsigned flags) { - int ret = 0; + int ret; void *ldata; loff_t size; @@ -202,14 +255,11 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, if (ret) goto out; -#ifdef CONFIG_KEXEC_VERIFY_SIG - ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, - image->kernel_buf_len); - if (ret) { - pr_debug("kernel signature verification failed.\n"); +#ifdef CONFIG_KEXEC_SIG + ret = kimage_validate_signature(image); + + if (ret) goto out; - } - pr_debug("kernel signature verification successful.\n"); #endif /* It is possible that there no initramfs is being loaded */ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1b66ccbb744a..53534aa258a6 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -962,8 +962,15 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) #ifdef CONFIG_KPROBES_ON_FTRACE static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { .func = kprobe_ftrace_handler, + .flags = FTRACE_OPS_FL_SAVE_REGS, +}; + +static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = { + .func = kprobe_ftrace_handler, .flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY, }; + +static int kprobe_ipmodify_enabled; static int kprobe_ftrace_enabled; /* Must ensure p->addr is really on ftrace */ @@ -976,58 +983,75 @@ static int prepare_kprobe(struct kprobe *p) } /* Caller must lock kprobe_mutex */ -static int arm_kprobe_ftrace(struct kprobe *p) +static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, + int *cnt) { int ret = 0; - ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, - (unsigned long)p->addr, 0, 0); + ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0); if (ret) { pr_debug("Failed to arm kprobe-ftrace at %pS (%d)\n", p->addr, ret); return ret; } - if (kprobe_ftrace_enabled == 0) { - ret = register_ftrace_function(&kprobe_ftrace_ops); + if (*cnt == 0) { + ret = register_ftrace_function(ops); if (ret) { pr_debug("Failed to init kprobe-ftrace (%d)\n", ret); goto err_ftrace; } } - kprobe_ftrace_enabled++; + (*cnt)++; return ret; err_ftrace: /* - * Note: Since kprobe_ftrace_ops has IPMODIFY set, and ftrace requires a - * non-empty filter_hash for IPMODIFY ops, we're safe from an accidental - * empty filter_hash which would undesirably trace all functions. + * At this point, sinec ops is not registered, we should be sefe from + * registering empty filter. */ - ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0); + ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0); return ret; } +static int arm_kprobe_ftrace(struct kprobe *p) +{ + bool ipmodify = (p->post_handler != NULL); + + return __arm_kprobe_ftrace(p, + ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops, + ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); +} + /* Caller must lock kprobe_mutex */ -static int disarm_kprobe_ftrace(struct kprobe *p) +static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, + int *cnt) { int ret = 0; - if (kprobe_ftrace_enabled == 1) { - ret = unregister_ftrace_function(&kprobe_ftrace_ops); + if (*cnt == 1) { + ret = unregister_ftrace_function(ops); if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret)) return ret; } - kprobe_ftrace_enabled--; + (*cnt)--; - ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, - (unsigned long)p->addr, 1, 0); + ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0); WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (%d)\n", p->addr, ret); return ret; } + +static int disarm_kprobe_ftrace(struct kprobe *p) +{ + bool ipmodify = (p->post_handler != NULL); + + return __disarm_kprobe_ftrace(p, + ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops, + ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); +} #else /* !CONFIG_KPROBES_ON_FTRACE */ #define prepare_kprobe(p) arch_prepare_kprobe(p) #define arm_kprobe_ftrace(p) (-ENODEV) diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index c4ce08f43bd6..ab4a4606d19b 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1175,6 +1175,7 @@ err: pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n", patch->mod->name, obj->mod->name, obj->mod->name); mod->klp_alive = false; + obj->mod = NULL; klp_cleanup_module_patches_limited(mod, patch); mutex_unlock(&klp_mutex); diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 89bab079e7a4..e84d21aa0722 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -269,7 +269,7 @@ pv_wait_early(struct pv_node *prev, int loop) if ((loop & PV_PREV_CHECK_MASK) != 0) return false; - return READ_ONCE(prev->state) != vcpu_running || vcpu_is_preempted(prev->cpu); + return READ_ONCE(prev->state) != vcpu_running; } /* diff --git a/kernel/module.c b/kernel/module.c index 9ee93421269c..ff2d7359a418 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -7,6 +7,7 @@ #include <linux/export.h> #include <linux/extable.h> #include <linux/moduleloader.h> +#include <linux/module_signature.h> #include <linux/trace_events.h> #include <linux/init.h> #include <linux/kallsyms.h> @@ -544,12 +545,20 @@ static const char *kernel_symbol_name(const struct kernel_symbol *sym) #endif } -static int cmp_name(const void *va, const void *vb) +static const char *kernel_symbol_namespace(const struct kernel_symbol *sym) { - const char *a; - const struct kernel_symbol *b; - a = va; b = vb; - return strcmp(a, kernel_symbol_name(b)); +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + if (!sym->namespace_offset) + return NULL; + return offset_to_ptr(&sym->namespace_offset); +#else + return sym->namespace; +#endif +} + +static int cmp_name(const void *name, const void *sym) +{ + return strcmp(name, kernel_symbol_name(sym)); } static bool find_exported_symbol_in_section(const struct symsearch *syms, @@ -1379,6 +1388,41 @@ static inline int same_magic(const char *amagic, const char *bmagic, } #endif /* CONFIG_MODVERSIONS */ +static char *get_modinfo(const struct load_info *info, const char *tag); +static char *get_next_modinfo(const struct load_info *info, const char *tag, + char *prev); + +static int verify_namespace_is_imported(const struct load_info *info, + const struct kernel_symbol *sym, + struct module *mod) +{ + const char *namespace; + char *imported_namespace; + + namespace = kernel_symbol_namespace(sym); + if (namespace) { + imported_namespace = get_modinfo(info, "import_ns"); + while (imported_namespace) { + if (strcmp(namespace, imported_namespace) == 0) + return 0; + imported_namespace = get_next_modinfo( + info, "import_ns", imported_namespace); + } +#ifdef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS + pr_warn( +#else + pr_err( +#endif + "%s: module uses symbol (%s) from namespace %s, but does not import it.\n", + mod->name, kernel_symbol_name(sym), namespace); +#ifndef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS + return -EINVAL; +#endif + } + return 0; +} + + /* Resolve a symbol for this module. I.e. if we find one, record usage. */ static const struct kernel_symbol *resolve_symbol(struct module *mod, const struct load_info *info, @@ -1407,6 +1451,12 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, goto getname; } + err = verify_namespace_is_imported(info, sym, mod); + if (err) { + sym = ERR_PTR(err); + goto getname; + } + err = ref_module(mod, owner); if (err) { sym = ERR_PTR(err); @@ -2481,7 +2531,8 @@ static char *next_string(char *string, unsigned long *secsize) return string; } -static char *get_modinfo(struct load_info *info, const char *tag) +static char *get_next_modinfo(const struct load_info *info, const char *tag, + char *prev) { char *p; unsigned int taglen = strlen(tag); @@ -2492,13 +2543,25 @@ static char *get_modinfo(struct load_info *info, const char *tag) * get_modinfo() calls made before rewrite_section_headers() * must use sh_offset, as sh_addr isn't set! */ - for (p = (char *)info->hdr + infosec->sh_offset; p; p = next_string(p, &size)) { + char *modinfo = (char *)info->hdr + infosec->sh_offset; + + if (prev) { + size -= prev - modinfo; + modinfo = next_string(prev, &size); + } + + for (p = modinfo; p; p = next_string(p, &size)) { if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') return p + taglen + 1; } return NULL; } +static char *get_modinfo(const struct load_info *info, const char *tag) +{ + return get_next_modinfo(info, tag, NULL); +} + static void setup_modinfo(struct module *mod, struct load_info *info) { struct module_attribute *attr; @@ -2776,8 +2839,9 @@ static inline void kmemleak_load_module(const struct module *mod, #ifdef CONFIG_MODULE_SIG static int module_sig_check(struct load_info *info, int flags) { - int err = -ENOKEY; + int err = -ENODATA; const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; + const char *reason; const void *mod = info->hdr; /* @@ -2792,16 +2856,38 @@ static int module_sig_check(struct load_info *info, int flags) err = mod_verify_sig(mod, info); } - if (!err) { + switch (err) { + case 0: info->sig_ok = true; return 0; - } - /* Not having a signature is only an error if we're strict. */ - if (err == -ENOKEY && !is_module_sig_enforced()) - err = 0; + /* We don't permit modules to be loaded into trusted kernels + * without a valid signature on them, but if we're not + * enforcing, certain errors are non-fatal. + */ + case -ENODATA: + reason = "Loading of unsigned module"; + goto decide; + case -ENOPKG: + reason = "Loading of module with unsupported crypto"; + goto decide; + case -ENOKEY: + reason = "Loading of module with unavailable key"; + decide: + if (is_module_sig_enforced()) { + pr_notice("%s is rejected\n", reason); + return -EKEYREJECTED; + } - return err; + return security_locked_down(LOCKDOWN_MODULE_SIGNATURE); + + /* All other errors are fatal, including nomem, unparseable + * signatures and signature check failures - even if signatures + * aren't required. + */ + default: + return err; + } } #else /* !CONFIG_MODULE_SIG */ static int module_sig_check(struct load_info *info, int flags) diff --git a/kernel/module_signature.c b/kernel/module_signature.c new file mode 100644 index 000000000000..4224a1086b7d --- /dev/null +++ b/kernel/module_signature.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Module signature checker + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/errno.h> +#include <linux/printk.h> +#include <linux/module_signature.h> +#include <asm/byteorder.h> + +/** + * mod_check_sig - check that the given signature is sane + * + * @ms: Signature to check. + * @file_len: Size of the file to which @ms is appended. + * @name: What is being checked. Used for error messages. + */ +int mod_check_sig(const struct module_signature *ms, size_t file_len, + const char *name) +{ + if (be32_to_cpu(ms->sig_len) >= file_len - sizeof(*ms)) + return -EBADMSG; + + if (ms->id_type != PKEY_ID_PKCS7) { + pr_err("%s: Module is not signed with expected PKCS#7 message\n", + name); + return -ENOPKG; + } + + if (ms->algo != 0 || + ms->hash != 0 || + ms->signer_len != 0 || + ms->key_id_len != 0 || + ms->__pad[0] != 0 || + ms->__pad[1] != 0 || + ms->__pad[2] != 0) { + pr_err("%s: PKCS#7 signature info has unexpected non-zero params\n", + name); + return -EBADMSG; + } + + return 0; +} diff --git a/kernel/module_signing.c b/kernel/module_signing.c index b10fb1986ca9..9d9fc678c91d 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -7,37 +7,13 @@ #include <linux/kernel.h> #include <linux/errno.h> +#include <linux/module.h> +#include <linux/module_signature.h> #include <linux/string.h> #include <linux/verification.h> #include <crypto/public_key.h> #include "module-internal.h" -enum pkey_id_type { - PKEY_ID_PGP, /* OpenPGP generated key ID */ - PKEY_ID_X509, /* X.509 arbitrary subjectKeyIdentifier */ - PKEY_ID_PKCS7, /* Signature in PKCS#7 message */ -}; - -/* - * Module signature information block. - * - * The constituents of the signature section are, in order: - * - * - Signer's name - * - Key identifier - * - Signature data - * - Information block - */ -struct module_signature { - u8 algo; /* Public-key crypto algorithm [0] */ - u8 hash; /* Digest algorithm [0] */ - u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */ - u8 signer_len; /* Length of signer's name [0] */ - u8 key_id_len; /* Length of key identifier [0] */ - u8 __pad[3]; - __be32 sig_len; /* Length of signature data */ -}; - /* * Verify the signature on a module. */ @@ -45,6 +21,7 @@ int mod_verify_sig(const void *mod, struct load_info *info) { struct module_signature ms; size_t sig_len, modlen = info->len; + int ret; pr_devel("==>%s(,%zu)\n", __func__, modlen); @@ -52,32 +29,15 @@ int mod_verify_sig(const void *mod, struct load_info *info) return -EBADMSG; memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms)); - modlen -= sizeof(ms); + + ret = mod_check_sig(&ms, modlen, info->name); + if (ret) + return ret; sig_len = be32_to_cpu(ms.sig_len); - if (sig_len >= modlen) - return -EBADMSG; - modlen -= sig_len; + modlen -= sig_len + sizeof(ms); info->len = modlen; - if (ms.id_type != PKEY_ID_PKCS7) { - pr_err("%s: Module is not signed with expected PKCS#7 message\n", - info->name); - return -ENOPKG; - } - - if (ms.algo != 0 || - ms.hash != 0 || - ms.signer_len != 0 || - ms.key_id_len != 0 || - ms.__pad[0] != 0 || - ms.__pad[1] != 0 || - ms.__pad[2] != 0) { - pr_err("%s: PKCS#7 signature info has unexpected non-zero params\n", - info->name); - return -EBADMSG; - } - return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len, VERIFY_USE_SECONDARY_KEYRING, VERIFYING_MODULE_SIGNATURE, diff --git a/kernel/panic.c b/kernel/panic.c index 057540b6eee9..47e8ebccc22b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -12,6 +12,7 @@ #include <linux/debug_locks.h> #include <linux/sched/debug.h> #include <linux/interrupt.h> +#include <linux/kgdb.h> #include <linux/kmsg_dump.h> #include <linux/kallsyms.h> #include <linux/notifier.h> @@ -220,6 +221,13 @@ void panic(const char *fmt, ...) #endif /* + * If kgdb is enabled, give it a chance to run before we stop all + * the other CPUs or else we won't be able to debug processes left + * running on them. + */ + kgdb_panic(buf); + + /* * If we have crashed and we have a crash kernel loaded let it handle * everything else. * If we want to run this after calling panic_notifiers, pass @@ -551,9 +559,6 @@ void __warn(const char *file, int line, void *caller, unsigned taint, { disable_trace_on_warning(); - if (args) - pr_warn(CUT_HERE); - if (file) pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", raw_smp_processor_id(), current->pid, file, line, @@ -591,37 +596,26 @@ void __warn(const char *file, int line, void *caller, unsigned taint, add_taint(taint, LOCKDEP_STILL_OK); } -#ifdef WANT_WARN_ON_SLOWPATH -void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) +#ifndef __WARN_FLAGS +void warn_slowpath_fmt(const char *file, int line, unsigned taint, + const char *fmt, ...) { struct warn_args args; - args.fmt = fmt; - va_start(args.args, fmt); - __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, - &args); - va_end(args.args); -} -EXPORT_SYMBOL(warn_slowpath_fmt); + pr_warn(CUT_HERE); -void warn_slowpath_fmt_taint(const char *file, int line, - unsigned taint, const char *fmt, ...) -{ - struct warn_args args; + if (!fmt) { + __warn(file, line, __builtin_return_address(0), taint, + NULL, NULL); + return; + } args.fmt = fmt; va_start(args.args, fmt); __warn(file, line, __builtin_return_address(0), taint, NULL, &args); va_end(args.args); } -EXPORT_SYMBOL(warn_slowpath_fmt_taint); - -void warn_slowpath_null(const char *file, int line) -{ - pr_warn(CUT_HERE); - __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL); -} -EXPORT_SYMBOL(warn_slowpath_null); +EXPORT_SYMBOL(warn_slowpath_fmt); #else void __warn_printk(const char *fmt, ...) { diff --git a/kernel/params.c b/kernel/params.c index cf448785d058..8e56f8b12d8f 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -12,6 +12,7 @@ #include <linux/err.h> #include <linux/slab.h> #include <linux/ctype.h> +#include <linux/security.h> #ifdef CONFIG_SYSFS /* Protects all built-in parameters, modules use their own param_lock */ @@ -96,13 +97,19 @@ bool parameq(const char *a, const char *b) return parameqn(a, b, strlen(a)+1); } -static void param_check_unsafe(const struct kernel_param *kp) +static bool param_check_unsafe(const struct kernel_param *kp) { + if (kp->flags & KERNEL_PARAM_FL_HWPARAM && + security_locked_down(LOCKDOWN_MODULE_PARAMETERS)) + return false; + if (kp->flags & KERNEL_PARAM_FL_UNSAFE) { pr_notice("Setting dangerous option %s - tainting kernel\n", kp->name); add_taint(TAINT_USER, LOCKDEP_STILL_OK); } + + return true; } static int parse_one(char *param, @@ -132,8 +139,10 @@ static int parse_one(char *param, pr_debug("handling %s with %p\n", param, params[i].ops->set); kernel_param_lock(params[i].mod); - param_check_unsafe(¶ms[i]); - err = params[i].ops->set(val, ¶ms[i]); + if (param_check_unsafe(¶ms[i])) + err = params[i].ops->set(val, ¶ms[i]); + else + err = -EPERM; kernel_param_unlock(params[i].mod); return err; } @@ -553,8 +562,10 @@ static ssize_t param_attr_store(struct module_attribute *mattr, return -EPERM; kernel_param_lock(mk->mod); - param_check_unsafe(attribute->param); - err = attribute->param->ops->set(buf, attribute->param); + if (param_check_unsafe(attribute->param)) + err = attribute->param->ops->set(buf, attribute->param); + else + err = -EPERM; kernel_param_unlock(mk->mod); if (!err) return len; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index cd7434e6000d..3c0a5a8170b0 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -30,6 +30,7 @@ #include <linux/ctype.h> #include <linux/genhd.h> #include <linux/ktime.h> +#include <linux/security.h> #include <trace/events/power.h> #include "power.h" @@ -68,7 +69,7 @@ static const struct platform_hibernation_ops *hibernation_ops; bool hibernation_available(void) { - return (nohibernate == 0); + return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION); } /** diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c index 1d21ebacfdb8..17a9591e54ff 100644 --- a/kernel/printk/braille.c +++ b/kernel/printk/braille.c @@ -11,11 +11,18 @@ int _braille_console_setup(char **str, char **brl_options) { - if (!strncmp(*str, "brl,", 4)) { + size_t len; + + len = str_has_prefix(*str, "brl,"); + if (len) { *brl_options = ""; - *str += 4; - } else if (!strncmp(*str, "brl=", 4)) { - *brl_options = *str + 4; + *str += len; + return 0; + } + + len = str_has_prefix(*str, "brl="); + if (len) { + *brl_options = *str + len; *str = strchr(*brl_options, ','); if (!*str) { pr_err("need port name after brl=\n"); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1888f6a3b694..ca65327a6de8 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -118,19 +118,29 @@ static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; static int __control_devkmsg(char *str) { + size_t len; + if (!str) return -EINVAL; - if (!strncmp(str, "on", 2)) { + len = str_has_prefix(str, "on"); + if (len) { devkmsg_log = DEVKMSG_LOG_MASK_ON; - return 2; - } else if (!strncmp(str, "off", 3)) { + return len; + } + + len = str_has_prefix(str, "off"); + if (len) { devkmsg_log = DEVKMSG_LOG_MASK_OFF; - return 3; - } else if (!strncmp(str, "ratelimit", 9)) { + return len; + } + + len = str_has_prefix(str, "ratelimit"); + if (len) { devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; - return 9; + return len; } + return -EINVAL; } @@ -3274,7 +3284,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, /* move first record forward until length fits into the buffer */ seq = dumper->cur_seq; idx = dumper->cur_idx; - while (l > size && seq < dumper->next_seq) { + while (l >= size && seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); l -= msg_print_text(msg, true, time, NULL, 0); diff --git a/kernel/resource.c b/kernel/resource.c index 7ea4306503c5..76036a41143b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -487,8 +487,8 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, while (start < end && !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, false, &res)) { - pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; - end_pfn = (res.end + 1) >> PAGE_SHIFT; + pfn = PFN_UP(res.start); + end_pfn = PFN_DOWN(res.end + 1); if (end_pfn > pfn) ret = (*func)(pfn, end_pfn - pfn, arg); if (ret) @@ -1644,19 +1644,8 @@ void resource_list_free(struct list_head *head) EXPORT_SYMBOL(resource_list_free); #ifdef CONFIG_DEVICE_PRIVATE -/** - * devm_request_free_mem_region - find free region for device private memory - * - * @dev: device struct to bind the resource to - * @size: size in bytes of the device memory to add - * @base: resource tree to look in - * - * This function tries to find an empty range of physical address big enough to - * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE - * memory, which in turn allocates struct pages. - */ -struct resource *devm_request_free_mem_region(struct device *dev, - struct resource *base, unsigned long size) +static struct resource *__request_free_mem_region(struct device *dev, + struct resource *base, unsigned long size, const char *name) { resource_size_t end, addr; struct resource *res; @@ -1670,7 +1659,10 @@ struct resource *devm_request_free_mem_region(struct device *dev, REGION_DISJOINT) continue; - res = devm_request_mem_region(dev, addr, size, dev_name(dev)); + if (dev) + res = devm_request_mem_region(dev, addr, size, name); + else + res = request_mem_region(addr, size, name); if (!res) return ERR_PTR(-ENOMEM); res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; @@ -1679,7 +1671,32 @@ struct resource *devm_request_free_mem_region(struct device *dev, return ERR_PTR(-ERANGE); } + +/** + * devm_request_free_mem_region - find free region for device private memory + * + * @dev: device struct to bind the resource to + * @size: size in bytes of the device memory to add + * @base: resource tree to look in + * + * This function tries to find an empty range of physical address big enough to + * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE + * memory, which in turn allocates struct pages. + */ +struct resource *devm_request_free_mem_region(struct device *dev, + struct resource *base, unsigned long size) +{ + return __request_free_mem_region(dev, base, size, dev_name(dev)); +} EXPORT_SYMBOL_GPL(devm_request_free_mem_region); + +struct resource *request_free_mem_region(struct resource *base, + unsigned long size, const char *name) +{ + return __request_free_mem_region(NULL, base, size, name); +} +EXPORT_SYMBOL_GPL(request_free_mem_region); + #endif /* CONFIG_DEVICE_PRIVATE */ static int __init strict_iomem(char *str) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5e8387bdd09c..7880f4f64d0e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1656,7 +1656,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(p->cpus_ptr, new_mask)) goto out; - if (!cpumask_intersects(new_mask, cpu_valid_mask)) { + dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); + if (dest_cpu >= nr_cpu_ids) { ret = -EINVAL; goto out; } @@ -1677,7 +1678,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ @@ -3254,7 +3254,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) /* Task is done with its stack. */ put_task_stack(prev); - put_task_struct(prev); + put_task_struct_rcu_user(prev); } tick_nohz_task_switch(); @@ -3358,15 +3358,15 @@ context_switch(struct rq *rq, struct task_struct *prev, else prev->active_mm = NULL; } else { // to user + membarrier_switch_mm(rq, prev->active_mm, next->mm); /* * sys_membarrier() requires an smp_mb() between setting - * rq->curr and returning to userspace. + * rq->curr / membarrier_switch_mm() and returning to userspace. * * The below provides this either through switch_mm(), or in * case 'prev->active_mm == next->mm' through * finish_task_switch()'s mmdrop(). */ - switch_mm_irqs_off(prev->active_mm, next->mm, next); if (!prev->mm) { // from kernel @@ -3871,13 +3871,22 @@ static noinline void __schedule_bug(struct task_struct *prev) /* * Various schedule()-time debugging checks and statistics: */ -static inline void schedule_debug(struct task_struct *prev) +static inline void schedule_debug(struct task_struct *prev, bool preempt) { #ifdef CONFIG_SCHED_STACK_END_CHECK if (task_stack_end_corrupted(prev)) panic("corrupted stack end detected inside scheduler\n"); #endif +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP + if (!preempt && prev->state && prev->non_block_count) { + printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", + prev->comm, prev->pid, prev->non_block_count); + dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); + } +#endif + if (unlikely(in_atomic_preempt_off())) { __schedule_bug(prev); preempt_count_set(PREEMPT_DISABLED); @@ -3989,7 +3998,7 @@ static void __sched notrace __schedule(bool preempt) rq = cpu_rq(cpu); prev = rq->curr; - schedule_debug(prev); + schedule_debug(prev, preempt); if (sched_feat(HRTICK)) hrtick_clear(rq); @@ -4033,7 +4042,11 @@ static void __sched notrace __schedule(bool preempt) if (likely(prev != next)) { rq->nr_switches++; - rq->curr = next; + /* + * RCU users of rcu_dereference(rq->curr) may not see + * changes to task_struct made by pick_next_task(). + */ + RCU_INIT_POINTER(rq->curr, next); /* * The membarrier system call requires each architecture * to have a full memory barrier after updating @@ -4214,9 +4227,8 @@ static void __sched notrace preempt_schedule_common(void) #ifdef CONFIG_PREEMPTION /* - * this is the entry point to schedule() from in-kernel preemption - * off of preempt_enable. Kernel preemptions off return from interrupt - * occur there and call schedule directly. + * This is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. */ asmlinkage __visible void __sched notrace preempt_schedule(void) { @@ -4287,7 +4299,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_notrace); #endif /* CONFIG_PREEMPTION */ /* - * this is the entry point to schedule() from kernel preemption + * This is the entry point to schedule() from kernel preemption * off of irq context. * Note, that this is called and return with irqs disabled. This will * protect us against recursive calling from irq. @@ -6060,7 +6072,8 @@ void init_idle(struct task_struct *idle, int cpu) __set_task_cpu(idle, cpu); rcu_read_unlock(); - rq->curr = rq->idle = idle; + rq->idle = idle; + rcu_assign_pointer(rq->curr, idle); idle->on_rq = TASK_ON_RQ_QUEUED; #ifdef CONFIG_SMP idle->on_cpu = 1; @@ -6421,8 +6434,6 @@ int sched_cpu_activate(unsigned int cpu) } rq_unlock_irqrestore(rq, &rf); - update_max_interval(); - return 0; } @@ -6763,7 +6774,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) rcu_sleep_check(); if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && - !is_idle_task(current)) || + !is_idle_task(current) && !current->non_block_count) || system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || oops_in_progress) return; @@ -6779,8 +6790,8 @@ void ___might_sleep(const char *file, int line, int preempt_offset) "BUG: sleeping function called from invalid context at %s:%d\n", file, line); printk(KERN_ERR - "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), + "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), current->non_block_count, current->pid, current->comm); if (task_stack_end_corrupted(current)) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d4bbf68c3161..83ab35e2374f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -749,7 +749,6 @@ void init_entity_runnable_average(struct sched_entity *se) /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); static void attach_entity_cfs_rq(struct sched_entity *se); /* @@ -1603,7 +1602,7 @@ static void task_numa_compare(struct task_numa_env *env, return; rcu_read_lock(); - cur = task_rcu_dereference(&dst_rq->curr); + cur = rcu_dereference(dst_rq->curr); if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) cur = NULL; @@ -4354,21 +4353,16 @@ static inline u64 sched_cfs_bandwidth_slice(void) } /* - * Replenish runtime according to assigned quota and update expiration time. - * We use sched_clock_cpu directly instead of rq->clock to avoid adding - * additional synchronization around rq->lock. + * Replenish runtime according to assigned quota. We use sched_clock_cpu + * directly instead of rq->clock to avoid adding additional synchronization + * around rq->lock. * * requires cfs_b->lock */ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) { - u64 now; - - if (cfs_b->quota == RUNTIME_INF) - return; - - now = sched_clock_cpu(smp_processor_id()); - cfs_b->runtime = cfs_b->quota; + if (cfs_b->quota != RUNTIME_INF) + cfs_b->runtime = cfs_b->quota; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4376,15 +4370,6 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return &tg->cfs_bandwidth; } -/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) -{ - if (unlikely(cfs_rq->throttle_count)) - return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; - - return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; -} - /* returns 0 on failure to allocate runtime */ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -4476,7 +4461,6 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; if (!cfs_rq->throttle_count) { - /* adjust cfs_rq_clock_task() */ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; @@ -4994,15 +4978,13 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { - u64 overrun; - lockdep_assert_held(&cfs_b->lock); if (cfs_b->period_active) return; cfs_b->period_active = 1; - overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); + hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); } @@ -5080,11 +5062,6 @@ static inline bool cfs_bandwidth_used(void) return false; } -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) -{ - return rq_clock_task(rq_of(cfs_rq)); -} - static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} @@ -6412,7 +6389,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) } /* Evaluate the energy impact of using this CPU. */ - if (max_spare_cap_cpu >= 0) { + if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) { cur_delta = compute_energy(p, max_spare_cap_cpu, pd); cur_delta -= base_energy_pd; if (cur_delta < best_delta) { diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c892c6280c9f..8dad5aa600ea 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -238,7 +238,6 @@ static void do_idle(void) tick_nohz_idle_enter(); while (!need_resched()) { - check_pgt_cache(); rmb(); local_irq_disable(); diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index aa8d75804108..a39bed2c784f 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -30,10 +30,42 @@ static void ipi_mb(void *info) smp_mb(); /* IPIs should be serializing but paranoid. */ } +static void ipi_sync_rq_state(void *info) +{ + struct mm_struct *mm = (struct mm_struct *) info; + + if (current->mm != mm) + return; + this_cpu_write(runqueues.membarrier_state, + atomic_read(&mm->membarrier_state)); + /* + * Issue a memory barrier after setting + * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to + * guarantee that no memory access following registration is reordered + * before registration. + */ + smp_mb(); +} + +void membarrier_exec_mmap(struct mm_struct *mm) +{ + /* + * Issue a memory barrier before clearing membarrier_state to + * guarantee that no memory access prior to exec is reordered after + * clearing this state. + */ + smp_mb(); + atomic_set(&mm->membarrier_state, 0); + /* + * Keep the runqueue membarrier_state in sync with this mm + * membarrier_state. + */ + this_cpu_write(runqueues.membarrier_state, 0); +} + static int membarrier_global_expedited(void) { int cpu; - bool fallback = false; cpumask_var_t tmpmask; if (num_online_cpus() == 1) @@ -45,17 +77,11 @@ static int membarrier_global_expedited(void) */ smp_mb(); /* system call entry is not a mb. */ - /* - * Expedited membarrier commands guarantee that they won't - * block, hence the GFP_NOWAIT allocation flag and fallback - * implementation. - */ - if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { - /* Fallback for OOM. */ - fallback = true; - } + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; cpus_read_lock(); + rcu_read_lock(); for_each_online_cpu(cpu) { struct task_struct *p; @@ -70,23 +96,28 @@ static int membarrier_global_expedited(void) if (cpu == raw_smp_processor_id()) continue; - rcu_read_lock(); - p = task_rcu_dereference(&cpu_rq(cpu)->curr); - if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & - MEMBARRIER_STATE_GLOBAL_EXPEDITED)) { - if (!fallback) - __cpumask_set_cpu(cpu, tmpmask); - else - smp_call_function_single(cpu, ipi_mb, NULL, 1); - } - rcu_read_unlock(); - } - if (!fallback) { - preempt_disable(); - smp_call_function_many(tmpmask, ipi_mb, NULL, 1); - preempt_enable(); - free_cpumask_var(tmpmask); + if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) & + MEMBARRIER_STATE_GLOBAL_EXPEDITED)) + continue; + + /* + * Skip the CPU if it runs a kernel thread. The scheduler + * leaves the prior task mm in place as an optimization when + * scheduling a kthread. + */ + p = rcu_dereference(cpu_rq(cpu)->curr); + if (p->flags & PF_KTHREAD) + continue; + + __cpumask_set_cpu(cpu, tmpmask); } + rcu_read_unlock(); + + preempt_disable(); + smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); + + free_cpumask_var(tmpmask); cpus_read_unlock(); /* @@ -101,22 +132,22 @@ static int membarrier_global_expedited(void) static int membarrier_private_expedited(int flags) { int cpu; - bool fallback = false; cpumask_var_t tmpmask; + struct mm_struct *mm = current->mm; if (flags & MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; - if (!(atomic_read(¤t->mm->membarrier_state) & + if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; } else { - if (!(atomic_read(¤t->mm->membarrier_state) & + if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) return -EPERM; } - if (num_online_cpus() == 1) + if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) return 0; /* @@ -125,17 +156,11 @@ static int membarrier_private_expedited(int flags) */ smp_mb(); /* system call entry is not a mb. */ - /* - * Expedited membarrier commands guarantee that they won't - * block, hence the GFP_NOWAIT allocation flag and fallback - * implementation. - */ - if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { - /* Fallback for OOM. */ - fallback = true; - } + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; cpus_read_lock(); + rcu_read_lock(); for_each_online_cpu(cpu) { struct task_struct *p; @@ -150,21 +175,17 @@ static int membarrier_private_expedited(int flags) if (cpu == raw_smp_processor_id()) continue; rcu_read_lock(); - p = task_rcu_dereference(&cpu_rq(cpu)->curr); - if (p && p->mm == current->mm) { - if (!fallback) - __cpumask_set_cpu(cpu, tmpmask); - else - smp_call_function_single(cpu, ipi_mb, NULL, 1); - } - rcu_read_unlock(); - } - if (!fallback) { - preempt_disable(); - smp_call_function_many(tmpmask, ipi_mb, NULL, 1); - preempt_enable(); - free_cpumask_var(tmpmask); + p = rcu_dereference(cpu_rq(cpu)->curr); + if (p && p->mm == mm) + __cpumask_set_cpu(cpu, tmpmask); } + rcu_read_unlock(); + + preempt_disable(); + smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); + + free_cpumask_var(tmpmask); cpus_read_unlock(); /* @@ -177,32 +198,78 @@ static int membarrier_private_expedited(int flags) return 0; } +static int sync_runqueues_membarrier_state(struct mm_struct *mm) +{ + int membarrier_state = atomic_read(&mm->membarrier_state); + cpumask_var_t tmpmask; + int cpu; + + if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) { + this_cpu_write(runqueues.membarrier_state, membarrier_state); + + /* + * For single mm user, we can simply issue a memory barrier + * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the + * mm and in the current runqueue to guarantee that no memory + * access following registration is reordered before + * registration. + */ + smp_mb(); + return 0; + } + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + /* + * For mm with multiple users, we need to ensure all future + * scheduler executions will observe @mm's new membarrier + * state. + */ + synchronize_rcu(); + + /* + * For each cpu runqueue, if the task's mm match @mm, ensure that all + * @mm's membarrier state set bits are also set in in the runqueue's + * membarrier state. This ensures that a runqueue scheduling + * between threads which are users of @mm has its membarrier state + * updated. + */ + cpus_read_lock(); + rcu_read_lock(); + for_each_online_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct task_struct *p; + + p = rcu_dereference(rq->curr); + if (p && p->mm == mm) + __cpumask_set_cpu(cpu, tmpmask); + } + rcu_read_unlock(); + + preempt_disable(); + smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1); + preempt_enable(); + + free_cpumask_var(tmpmask); + cpus_read_unlock(); + + return 0; +} + static int membarrier_register_global_expedited(void) { struct task_struct *p = current; struct mm_struct *mm = p->mm; + int ret; if (atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) return 0; atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); - if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) { - /* - * For single mm user, single threaded process, we can - * simply issue a memory barrier after setting - * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that - * no memory access following registration is reordered - * before registration. - */ - smp_mb(); - } else { - /* - * For multi-mm user threads, we need to ensure all - * future scheduler executions will observe the new - * thread flag state for this mm. - */ - synchronize_rcu(); - } + ret = sync_runqueues_membarrier_state(mm); + if (ret) + return ret; atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, &mm->membarrier_state); @@ -213,12 +280,15 @@ static int membarrier_register_private_expedited(int flags) { struct task_struct *p = current; struct mm_struct *mm = p->mm; - int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY; + int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, + set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, + ret; if (flags & MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; - state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; + ready_state = + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; } /* @@ -226,20 +296,15 @@ static int membarrier_register_private_expedited(int flags) * groups, which use the same mm. (CLONE_VM but not * CLONE_THREAD). */ - if (atomic_read(&mm->membarrier_state) & state) + if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state) return 0; - atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state); if (flags & MEMBARRIER_FLAG_SYNC_CORE) - atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE, - &mm->membarrier_state); - if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) { - /* - * Ensure all future scheduler executions will observe the - * new thread flag state for this process. - */ - synchronize_rcu(); - } - atomic_or(state, &mm->membarrier_state); + set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE; + atomic_or(set_state, &mm->membarrier_state); + ret = sync_runqueues_membarrier_state(mm); + if (ret) + return ret; + atomic_or(ready_state, &mm->membarrier_state); return 0; } @@ -253,8 +318,10 @@ static int membarrier_register_private_expedited(int flags) * command specified does not exist, not available on the running * kernel, or if the command argument is invalid, this system call * returns -EINVAL. For a given command, with flags argument set to 0, - * this system call is guaranteed to always return the same value until - * reboot. + * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to + * always return the same value until reboot. In addition, it can return + * -ENOMEM if there is not enough memory available to perform the system + * call. * * All memory accesses performed in program order from each targeted thread * is guaranteed to be ordered with respect to sys_membarrier(). If we use diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b3cb895d14a2..0db2c1b3361e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -911,6 +911,10 @@ struct rq { atomic_t nr_iowait; +#ifdef CONFIG_MEMBARRIER + int membarrier_state; +#endif + #ifdef CONFIG_SMP struct root_domain *rd; struct sched_domain __rcu *sd; @@ -2438,3 +2442,33 @@ static inline bool sched_energy_enabled(void) static inline bool sched_energy_enabled(void) { return false; } #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ + +#ifdef CONFIG_MEMBARRIER +/* + * The scheduler provides memory barriers required by membarrier between: + * - prior user-space memory accesses and store to rq->membarrier_state, + * - store to rq->membarrier_state and following user-space memory accesses. + * In the same way it provides those guarantees around store to rq->curr. + */ +static inline void membarrier_switch_mm(struct rq *rq, + struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ + int membarrier_state; + + if (prev_mm == next_mm) + return; + + membarrier_state = atomic_read(&next_mm->membarrier_state); + if (READ_ONCE(rq->membarrier_state) == membarrier_state) + return; + + WRITE_ONCE(rq->membarrier_state, membarrier_state); +} +#else +static inline void membarrier_switch_mm(struct rq *rq, + struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ +} +#endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 078950d9605b..00fcea236eba 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -264,7 +264,8 @@ extern struct ctl_table epoll_table[]; extern struct ctl_table firmware_config_table[]; #endif -#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT +#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ + defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) int sysctl_legacy_va_layout; #endif @@ -1573,7 +1574,8 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_dointvec, .extra1 = SYSCTL_ZERO, }, -#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT +#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ + defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) { .procname = "legacy_va_layout", .data = &sysctl_legacy_va_layout, diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 0e315a2e77ae..4820823515e9 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1678,24 +1678,26 @@ void timer_clear_idle(void) static int collect_expired_timers(struct timer_base *base, struct hlist_head *heads) { + unsigned long now = READ_ONCE(jiffies); + /* * NOHZ optimization. After a long idle sleep we need to forward the * base to current jiffies. Avoid a loop by searching the bitfield for * the next expiring timer. */ - if ((long)(jiffies - base->clk) > 2) { + if ((long)(now - base->clk) > 2) { unsigned long next = __next_timer_interrupt(base); /* * If the next timer is ahead of time forward to current * jiffies, otherwise forward to the next expiry time: */ - if (time_after(next, jiffies)) { + if (time_after(next, now)) { /* * The call site will increment base->clk and then * terminate the expiry loop immediately. */ - base->clk = jiffies; + base->clk = now; return 0; } base->clk = next; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3e38a010003c..44bd08f2443b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -142,8 +142,13 @@ BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { int ret; + ret = security_locked_down(LOCKDOWN_BPF_READ); + if (ret < 0) + goto out; + ret = probe_kernel_read(dst, unsafe_ptr, size); if (unlikely(ret < 0)) +out: memset(dst, 0, size); return ret; @@ -585,6 +590,10 @@ BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size, { int ret; + ret = security_locked_down(LOCKDOWN_BPF_READ); + if (ret < 0) + goto out; + /* * The strncpy_from_unsafe() call will likely not fill the entire * buffer, but that's okay in this circumstance as we're probing @@ -596,6 +605,7 @@ BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size, */ ret = strncpy_from_unsafe(dst, unsafe_ptr, size); if (unlikely(ret < 0)) +out: memset(dst, 0, size); return ret; diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 8dfd5021b933..7950a0356042 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -276,7 +276,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, int index = task->curr_ret_stack; int i; - if (ret != (unsigned long)return_to_handler) + if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler)) return ret; if (index < 0) @@ -294,7 +294,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, { int task_idx; - if (ret != (unsigned long)return_to_handler) + if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler)) return ret; task_idx = task->curr_ret_stack; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 356b848c697a..62a50bf399d6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6036,11 +6036,7 @@ clear_func_from_hash(struct ftrace_init_func *func, struct ftrace_hash *hash) { struct ftrace_func_entry *entry; - if (ftrace_hash_empty(hash)) - return; - - entry = __ftrace_lookup_ip(hash, func->ip); - + entry = ftrace_lookup_ip(hash, func->ip); /* * Do not allow this rec to match again. * Yeah, it may waste some memory, but will be removed diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 947ba433865f..252f79c435f8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1854,7 +1854,7 @@ int __init register_tracer(struct tracer *type) return ret; } -void tracing_reset(struct trace_buffer *buf, int cpu) +static void tracing_reset_cpu(struct trace_buffer *buf, int cpu) { struct ring_buffer *buffer = buf->buffer; @@ -4251,7 +4251,7 @@ static int tracing_open(struct inode *inode, struct file *file) if (cpu == RING_BUFFER_ALL_CPUS) tracing_reset_online_cpus(trace_buf); else - tracing_reset(trace_buf, cpu); + tracing_reset_cpu(trace_buf, cpu); } if (file->f_mode & FMODE_READ) { @@ -4815,15 +4815,15 @@ static const char readme_msg[] = #endif #endif /* CONFIG_STACK_TRACER */ #ifdef CONFIG_DYNAMIC_EVENTS - " dynamic_events\t\t- Add/remove/show the generic dynamic events\n" + " dynamic_events\t\t- Create/append/remove/show the generic dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif #ifdef CONFIG_KPROBE_EVENTS - " kprobe_events\t\t- Add/remove/show the kernel dynamic events\n" + " kprobe_events\t\t- Create/append/remove/show the kernel dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif #ifdef CONFIG_UPROBE_EVENTS - " uprobe_events\t\t- Add/remove/show the userspace dynamic events\n" + " uprobe_events\t\t- Create/append/remove/show the userspace dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) @@ -4848,7 +4848,7 @@ static const char readme_msg[] = #else "\t $stack<index>, $stack, $retval, $comm,\n" #endif - "\t +|-[u]<offset>(<fetcharg>)\n" + "\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n" "\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n" "\t <type>\\[<array-size>\\]\n" @@ -6742,7 +6742,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, if (iter->cpu_file == RING_BUFFER_ALL_CPUS) tracing_reset_online_cpus(&tr->max_buffer); else - tracing_reset(&tr->max_buffer, iter->cpu_file); + tracing_reset_cpu(&tr->max_buffer, iter->cpu_file); } break; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 005f08629b8b..26b0a08f3c7d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -677,7 +677,6 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu) int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); -void tracing_reset(struct trace_buffer *buf, int cpu); void tracing_reset_online_cpus(struct trace_buffer *buf); void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index fa100ed3b4de..a41fed46c285 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -47,6 +47,7 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type) return -EINVAL; event++; } + argc--; argv++; p = strchr(event, '/'); if (p) { @@ -61,10 +62,13 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type) for_each_dyn_event_safe(pos, n) { if (type && type != pos->ops) continue; - if (pos->ops->match(system, event, pos)) { - ret = pos->ops->free(pos); + if (!pos->ops->match(system, event, + argc, (const char **)argv, pos)) + continue; + + ret = pos->ops->free(pos); + if (ret) break; - } } mutex_unlock(&event_mutex); diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index 8c334064e4d6..46898138d2df 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -31,8 +31,9 @@ struct dyn_event; * @is_busy: Check whether given event is busy so that it can not be deleted. * Return true if it is busy, otherwides false. * @free: Delete the given event. Return 0 if success, otherwides error. - * @match: Check whether given event and system name match this event. - * Return true if it matches, otherwides false. + * @match: Check whether given event and system name match this event. The argc + * and argv is used for exact match. Return true if it matches, otherwides + * false. * * Except for @create, these methods are called under holding event_mutex. */ @@ -43,7 +44,7 @@ struct dyn_event_operations { bool (*is_busy)(struct dyn_event *ev); int (*free)(struct dyn_event *ev); bool (*match)(const char *system, const char *event, - struct dyn_event *ev); + int argc, const char **argv, struct dyn_event *ev); }; /* Register new dyn_event type -- must be called at first */ diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index ca6b0dff60c5..9468bd8d44a2 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -13,6 +13,10 @@ #include <linux/rculist.h> #include <linux/tracefs.h> +/* for gfp flag names */ +#include <linux/trace_events.h> +#include <trace/events/mmflags.h> + #include "tracing_map.h" #include "trace.h" #include "trace_dynevent.h" @@ -374,7 +378,7 @@ static int synth_event_show(struct seq_file *m, struct dyn_event *ev); static int synth_event_release(struct dyn_event *ev); static bool synth_event_is_busy(struct dyn_event *ev); static bool synth_event_match(const char *system, const char *event, - struct dyn_event *ev); + int argc, const char **argv, struct dyn_event *ev); static struct dyn_event_operations synth_event_ops = { .create = synth_event_create, @@ -422,7 +426,7 @@ static bool synth_event_is_busy(struct dyn_event *ev) } static bool synth_event_match(const char *system, const char *event, - struct dyn_event *ev) + int argc, const char **argv, struct dyn_event *ev) { struct synth_event *sev = to_synth_event(ev); @@ -752,6 +756,8 @@ static int synth_field_size(char *type) size = sizeof(unsigned long); else if (strcmp(type, "pid_t") == 0) size = sizeof(pid_t); + else if (strcmp(type, "gfp_t") == 0) + size = sizeof(gfp_t); else if (synth_field_is_string(type)) size = synth_field_string_size(type); @@ -792,6 +798,8 @@ static const char *synth_field_fmt(char *type) fmt = "%lu"; else if (strcmp(type, "pid_t") == 0) fmt = "%d"; + else if (strcmp(type, "gfp_t") == 0) + fmt = "%x"; else if (synth_field_is_string(type)) fmt = "%s"; @@ -834,9 +842,20 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, i == se->n_fields - 1 ? "" : " "); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { + struct trace_print_flags __flags[] = { + __def_gfpflag_names, {-1, NULL} }; + trace_seq_printf(s, print_fmt, se->fields[i]->name, entry->fields[n_u64], i == se->n_fields - 1 ? "" : " "); + + if (strcmp(se->fields[i]->type, "gfp_t") == 0) { + trace_seq_puts(s, " ("); + trace_print_flags_seq(s, "|", + entry->fields[n_u64], + __flags); + trace_seq_putc(s, ')'); + } n_u64++; } } @@ -2785,6 +2804,8 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data, return NULL; } + alias->var_ref_idx = var_ref->var_ref_idx; + return alias; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9d483ad9bb6c..324ffbea3556 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -11,6 +11,7 @@ #include <linux/uaccess.h> #include <linux/rculist.h> #include <linux/error-injection.h> +#include <linux/security.h> #include <asm/setup.h> /* for COMMAND_LINE_SIZE */ @@ -39,7 +40,7 @@ static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev); static int trace_kprobe_release(struct dyn_event *ev); static bool trace_kprobe_is_busy(struct dyn_event *ev); static bool trace_kprobe_match(const char *system, const char *event, - struct dyn_event *ev); + int argc, const char **argv, struct dyn_event *ev); static struct dyn_event_operations trace_kprobe_ops = { .create = trace_kprobe_create, @@ -137,13 +138,36 @@ static bool trace_kprobe_is_busy(struct dyn_event *ev) return trace_probe_is_enabled(&tk->tp); } +static bool trace_kprobe_match_command_head(struct trace_kprobe *tk, + int argc, const char **argv) +{ + char buf[MAX_ARGSTR_LEN + 1]; + + if (!argc) + return true; + + if (!tk->symbol) + snprintf(buf, sizeof(buf), "0x%p", tk->rp.kp.addr); + else if (tk->rp.kp.offset) + snprintf(buf, sizeof(buf), "%s+%u", + trace_kprobe_symbol(tk), tk->rp.kp.offset); + else + snprintf(buf, sizeof(buf), "%s", trace_kprobe_symbol(tk)); + if (strcmp(buf, argv[0])) + return false; + argc--; argv++; + + return trace_probe_match_command_args(&tk->tp, argc, argv); +} + static bool trace_kprobe_match(const char *system, const char *event, - struct dyn_event *ev) + int argc, const char **argv, struct dyn_event *ev) { struct trace_kprobe *tk = to_trace_kprobe(ev); return strcmp(trace_probe_name(&tk->tp), event) == 0 && - (!system || strcmp(trace_probe_group_name(&tk->tp), system) == 0); + (!system || strcmp(trace_probe_group_name(&tk->tp), system) == 0) && + trace_kprobe_match_command_head(tk, argc, argv); } static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) @@ -180,20 +204,33 @@ unsigned long trace_kprobe_address(struct trace_kprobe *tk) return addr; } +static nokprobe_inline struct trace_kprobe * +trace_kprobe_primary_from_call(struct trace_event_call *call) +{ + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return NULL; + + return container_of(tp, struct trace_kprobe, tp); +} + bool trace_kprobe_on_func_entry(struct trace_event_call *call) { - struct trace_kprobe *tk = (struct trace_kprobe *)call->data; + struct trace_kprobe *tk = trace_kprobe_primary_from_call(call); - return kprobe_on_func_entry(tk->rp.kp.addr, + return tk ? kprobe_on_func_entry(tk->rp.kp.addr, tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name, - tk->rp.kp.addr ? 0 : tk->rp.kp.offset); + tk->rp.kp.addr ? 0 : tk->rp.kp.offset) : false; } bool trace_kprobe_error_injectable(struct trace_event_call *call) { - struct trace_kprobe *tk = (struct trace_kprobe *)call->data; + struct trace_kprobe *tk = trace_kprobe_primary_from_call(call); - return within_error_injection_list(trace_kprobe_address(tk)); + return tk ? within_error_injection_list(trace_kprobe_address(tk)) : + false; } static int register_kprobe_event(struct trace_kprobe *tk); @@ -291,32 +328,68 @@ static inline int __enable_trace_kprobe(struct trace_kprobe *tk) return ret; } +static void __disable_trace_kprobe(struct trace_probe *tp) +{ + struct trace_probe *pos; + struct trace_kprobe *tk; + + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + tk = container_of(pos, struct trace_kprobe, tp); + if (!trace_kprobe_is_registered(tk)) + continue; + if (trace_kprobe_is_return(tk)) + disable_kretprobe(&tk->rp); + else + disable_kprobe(&tk->rp.kp); + } +} + /* * Enable trace_probe * if the file is NULL, enable "perf" handler, or enable "trace" handler. */ -static int -enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) +static int enable_trace_kprobe(struct trace_event_call *call, + struct trace_event_file *file) { - bool enabled = trace_probe_is_enabled(&tk->tp); + struct trace_probe *pos, *tp; + struct trace_kprobe *tk; + bool enabled; int ret = 0; + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + enabled = trace_probe_is_enabled(tp); + + /* This also changes "enabled" state */ if (file) { - ret = trace_probe_add_file(&tk->tp, file); + ret = trace_probe_add_file(tp, file); if (ret) return ret; } else - trace_probe_set_flag(&tk->tp, TP_FLAG_PROFILE); + trace_probe_set_flag(tp, TP_FLAG_PROFILE); if (enabled) return 0; - ret = __enable_trace_kprobe(tk); + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + tk = container_of(pos, struct trace_kprobe, tp); + if (trace_kprobe_has_gone(tk)) + continue; + ret = __enable_trace_kprobe(tk); + if (ret) + break; + enabled = true; + } + if (ret) { + /* Failed to enable one of them. Roll back all */ + if (enabled) + __disable_trace_kprobe(tp); if (file) - trace_probe_remove_file(&tk->tp, file); + trace_probe_remove_file(tp, file); else - trace_probe_clear_flag(&tk->tp, TP_FLAG_PROFILE); + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); } return ret; @@ -326,11 +399,14 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) * Disable trace_probe * if the file is NULL, disable "perf" handler, or disable "trace" handler. */ -static int -disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) +static int disable_trace_kprobe(struct trace_event_call *call, + struct trace_event_file *file) { - struct trace_probe *tp = &tk->tp; - int ret = 0; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; if (file) { if (!trace_probe_get_file_link(tp, file)) @@ -341,12 +417,8 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) } else trace_probe_clear_flag(tp, TP_FLAG_PROFILE); - if (!trace_probe_is_enabled(tp) && trace_kprobe_is_registered(tk)) { - if (trace_kprobe_is_return(tk)) - disable_kretprobe(&tk->rp); - else - disable_kprobe(&tk->rp.kp); - } + if (!trace_probe_is_enabled(tp)) + __disable_trace_kprobe(tp); out: if (file) @@ -358,7 +430,7 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) */ trace_probe_remove_file(tp, file); - return ret; + return 0; } #if defined(CONFIG_KPROBES_ON_FTRACE) && \ @@ -389,6 +461,10 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) { int i, ret; + ret = security_locked_down(LOCKDOWN_KPROBES); + if (ret) + return ret; + if (trace_kprobe_is_registered(tk)) return -EINVAL; @@ -437,6 +513,10 @@ static void __unregister_trace_kprobe(struct trace_kprobe *tk) /* Unregister a trace_probe and probe_event */ static int unregister_trace_kprobe(struct trace_kprobe *tk) { + /* If other probes are on the event, just unregister kprobe */ + if (trace_probe_has_sibling(&tk->tp)) + goto unreg; + /* Enabled event can not be unregistered */ if (trace_probe_is_enabled(&tk->tp)) return -EBUSY; @@ -445,12 +525,82 @@ static int unregister_trace_kprobe(struct trace_kprobe *tk) if (unregister_kprobe_event(tk)) return -EBUSY; +unreg: __unregister_trace_kprobe(tk); dyn_event_remove(&tk->devent); + trace_probe_unlink(&tk->tp); return 0; } +static bool trace_kprobe_has_same_kprobe(struct trace_kprobe *orig, + struct trace_kprobe *comp) +{ + struct trace_probe_event *tpe = orig->tp.event; + struct trace_probe *pos; + int i; + + list_for_each_entry(pos, &tpe->probes, list) { + orig = container_of(pos, struct trace_kprobe, tp); + if (strcmp(trace_kprobe_symbol(orig), + trace_kprobe_symbol(comp)) || + trace_kprobe_offset(orig) != trace_kprobe_offset(comp)) + continue; + + /* + * trace_probe_compare_arg_type() ensured that nr_args and + * each argument name and type are same. Let's compare comm. + */ + for (i = 0; i < orig->tp.nr_args; i++) { + if (strcmp(orig->tp.args[i].comm, + comp->tp.args[i].comm)) + break; + } + + if (i == orig->tp.nr_args) + return true; + } + + return false; +} + +static int append_trace_kprobe(struct trace_kprobe *tk, struct trace_kprobe *to) +{ + int ret; + + ret = trace_probe_compare_arg_type(&tk->tp, &to->tp); + if (ret) { + /* Note that argument starts index = 2 */ + trace_probe_log_set_index(ret + 1); + trace_probe_log_err(0, DIFF_ARG_TYPE); + return -EEXIST; + } + if (trace_kprobe_has_same_kprobe(to, tk)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, SAME_PROBE); + return -EEXIST; + } + + /* Append to existing event */ + ret = trace_probe_append(&tk->tp, &to->tp); + if (ret) + return ret; + + /* Register k*probe */ + ret = __register_trace_kprobe(tk); + if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) { + pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); + ret = 0; + } + + if (ret) + trace_probe_unlink(&tk->tp); + else + dyn_event_add(&tk->devent); + + return ret; +} + /* Register a trace_probe and probe_event */ static int register_trace_kprobe(struct trace_kprobe *tk) { @@ -459,14 +609,17 @@ static int register_trace_kprobe(struct trace_kprobe *tk) mutex_lock(&event_mutex); - /* Delete old (same name) event if exist */ old_tk = find_trace_kprobe(trace_probe_name(&tk->tp), trace_probe_group_name(&tk->tp)); if (old_tk) { - ret = unregister_trace_kprobe(old_tk); - if (ret < 0) - goto end; - free_trace_kprobe(old_tk); + if (trace_kprobe_is_return(tk) != trace_kprobe_is_return(old_tk)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, DIFF_PROBE_TYPE); + ret = -EEXIST; + } else { + ret = append_trace_kprobe(tk, old_tk); + } + goto end; } /* Register new event */ @@ -700,7 +853,7 @@ static int trace_kprobe_create(int argc, const char *argv[]) trace_probe_log_err(0, BAD_INSN_BNDRY); else if (ret == -ENOENT) trace_probe_log_err(0, BAD_PROBE_ADDR); - else if (ret != -ENOMEM) + else if (ret != -ENOMEM && ret != -EEXIST) trace_probe_log_err(0, FAIL_REG_PROBE); goto error; } @@ -965,6 +1118,9 @@ retry: case FETCH_OP_COMM: val = (unsigned long)current->comm; break; + case FETCH_OP_DATA: + val = (unsigned long)code->data; + break; #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API case FETCH_OP_ARG: val = regs_get_kernel_argument(regs, code->param); @@ -1089,7 +1245,10 @@ print_kprobe_event(struct trace_iterator *iter, int flags, struct trace_probe *tp; field = (struct kprobe_trace_entry_head *)iter->ent; - tp = container_of(event, struct trace_probe, call.event); + tp = trace_probe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (WARN_ON_ONCE(!tp)) + goto out; trace_seq_printf(s, "%s: (", trace_probe_name(tp)); @@ -1116,7 +1275,10 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, struct trace_probe *tp; field = (struct kretprobe_trace_entry_head *)iter->ent; - tp = container_of(event, struct trace_probe, call.event); + tp = trace_probe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (WARN_ON_ONCE(!tp)) + goto out; trace_seq_printf(s, "%s: (", trace_probe_name(tp)); @@ -1145,23 +1307,31 @@ static int kprobe_event_define_fields(struct trace_event_call *event_call) { int ret; struct kprobe_trace_entry_head field; - struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(event_call); + if (WARN_ON_ONCE(!tp)) + return -ENOENT; DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); - return traceprobe_define_arg_fields(event_call, sizeof(field), &tk->tp); + return traceprobe_define_arg_fields(event_call, sizeof(field), tp); } static int kretprobe_event_define_fields(struct trace_event_call *event_call) { int ret; struct kretprobe_trace_entry_head field; - struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(event_call); + if (WARN_ON_ONCE(!tp)) + return -ENOENT; DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); - return traceprobe_define_arg_fields(event_call, sizeof(field), &tk->tp); + return traceprobe_define_arg_fields(event_call, sizeof(field), tp); } #ifdef CONFIG_PERF_EVENTS @@ -1289,20 +1459,19 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, static int kprobe_register(struct trace_event_call *event, enum trace_reg type, void *data) { - struct trace_kprobe *tk = (struct trace_kprobe *)event->data; struct trace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: - return enable_trace_kprobe(tk, file); + return enable_trace_kprobe(event, file); case TRACE_REG_UNREGISTER: - return disable_trace_kprobe(tk, file); + return disable_trace_kprobe(event, file); #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return enable_trace_kprobe(tk, NULL); + return enable_trace_kprobe(event, NULL); case TRACE_REG_PERF_UNREGISTER: - return disable_trace_kprobe(tk, NULL); + return disable_trace_kprobe(event, NULL); case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: case TRACE_REG_PERF_ADD: @@ -1369,7 +1538,6 @@ static inline void init_trace_event_call(struct trace_kprobe *tk) call->flags = TRACE_EVENT_FL_KPROBE; call->class->reg = kprobe_register; - call->data = tk; } static int register_kprobe_event(struct trace_kprobe *tk) @@ -1432,7 +1600,9 @@ void destroy_local_trace_kprobe(struct trace_event_call *event_call) { struct trace_kprobe *tk; - tk = container_of(event_call, struct trace_kprobe, tp.call); + tk = trace_kprobe_primary_from_call(event_call); + if (unlikely(!tk)) + return; if (trace_probe_is_enabled(&tk->tp)) { WARN_ON(1); @@ -1577,7 +1747,8 @@ static __init int kprobe_trace_self_tests_init(void) pr_warn("error on getting probe file.\n"); warn++; } else - enable_trace_kprobe(tk, file); + enable_trace_kprobe( + trace_probe_event_call(&tk->tp), file); } } @@ -1598,7 +1769,8 @@ static __init int kprobe_trace_self_tests_init(void) pr_warn("error on getting probe file.\n"); warn++; } else - enable_trace_kprobe(tk, file); + enable_trace_kprobe( + trace_probe_event_call(&tk->tp), file); } } @@ -1631,7 +1803,8 @@ static __init int kprobe_trace_self_tests_init(void) pr_warn("error on getting probe file.\n"); warn++; } else - disable_trace_kprobe(tk, file); + disable_trace_kprobe( + trace_probe_event_call(&tk->tp), file); } tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); @@ -1649,7 +1822,8 @@ static __init int kprobe_trace_self_tests_init(void) pr_warn("error on getting probe file.\n"); warn++; } else - disable_trace_kprobe(tk, file); + disable_trace_kprobe( + trace_probe_event_call(&tk->tp), file); } ret = trace_run_command("-:testprobe", create_or_delete_trace_kprobe); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index cab4a5398f1d..d54ce252b05a 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -219,10 +219,10 @@ trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len, { int i; const char *ret = trace_seq_buffer_ptr(p); + const char *fmt = concatenate ? "%*phN" : "%*ph"; - for (i = 0; i < buf_len; i++) - trace_seq_printf(p, "%s%2.2x", concatenate || i == 0 ? "" : " ", - buf[i]); + for (i = 0; i < buf_len; i += 16) + trace_seq_printf(p, fmt, min(buf_len - i, 16), &buf[i]); trace_seq_putc(p, 0); return ret; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index fb6bfbc5bf86..baf58a3612c0 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -316,6 +316,29 @@ inval_var: return -EINVAL; } +static int str_to_immediate(char *str, unsigned long *imm) +{ + if (isdigit(str[0])) + return kstrtoul(str, 0, imm); + else if (str[0] == '-') + return kstrtol(str, 0, (long *)imm); + else if (str[0] == '+') + return kstrtol(str + 1, 0, (long *)imm); + return -EINVAL; +} + +static int __parse_imm_string(char *str, char **pbuf, int offs) +{ + size_t len = strlen(str); + + if (str[len - 1] != '"') { + trace_probe_log_err(offs + len, IMMSTR_NO_CLOSE); + return -EINVAL; + } + *pbuf = kstrndup(str, len - 1, GFP_KERNEL); + return 0; +} + /* Recursive argument parser */ static int parse_probe_arg(char *arg, const struct fetch_type *type, @@ -430,7 +453,8 @@ parse_probe_arg(char *arg, const struct fetch_type *type, ret = parse_probe_arg(arg, t2, &code, end, flags, offs); if (ret) break; - if (code->op == FETCH_OP_COMM) { + if (code->op == FETCH_OP_COMM || + code->op == FETCH_OP_DATA) { trace_probe_log_err(offs, COMM_CANT_DEREF); return -EINVAL; } @@ -444,6 +468,21 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->offset = offset; } break; + case '\\': /* Immediate value */ + if (arg[1] == '"') { /* Immediate string */ + ret = __parse_imm_string(arg + 2, &tmp, offs + 2); + if (ret) + break; + code->op = FETCH_OP_DATA; + code->data = tmp; + } else { + ret = str_to_immediate(arg + 1, &code->immediate); + if (ret) + trace_probe_log_err(offs + 1, BAD_IMM); + else + code->op = FETCH_OP_IMM; + } + break; } if (!ret && code->op == FETCH_OP_NOP) { /* Parsed, but do not find fetch method */ @@ -542,8 +581,11 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, } } - /* Since $comm can not be dereferred, we can find $comm by strcmp */ - if (strcmp(arg, "$comm") == 0) { + /* + * Since $comm and immediate string can not be dereferred, + * we can find those by strcmp. + */ + if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) { /* The type of $comm must be "string", and not an array. */ if (parg->count || (t && strcmp(t, "string"))) return -EINVAL; @@ -580,7 +622,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, if (!strcmp(parg->type->name, "string") || !strcmp(parg->type->name, "ustring")) { if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF && - code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM) { + code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM && + code->op != FETCH_OP_DATA) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_STRING); ret = -EINVAL; @@ -589,9 +632,10 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM) || parg->count) { /* - * IMM and COMM is pointing actual address, those must - * be kept, and if parg->count != 0, this is an array - * of string pointers instead of string address itself. + * IMM, DATA and COMM is pointing actual address, those + * must be kept, and if parg->count != 0, this is an + * array of string pointers instead of string address + * itself. */ code++; if (code->op != FETCH_OP_NOP) { @@ -665,7 +709,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, fail: if (ret) { for (code = tmp; code < tmp + FETCH_INSN_MAX; code++) - if (code->op == FETCH_NOP_SYMBOL) + if (code->op == FETCH_NOP_SYMBOL || + code->op == FETCH_OP_DATA) kfree(code->data); } kfree(tmp); @@ -736,7 +781,8 @@ void traceprobe_free_probe_arg(struct probe_arg *arg) struct fetch_insn *code = arg->code; while (code && code->op != FETCH_OP_END) { - if (code->op == FETCH_NOP_SYMBOL) + if (code->op == FETCH_NOP_SYMBOL || + code->op == FETCH_OP_DATA) kfree(code->data); code++; } @@ -886,44 +932,85 @@ int traceprobe_define_arg_fields(struct trace_event_call *event_call, return 0; } +static void trace_probe_event_free(struct trace_probe_event *tpe) +{ + kfree(tpe->class.system); + kfree(tpe->call.name); + kfree(tpe->call.print_fmt); + kfree(tpe); +} + +int trace_probe_append(struct trace_probe *tp, struct trace_probe *to) +{ + if (trace_probe_has_sibling(tp)) + return -EBUSY; + + list_del_init(&tp->list); + trace_probe_event_free(tp->event); + + tp->event = to->event; + list_add_tail(&tp->list, trace_probe_probe_list(to)); + + return 0; +} + +void trace_probe_unlink(struct trace_probe *tp) +{ + list_del_init(&tp->list); + if (list_empty(trace_probe_probe_list(tp))) + trace_probe_event_free(tp->event); + tp->event = NULL; +} void trace_probe_cleanup(struct trace_probe *tp) { - struct trace_event_call *call = trace_probe_event_call(tp); int i; for (i = 0; i < tp->nr_args; i++) traceprobe_free_probe_arg(&tp->args[i]); - if (call->class) - kfree(call->class->system); - kfree(call->name); - kfree(call->print_fmt); + if (tp->event) + trace_probe_unlink(tp); } int trace_probe_init(struct trace_probe *tp, const char *event, const char *group) { - struct trace_event_call *call = trace_probe_event_call(tp); + struct trace_event_call *call; + int ret = 0; if (!event || !group) return -EINVAL; - call->class = &tp->class; - call->name = kstrdup(event, GFP_KERNEL); - if (!call->name) + tp->event = kzalloc(sizeof(struct trace_probe_event), GFP_KERNEL); + if (!tp->event) return -ENOMEM; - tp->class.system = kstrdup(group, GFP_KERNEL); - if (!tp->class.system) { - kfree(call->name); - call->name = NULL; - return -ENOMEM; + INIT_LIST_HEAD(&tp->event->files); + INIT_LIST_HEAD(&tp->event->class.fields); + INIT_LIST_HEAD(&tp->event->probes); + INIT_LIST_HEAD(&tp->list); + list_add(&tp->event->probes, &tp->list); + + call = trace_probe_event_call(tp); + call->class = &tp->event->class; + call->name = kstrdup(event, GFP_KERNEL); + if (!call->name) { + ret = -ENOMEM; + goto error; + } + + tp->event->class.system = kstrdup(group, GFP_KERNEL); + if (!tp->event->class.system) { + ret = -ENOMEM; + goto error; } - INIT_LIST_HEAD(&tp->files); - INIT_LIST_HEAD(&tp->class.fields); return 0; + +error: + trace_probe_cleanup(tp); + return ret; } int trace_probe_register_event_call(struct trace_probe *tp) @@ -952,7 +1039,7 @@ int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file) link->file = file; INIT_LIST_HEAD(&link->list); - list_add_tail_rcu(&link->list, &tp->files); + list_add_tail_rcu(&link->list, &tp->event->files); trace_probe_set_flag(tp, TP_FLAG_TRACE); return 0; } @@ -983,8 +1070,45 @@ int trace_probe_remove_file(struct trace_probe *tp, synchronize_rcu(); kfree(link); - if (list_empty(&tp->files)) + if (list_empty(&tp->event->files)) trace_probe_clear_flag(tp, TP_FLAG_TRACE); return 0; } + +/* + * Return the smallest index of different type argument (start from 1). + * If all argument types and name are same, return 0. + */ +int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b) +{ + int i; + + for (i = 0; i < a->nr_args; i++) { + if ((b->nr_args <= i) || + ((a->args[i].type != b->args[i].type) || + (a->args[i].count != b->args[i].count) || + strcmp(a->args[i].name, b->args[i].name))) + return i + 1; + } + + return 0; +} + +bool trace_probe_match_command_args(struct trace_probe *tp, + int argc, const char **argv) +{ + char buf[MAX_ARGSTR_LEN + 1]; + int i; + + if (tp->nr_args < argc) + return false; + + for (i = 0; i < argc; i++) { + snprintf(buf, sizeof(buf), "%s=%s", + tp->args[i].name, tp->args[i].comm); + if (strcmp(buf, argv[i])) + return false; + } + return true; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index d1714820efe1..4ee703728aec 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -89,6 +89,7 @@ enum fetch_op { FETCH_OP_COMM, /* Current comm */ FETCH_OP_ARG, /* Function argument : .param */ FETCH_OP_FOFFS, /* File offset: .immediate */ + FETCH_OP_DATA, /* Allocated data: .data */ // Stage 2 (dereference) op FETCH_OP_DEREF, /* Dereference: .offset */ FETCH_OP_UDEREF, /* User-space Dereference: .offset */ @@ -222,11 +223,18 @@ struct probe_arg { const struct fetch_type *type; /* Type of this argument */ }; -struct trace_probe { +/* Event call and class holder */ +struct trace_probe_event { unsigned int flags; /* For TP_FLAG_* */ struct trace_event_class class; struct trace_event_call call; struct list_head files; + struct list_head probes; +}; + +struct trace_probe { + struct list_head list; + struct trace_probe_event *event; ssize_t size; /* trace entry size */ unsigned int nr_args; struct probe_arg args[]; @@ -240,19 +248,19 @@ struct event_file_link { static inline bool trace_probe_test_flag(struct trace_probe *tp, unsigned int flag) { - return !!(tp->flags & flag); + return !!(tp->event->flags & flag); } static inline void trace_probe_set_flag(struct trace_probe *tp, unsigned int flag) { - tp->flags |= flag; + tp->event->flags |= flag; } static inline void trace_probe_clear_flag(struct trace_probe *tp, unsigned int flag) { - tp->flags &= ~flag; + tp->event->flags &= ~flag; } static inline bool trace_probe_is_enabled(struct trace_probe *tp) @@ -262,45 +270,76 @@ static inline bool trace_probe_is_enabled(struct trace_probe *tp) static inline const char *trace_probe_name(struct trace_probe *tp) { - return trace_event_name(&tp->call); + return trace_event_name(&tp->event->call); } static inline const char *trace_probe_group_name(struct trace_probe *tp) { - return tp->call.class->system; + return tp->event->call.class->system; } static inline struct trace_event_call * trace_probe_event_call(struct trace_probe *tp) { - return &tp->call; + return &tp->event->call; +} + +static inline struct trace_probe_event * +trace_probe_event_from_call(struct trace_event_call *event_call) +{ + return container_of(event_call, struct trace_probe_event, call); +} + +static inline struct trace_probe * +trace_probe_primary_from_call(struct trace_event_call *call) +{ + struct trace_probe_event *tpe = trace_probe_event_from_call(call); + + return list_first_entry(&tpe->probes, struct trace_probe, list); +} + +static inline struct list_head *trace_probe_probe_list(struct trace_probe *tp) +{ + return &tp->event->probes; +} + +static inline bool trace_probe_has_sibling(struct trace_probe *tp) +{ + struct list_head *list = trace_probe_probe_list(tp); + + return !list_empty(list) && !list_is_singular(list); } static inline int trace_probe_unregister_event_call(struct trace_probe *tp) { /* tp->event is unregistered in trace_remove_event_call() */ - return trace_remove_event_call(&tp->call); + return trace_remove_event_call(&tp->event->call); } static inline bool trace_probe_has_single_file(struct trace_probe *tp) { - return !!list_is_singular(&tp->files); + return !!list_is_singular(&tp->event->files); } int trace_probe_init(struct trace_probe *tp, const char *event, const char *group); void trace_probe_cleanup(struct trace_probe *tp); +int trace_probe_append(struct trace_probe *tp, struct trace_probe *to); +void trace_probe_unlink(struct trace_probe *tp); int trace_probe_register_event_call(struct trace_probe *tp); int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file); int trace_probe_remove_file(struct trace_probe *tp, struct trace_event_file *file); struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp, struct trace_event_file *file); +int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b); +bool trace_probe_match_command_args(struct trace_probe *tp, + int argc, const char **argv); #define trace_probe_for_each_link(pos, tp) \ - list_for_each_entry(pos, &(tp)->files, list) + list_for_each_entry(pos, &(tp)->event->files, list) #define trace_probe_for_each_link_rcu(pos, tp) \ - list_for_each_entry_rcu(pos, &(tp)->files, list) + list_for_each_entry_rcu(pos, &(tp)->event->files, list) /* Check the name is good for event/group/fields */ static inline bool is_good_name(const char *name) @@ -370,6 +409,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_VAR, "Invalid $-valiable specified"), \ C(BAD_REG_NAME, "Invalid register name"), \ C(BAD_MEM_ADDR, "Invalid memory address"), \ + C(BAD_IMM, "Invalid immediate value"), \ + C(IMMSTR_NO_CLOSE, "String is not closed with '\"'"), \ C(FILE_ON_KPROBE, "File offset is not available with kprobe"), \ C(BAD_FILE_OFFS, "Invalid file offset value"), \ C(SYM_ON_UPROBE, "Symbol is not available with uprobe"), \ @@ -393,7 +434,10 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(ARG_TOO_LONG, "Argument expression is too long"), \ C(NO_ARG_BODY, "No argument expression"), \ C(BAD_INSN_BNDRY, "Probe point is not an instruction boundary"),\ - C(FAIL_REG_PROBE, "Failed to register probe event"), + C(FAIL_REG_PROBE, "Failed to register probe event"),\ + C(DIFF_PROBE_TYPE, "Probe type is different from existing probe"),\ + C(DIFF_ARG_TYPE, "Argument type or name is different from existing probe"),\ + C(SAME_PROBE, "There is already the exact same probe event"), #undef C #define C(a, b) TP_ERR_##a diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 5d16f73898db..ec9a34a97129 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -53,6 +53,104 @@ static void print_max_stack(void) } } +/* + * The stack tracer looks for a maximum stack at each call from a function. It + * registers a callback from ftrace, and in that callback it examines the stack + * size. It determines the stack size from the variable passed in, which is the + * address of a local variable in the stack_trace_call() callback function. + * The stack size is calculated by the address of the local variable to the top + * of the current stack. If that size is smaller than the currently saved max + * stack size, nothing more is done. + * + * If the size of the stack is greater than the maximum recorded size, then the + * following algorithm takes place. + * + * For architectures (like x86) that store the function's return address before + * saving the function's local variables, the stack will look something like + * this: + * + * [ top of stack ] + * 0: sys call entry frame + * 10: return addr to entry code + * 11: start of sys_foo frame + * 20: return addr to sys_foo + * 21: start of kernel_func_bar frame + * 30: return addr to kernel_func_bar + * 31: [ do trace stack here ] + * + * The save_stack_trace() is called returning all the functions it finds in the + * current stack. Which would be (from the bottom of the stack to the top): + * + * return addr to kernel_func_bar + * return addr to sys_foo + * return addr to entry code + * + * Now to figure out how much each of these functions' local variable size is, + * a search of the stack is made to find these values. When a match is made, it + * is added to the stack_dump_trace[] array. The offset into the stack is saved + * in the stack_trace_index[] array. The above example would show: + * + * stack_dump_trace[] | stack_trace_index[] + * ------------------ + ------------------- + * return addr to kernel_func_bar | 30 + * return addr to sys_foo | 20 + * return addr to entry | 10 + * + * The print_max_stack() function above, uses these values to print the size of + * each function's portion of the stack. + * + * for (i = 0; i < nr_entries; i++) { + * size = i == nr_entries - 1 ? stack_trace_index[i] : + * stack_trace_index[i] - stack_trace_index[i+1] + * print "%d %d %d %s\n", i, stack_trace_index[i], size, stack_dump_trace[i]); + * } + * + * The above shows + * + * depth size location + * ----- ---- -------- + * 0 30 10 kernel_func_bar + * 1 20 10 sys_foo + * 2 10 10 entry code + * + * Now for architectures that might save the return address after the functions + * local variables (saving the link register before calling nested functions), + * this will cause the stack to look a little different: + * + * [ top of stack ] + * 0: sys call entry frame + * 10: start of sys_foo_frame + * 19: return addr to entry code << lr saved before calling kernel_func_bar + * 20: start of kernel_func_bar frame + * 29: return addr to sys_foo_frame << lr saved before calling next function + * 30: [ do trace stack here ] + * + * Although the functions returned by save_stack_trace() may be the same, the + * placement in the stack will be different. Using the same algorithm as above + * would yield: + * + * stack_dump_trace[] | stack_trace_index[] + * ------------------ + ------------------- + * return addr to kernel_func_bar | 30 + * return addr to sys_foo | 29 + * return addr to entry | 19 + * + * Where the mapping is off by one: + * + * kernel_func_bar stack frame size is 29 - 19 not 30 - 29! + * + * To fix this, if the architecture sets ARCH_RET_ADDR_AFTER_LOCAL_VARS the + * values in stack_trace_index[] are shifted by one to and the number of + * stack trace entries is decremented by one. + * + * stack_dump_trace[] | stack_trace_index[] + * ------------------ + ------------------- + * return addr to kernel_func_bar | 29 + * return addr to sys_foo | 19 + * + * Although the entry function is not displayed, the first function (sys_foo) + * will still include the stack size of it. + */ static void check_stack(unsigned long ip, unsigned long *stack) { unsigned long this_size, flags; unsigned long *p, *top, *start; @@ -158,6 +256,20 @@ static void check_stack(unsigned long ip, unsigned long *stack) i++; } +#ifdef ARCH_FTRACE_SHIFT_STACK_TRACER + /* + * Some archs will store the link register before calling + * nested functions. This means the saved return address + * comes after the local storage, and we need to shift + * for that. + */ + if (x > 1) { + memmove(&stack_trace_index[0], &stack_trace_index[1], + sizeof(stack_trace_index[0]) * (x - 1)); + x--; + } +#endif + stack_trace_nr_entries = x; if (task_stack_end_corrupted(current)) { diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 1ceedb9146b1..dd884341f5c5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -44,7 +44,7 @@ static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev); static int trace_uprobe_release(struct dyn_event *ev); static bool trace_uprobe_is_busy(struct dyn_event *ev); static bool trace_uprobe_match(const char *system, const char *event, - struct dyn_event *ev); + int argc, const char **argv, struct dyn_event *ev); static struct dyn_event_operations trace_uprobe_ops = { .create = trace_uprobe_create, @@ -248,6 +248,9 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, case FETCH_OP_COMM: val = FETCH_TOKEN_COMM; break; + case FETCH_OP_DATA: + val = (unsigned long)code->data; + break; case FETCH_OP_FOFFS: val = translate_user_vaddr(code->immediate); break; @@ -284,13 +287,54 @@ static bool trace_uprobe_is_busy(struct dyn_event *ev) return trace_probe_is_enabled(&tu->tp); } +static bool trace_uprobe_match_command_head(struct trace_uprobe *tu, + int argc, const char **argv) +{ + char buf[MAX_ARGSTR_LEN + 1]; + int len; + + if (!argc) + return true; + + len = strlen(tu->filename); + if (strncmp(tu->filename, argv[0], len) || argv[0][len] != ':') + return false; + + if (tu->ref_ctr_offset == 0) + snprintf(buf, sizeof(buf), "0x%0*lx", + (int)(sizeof(void *) * 2), tu->offset); + else + snprintf(buf, sizeof(buf), "0x%0*lx(0x%lx)", + (int)(sizeof(void *) * 2), tu->offset, + tu->ref_ctr_offset); + if (strcmp(buf, &argv[0][len + 1])) + return false; + + argc--; argv++; + + return trace_probe_match_command_args(&tu->tp, argc, argv); +} + static bool trace_uprobe_match(const char *system, const char *event, - struct dyn_event *ev) + int argc, const char **argv, struct dyn_event *ev) { struct trace_uprobe *tu = to_trace_uprobe(ev); return strcmp(trace_probe_name(&tu->tp), event) == 0 && - (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0); + (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0) && + trace_uprobe_match_command_head(tu, argc, argv); +} + +static nokprobe_inline struct trace_uprobe * +trace_uprobe_primary_from_call(struct trace_event_call *call) +{ + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return NULL; + + return container_of(tp, struct trace_uprobe, tp); } /* @@ -352,15 +396,76 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu) { int ret; + if (trace_probe_has_sibling(&tu->tp)) + goto unreg; + ret = unregister_uprobe_event(tu); if (ret) return ret; +unreg: dyn_event_remove(&tu->devent); + trace_probe_unlink(&tu->tp); free_trace_uprobe(tu); return 0; } +static bool trace_uprobe_has_same_uprobe(struct trace_uprobe *orig, + struct trace_uprobe *comp) +{ + struct trace_probe_event *tpe = orig->tp.event; + struct trace_probe *pos; + struct inode *comp_inode = d_real_inode(comp->path.dentry); + int i; + + list_for_each_entry(pos, &tpe->probes, list) { + orig = container_of(pos, struct trace_uprobe, tp); + if (comp_inode != d_real_inode(orig->path.dentry) || + comp->offset != orig->offset) + continue; + + /* + * trace_probe_compare_arg_type() ensured that nr_args and + * each argument name and type are same. Let's compare comm. + */ + for (i = 0; i < orig->tp.nr_args; i++) { + if (strcmp(orig->tp.args[i].comm, + comp->tp.args[i].comm)) + break; + } + + if (i == orig->tp.nr_args) + return true; + } + + return false; +} + +static int append_trace_uprobe(struct trace_uprobe *tu, struct trace_uprobe *to) +{ + int ret; + + ret = trace_probe_compare_arg_type(&tu->tp, &to->tp); + if (ret) { + /* Note that argument starts index = 2 */ + trace_probe_log_set_index(ret + 1); + trace_probe_log_err(0, DIFF_ARG_TYPE); + return -EEXIST; + } + if (trace_uprobe_has_same_uprobe(to, tu)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, SAME_PROBE); + return -EEXIST; + } + + /* Append to existing event */ + ret = trace_probe_append(&tu->tp, &to->tp); + if (!ret) + dyn_event_add(&tu->devent); + + return ret; +} + /* * Uprobe with multiple reference counter is not allowed. i.e. * If inode and offset matches, reference counter offset *must* @@ -370,25 +475,21 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu) * as the new one does not conflict with any other existing * ones. */ -static struct trace_uprobe *find_old_trace_uprobe(struct trace_uprobe *new) +static int validate_ref_ctr_offset(struct trace_uprobe *new) { struct dyn_event *pos; - struct trace_uprobe *tmp, *old = NULL; + struct trace_uprobe *tmp; struct inode *new_inode = d_real_inode(new->path.dentry); - old = find_probe_event(trace_probe_name(&new->tp), - trace_probe_group_name(&new->tp)); - for_each_trace_uprobe(tmp, pos) { - if ((old ? old != tmp : true) && - new_inode == d_real_inode(tmp->path.dentry) && + if (new_inode == d_real_inode(tmp->path.dentry) && new->offset == tmp->offset && new->ref_ctr_offset != tmp->ref_ctr_offset) { pr_warn("Reference counter offset mismatch."); - return ERR_PTR(-EINVAL); + return -EINVAL; } } - return old; + return 0; } /* Register a trace_uprobe and probe_event */ @@ -399,18 +500,22 @@ static int register_trace_uprobe(struct trace_uprobe *tu) mutex_lock(&event_mutex); - /* register as an event */ - old_tu = find_old_trace_uprobe(tu); - if (IS_ERR(old_tu)) { - ret = PTR_ERR(old_tu); + ret = validate_ref_ctr_offset(tu); + if (ret) goto end; - } + /* register as an event */ + old_tu = find_probe_event(trace_probe_name(&tu->tp), + trace_probe_group_name(&tu->tp)); if (old_tu) { - /* delete old event */ - ret = unregister_trace_uprobe(old_tu); - if (ret) - goto end; + if (is_ret_probe(tu) != is_ret_probe(old_tu)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, DIFF_PROBE_TYPE); + ret = -EEXIST; + } else { + ret = append_trace_uprobe(tu, old_tu); + } + goto end; } ret = register_uprobe_event(tu); @@ -897,7 +1002,10 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e u8 *data; entry = (struct uprobe_trace_entry_head *)iter->ent; - tu = container_of(event, struct trace_uprobe, tp.call.event); + tu = trace_uprobe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (unlikely(!tu)) + goto out; if (is_ret_probe(tu)) { trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", @@ -924,27 +1032,71 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self, enum uprobe_filter_ctx ctx, struct mm_struct *mm); -static int -probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, - filter_func_t filter) +static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter) { - bool enabled = trace_probe_is_enabled(&tu->tp); int ret; + tu->consumer.filter = filter; + tu->inode = d_real_inode(tu->path.dentry); + + if (tu->ref_ctr_offset) + ret = uprobe_register_refctr(tu->inode, tu->offset, + tu->ref_ctr_offset, &tu->consumer); + else + ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + + if (ret) + tu->inode = NULL; + + return ret; +} + +static void __probe_event_disable(struct trace_probe *tp) +{ + struct trace_probe *pos; + struct trace_uprobe *tu; + + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + tu = container_of(pos, struct trace_uprobe, tp); + if (!tu->inode) + continue; + + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + + uprobe_unregister(tu->inode, tu->offset, &tu->consumer); + tu->inode = NULL; + } +} + +static int probe_event_enable(struct trace_event_call *call, + struct trace_event_file *file, filter_func_t filter) +{ + struct trace_probe *pos, *tp; + struct trace_uprobe *tu; + bool enabled; + int ret; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + enabled = trace_probe_is_enabled(tp); + + /* This may also change "enabled" state */ if (file) { - if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) + if (trace_probe_test_flag(tp, TP_FLAG_PROFILE)) return -EINTR; - ret = trace_probe_add_file(&tu->tp, file); + ret = trace_probe_add_file(tp, file); if (ret < 0) return ret; } else { - if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) + if (trace_probe_test_flag(tp, TP_FLAG_TRACE)) return -EINTR; - trace_probe_set_flag(&tu->tp, TP_FLAG_PROFILE); + trace_probe_set_flag(tp, TP_FLAG_PROFILE); } + tu = container_of(tp, struct trace_uprobe, tp); WARN_ON(!uprobe_filter_is_empty(&tu->filter)); if (enabled) @@ -954,18 +1106,15 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, if (ret) goto err_flags; - tu->consumer.filter = filter; - tu->inode = d_real_inode(tu->path.dentry); - if (tu->ref_ctr_offset) { - ret = uprobe_register_refctr(tu->inode, tu->offset, - tu->ref_ctr_offset, &tu->consumer); - } else { - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + tu = container_of(pos, struct trace_uprobe, tp); + ret = trace_uprobe_enable(tu, filter); + if (ret) { + __probe_event_disable(tp); + goto err_buffer; + } } - if (ret) - goto err_buffer; - return 0; err_buffer: @@ -973,33 +1122,35 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, err_flags: if (file) - trace_probe_remove_file(&tu->tp, file); + trace_probe_remove_file(tp, file); else - trace_probe_clear_flag(&tu->tp, TP_FLAG_PROFILE); + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); return ret; } -static void -probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file) +static void probe_event_disable(struct trace_event_call *call, + struct trace_event_file *file) { - if (!trace_probe_is_enabled(&tu->tp)) + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return; + + if (!trace_probe_is_enabled(tp)) return; if (file) { - if (trace_probe_remove_file(&tu->tp, file) < 0) + if (trace_probe_remove_file(tp, file) < 0) return; - if (trace_probe_is_enabled(&tu->tp)) + if (trace_probe_is_enabled(tp)) return; } else - trace_probe_clear_flag(&tu->tp, TP_FLAG_PROFILE); - - WARN_ON(!uprobe_filter_is_empty(&tu->filter)); - - uprobe_unregister(tu->inode, tu->offset, &tu->consumer); - tu->inode = NULL; + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + __probe_event_disable(tp); uprobe_buffer_disable(); } @@ -1007,7 +1158,11 @@ static int uprobe_event_define_fields(struct trace_event_call *event_call) { int ret, size; struct uprobe_trace_entry_head field; - struct trace_uprobe *tu = event_call->data; + struct trace_uprobe *tu; + + tu = trace_uprobe_primary_from_call(event_call); + if (unlikely(!tu)) + return -ENODEV; if (is_ret_probe(tu)) { DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0); @@ -1100,6 +1255,27 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) return err; } +static int uprobe_perf_multi_call(struct trace_event_call *call, + struct perf_event *event, + int (*op)(struct trace_uprobe *tu, struct perf_event *event)) +{ + struct trace_probe *pos, *tp; + struct trace_uprobe *tu; + int ret = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + tu = container_of(pos, struct trace_uprobe, tp); + ret = op(tu, event); + if (ret) + break; + } + + return ret; +} static bool uprobe_perf_filter(struct uprobe_consumer *uc, enum uprobe_filter_ctx ctx, struct mm_struct *mm) { @@ -1213,30 +1389,29 @@ static int trace_uprobe_register(struct trace_event_call *event, enum trace_reg type, void *data) { - struct trace_uprobe *tu = event->data; struct trace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: - return probe_event_enable(tu, file, NULL); + return probe_event_enable(event, file, NULL); case TRACE_REG_UNREGISTER: - probe_event_disable(tu, file); + probe_event_disable(event, file); return 0; #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return probe_event_enable(tu, NULL, uprobe_perf_filter); + return probe_event_enable(event, NULL, uprobe_perf_filter); case TRACE_REG_PERF_UNREGISTER: - probe_event_disable(tu, NULL); + probe_event_disable(event, NULL); return 0; case TRACE_REG_PERF_OPEN: - return uprobe_perf_open(tu, data); + return uprobe_perf_multi_call(event, data, uprobe_perf_open); case TRACE_REG_PERF_CLOSE: - return uprobe_perf_close(tu, data); + return uprobe_perf_multi_call(event, data, uprobe_perf_close); #endif default: @@ -1330,7 +1505,6 @@ static inline void init_trace_event_call(struct trace_uprobe *tu) call->flags = TRACE_EVENT_FL_UPROBE | TRACE_EVENT_FL_CAP_ANY; call->class->reg = trace_uprobe_register; - call->data = tu; } static int register_uprobe_event(struct trace_uprobe *tu) @@ -1399,7 +1573,7 @@ void destroy_local_trace_uprobe(struct trace_event_call *event_call) { struct trace_uprobe *tu; - tu = container_of(event_call, struct trace_uprobe, tp.call); + tu = trace_uprobe_primary_from_call(event_call); free_trace_uprobe(tu); } |