aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/bpf/inode.c92
-rw-r--r--kernel/cpu.c11
-rw-r--r--kernel/debug/debug_core.c36
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/dma/Kconfig12
-rw-r--r--kernel/dma/coherent.c13
-rw-r--r--kernel/dma/mapping.c113
-rw-r--r--kernel/dma/remap.c51
-rw-r--r--kernel/dma/swiotlb.c3
-rw-r--r--kernel/elfcore.c1
-rw-r--r--kernel/events/core.c13
-rw-r--r--kernel/events/uprobes.c81
-rw-r--r--kernel/exit.c74
-rw-r--r--kernel/extable.c11
-rw-r--r--kernel/fork.c25
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/kexec.c8
-rw-r--r--kernel/kexec_core.c2
-rw-r--r--kernel/kexec_file.c68
-rw-r--r--kernel/kprobes.c56
-rw-r--r--kernel/livepatch/core.c1
-rw-r--r--kernel/locking/qspinlock_paravirt.h2
-rw-r--r--kernel/module.c114
-rw-r--r--kernel/module_signature.c46
-rw-r--r--kernel/module_signing.c56
-rw-r--r--kernel/panic.c42
-rw-r--r--kernel/params.c21
-rw-r--r--kernel/power/hibernate.c3
-rw-r--r--kernel/printk/braille.c15
-rw-r--r--kernel/printk/printk.c24
-rw-r--r--kernel/resource.c49
-rw-r--r--kernel/sched/core.c47
-rw-r--r--kernel/sched/fair.c39
-rw-r--r--kernel/sched/idle.c1
-rw-r--r--kernel/sched/membarrier.c239
-rw-r--r--kernel/sched/sched.h34
-rw-r--r--kernel/sysctl.c6
-rw-r--r--kernel/time/timer.c8
-rw-r--r--kernel/trace/bpf_trace.c10
-rw-r--r--kernel/trace/fgraph.c4
-rw-r--r--kernel/trace/ftrace.c6
-rw-r--r--kernel/trace/trace.c14
-rw-r--r--kernel/trace/trace.h1
-rw-r--r--kernel/trace/trace_dynevent.c10
-rw-r--r--kernel/trace/trace_dynevent.h7
-rw-r--r--kernel/trace/trace_events_hist.c25
-rw-r--r--kernel/trace/trace_kprobe.c274
-rw-r--r--kernel/trace/trace_output.c6
-rw-r--r--kernel/trace/trace_probe.c178
-rw-r--r--kernel/trace/trace_probe.h68
-rw-r--r--kernel/trace/trace_stack.c112
-rw-r--r--kernel/trace/trace_uprobe.c300
53 files changed, 1738 insertions, 701 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 48c5376d290a..daad787fb795 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -58,6 +58,7 @@ endif
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_MODULE_SIG) += module_signing.o
+obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_CRASH_CORE) += crash_core.o
@@ -127,7 +128,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz
-cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@
+ cmd_genikh = $(BASH) $(srctree)/kernel/gen_kheaders.sh $@
$(obj)/kheaders_data.tar.xz: FORCE
$(call cmd,genikh)
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index cc0d0cf114e3..a70f7209cda3 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -14,8 +14,9 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/kdev_t.h>
-#include <linux/parser.h>
#include <linux/filter.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
@@ -583,58 +584,52 @@ static const struct super_operations bpf_super_ops = {
enum {
OPT_MODE,
- OPT_ERR,
};
-static const match_table_t bpf_mount_tokens = {
- { OPT_MODE, "mode=%o" },
- { OPT_ERR, NULL },
+static const struct fs_parameter_spec bpf_param_specs[] = {
+ fsparam_u32oct ("mode", OPT_MODE),
+ {}
+};
+
+static const struct fs_parameter_description bpf_fs_parameters = {
+ .name = "bpf",
+ .specs = bpf_param_specs,
};
struct bpf_mount_opts {
umode_t mode;
};
-static int bpf_parse_options(char *data, struct bpf_mount_opts *opts)
+static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- substring_t args[MAX_OPT_ARGS];
- int option, token;
- char *ptr;
+ struct bpf_mount_opts *opts = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
- opts->mode = S_IRWXUGO;
-
- while ((ptr = strsep(&data, ",")) != NULL) {
- if (!*ptr)
- continue;
-
- token = match_token(ptr, bpf_mount_tokens, args);
- switch (token) {
- case OPT_MODE:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->mode = option & S_IALLUGO;
- break;
+ opt = fs_parse(fc, &bpf_fs_parameters, param, &result);
+ if (opt < 0)
/* We might like to report bad mount options here, but
* traditionally we've ignored all mount options, so we'd
* better continue to ignore non-existing options for bpf.
*/
- }
+ return opt == -ENOPARAM ? 0 : opt;
+
+ switch (opt) {
+ case OPT_MODE:
+ opts->mode = result.uint_32 & S_IALLUGO;
+ break;
}
return 0;
}
-static int bpf_fill_super(struct super_block *sb, void *data, int silent)
+static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
{
static const struct tree_descr bpf_rfiles[] = { { "" } };
- struct bpf_mount_opts opts;
+ struct bpf_mount_opts *opts = fc->fs_private;
struct inode *inode;
int ret;
- ret = bpf_parse_options(data, &opts);
- if (ret)
- return ret;
-
ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
if (ret)
return ret;
@@ -644,21 +639,50 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent)
inode = sb->s_root->d_inode;
inode->i_op = &bpf_dir_iops;
inode->i_mode &= ~S_IALLUGO;
- inode->i_mode |= S_ISVTX | opts.mode;
+ inode->i_mode |= S_ISVTX | opts->mode;
return 0;
}
-static struct dentry *bpf_mount(struct file_system_type *type, int flags,
- const char *dev_name, void *data)
+static int bpf_get_tree(struct fs_context *fc)
+{
+ return get_tree_nodev(fc, bpf_fill_super);
+}
+
+static void bpf_free_fc(struct fs_context *fc)
{
- return mount_nodev(type, flags, data, bpf_fill_super);
+ kfree(fc->fs_private);
+}
+
+static const struct fs_context_operations bpf_context_ops = {
+ .free = bpf_free_fc,
+ .parse_param = bpf_parse_param,
+ .get_tree = bpf_get_tree,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int bpf_init_fs_context(struct fs_context *fc)
+{
+ struct bpf_mount_opts *opts;
+
+ opts = kzalloc(sizeof(struct bpf_mount_opts), GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+
+ opts->mode = S_IRWXUGO;
+
+ fc->fs_private = opts;
+ fc->ops = &bpf_context_ops;
+ return 0;
}
static struct file_system_type bpf_fs_type = {
.owner = THIS_MODULE,
.name = "bpf",
- .mount = bpf_mount,
+ .init_fs_context = bpf_init_fs_context,
+ .parameters = &bpf_fs_parameters,
.kill_sb = kill_litter_super,
};
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e1967e9eddc2..fc28e17940e0 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -392,8 +392,7 @@ enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
void __init cpu_smt_disable(bool force)
{
- if (cpu_smt_control == CPU_SMT_FORCE_DISABLED ||
- cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
+ if (!cpu_smt_possible())
return;
if (force) {
@@ -438,6 +437,14 @@ static inline bool cpu_smt_allowed(unsigned int cpu)
*/
return !cpumask_test_cpu(cpu, &cpus_booted_once_mask);
}
+
+/* Returns true if SMT is not supported of forcefully (irreversibly) disabled */
+bool cpu_smt_possible(void)
+{
+ return cpu_smt_control != CPU_SMT_FORCE_DISABLED &&
+ cpu_smt_control != CPU_SMT_NOT_SUPPORTED;
+}
+EXPORT_SYMBOL_GPL(cpu_smt_possible);
#else
static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
#endif
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 5cc608de6883..f76d6f77dd5e 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -787,11 +787,8 @@ out:
}
/*
- * GDB places a breakpoint at this function to know dynamically
- * loaded objects. It's not defined static so that only one instance with this
- * name exists in the kernel.
+ * GDB places a breakpoint at this function to know dynamically loaded objects.
*/
-
static int module_event(struct notifier_block *self, unsigned long val,
void *data)
{
@@ -896,30 +893,25 @@ static struct sysrq_key_op sysrq_dbg_op = {
};
#endif
-static int kgdb_panic_event(struct notifier_block *self,
- unsigned long val,
- void *data)
+void kgdb_panic(const char *msg)
{
+ if (!kgdb_io_module_registered)
+ return;
+
/*
- * Avoid entering the debugger if we were triggered due to a panic
- * We don't want to get stuck waiting for input from user in such case.
- * panic_timeout indicates the system should automatically
+ * We don't want to get stuck waiting for input from user if
+ * "panic_timeout" indicates the system should automatically
* reboot on panic.
*/
if (panic_timeout)
- return NOTIFY_DONE;
+ return;
if (dbg_kdb_mode)
- kdb_printf("PANIC: %s\n", (char *)data);
+ kdb_printf("PANIC: %s\n", msg);
+
kgdb_breakpoint();
- return NOTIFY_DONE;
}
-static struct notifier_block kgdb_panic_event_nb = {
- .notifier_call = kgdb_panic_event,
- .priority = INT_MAX,
-};
-
void __weak kgdb_arch_late(void)
{
}
@@ -968,8 +960,6 @@ static void kgdb_register_callbacks(void)
kgdb_arch_late();
register_module_notifier(&dbg_module_load_nb);
register_reboot_notifier(&dbg_reboot_notifier);
- atomic_notifier_chain_register(&panic_notifier_list,
- &kgdb_panic_event_nb);
#ifdef CONFIG_MAGIC_SYSRQ
register_sysrq_key('g', &sysrq_dbg_op);
#endif
@@ -983,16 +973,14 @@ static void kgdb_register_callbacks(void)
static void kgdb_unregister_callbacks(void)
{
/*
- * When this routine is called KGDB should unregister from the
- * panic handler and clean up, making sure it is not handling any
+ * When this routine is called KGDB should unregister from
+ * handlers and clean up, making sure it is not handling any
* break exceptions at the time.
*/
if (kgdb_io_module_registered) {
kgdb_io_module_registered = 0;
unregister_reboot_notifier(&dbg_reboot_notifier);
unregister_module_notifier(&dbg_module_load_nb);
- atomic_notifier_chain_unregister(&panic_notifier_list,
- &kgdb_panic_event_nb);
kgdb_arch_exit();
#ifdef CONFIG_MAGIC_SYSRQ
unregister_sysrq_key('g', &sysrq_dbg_op);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 9ecfa37c7fbf..4567fe998c30 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -830,7 +830,7 @@ static void parse_grep(const char *str)
cp++;
while (isspace(*cp))
cp++;
- if (strncmp(cp, "grep ", 5)) {
+ if (!str_has_prefix(cp, "grep ")) {
kdb_printf("invalid 'pipe', see grephelp\n");
return;
}
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 9decbba255fc..73c5c2b8e824 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -20,6 +20,15 @@ config ARCH_HAS_DMA_COHERENCE_H
config ARCH_HAS_DMA_SET_MASK
bool
+#
+# Select this option if the architecture needs special handling for
+# DMA_ATTR_WRITE_COMBINE. Normally the "uncached" mapping should be what
+# people thing of when saying write combine, so very few platforms should
+# need to enable this.
+#
+config ARCH_HAS_DMA_WRITE_COMBINE
+ bool
+
config DMA_DECLARE_COHERENT
bool
@@ -45,9 +54,6 @@ config ARCH_HAS_DMA_PREP_COHERENT
config ARCH_HAS_DMA_COHERENT_TO_PFN
bool
-config ARCH_HAS_DMA_MMAP_PGPROT
- bool
-
config ARCH_HAS_FORCE_DMA_UNENCRYPTED
bool
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 29fd6590dc1e..545e3869b0e3 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -122,18 +122,6 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
dma_release_coherent_memory(mem);
return ret;
}
-EXPORT_SYMBOL(dma_declare_coherent_memory);
-
-void dma_release_declared_memory(struct device *dev)
-{
- struct dma_coherent_mem *mem = dev->dma_mem;
-
- if (!mem)
- return;
- dma_release_coherent_memory(mem);
- dev->dma_mem = NULL;
-}
-EXPORT_SYMBOL(dma_release_declared_memory);
static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,
ssize_t size, dma_addr_t *dma_handle)
@@ -288,7 +276,6 @@ int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma,
return __dma_mmap_from_coherent(mem, vma, vaddr, size, ret);
}
-EXPORT_SYMBOL(dma_mmap_from_dev_coherent);
int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr,
size_t size, int *ret)
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index b0038ca3aa92..d9334f31a5af 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -136,17 +136,29 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
return ret;
}
+/*
+ * The whole dma_get_sgtable() idea is fundamentally unsafe - it seems
+ * that the intention is to allow exporting memory allocated via the
+ * coherent DMA APIs through the dma_buf API, which only accepts a
+ * scattertable. This presents a couple of problems:
+ * 1. Not all memory allocated via the coherent DMA APIs is backed by
+ * a struct page
+ * 2. Passing coherent DMA memory into the streaming APIs is not allowed
+ * as we will try to flush the memory through a different alias to that
+ * actually being used (and the flushes are redundant.)
+ */
int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
unsigned long attrs)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (!dma_is_direct(ops) && ops->get_sgtable)
- return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size,
- attrs);
- return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, size,
- attrs);
+ if (dma_is_direct(ops))
+ return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr,
+ size, attrs);
+ if (!ops->get_sgtable)
+ return -ENXIO;
+ return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs);
}
EXPORT_SYMBOL(dma_get_sgtable_attrs);
@@ -161,9 +173,11 @@ pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs)
(IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) &&
(attrs & DMA_ATTR_NON_CONSISTENT)))
return prot;
- if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_MMAP_PGPROT))
- return arch_dma_mmap_pgprot(dev, prot, attrs);
- return pgprot_noncached(prot);
+#ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE
+ if (attrs & DMA_ATTR_WRITE_COMBINE)
+ return pgprot_writecombine(prot);
+#endif
+ return pgprot_dmacoherent(prot);
}
#endif /* CONFIG_MMU */
@@ -174,7 +188,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
unsigned long attrs)
{
-#ifndef CONFIG_ARCH_NO_COHERENT_DMA_MMAP
+#ifdef CONFIG_MMU
unsigned long user_count = vma_pages(vma);
unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
unsigned long off = vma->vm_pgoff;
@@ -205,8 +219,29 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
user_count << PAGE_SHIFT, vma->vm_page_prot);
#else
return -ENXIO;
-#endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */
+#endif /* CONFIG_MMU */
+}
+
+/**
+ * dma_can_mmap - check if a given device supports dma_mmap_*
+ * @dev: device to check
+ *
+ * Returns %true if @dev supports dma_mmap_coherent() and dma_mmap_attrs() to
+ * map DMA allocations to userspace.
+ */
+bool dma_can_mmap(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (dma_is_direct(ops)) {
+ return IS_ENABLED(CONFIG_MMU) &&
+ (dev_is_dma_coherent(dev) ||
+ IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN));
+ }
+
+ return ops->mmap != NULL;
}
+EXPORT_SYMBOL_GPL(dma_can_mmap);
/**
* dma_mmap_attrs - map a coherent DMA allocation into user space
@@ -227,31 +262,15 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (!dma_is_direct(ops) && ops->mmap)
- return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
- return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
+ if (dma_is_direct(ops))
+ return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size,
+ attrs);
+ if (!ops->mmap)
+ return -ENXIO;
+ return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
}
EXPORT_SYMBOL(dma_mmap_attrs);
-static u64 dma_default_get_required_mask(struct device *dev)
-{
- u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT);
- u32 high_totalram = ((max_pfn - 1) >> (32 - PAGE_SHIFT));
- u64 mask;
-
- if (!high_totalram) {
- /* convert to mask just covering totalram */
- low_totalram = (1 << (fls(low_totalram) - 1));
- low_totalram += low_totalram - 1;
- mask = low_totalram;
- } else {
- high_totalram = (1 << (fls(high_totalram) - 1));
- high_totalram += high_totalram - 1;
- mask = (((u64)high_totalram) << 32) + 0xffffffff;
- }
- return mask;
-}
-
u64 dma_get_required_mask(struct device *dev)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -260,7 +279,16 @@ u64 dma_get_required_mask(struct device *dev)
return dma_direct_get_required_mask(dev);
if (ops->get_required_mask)
return ops->get_required_mask(dev);
- return dma_default_get_required_mask(dev);
+
+ /*
+ * We require every DMA ops implementation to at least support a 32-bit
+ * DMA mask (and use bounce buffering if that isn't supported in
+ * hardware). As the direct mapping code has its own routine to
+ * actually report an optimal mask we default to 32-bit here as that
+ * is the right thing for most IOMMUs, and at least not actively
+ * harmful in general.
+ */
+ return DMA_BIT_MASK(32);
}
EXPORT_SYMBOL_GPL(dma_get_required_mask);
@@ -317,12 +345,6 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
}
EXPORT_SYMBOL(dma_free_attrs);
-static inline void dma_check_mask(struct device *dev, u64 mask)
-{
- if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1)))
- dev_warn(dev, "SME is active, device will require DMA bounce buffers\n");
-}
-
int dma_supported(struct device *dev, u64 mask)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -353,7 +375,6 @@ int dma_set_mask(struct device *dev, u64 mask)
return -EIO;
arch_dma_set_mask(dev, mask);
- dma_check_mask(dev, mask);
*dev->dma_mask = mask;
return 0;
}
@@ -371,7 +392,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
if (!dma_supported(dev, mask))
return -EIO;
- dma_check_mask(dev, mask);
dev->coherent_dma_mask = mask;
return 0;
}
@@ -405,3 +425,14 @@ size_t dma_max_mapping_size(struct device *dev)
return size;
}
EXPORT_SYMBOL_GPL(dma_max_mapping_size);
+
+unsigned long dma_get_merge_boundary(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (!ops || !ops->get_merge_boundary)
+ return 0; /* can't merge */
+
+ return ops->get_merge_boundary(dev);
+}
+EXPORT_SYMBOL_GPL(dma_get_merge_boundary);
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index ffe78f0b2fe4..ca4e5d44b571 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -11,13 +11,21 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
+struct page **dma_common_find_pages(void *cpu_addr)
+{
+ struct vm_struct *area = find_vm_area(cpu_addr);
+
+ if (!area || area->flags != VM_DMA_COHERENT)
+ return NULL;
+ return area->pages;
+}
+
static struct vm_struct *__dma_common_pages_remap(struct page **pages,
- size_t size, unsigned long vm_flags, pgprot_t prot,
- const void *caller)
+ size_t size, pgprot_t prot, const void *caller)
{
struct vm_struct *area;
- area = get_vm_area_caller(size, vm_flags, caller);
+ area = get_vm_area_caller(size, VM_DMA_COHERENT, caller);
if (!area)
return NULL;
@@ -34,12 +42,11 @@ static struct vm_struct *__dma_common_pages_remap(struct page **pages,
* Cannot be used in non-sleeping contexts
*/
void *dma_common_pages_remap(struct page **pages, size_t size,
- unsigned long vm_flags, pgprot_t prot,
- const void *caller)
+ pgprot_t prot, const void *caller)
{
struct vm_struct *area;
- area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller);
+ area = __dma_common_pages_remap(pages, size, prot, caller);
if (!area)
return NULL;
@@ -53,7 +60,6 @@ void *dma_common_pages_remap(struct page **pages, size_t size,
* Cannot be used in non-sleeping contexts
*/
void *dma_common_contiguous_remap(struct page *page, size_t size,
- unsigned long vm_flags,
pgprot_t prot, const void *caller)
{
int i;
@@ -67,7 +73,7 @@ void *dma_common_contiguous_remap(struct page *page, size_t size,
for (i = 0; i < (size >> PAGE_SHIFT); i++)
pages[i] = nth_page(page, i);
- area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller);
+ area = __dma_common_pages_remap(pages, size, prot, caller);
kfree(pages);
@@ -79,11 +85,11 @@ void *dma_common_contiguous_remap(struct page *page, size_t size,
/*
* Unmaps a range previously mapped by dma_common_*_remap
*/
-void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags)
+void dma_common_free_remap(void *cpu_addr, size_t size)
{
- struct vm_struct *area = find_vm_area(cpu_addr);
+ struct page **pages = dma_common_find_pages(cpu_addr);
- if (!area || (area->flags & vm_flags) != vm_flags) {
+ if (!pages) {
WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr);
return;
}
@@ -105,7 +111,16 @@ static int __init early_coherent_pool(char *p)
}
early_param("coherent_pool", early_coherent_pool);
-int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot)
+static gfp_t dma_atomic_pool_gfp(void)
+{
+ if (IS_ENABLED(CONFIG_ZONE_DMA))
+ return GFP_DMA;
+ if (IS_ENABLED(CONFIG_ZONE_DMA32))
+ return GFP_DMA32;
+ return GFP_KERNEL;
+}
+
+static int __init dma_atomic_pool_init(void)
{
unsigned int pool_size_order = get_order(atomic_pool_size);
unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT;
@@ -117,7 +132,7 @@ int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot)
page = dma_alloc_from_contiguous(NULL, nr_pages,
pool_size_order, false);
else
- page = alloc_pages(gfp, pool_size_order);
+ page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order);
if (!page)
goto out;
@@ -127,8 +142,9 @@ int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot)
if (!atomic_pool)
goto free_page;
- addr = dma_common_contiguous_remap(page, atomic_pool_size, VM_USERMAP,
- prot, __builtin_return_address(0));
+ addr = dma_common_contiguous_remap(page, atomic_pool_size,
+ pgprot_dmacoherent(PAGE_KERNEL),
+ __builtin_return_address(0));
if (!addr)
goto destroy_genpool;
@@ -143,7 +159,7 @@ int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot)
return 0;
remove_mapping:
- dma_common_free_remap(addr, atomic_pool_size, VM_USERMAP);
+ dma_common_free_remap(addr, atomic_pool_size);
destroy_genpool:
gen_pool_destroy(atomic_pool);
atomic_pool = NULL;
@@ -155,6 +171,7 @@ out:
atomic_pool_size / 1024);
return -ENOMEM;
}
+postcore_initcall(dma_atomic_pool_init);
bool dma_in_atomic_pool(void *start, size_t size)
{
@@ -217,7 +234,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
arch_dma_prep_coherent(page, size);
/* create a coherent mapping */
- ret = dma_common_contiguous_remap(page, size, VM_USERMAP,
+ ret = dma_common_contiguous_remap(page, size,
dma_pgprot(dev, PAGE_KERNEL, attrs),
__builtin_return_address(0));
if (!ret) {
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 796a44f8ef5a..673a2cdb2656 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -463,8 +463,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
if (mem_encrypt_active())
- pr_warn_once("%s is active and system is using DMA bounce buffers\n",
- sme_active() ? "SME" : "SEV");
+ pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
if (mapping_size > alloc_size) {
dev_warn_once(hwdev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)",
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
index fc482c8e0bd8..57fb4dcff434 100644
--- a/kernel/elfcore.c
+++ b/kernel/elfcore.c
@@ -3,6 +3,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/binfmts.h>
+#include <linux/elfcore.h>
Elf_Half __weak elf_core_extra_phdrs(void)
{
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4f08b17d6426..4655adbbae10 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2239,7 +2239,7 @@ static void __perf_event_disable(struct perf_event *event,
*
* If event->ctx is a cloned context, callers must make sure that
* every task struct that event->ctx->task could possibly point to
- * remains valid. This condition is satisifed when called through
+ * remains valid. This condition is satisfied when called through
* perf_event_for_each_child or perf_event_for_each because they
* hold the top-level event's child_mutex, so any descendant that
* goes to exit will block in perf_event_exit_event().
@@ -6054,7 +6054,7 @@ static void perf_sample_regs_intr(struct perf_regs *regs_intr,
* Get remaining task size from user stack pointer.
*
* It'd be better to take stack vma map and limit this more
- * precisly, but there's no way to get it safely under interrupt,
+ * precisely, but there's no way to get it safely under interrupt,
* so using TASK_SIZE as limit.
*/
static u64 perf_ustack_task_size(struct pt_regs *regs)
@@ -6616,7 +6616,7 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_STACK_USER) {
/*
- * Either we need PERF_SAMPLE_STACK_USER bit to be allways
+ * Either we need PERF_SAMPLE_STACK_USER bit to be always
* processed as the last one or have additional check added
* in case new sample type is added, because we could eat
* up the rest of the sample size.
@@ -10917,6 +10917,13 @@ SYSCALL_DEFINE5(perf_event_open,
perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
return -EACCES;
+ err = security_locked_down(LOCKDOWN_PERF);
+ if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR))
+ /* REGS_INTR can leak data, lockdown must prevent this */
+ return err;
+
+ err = 0;
+
/*
* In cgroup mode, the pid argument is used to pass the fd
* opened to the cgroup directory in cgroupfs. The cpu argument
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 84fa00497c49..94d38a39d72e 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -26,6 +26,7 @@
#include <linux/percpu-rwsem.h>
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
+#include <linux/khugepaged.h>
#include <linux/uprobes.h>
@@ -143,17 +144,19 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
*
* @vma: vma that holds the pte pointing to page
* @addr: address the old @page is mapped at
- * @page: the cowed page we are replacing by kpage
- * @kpage: the modified page we replace page by
+ * @old_page: the page we are replacing by new_page
+ * @new_page: the modified page we replace page by
*
- * Returns 0 on success, -EFAULT on failure.
+ * If @new_page is NULL, only unmap @old_page.
+ *
+ * Returns 0 on success, negative error code otherwise.
*/
static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
struct page *old_page, struct page *new_page)
{
struct mm_struct *mm = vma->vm_mm;
struct page_vma_mapped_walk pvmw = {
- .page = old_page,
+ .page = compound_head(old_page),
.vma = vma,
.address = addr,
};
@@ -164,12 +167,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
addr + PAGE_SIZE);
- VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
-
- err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg,
- false);
- if (err)
- return err;
+ if (new_page) {
+ err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
+ &memcg, false);
+ if (err)
+ return err;
+ }
/* For try_to_free_swap() and munlock_vma_page() below */
lock_page(old_page);
@@ -177,15 +180,20 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mmu_notifier_invalidate_range_start(&range);
err = -EAGAIN;
if (!page_vma_mapped_walk(&pvmw)) {
- mem_cgroup_cancel_charge(new_page, memcg, false);
+ if (new_page)
+ mem_cgroup_cancel_charge(new_page, memcg, false);
goto unlock;
}
VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
- get_page(new_page);
- page_add_new_anon_rmap(new_page, vma, addr, false);
- mem_cgroup_commit_charge(new_page, memcg, false, false);
- lru_cache_add_active_or_unevictable(new_page, vma);
+ if (new_page) {
+ get_page(new_page);
+ page_add_new_anon_rmap(new_page, vma, addr, false);
+ mem_cgroup_commit_charge(new_page, memcg, false, false);
+ lru_cache_add_active_or_unevictable(new_page, vma);
+ } else
+ /* no new page, just dec_mm_counter for old_page */
+ dec_mm_counter(mm, MM_ANONPAGES);
if (!PageAnon(old_page)) {
dec_mm_counter(mm, mm_counter_file(old_page));
@@ -194,8 +202,9 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));
ptep_clear_flush_notify(vma, addr, pvmw.pte);
- set_pte_at_notify(mm, addr, pvmw.pte,
- mk_pte(new_page, vma->vm_page_prot));
+ if (new_page)
+ set_pte_at_notify(mm, addr, pvmw.pte,
+ mk_pte(new_page, vma->vm_page_prot));
page_remove_rmap(old_page, false);
if (!page_mapped(old_page))
@@ -464,6 +473,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
struct page *old_page, *new_page;
struct vm_area_struct *vma;
int ret, is_register, ref_ctr_updated = 0;
+ bool orig_page_huge = false;
is_register = is_swbp_insn(&opcode);
uprobe = container_of(auprobe, struct uprobe, arch);
@@ -471,7 +481,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
retry:
/* Read the page with vaddr into memory */
ret = get_user_pages_remote(NULL, mm, vaddr, 1,
- FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL);
+ FOLL_FORCE | FOLL_SPLIT_PMD, &old_page, &vma, NULL);
if (ret <= 0)
return ret;
@@ -488,6 +498,10 @@ retry:
ref_ctr_updated = 1;
}
+ ret = 0;
+ if (!is_register && !PageAnon(old_page))
+ goto put_old;
+
ret = anon_vma_prepare(vma);
if (ret)
goto put_old;
@@ -501,8 +515,33 @@ retry:
copy_highpage(new_page, old_page);
copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ if (!is_register) {
+ struct page *orig_page;
+ pgoff_t index;
+
+ VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);
+
+ index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
+ orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
+ index);
+
+ if (orig_page) {
+ if (PageUptodate(orig_page) &&
+ pages_identical(new_page, orig_page)) {
+ /* let go new_page */
+ put_page(new_page);
+ new_page = NULL;
+
+ if (PageCompound(orig_page))
+ orig_page_huge = true;
+ }
+ put_page(orig_page);
+ }
+ }
+
ret = __replace_page(vma, vaddr, old_page, new_page);
- put_page(new_page);
+ if (new_page)
+ put_page(new_page);
put_old:
put_page(old_page);
@@ -513,6 +552,10 @@ put_old:
if (ret && is_register && ref_ctr_updated)
update_ref_ctr(uprobe, mm, -1);
+ /* try collapse pmd for compound page */
+ if (!ret && orig_page_huge)
+ collapse_pte_mapped_thp(mm, vaddr);
+
return ret;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 22ab6a4bdc51..a46a50d67002 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -182,6 +182,11 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
put_task_struct(tsk);
}
+void put_task_struct_rcu_user(struct task_struct *task)
+{
+ if (refcount_dec_and_test(&task->rcu_users))
+ call_rcu(&task->rcu, delayed_put_task_struct);
+}
void release_task(struct task_struct *p)
{
@@ -222,76 +227,13 @@ repeat:
write_unlock_irq(&tasklist_lock);
release_thread(p);
- call_rcu(&p->rcu, delayed_put_task_struct);
+ put_task_struct_rcu_user(p);
p = leader;
if (unlikely(zap_leader))
goto repeat;
}
-/*
- * Note that if this function returns a valid task_struct pointer (!NULL)
- * task->usage must remain >0 for the duration of the RCU critical section.
- */
-struct task_struct *task_rcu_dereference(struct task_struct **ptask)
-{
- struct sighand_struct *sighand;
- struct task_struct *task;
-
- /*
- * We need to verify that release_task() was not called and thus
- * delayed_put_task_struct() can't run and drop the last reference
- * before rcu_read_unlock(). We check task->sighand != NULL,
- * but we can read the already freed and reused memory.
- */
-retry:
- task = rcu_dereference(*ptask);
- if (!task)
- return NULL;
-
- probe_kernel_address(&task->sighand, sighand);
-
- /*
- * Pairs with atomic_dec_and_test() in put_task_struct(). If this task
- * was already freed we can not miss the preceding update of this
- * pointer.
- */
- smp_rmb();
- if (unlikely(task != READ_ONCE(*ptask)))
- goto retry;
-
- /*
- * We've re-checked that "task == *ptask", now we have two different
- * cases:
- *
- * 1. This is actually the same task/task_struct. In this case
- * sighand != NULL tells us it is still alive.
- *
- * 2. This is another task which got the same memory for task_struct.
- * We can't know this of course, and we can not trust
- * sighand != NULL.
- *
- * In this case we actually return a random value, but this is
- * correct.
- *
- * If we return NULL - we can pretend that we actually noticed that
- * *ptask was updated when the previous task has exited. Or pretend
- * that probe_slab_address(&sighand) reads NULL.
- *
- * If we return the new task (because sighand is not NULL for any
- * reason) - this is fine too. This (new) task can't go away before
- * another gp pass.
- *
- * And note: We could even eliminate the false positive if re-read
- * task->sighand once again to avoid the falsely NULL. But this case
- * is very unlikely so we don't care.
- */
- if (!sighand)
- return NULL;
-
- return task;
-}
-
void rcuwait_wake_up(struct rcuwait *w)
{
struct task_struct *task;
@@ -311,10 +253,6 @@ void rcuwait_wake_up(struct rcuwait *w)
*/
smp_mb(); /* (B) */
- /*
- * Avoid using task_rcu_dereference() magic as long as we are careful,
- * see comment in rcuwait_wait_event() regarding ->exit_state.
- */
task = rcu_dereference(w->task);
if (task)
wake_up_process(task);
diff --git a/kernel/extable.c b/kernel/extable.c
index e23cce6e6092..f6c9406eec7d 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -40,13 +40,20 @@ void __init sort_main_extable(void)
}
}
+/* Given an address, look for it in the kernel exception table */
+const
+struct exception_table_entry *search_kernel_exception_table(unsigned long addr)
+{
+ return search_extable(__start___ex_table,
+ __stop___ex_table - __start___ex_table, addr);
+}
+
/* Given an address, look for it in the exception tables. */
const struct exception_table_entry *search_exception_tables(unsigned long addr)
{
const struct exception_table_entry *e;
- e = search_extable(__start___ex_table,
- __stop___ex_table - __start___ex_table, addr);
+ e = search_kernel_exception_table(addr);
if (!e)
e = search_module_extables(addr);
return e;
diff --git a/kernel/fork.c b/kernel/fork.c
index 53e780748fe3..f9572f416126 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -125,6 +125,15 @@ int nr_threads; /* The idle threads do not count.. */
static int max_threads; /* tunable limit on nr_threads */
+#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x)
+
+static const char * const resident_page_types[] = {
+ NAMED_ARRAY_INDEX(MM_FILEPAGES),
+ NAMED_ARRAY_INDEX(MM_ANONPAGES),
+ NAMED_ARRAY_INDEX(MM_SWAPENTS),
+ NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
+};
+
DEFINE_PER_CPU(unsigned long, process_counts) = 0;
__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
@@ -645,12 +654,15 @@ static void check_mm(struct mm_struct *mm)
{
int i;
+ BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
+ "Please make sure 'struct resident_page_types[]' is updated as well");
+
for (i = 0; i < NR_MM_COUNTERS; i++) {
long x = atomic_long_read(&mm->rss_stat.count[i]);
if (unlikely(x))
- printk(KERN_ALERT "BUG: Bad rss-counter state "
- "mm:%p idx:%d val:%ld\n", mm, i, x);
+ pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
+ mm, resident_page_types[i], x);
}
if (mm_pgtables_bytes(mm))
@@ -903,10 +915,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->cpus_ptr = &tsk->cpus_mask;
/*
- * One for us, one for whoever does the "release_task()" (usually
- * parent)
+ * One for the user space visible state that goes away when reaped.
+ * One for the scheduler.
*/
- refcount_set(&tsk->usage, 2);
+ refcount_set(&tsk->rcu_users, 2);
+ /* One for the rcu users */
+ refcount_set(&tsk->usage, 1);
#ifdef CONFIG_BLK_DEV_IO_TRACE
tsk->btrace_seq = 0;
#endif
@@ -1009,7 +1023,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_owner(mm, p);
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_mm_init(mm);
- hmm_mm_init(mm);
init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 3941a9c48f83..060e8e726755 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -4,7 +4,7 @@ menu "GCOV-based kernel profiling"
config GCOV_KERNEL
bool "Enable gcov-based kernel profiling"
depends on DEBUG_FS
- select CONSTRUCTORS if !UML
+ select CONSTRUCTORS
default n
---help---
This option enables gcov-based code profiling (e.g. for code coverage
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 1b018f1a6e0d..bc933c0db9bf 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -206,6 +206,14 @@ static inline int kexec_load_check(unsigned long nr_segments,
return result;
/*
+ * kexec can be used to circumvent module loading restrictions, so
+ * prevent loading in that case
+ */
+ result = security_locked_down(LOCKDOWN_KEXEC);
+ if (result)
+ return result;
+
+ /*
* Verify we have a legal set of flags
* This leaves us room for future extensions.
*/
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index d5870723b8ad..15d70a90b50d 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -300,6 +300,8 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
{
struct page *pages;
+ if (fatal_signal_pending(current))
+ return NULL;
pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order);
if (pages) {
unsigned int count, i;
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b8cc032d5620..79f252af7dee 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -88,7 +88,7 @@ int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
return kexec_image_post_load_cleanup_default(image);
}
-#ifdef CONFIG_KEXEC_VERIFY_SIG
+#ifdef CONFIG_KEXEC_SIG
static int kexec_image_verify_sig_default(struct kimage *image, void *buf,
unsigned long buf_len)
{
@@ -177,6 +177,59 @@ void kimage_file_post_load_cleanup(struct kimage *image)
image->image_loader_data = NULL;
}
+#ifdef CONFIG_KEXEC_SIG
+static int
+kimage_validate_signature(struct kimage *image)
+{
+ const char *reason;
+ int ret;
+
+ ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
+ image->kernel_buf_len);
+ switch (ret) {
+ case 0:
+ break;
+
+ /* Certain verification errors are non-fatal if we're not
+ * checking errors, provided we aren't mandating that there
+ * must be a valid signature.
+ */
+ case -ENODATA:
+ reason = "kexec of unsigned image";
+ goto decide;
+ case -ENOPKG:
+ reason = "kexec of image with unsupported crypto";
+ goto decide;
+ case -ENOKEY:
+ reason = "kexec of image with unavailable key";
+ decide:
+ if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) {
+ pr_notice("%s rejected\n", reason);
+ return ret;
+ }
+
+ /* If IMA is guaranteed to appraise a signature on the kexec
+ * image, permit it even if the kernel is otherwise locked
+ * down.
+ */
+ if (!ima_appraise_signature(READING_KEXEC_IMAGE) &&
+ security_locked_down(LOCKDOWN_KEXEC))
+ return -EPERM;
+
+ return 0;
+
+ /* All other errors are fatal, including nomem, unparseable
+ * signatures and signature check failures - even if signatures
+ * aren't required.
+ */
+ default:
+ pr_notice("kernel signature verification failed (%d).\n", ret);
+ }
+
+ return ret;
+}
+#endif
+
/*
* In file mode list of segments is prepared by kernel. Copy relevant
* data from user space, do error checking, prepare segment list
@@ -186,7 +239,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
const char __user *cmdline_ptr,
unsigned long cmdline_len, unsigned flags)
{
- int ret = 0;
+ int ret;
void *ldata;
loff_t size;
@@ -202,14 +255,11 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
if (ret)
goto out;
-#ifdef CONFIG_KEXEC_VERIFY_SIG
- ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
- image->kernel_buf_len);
- if (ret) {
- pr_debug("kernel signature verification failed.\n");
+#ifdef CONFIG_KEXEC_SIG
+ ret = kimage_validate_signature(image);
+
+ if (ret)
goto out;
- }
- pr_debug("kernel signature verification successful.\n");
#endif
/* It is possible that there no initramfs is being loaded */
if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1b66ccbb744a..53534aa258a6 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -962,8 +962,15 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
#ifdef CONFIG_KPROBES_ON_FTRACE
static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
.func = kprobe_ftrace_handler,
+ .flags = FTRACE_OPS_FL_SAVE_REGS,
+};
+
+static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = {
+ .func = kprobe_ftrace_handler,
.flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY,
};
+
+static int kprobe_ipmodify_enabled;
static int kprobe_ftrace_enabled;
/* Must ensure p->addr is really on ftrace */
@@ -976,58 +983,75 @@ static int prepare_kprobe(struct kprobe *p)
}
/* Caller must lock kprobe_mutex */
-static int arm_kprobe_ftrace(struct kprobe *p)
+static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
+ int *cnt)
{
int ret = 0;
- ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
- (unsigned long)p->addr, 0, 0);
+ ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0);
if (ret) {
pr_debug("Failed to arm kprobe-ftrace at %pS (%d)\n",
p->addr, ret);
return ret;
}
- if (kprobe_ftrace_enabled == 0) {
- ret = register_ftrace_function(&kprobe_ftrace_ops);
+ if (*cnt == 0) {
+ ret = register_ftrace_function(ops);
if (ret) {
pr_debug("Failed to init kprobe-ftrace (%d)\n", ret);
goto err_ftrace;
}
}
- kprobe_ftrace_enabled++;
+ (*cnt)++;
return ret;
err_ftrace:
/*
- * Note: Since kprobe_ftrace_ops has IPMODIFY set, and ftrace requires a
- * non-empty filter_hash for IPMODIFY ops, we're safe from an accidental
- * empty filter_hash which would undesirably trace all functions.
+ * At this point, sinec ops is not registered, we should be sefe from
+ * registering empty filter.
*/
- ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0);
+ ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);
return ret;
}
+static int arm_kprobe_ftrace(struct kprobe *p)
+{
+ bool ipmodify = (p->post_handler != NULL);
+
+ return __arm_kprobe_ftrace(p,
+ ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops,
+ ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
+}
+
/* Caller must lock kprobe_mutex */
-static int disarm_kprobe_ftrace(struct kprobe *p)
+static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
+ int *cnt)
{
int ret = 0;
- if (kprobe_ftrace_enabled == 1) {
- ret = unregister_ftrace_function(&kprobe_ftrace_ops);
+ if (*cnt == 1) {
+ ret = unregister_ftrace_function(ops);
if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret))
return ret;
}
- kprobe_ftrace_enabled--;
+ (*cnt)--;
- ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
- (unsigned long)p->addr, 1, 0);
+ ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);
WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (%d)\n",
p->addr, ret);
return ret;
}
+
+static int disarm_kprobe_ftrace(struct kprobe *p)
+{
+ bool ipmodify = (p->post_handler != NULL);
+
+ return __disarm_kprobe_ftrace(p,
+ ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops,
+ ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
+}
#else /* !CONFIG_KPROBES_ON_FTRACE */
#define prepare_kprobe(p) arch_prepare_kprobe(p)
#define arm_kprobe_ftrace(p) (-ENODEV)
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index c4ce08f43bd6..ab4a4606d19b 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -1175,6 +1175,7 @@ err:
pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n",
patch->mod->name, obj->mod->name, obj->mod->name);
mod->klp_alive = false;
+ obj->mod = NULL;
klp_cleanup_module_patches_limited(mod, patch);
mutex_unlock(&klp_mutex);
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 89bab079e7a4..e84d21aa0722 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -269,7 +269,7 @@ pv_wait_early(struct pv_node *prev, int loop)
if ((loop & PV_PREV_CHECK_MASK) != 0)
return false;
- return READ_ONCE(prev->state) != vcpu_running || vcpu_is_preempted(prev->cpu);
+ return READ_ONCE(prev->state) != vcpu_running;
}
/*
diff --git a/kernel/module.c b/kernel/module.c
index 9ee93421269c..ff2d7359a418 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/extable.h>
#include <linux/moduleloader.h>
+#include <linux/module_signature.h>
#include <linux/trace_events.h>
#include <linux/init.h>
#include <linux/kallsyms.h>
@@ -544,12 +545,20 @@ static const char *kernel_symbol_name(const struct kernel_symbol *sym)
#endif
}
-static int cmp_name(const void *va, const void *vb)
+static const char *kernel_symbol_namespace(const struct kernel_symbol *sym)
{
- const char *a;
- const struct kernel_symbol *b;
- a = va; b = vb;
- return strcmp(a, kernel_symbol_name(b));
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ if (!sym->namespace_offset)
+ return NULL;
+ return offset_to_ptr(&sym->namespace_offset);
+#else
+ return sym->namespace;
+#endif
+}
+
+static int cmp_name(const void *name, const void *sym)
+{
+ return strcmp(name, kernel_symbol_name(sym));
}
static bool find_exported_symbol_in_section(const struct symsearch *syms,
@@ -1379,6 +1388,41 @@ static inline int same_magic(const char *amagic, const char *bmagic,
}
#endif /* CONFIG_MODVERSIONS */
+static char *get_modinfo(const struct load_info *info, const char *tag);
+static char *get_next_modinfo(const struct load_info *info, const char *tag,
+ char *prev);
+
+static int verify_namespace_is_imported(const struct load_info *info,
+ const struct kernel_symbol *sym,
+ struct module *mod)
+{
+ const char *namespace;
+ char *imported_namespace;
+
+ namespace = kernel_symbol_namespace(sym);
+ if (namespace) {
+ imported_namespace = get_modinfo(info, "import_ns");
+ while (imported_namespace) {
+ if (strcmp(namespace, imported_namespace) == 0)
+ return 0;
+ imported_namespace = get_next_modinfo(
+ info, "import_ns", imported_namespace);
+ }
+#ifdef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
+ pr_warn(
+#else
+ pr_err(
+#endif
+ "%s: module uses symbol (%s) from namespace %s, but does not import it.\n",
+ mod->name, kernel_symbol_name(sym), namespace);
+#ifndef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
+ return -EINVAL;
+#endif
+ }
+ return 0;
+}
+
+
/* Resolve a symbol for this module. I.e. if we find one, record usage. */
static const struct kernel_symbol *resolve_symbol(struct module *mod,
const struct load_info *info,
@@ -1407,6 +1451,12 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
goto getname;
}
+ err = verify_namespace_is_imported(info, sym, mod);
+ if (err) {
+ sym = ERR_PTR(err);
+ goto getname;
+ }
+
err = ref_module(mod, owner);
if (err) {
sym = ERR_PTR(err);
@@ -2481,7 +2531,8 @@ static char *next_string(char *string, unsigned long *secsize)
return string;
}
-static char *get_modinfo(struct load_info *info, const char *tag)
+static char *get_next_modinfo(const struct load_info *info, const char *tag,
+ char *prev)
{
char *p;
unsigned int taglen = strlen(tag);
@@ -2492,13 +2543,25 @@ static char *get_modinfo(struct load_info *info, const char *tag)
* get_modinfo() calls made before rewrite_section_headers()
* must use sh_offset, as sh_addr isn't set!
*/
- for (p = (char *)info->hdr + infosec->sh_offset; p; p = next_string(p, &size)) {
+ char *modinfo = (char *)info->hdr + infosec->sh_offset;
+
+ if (prev) {
+ size -= prev - modinfo;
+ modinfo = next_string(prev, &size);
+ }
+
+ for (p = modinfo; p; p = next_string(p, &size)) {
if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
return p + taglen + 1;
}
return NULL;
}
+static char *get_modinfo(const struct load_info *info, const char *tag)
+{
+ return get_next_modinfo(info, tag, NULL);
+}
+
static void setup_modinfo(struct module *mod, struct load_info *info)
{
struct module_attribute *attr;
@@ -2776,8 +2839,9 @@ static inline void kmemleak_load_module(const struct module *mod,
#ifdef CONFIG_MODULE_SIG
static int module_sig_check(struct load_info *info, int flags)
{
- int err = -ENOKEY;
+ int err = -ENODATA;
const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
+ const char *reason;
const void *mod = info->hdr;
/*
@@ -2792,16 +2856,38 @@ static int module_sig_check(struct load_info *info, int flags)
err = mod_verify_sig(mod, info);
}
- if (!err) {
+ switch (err) {
+ case 0:
info->sig_ok = true;
return 0;
- }
- /* Not having a signature is only an error if we're strict. */
- if (err == -ENOKEY && !is_module_sig_enforced())
- err = 0;
+ /* We don't permit modules to be loaded into trusted kernels
+ * without a valid signature on them, but if we're not
+ * enforcing, certain errors are non-fatal.
+ */
+ case -ENODATA:
+ reason = "Loading of unsigned module";
+ goto decide;
+ case -ENOPKG:
+ reason = "Loading of module with unsupported crypto";
+ goto decide;
+ case -ENOKEY:
+ reason = "Loading of module with unavailable key";
+ decide:
+ if (is_module_sig_enforced()) {
+ pr_notice("%s is rejected\n", reason);
+ return -EKEYREJECTED;
+ }
- return err;
+ return security_locked_down(LOCKDOWN_MODULE_SIGNATURE);
+
+ /* All other errors are fatal, including nomem, unparseable
+ * signatures and signature check failures - even if signatures
+ * aren't required.
+ */
+ default:
+ return err;
+ }
}
#else /* !CONFIG_MODULE_SIG */
static int module_sig_check(struct load_info *info, int flags)
diff --git a/kernel/module_signature.c b/kernel/module_signature.c
new file mode 100644
index 000000000000..4224a1086b7d
--- /dev/null
+++ b/kernel/module_signature.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Module signature checker
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/errno.h>
+#include <linux/printk.h>
+#include <linux/module_signature.h>
+#include <asm/byteorder.h>
+
+/**
+ * mod_check_sig - check that the given signature is sane
+ *
+ * @ms: Signature to check.
+ * @file_len: Size of the file to which @ms is appended.
+ * @name: What is being checked. Used for error messages.
+ */
+int mod_check_sig(const struct module_signature *ms, size_t file_len,
+ const char *name)
+{
+ if (be32_to_cpu(ms->sig_len) >= file_len - sizeof(*ms))
+ return -EBADMSG;
+
+ if (ms->id_type != PKEY_ID_PKCS7) {
+ pr_err("%s: Module is not signed with expected PKCS#7 message\n",
+ name);
+ return -ENOPKG;
+ }
+
+ if (ms->algo != 0 ||
+ ms->hash != 0 ||
+ ms->signer_len != 0 ||
+ ms->key_id_len != 0 ||
+ ms->__pad[0] != 0 ||
+ ms->__pad[1] != 0 ||
+ ms->__pad[2] != 0) {
+ pr_err("%s: PKCS#7 signature info has unexpected non-zero params\n",
+ name);
+ return -EBADMSG;
+ }
+
+ return 0;
+}
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index b10fb1986ca9..9d9fc678c91d 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -7,37 +7,13 @@
#include <linux/kernel.h>
#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/module_signature.h>
#include <linux/string.h>
#include <linux/verification.h>
#include <crypto/public_key.h>
#include "module-internal.h"
-enum pkey_id_type {
- PKEY_ID_PGP, /* OpenPGP generated key ID */
- PKEY_ID_X509, /* X.509 arbitrary subjectKeyIdentifier */
- PKEY_ID_PKCS7, /* Signature in PKCS#7 message */
-};
-
-/*
- * Module signature information block.
- *
- * The constituents of the signature section are, in order:
- *
- * - Signer's name
- * - Key identifier
- * - Signature data
- * - Information block
- */
-struct module_signature {
- u8 algo; /* Public-key crypto algorithm [0] */
- u8 hash; /* Digest algorithm [0] */
- u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */
- u8 signer_len; /* Length of signer's name [0] */
- u8 key_id_len; /* Length of key identifier [0] */
- u8 __pad[3];
- __be32 sig_len; /* Length of signature data */
-};
-
/*
* Verify the signature on a module.
*/
@@ -45,6 +21,7 @@ int mod_verify_sig(const void *mod, struct load_info *info)
{
struct module_signature ms;
size_t sig_len, modlen = info->len;
+ int ret;
pr_devel("==>%s(,%zu)\n", __func__, modlen);
@@ -52,32 +29,15 @@ int mod_verify_sig(const void *mod, struct load_info *info)
return -EBADMSG;
memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
- modlen -= sizeof(ms);
+
+ ret = mod_check_sig(&ms, modlen, info->name);
+ if (ret)
+ return ret;
sig_len = be32_to_cpu(ms.sig_len);
- if (sig_len >= modlen)
- return -EBADMSG;
- modlen -= sig_len;
+ modlen -= sig_len + sizeof(ms);
info->len = modlen;
- if (ms.id_type != PKEY_ID_PKCS7) {
- pr_err("%s: Module is not signed with expected PKCS#7 message\n",
- info->name);
- return -ENOPKG;
- }
-
- if (ms.algo != 0 ||
- ms.hash != 0 ||
- ms.signer_len != 0 ||
- ms.key_id_len != 0 ||
- ms.__pad[0] != 0 ||
- ms.__pad[1] != 0 ||
- ms.__pad[2] != 0) {
- pr_err("%s: PKCS#7 signature info has unexpected non-zero params\n",
- info->name);
- return -EBADMSG;
- }
-
return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
VERIFY_USE_SECONDARY_KEYRING,
VERIFYING_MODULE_SIGNATURE,
diff --git a/kernel/panic.c b/kernel/panic.c
index 057540b6eee9..47e8ebccc22b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -12,6 +12,7 @@
#include <linux/debug_locks.h>
#include <linux/sched/debug.h>
#include <linux/interrupt.h>
+#include <linux/kgdb.h>
#include <linux/kmsg_dump.h>
#include <linux/kallsyms.h>
#include <linux/notifier.h>
@@ -220,6 +221,13 @@ void panic(const char *fmt, ...)
#endif
/*
+ * If kgdb is enabled, give it a chance to run before we stop all
+ * the other CPUs or else we won't be able to debug processes left
+ * running on them.
+ */
+ kgdb_panic(buf);
+
+ /*
* If we have crashed and we have a crash kernel loaded let it handle
* everything else.
* If we want to run this after calling panic_notifiers, pass
@@ -551,9 +559,6 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
{
disable_trace_on_warning();
- if (args)
- pr_warn(CUT_HERE);
-
if (file)
pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
raw_smp_processor_id(), current->pid, file, line,
@@ -591,37 +596,26 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
add_taint(taint, LOCKDEP_STILL_OK);
}
-#ifdef WANT_WARN_ON_SLOWPATH
-void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
+#ifndef __WARN_FLAGS
+void warn_slowpath_fmt(const char *file, int line, unsigned taint,
+ const char *fmt, ...)
{
struct warn_args args;
- args.fmt = fmt;
- va_start(args.args, fmt);
- __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL,
- &args);
- va_end(args.args);
-}
-EXPORT_SYMBOL(warn_slowpath_fmt);
+ pr_warn(CUT_HERE);
-void warn_slowpath_fmt_taint(const char *file, int line,
- unsigned taint, const char *fmt, ...)
-{
- struct warn_args args;
+ if (!fmt) {
+ __warn(file, line, __builtin_return_address(0), taint,
+ NULL, NULL);
+ return;
+ }
args.fmt = fmt;
va_start(args.args, fmt);
__warn(file, line, __builtin_return_address(0), taint, NULL, &args);
va_end(args.args);
}
-EXPORT_SYMBOL(warn_slowpath_fmt_taint);
-
-void warn_slowpath_null(const char *file, int line)
-{
- pr_warn(CUT_HERE);
- __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL);
-}
-EXPORT_SYMBOL(warn_slowpath_null);
+EXPORT_SYMBOL(warn_slowpath_fmt);
#else
void __warn_printk(const char *fmt, ...)
{
diff --git a/kernel/params.c b/kernel/params.c
index cf448785d058..8e56f8b12d8f 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -12,6 +12,7 @@
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/ctype.h>
+#include <linux/security.h>
#ifdef CONFIG_SYSFS
/* Protects all built-in parameters, modules use their own param_lock */
@@ -96,13 +97,19 @@ bool parameq(const char *a, const char *b)
return parameqn(a, b, strlen(a)+1);
}
-static void param_check_unsafe(const struct kernel_param *kp)
+static bool param_check_unsafe(const struct kernel_param *kp)
{
+ if (kp->flags & KERNEL_PARAM_FL_HWPARAM &&
+ security_locked_down(LOCKDOWN_MODULE_PARAMETERS))
+ return false;
+
if (kp->flags & KERNEL_PARAM_FL_UNSAFE) {
pr_notice("Setting dangerous option %s - tainting kernel\n",
kp->name);
add_taint(TAINT_USER, LOCKDEP_STILL_OK);
}
+
+ return true;
}
static int parse_one(char *param,
@@ -132,8 +139,10 @@ static int parse_one(char *param,
pr_debug("handling %s with %p\n", param,
params[i].ops->set);
kernel_param_lock(params[i].mod);
- param_check_unsafe(&params[i]);
- err = params[i].ops->set(val, &params[i]);
+ if (param_check_unsafe(&params[i]))
+ err = params[i].ops->set(val, &params[i]);
+ else
+ err = -EPERM;
kernel_param_unlock(params[i].mod);
return err;
}
@@ -553,8 +562,10 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
return -EPERM;
kernel_param_lock(mk->mod);
- param_check_unsafe(attribute->param);
- err = attribute->param->ops->set(buf, attribute->param);
+ if (param_check_unsafe(attribute->param))
+ err = attribute->param->ops->set(buf, attribute->param);
+ else
+ err = -EPERM;
kernel_param_unlock(mk->mod);
if (!err)
return len;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index cd7434e6000d..3c0a5a8170b0 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -30,6 +30,7 @@
#include <linux/ctype.h>
#include <linux/genhd.h>
#include <linux/ktime.h>
+#include <linux/security.h>
#include <trace/events/power.h>
#include "power.h"
@@ -68,7 +69,7 @@ static const struct platform_hibernation_ops *hibernation_ops;
bool hibernation_available(void)
{
- return (nohibernate == 0);
+ return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION);
}
/**
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
index 1d21ebacfdb8..17a9591e54ff 100644
--- a/kernel/printk/braille.c
+++ b/kernel/printk/braille.c
@@ -11,11 +11,18 @@
int _braille_console_setup(char **str, char **brl_options)
{
- if (!strncmp(*str, "brl,", 4)) {
+ size_t len;
+
+ len = str_has_prefix(*str, "brl,");
+ if (len) {
*brl_options = "";
- *str += 4;
- } else if (!strncmp(*str, "brl=", 4)) {
- *brl_options = *str + 4;
+ *str += len;
+ return 0;
+ }
+
+ len = str_has_prefix(*str, "brl=");
+ if (len) {
+ *brl_options = *str + len;
*str = strchr(*brl_options, ',');
if (!*str) {
pr_err("need port name after brl=\n");
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 1888f6a3b694..ca65327a6de8 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -118,19 +118,29 @@ static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
static int __control_devkmsg(char *str)
{
+ size_t len;
+
if (!str)
return -EINVAL;
- if (!strncmp(str, "on", 2)) {
+ len = str_has_prefix(str, "on");
+ if (len) {
devkmsg_log = DEVKMSG_LOG_MASK_ON;
- return 2;
- } else if (!strncmp(str, "off", 3)) {
+ return len;
+ }
+
+ len = str_has_prefix(str, "off");
+ if (len) {
devkmsg_log = DEVKMSG_LOG_MASK_OFF;
- return 3;
- } else if (!strncmp(str, "ratelimit", 9)) {
+ return len;
+ }
+
+ len = str_has_prefix(str, "ratelimit");
+ if (len) {
devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
- return 9;
+ return len;
}
+
return -EINVAL;
}
@@ -3274,7 +3284,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
/* move first record forward until length fits into the buffer */
seq = dumper->cur_seq;
idx = dumper->cur_idx;
- while (l > size && seq < dumper->next_seq) {
+ while (l >= size && seq < dumper->next_seq) {
struct printk_log *msg = log_from_idx(idx);
l -= msg_print_text(msg, true, time, NULL, 0);
diff --git a/kernel/resource.c b/kernel/resource.c
index 7ea4306503c5..76036a41143b 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -487,8 +487,8 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
while (start < end &&
!find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
false, &res)) {
- pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
- end_pfn = (res.end + 1) >> PAGE_SHIFT;
+ pfn = PFN_UP(res.start);
+ end_pfn = PFN_DOWN(res.end + 1);
if (end_pfn > pfn)
ret = (*func)(pfn, end_pfn - pfn, arg);
if (ret)
@@ -1644,19 +1644,8 @@ void resource_list_free(struct list_head *head)
EXPORT_SYMBOL(resource_list_free);
#ifdef CONFIG_DEVICE_PRIVATE
-/**
- * devm_request_free_mem_region - find free region for device private memory
- *
- * @dev: device struct to bind the resource to
- * @size: size in bytes of the device memory to add
- * @base: resource tree to look in
- *
- * This function tries to find an empty range of physical address big enough to
- * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE
- * memory, which in turn allocates struct pages.
- */
-struct resource *devm_request_free_mem_region(struct device *dev,
- struct resource *base, unsigned long size)
+static struct resource *__request_free_mem_region(struct device *dev,
+ struct resource *base, unsigned long size, const char *name)
{
resource_size_t end, addr;
struct resource *res;
@@ -1670,7 +1659,10 @@ struct resource *devm_request_free_mem_region(struct device *dev,
REGION_DISJOINT)
continue;
- res = devm_request_mem_region(dev, addr, size, dev_name(dev));
+ if (dev)
+ res = devm_request_mem_region(dev, addr, size, name);
+ else
+ res = request_mem_region(addr, size, name);
if (!res)
return ERR_PTR(-ENOMEM);
res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
@@ -1679,7 +1671,32 @@ struct resource *devm_request_free_mem_region(struct device *dev,
return ERR_PTR(-ERANGE);
}
+
+/**
+ * devm_request_free_mem_region - find free region for device private memory
+ *
+ * @dev: device struct to bind the resource to
+ * @size: size in bytes of the device memory to add
+ * @base: resource tree to look in
+ *
+ * This function tries to find an empty range of physical address big enough to
+ * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE
+ * memory, which in turn allocates struct pages.
+ */
+struct resource *devm_request_free_mem_region(struct device *dev,
+ struct resource *base, unsigned long size)
+{
+ return __request_free_mem_region(dev, base, size, dev_name(dev));
+}
EXPORT_SYMBOL_GPL(devm_request_free_mem_region);
+
+struct resource *request_free_mem_region(struct resource *base,
+ unsigned long size, const char *name)
+{
+ return __request_free_mem_region(NULL, base, size, name);
+}
+EXPORT_SYMBOL_GPL(request_free_mem_region);
+
#endif /* CONFIG_DEVICE_PRIVATE */
static int __init strict_iomem(char *str)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5e8387bdd09c..7880f4f64d0e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1656,7 +1656,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
if (cpumask_equal(p->cpus_ptr, new_mask))
goto out;
- if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
+ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
+ if (dest_cpu >= nr_cpu_ids) {
ret = -EINVAL;
goto out;
}
@@ -1677,7 +1678,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
if (cpumask_test_cpu(task_cpu(p), new_mask))
goto out;
- dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
if (task_running(rq, p) || p->state == TASK_WAKING) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
@@ -3254,7 +3254,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
/* Task is done with its stack. */
put_task_stack(prev);
- put_task_struct(prev);
+ put_task_struct_rcu_user(prev);
}
tick_nohz_task_switch();
@@ -3358,15 +3358,15 @@ context_switch(struct rq *rq, struct task_struct *prev,
else
prev->active_mm = NULL;
} else { // to user
+ membarrier_switch_mm(rq, prev->active_mm, next->mm);
/*
* sys_membarrier() requires an smp_mb() between setting
- * rq->curr and returning to userspace.
+ * rq->curr / membarrier_switch_mm() and returning to userspace.
*
* The below provides this either through switch_mm(), or in
* case 'prev->active_mm == next->mm' through
* finish_task_switch()'s mmdrop().
*/
-
switch_mm_irqs_off(prev->active_mm, next->mm, next);
if (!prev->mm) { // from kernel
@@ -3871,13 +3871,22 @@ static noinline void __schedule_bug(struct task_struct *prev)
/*
* Various schedule()-time debugging checks and statistics:
*/
-static inline void schedule_debug(struct task_struct *prev)
+static inline void schedule_debug(struct task_struct *prev, bool preempt)
{
#ifdef CONFIG_SCHED_STACK_END_CHECK
if (task_stack_end_corrupted(prev))
panic("corrupted stack end detected inside scheduler\n");
#endif
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+ if (!preempt && prev->state && prev->non_block_count) {
+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
+ prev->comm, prev->pid, prev->non_block_count);
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+ }
+#endif
+
if (unlikely(in_atomic_preempt_off())) {
__schedule_bug(prev);
preempt_count_set(PREEMPT_DISABLED);
@@ -3989,7 +3998,7 @@ static void __sched notrace __schedule(bool preempt)
rq = cpu_rq(cpu);
prev = rq->curr;
- schedule_debug(prev);
+ schedule_debug(prev, preempt);
if (sched_feat(HRTICK))
hrtick_clear(rq);
@@ -4033,7 +4042,11 @@ static void __sched notrace __schedule(bool preempt)
if (likely(prev != next)) {
rq->nr_switches++;
- rq->curr = next;
+ /*
+ * RCU users of rcu_dereference(rq->curr) may not see
+ * changes to task_struct made by pick_next_task().
+ */
+ RCU_INIT_POINTER(rq->curr, next);
/*
* The membarrier system call requires each architecture
* to have a full memory barrier after updating
@@ -4214,9 +4227,8 @@ static void __sched notrace preempt_schedule_common(void)
#ifdef CONFIG_PREEMPTION
/*
- * this is the entry point to schedule() from in-kernel preemption
- * off of preempt_enable. Kernel preemptions off return from interrupt
- * occur there and call schedule directly.
+ * This is the entry point to schedule() from in-kernel preemption
+ * off of preempt_enable.
*/
asmlinkage __visible void __sched notrace preempt_schedule(void)
{
@@ -4287,7 +4299,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
#endif /* CONFIG_PREEMPTION */
/*
- * this is the entry point to schedule() from kernel preemption
+ * This is the entry point to schedule() from kernel preemption
* off of irq context.
* Note, that this is called and return with irqs disabled. This will
* protect us against recursive calling from irq.
@@ -6060,7 +6072,8 @@ void init_idle(struct task_struct *idle, int cpu)
__set_task_cpu(idle, cpu);
rcu_read_unlock();
- rq->curr = rq->idle = idle;
+ rq->idle = idle;
+ rcu_assign_pointer(rq->curr, idle);
idle->on_rq = TASK_ON_RQ_QUEUED;
#ifdef CONFIG_SMP
idle->on_cpu = 1;
@@ -6421,8 +6434,6 @@ int sched_cpu_activate(unsigned int cpu)
}
rq_unlock_irqrestore(rq, &rf);
- update_max_interval();
-
return 0;
}
@@ -6763,7 +6774,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
rcu_sleep_check();
if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
- !is_idle_task(current)) ||
+ !is_idle_task(current) && !current->non_block_count) ||
system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
oops_in_progress)
return;
@@ -6779,8 +6790,8 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
"BUG: sleeping function called from invalid context at %s:%d\n",
file, line);
printk(KERN_ERR
- "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
- in_atomic(), irqs_disabled(),
+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(), current->non_block_count,
current->pid, current->comm);
if (task_stack_end_corrupted(current))
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d4bbf68c3161..83ab35e2374f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -749,7 +749,6 @@ void init_entity_runnable_average(struct sched_entity *se)
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
static void attach_entity_cfs_rq(struct sched_entity *se);
/*
@@ -1603,7 +1602,7 @@ static void task_numa_compare(struct task_numa_env *env,
return;
rcu_read_lock();
- cur = task_rcu_dereference(&dst_rq->curr);
+ cur = rcu_dereference(dst_rq->curr);
if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
cur = NULL;
@@ -4354,21 +4353,16 @@ static inline u64 sched_cfs_bandwidth_slice(void)
}
/*
- * Replenish runtime according to assigned quota and update expiration time.
- * We use sched_clock_cpu directly instead of rq->clock to avoid adding
- * additional synchronization around rq->lock.
+ * Replenish runtime according to assigned quota. We use sched_clock_cpu
+ * directly instead of rq->clock to avoid adding additional synchronization
+ * around rq->lock.
*
* requires cfs_b->lock
*/
void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
{
- u64 now;
-
- if (cfs_b->quota == RUNTIME_INF)
- return;
-
- now = sched_clock_cpu(smp_processor_id());
- cfs_b->runtime = cfs_b->quota;
+ if (cfs_b->quota != RUNTIME_INF)
+ cfs_b->runtime = cfs_b->quota;
}
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4376,15 +4370,6 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
return &tg->cfs_bandwidth;
}
-/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
-{
- if (unlikely(cfs_rq->throttle_count))
- return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
-
- return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
-}
-
/* returns 0 on failure to allocate runtime */
static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
@@ -4476,7 +4461,6 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
cfs_rq->throttle_count--;
if (!cfs_rq->throttle_count) {
- /* adjust cfs_rq_clock_task() */
cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
cfs_rq->throttled_clock_task;
@@ -4994,15 +4978,13 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
- u64 overrun;
-
lockdep_assert_held(&cfs_b->lock);
if (cfs_b->period_active)
return;
cfs_b->period_active = 1;
- overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
+ hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
}
@@ -5080,11 +5062,6 @@ static inline bool cfs_bandwidth_used(void)
return false;
}
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
-{
- return rq_clock_task(rq_of(cfs_rq));
-}
-
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
@@ -6412,7 +6389,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
}
/* Evaluate the energy impact of using this CPU. */
- if (max_spare_cap_cpu >= 0) {
+ if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) {
cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
cur_delta -= base_energy_pd;
if (cur_delta < best_delta) {
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c892c6280c9f..8dad5aa600ea 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -238,7 +238,6 @@ static void do_idle(void)
tick_nohz_idle_enter();
while (!need_resched()) {
- check_pgt_cache();
rmb();
local_irq_disable();
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index aa8d75804108..a39bed2c784f 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -30,10 +30,42 @@ static void ipi_mb(void *info)
smp_mb(); /* IPIs should be serializing but paranoid. */
}
+static void ipi_sync_rq_state(void *info)
+{
+ struct mm_struct *mm = (struct mm_struct *) info;
+
+ if (current->mm != mm)
+ return;
+ this_cpu_write(runqueues.membarrier_state,
+ atomic_read(&mm->membarrier_state));
+ /*
+ * Issue a memory barrier after setting
+ * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
+ * guarantee that no memory access following registration is reordered
+ * before registration.
+ */
+ smp_mb();
+}
+
+void membarrier_exec_mmap(struct mm_struct *mm)
+{
+ /*
+ * Issue a memory barrier before clearing membarrier_state to
+ * guarantee that no memory access prior to exec is reordered after
+ * clearing this state.
+ */
+ smp_mb();
+ atomic_set(&mm->membarrier_state, 0);
+ /*
+ * Keep the runqueue membarrier_state in sync with this mm
+ * membarrier_state.
+ */
+ this_cpu_write(runqueues.membarrier_state, 0);
+}
+
static int membarrier_global_expedited(void)
{
int cpu;
- bool fallback = false;
cpumask_var_t tmpmask;
if (num_online_cpus() == 1)
@@ -45,17 +77,11 @@ static int membarrier_global_expedited(void)
*/
smp_mb(); /* system call entry is not a mb. */
- /*
- * Expedited membarrier commands guarantee that they won't
- * block, hence the GFP_NOWAIT allocation flag and fallback
- * implementation.
- */
- if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
- /* Fallback for OOM. */
- fallback = true;
- }
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
cpus_read_lock();
+ rcu_read_lock();
for_each_online_cpu(cpu) {
struct task_struct *p;
@@ -70,23 +96,28 @@ static int membarrier_global_expedited(void)
if (cpu == raw_smp_processor_id())
continue;
- rcu_read_lock();
- p = task_rcu_dereference(&cpu_rq(cpu)->curr);
- if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
- MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
- if (!fallback)
- __cpumask_set_cpu(cpu, tmpmask);
- else
- smp_call_function_single(cpu, ipi_mb, NULL, 1);
- }
- rcu_read_unlock();
- }
- if (!fallback) {
- preempt_disable();
- smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
- preempt_enable();
- free_cpumask_var(tmpmask);
+ if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED))
+ continue;
+
+ /*
+ * Skip the CPU if it runs a kernel thread. The scheduler
+ * leaves the prior task mm in place as an optimization when
+ * scheduling a kthread.
+ */
+ p = rcu_dereference(cpu_rq(cpu)->curr);
+ if (p->flags & PF_KTHREAD)
+ continue;
+
+ __cpumask_set_cpu(cpu, tmpmask);
}
+ rcu_read_unlock();
+
+ preempt_disable();
+ smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ preempt_enable();
+
+ free_cpumask_var(tmpmask);
cpus_read_unlock();
/*
@@ -101,22 +132,22 @@ static int membarrier_global_expedited(void)
static int membarrier_private_expedited(int flags)
{
int cpu;
- bool fallback = false;
cpumask_var_t tmpmask;
+ struct mm_struct *mm = current->mm;
if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
return -EINVAL;
- if (!(atomic_read(&current->mm->membarrier_state) &
+ if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
return -EPERM;
} else {
- if (!(atomic_read(&current->mm->membarrier_state) &
+ if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
return -EPERM;
}
- if (num_online_cpus() == 1)
+ if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
return 0;
/*
@@ -125,17 +156,11 @@ static int membarrier_private_expedited(int flags)
*/
smp_mb(); /* system call entry is not a mb. */
- /*
- * Expedited membarrier commands guarantee that they won't
- * block, hence the GFP_NOWAIT allocation flag and fallback
- * implementation.
- */
- if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
- /* Fallback for OOM. */
- fallback = true;
- }
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
cpus_read_lock();
+ rcu_read_lock();
for_each_online_cpu(cpu) {
struct task_struct *p;
@@ -150,21 +175,17 @@ static int membarrier_private_expedited(int flags)
if (cpu == raw_smp_processor_id())
continue;
rcu_read_lock();
- p = task_rcu_dereference(&cpu_rq(cpu)->curr);
- if (p && p->mm == current->mm) {
- if (!fallback)
- __cpumask_set_cpu(cpu, tmpmask);
- else
- smp_call_function_single(cpu, ipi_mb, NULL, 1);
- }
- rcu_read_unlock();
- }
- if (!fallback) {
- preempt_disable();
- smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
- preempt_enable();
- free_cpumask_var(tmpmask);
+ p = rcu_dereference(cpu_rq(cpu)->curr);
+ if (p && p->mm == mm)
+ __cpumask_set_cpu(cpu, tmpmask);
}
+ rcu_read_unlock();
+
+ preempt_disable();
+ smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ preempt_enable();
+
+ free_cpumask_var(tmpmask);
cpus_read_unlock();
/*
@@ -177,32 +198,78 @@ static int membarrier_private_expedited(int flags)
return 0;
}
+static int sync_runqueues_membarrier_state(struct mm_struct *mm)
+{
+ int membarrier_state = atomic_read(&mm->membarrier_state);
+ cpumask_var_t tmpmask;
+ int cpu;
+
+ if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
+ this_cpu_write(runqueues.membarrier_state, membarrier_state);
+
+ /*
+ * For single mm user, we can simply issue a memory barrier
+ * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
+ * mm and in the current runqueue to guarantee that no memory
+ * access following registration is reordered before
+ * registration.
+ */
+ smp_mb();
+ return 0;
+ }
+
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
+
+ /*
+ * For mm with multiple users, we need to ensure all future
+ * scheduler executions will observe @mm's new membarrier
+ * state.
+ */
+ synchronize_rcu();
+
+ /*
+ * For each cpu runqueue, if the task's mm match @mm, ensure that all
+ * @mm's membarrier state set bits are also set in in the runqueue's
+ * membarrier state. This ensures that a runqueue scheduling
+ * between threads which are users of @mm has its membarrier state
+ * updated.
+ */
+ cpus_read_lock();
+ rcu_read_lock();
+ for_each_online_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *p;
+
+ p = rcu_dereference(rq->curr);
+ if (p && p->mm == mm)
+ __cpumask_set_cpu(cpu, tmpmask);
+ }
+ rcu_read_unlock();
+
+ preempt_disable();
+ smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
+ preempt_enable();
+
+ free_cpumask_var(tmpmask);
+ cpus_read_unlock();
+
+ return 0;
+}
+
static int membarrier_register_global_expedited(void)
{
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
+ int ret;
if (atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
return 0;
atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
- if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) {
- /*
- * For single mm user, single threaded process, we can
- * simply issue a memory barrier after setting
- * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that
- * no memory access following registration is reordered
- * before registration.
- */
- smp_mb();
- } else {
- /*
- * For multi-mm user threads, we need to ensure all
- * future scheduler executions will observe the new
- * thread flag state for this mm.
- */
- synchronize_rcu();
- }
+ ret = sync_runqueues_membarrier_state(mm);
+ if (ret)
+ return ret;
atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
&mm->membarrier_state);
@@ -213,12 +280,15 @@ static int membarrier_register_private_expedited(int flags)
{
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
- int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
+ int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
+ set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
+ ret;
if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
return -EINVAL;
- state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+ ready_state =
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
}
/*
@@ -226,20 +296,15 @@ static int membarrier_register_private_expedited(int flags)
* groups, which use the same mm. (CLONE_VM but not
* CLONE_THREAD).
*/
- if (atomic_read(&mm->membarrier_state) & state)
+ if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
return 0;
- atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
if (flags & MEMBARRIER_FLAG_SYNC_CORE)
- atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
- &mm->membarrier_state);
- if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
- /*
- * Ensure all future scheduler executions will observe the
- * new thread flag state for this process.
- */
- synchronize_rcu();
- }
- atomic_or(state, &mm->membarrier_state);
+ set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
+ atomic_or(set_state, &mm->membarrier_state);
+ ret = sync_runqueues_membarrier_state(mm);
+ if (ret)
+ return ret;
+ atomic_or(ready_state, &mm->membarrier_state);
return 0;
}
@@ -253,8 +318,10 @@ static int membarrier_register_private_expedited(int flags)
* command specified does not exist, not available on the running
* kernel, or if the command argument is invalid, this system call
* returns -EINVAL. For a given command, with flags argument set to 0,
- * this system call is guaranteed to always return the same value until
- * reboot.
+ * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
+ * always return the same value until reboot. In addition, it can return
+ * -ENOMEM if there is not enough memory available to perform the system
+ * call.
*
* All memory accesses performed in program order from each targeted thread
* is guaranteed to be ordered with respect to sys_membarrier(). If we use
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b3cb895d14a2..0db2c1b3361e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -911,6 +911,10 @@ struct rq {
atomic_t nr_iowait;
+#ifdef CONFIG_MEMBARRIER
+ int membarrier_state;
+#endif
+
#ifdef CONFIG_SMP
struct root_domain *rd;
struct sched_domain __rcu *sd;
@@ -2438,3 +2442,33 @@ static inline bool sched_energy_enabled(void)
static inline bool sched_energy_enabled(void) { return false; }
#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
+
+#ifdef CONFIG_MEMBARRIER
+/*
+ * The scheduler provides memory barriers required by membarrier between:
+ * - prior user-space memory accesses and store to rq->membarrier_state,
+ * - store to rq->membarrier_state and following user-space memory accesses.
+ * In the same way it provides those guarantees around store to rq->curr.
+ */
+static inline void membarrier_switch_mm(struct rq *rq,
+ struct mm_struct *prev_mm,
+ struct mm_struct *next_mm)
+{
+ int membarrier_state;
+
+ if (prev_mm == next_mm)
+ return;
+
+ membarrier_state = atomic_read(&next_mm->membarrier_state);
+ if (READ_ONCE(rq->membarrier_state) == membarrier_state)
+ return;
+
+ WRITE_ONCE(rq->membarrier_state, membarrier_state);
+}
+#else
+static inline void membarrier_switch_mm(struct rq *rq,
+ struct mm_struct *prev_mm,
+ struct mm_struct *next_mm)
+{
+}
+#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 078950d9605b..00fcea236eba 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -264,7 +264,8 @@ extern struct ctl_table epoll_table[];
extern struct ctl_table firmware_config_table[];
#endif
-#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
+#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
int sysctl_legacy_va_layout;
#endif
@@ -1573,7 +1574,8 @@ static struct ctl_table vm_table[] = {
.proc_handler = proc_dointvec,
.extra1 = SYSCTL_ZERO,
},
-#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
+#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
{
.procname = "legacy_va_layout",
.data = &sysctl_legacy_va_layout,
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 0e315a2e77ae..4820823515e9 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1678,24 +1678,26 @@ void timer_clear_idle(void)
static int collect_expired_timers(struct timer_base *base,
struct hlist_head *heads)
{
+ unsigned long now = READ_ONCE(jiffies);
+
/*
* NOHZ optimization. After a long idle sleep we need to forward the
* base to current jiffies. Avoid a loop by searching the bitfield for
* the next expiring timer.
*/
- if ((long)(jiffies - base->clk) > 2) {
+ if ((long)(now - base->clk) > 2) {
unsigned long next = __next_timer_interrupt(base);
/*
* If the next timer is ahead of time forward to current
* jiffies, otherwise forward to the next expiry time:
*/
- if (time_after(next, jiffies)) {
+ if (time_after(next, now)) {
/*
* The call site will increment base->clk and then
* terminate the expiry loop immediately.
*/
- base->clk = jiffies;
+ base->clk = now;
return 0;
}
base->clk = next;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3e38a010003c..44bd08f2443b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -142,8 +142,13 @@ BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
{
int ret;
+ ret = security_locked_down(LOCKDOWN_BPF_READ);
+ if (ret < 0)
+ goto out;
+
ret = probe_kernel_read(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
+out:
memset(dst, 0, size);
return ret;
@@ -585,6 +590,10 @@ BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size,
{
int ret;
+ ret = security_locked_down(LOCKDOWN_BPF_READ);
+ if (ret < 0)
+ goto out;
+
/*
* The strncpy_from_unsafe() call will likely not fill the entire
* buffer, but that's okay in this circumstance as we're probing
@@ -596,6 +605,7 @@ BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size,
*/
ret = strncpy_from_unsafe(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
+out:
memset(dst, 0, size);
return ret;
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 8dfd5021b933..7950a0356042 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -276,7 +276,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
int index = task->curr_ret_stack;
int i;
- if (ret != (unsigned long)return_to_handler)
+ if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler))
return ret;
if (index < 0)
@@ -294,7 +294,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
{
int task_idx;
- if (ret != (unsigned long)return_to_handler)
+ if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler))
return ret;
task_idx = task->curr_ret_stack;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 356b848c697a..62a50bf399d6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6036,11 +6036,7 @@ clear_func_from_hash(struct ftrace_init_func *func, struct ftrace_hash *hash)
{
struct ftrace_func_entry *entry;
- if (ftrace_hash_empty(hash))
- return;
-
- entry = __ftrace_lookup_ip(hash, func->ip);
-
+ entry = ftrace_lookup_ip(hash, func->ip);
/*
* Do not allow this rec to match again.
* Yeah, it may waste some memory, but will be removed
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 947ba433865f..252f79c435f8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1854,7 +1854,7 @@ int __init register_tracer(struct tracer *type)
return ret;
}
-void tracing_reset(struct trace_buffer *buf, int cpu)
+static void tracing_reset_cpu(struct trace_buffer *buf, int cpu)
{
struct ring_buffer *buffer = buf->buffer;
@@ -4251,7 +4251,7 @@ static int tracing_open(struct inode *inode, struct file *file)
if (cpu == RING_BUFFER_ALL_CPUS)
tracing_reset_online_cpus(trace_buf);
else
- tracing_reset(trace_buf, cpu);
+ tracing_reset_cpu(trace_buf, cpu);
}
if (file->f_mode & FMODE_READ) {
@@ -4815,15 +4815,15 @@ static const char readme_msg[] =
#endif
#endif /* CONFIG_STACK_TRACER */
#ifdef CONFIG_DYNAMIC_EVENTS
- " dynamic_events\t\t- Add/remove/show the generic dynamic events\n"
+ " dynamic_events\t\t- Create/append/remove/show the generic dynamic events\n"
"\t\t\t Write into this file to define/undefine new trace events.\n"
#endif
#ifdef CONFIG_KPROBE_EVENTS
- " kprobe_events\t\t- Add/remove/show the kernel dynamic events\n"
+ " kprobe_events\t\t- Create/append/remove/show the kernel dynamic events\n"
"\t\t\t Write into this file to define/undefine new trace events.\n"
#endif
#ifdef CONFIG_UPROBE_EVENTS
- " uprobe_events\t\t- Add/remove/show the userspace dynamic events\n"
+ " uprobe_events\t\t- Create/append/remove/show the userspace dynamic events\n"
"\t\t\t Write into this file to define/undefine new trace events.\n"
#endif
#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
@@ -4848,7 +4848,7 @@ static const char readme_msg[] =
#else
"\t $stack<index>, $stack, $retval, $comm,\n"
#endif
- "\t +|-[u]<offset>(<fetcharg>)\n"
+ "\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n"
"\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n"
"\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n"
"\t <type>\\[<array-size>\\]\n"
@@ -6742,7 +6742,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
tracing_reset_online_cpus(&tr->max_buffer);
else
- tracing_reset(&tr->max_buffer, iter->cpu_file);
+ tracing_reset_cpu(&tr->max_buffer, iter->cpu_file);
}
break;
}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 005f08629b8b..26b0a08f3c7d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -677,7 +677,6 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)
int tracer_init(struct tracer *t, struct trace_array *tr);
int tracing_is_enabled(void);
-void tracing_reset(struct trace_buffer *buf, int cpu);
void tracing_reset_online_cpus(struct trace_buffer *buf);
void tracing_reset_current(int cpu);
void tracing_reset_all_online_cpus(void);
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index fa100ed3b4de..a41fed46c285 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -47,6 +47,7 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
return -EINVAL;
event++;
}
+ argc--; argv++;
p = strchr(event, '/');
if (p) {
@@ -61,10 +62,13 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
for_each_dyn_event_safe(pos, n) {
if (type && type != pos->ops)
continue;
- if (pos->ops->match(system, event, pos)) {
- ret = pos->ops->free(pos);
+ if (!pos->ops->match(system, event,
+ argc, (const char **)argv, pos))
+ continue;
+
+ ret = pos->ops->free(pos);
+ if (ret)
break;
- }
}
mutex_unlock(&event_mutex);
diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h
index 8c334064e4d6..46898138d2df 100644
--- a/kernel/trace/trace_dynevent.h
+++ b/kernel/trace/trace_dynevent.h
@@ -31,8 +31,9 @@ struct dyn_event;
* @is_busy: Check whether given event is busy so that it can not be deleted.
* Return true if it is busy, otherwides false.
* @free: Delete the given event. Return 0 if success, otherwides error.
- * @match: Check whether given event and system name match this event.
- * Return true if it matches, otherwides false.
+ * @match: Check whether given event and system name match this event. The argc
+ * and argv is used for exact match. Return true if it matches, otherwides
+ * false.
*
* Except for @create, these methods are called under holding event_mutex.
*/
@@ -43,7 +44,7 @@ struct dyn_event_operations {
bool (*is_busy)(struct dyn_event *ev);
int (*free)(struct dyn_event *ev);
bool (*match)(const char *system, const char *event,
- struct dyn_event *ev);
+ int argc, const char **argv, struct dyn_event *ev);
};
/* Register new dyn_event type -- must be called at first */
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index ca6b0dff60c5..9468bd8d44a2 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -13,6 +13,10 @@
#include <linux/rculist.h>
#include <linux/tracefs.h>
+/* for gfp flag names */
+#include <linux/trace_events.h>
+#include <trace/events/mmflags.h>
+
#include "tracing_map.h"
#include "trace.h"
#include "trace_dynevent.h"
@@ -374,7 +378,7 @@ static int synth_event_show(struct seq_file *m, struct dyn_event *ev);
static int synth_event_release(struct dyn_event *ev);
static bool synth_event_is_busy(struct dyn_event *ev);
static bool synth_event_match(const char *system, const char *event,
- struct dyn_event *ev);
+ int argc, const char **argv, struct dyn_event *ev);
static struct dyn_event_operations synth_event_ops = {
.create = synth_event_create,
@@ -422,7 +426,7 @@ static bool synth_event_is_busy(struct dyn_event *ev)
}
static bool synth_event_match(const char *system, const char *event,
- struct dyn_event *ev)
+ int argc, const char **argv, struct dyn_event *ev)
{
struct synth_event *sev = to_synth_event(ev);
@@ -752,6 +756,8 @@ static int synth_field_size(char *type)
size = sizeof(unsigned long);
else if (strcmp(type, "pid_t") == 0)
size = sizeof(pid_t);
+ else if (strcmp(type, "gfp_t") == 0)
+ size = sizeof(gfp_t);
else if (synth_field_is_string(type))
size = synth_field_string_size(type);
@@ -792,6 +798,8 @@ static const char *synth_field_fmt(char *type)
fmt = "%lu";
else if (strcmp(type, "pid_t") == 0)
fmt = "%d";
+ else if (strcmp(type, "gfp_t") == 0)
+ fmt = "%x";
else if (synth_field_is_string(type))
fmt = "%s";
@@ -834,9 +842,20 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
i == se->n_fields - 1 ? "" : " ");
n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
} else {
+ struct trace_print_flags __flags[] = {
+ __def_gfpflag_names, {-1, NULL} };
+
trace_seq_printf(s, print_fmt, se->fields[i]->name,
entry->fields[n_u64],
i == se->n_fields - 1 ? "" : " ");
+
+ if (strcmp(se->fields[i]->type, "gfp_t") == 0) {
+ trace_seq_puts(s, " (");
+ trace_print_flags_seq(s, "|",
+ entry->fields[n_u64],
+ __flags);
+ trace_seq_putc(s, ')');
+ }
n_u64++;
}
}
@@ -2785,6 +2804,8 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data,
return NULL;
}
+ alias->var_ref_idx = var_ref->var_ref_idx;
+
return alias;
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 9d483ad9bb6c..324ffbea3556 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -11,6 +11,7 @@
#include <linux/uaccess.h>
#include <linux/rculist.h>
#include <linux/error-injection.h>
+#include <linux/security.h>
#include <asm/setup.h> /* for COMMAND_LINE_SIZE */
@@ -39,7 +40,7 @@ static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev);
static int trace_kprobe_release(struct dyn_event *ev);
static bool trace_kprobe_is_busy(struct dyn_event *ev);
static bool trace_kprobe_match(const char *system, const char *event,
- struct dyn_event *ev);
+ int argc, const char **argv, struct dyn_event *ev);
static struct dyn_event_operations trace_kprobe_ops = {
.create = trace_kprobe_create,
@@ -137,13 +138,36 @@ static bool trace_kprobe_is_busy(struct dyn_event *ev)
return trace_probe_is_enabled(&tk->tp);
}
+static bool trace_kprobe_match_command_head(struct trace_kprobe *tk,
+ int argc, const char **argv)
+{
+ char buf[MAX_ARGSTR_LEN + 1];
+
+ if (!argc)
+ return true;
+
+ if (!tk->symbol)
+ snprintf(buf, sizeof(buf), "0x%p", tk->rp.kp.addr);
+ else if (tk->rp.kp.offset)
+ snprintf(buf, sizeof(buf), "%s+%u",
+ trace_kprobe_symbol(tk), tk->rp.kp.offset);
+ else
+ snprintf(buf, sizeof(buf), "%s", trace_kprobe_symbol(tk));
+ if (strcmp(buf, argv[0]))
+ return false;
+ argc--; argv++;
+
+ return trace_probe_match_command_args(&tk->tp, argc, argv);
+}
+
static bool trace_kprobe_match(const char *system, const char *event,
- struct dyn_event *ev)
+ int argc, const char **argv, struct dyn_event *ev)
{
struct trace_kprobe *tk = to_trace_kprobe(ev);
return strcmp(trace_probe_name(&tk->tp), event) == 0 &&
- (!system || strcmp(trace_probe_group_name(&tk->tp), system) == 0);
+ (!system || strcmp(trace_probe_group_name(&tk->tp), system) == 0) &&
+ trace_kprobe_match_command_head(tk, argc, argv);
}
static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
@@ -180,20 +204,33 @@ unsigned long trace_kprobe_address(struct trace_kprobe *tk)
return addr;
}
+static nokprobe_inline struct trace_kprobe *
+trace_kprobe_primary_from_call(struct trace_event_call *call)
+{
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return NULL;
+
+ return container_of(tp, struct trace_kprobe, tp);
+}
+
bool trace_kprobe_on_func_entry(struct trace_event_call *call)
{
- struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
+ struct trace_kprobe *tk = trace_kprobe_primary_from_call(call);
- return kprobe_on_func_entry(tk->rp.kp.addr,
+ return tk ? kprobe_on_func_entry(tk->rp.kp.addr,
tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name,
- tk->rp.kp.addr ? 0 : tk->rp.kp.offset);
+ tk->rp.kp.addr ? 0 : tk->rp.kp.offset) : false;
}
bool trace_kprobe_error_injectable(struct trace_event_call *call)
{
- struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
+ struct trace_kprobe *tk = trace_kprobe_primary_from_call(call);
- return within_error_injection_list(trace_kprobe_address(tk));
+ return tk ? within_error_injection_list(trace_kprobe_address(tk)) :
+ false;
}
static int register_kprobe_event(struct trace_kprobe *tk);
@@ -291,32 +328,68 @@ static inline int __enable_trace_kprobe(struct trace_kprobe *tk)
return ret;
}
+static void __disable_trace_kprobe(struct trace_probe *tp)
+{
+ struct trace_probe *pos;
+ struct trace_kprobe *tk;
+
+ list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
+ tk = container_of(pos, struct trace_kprobe, tp);
+ if (!trace_kprobe_is_registered(tk))
+ continue;
+ if (trace_kprobe_is_return(tk))
+ disable_kretprobe(&tk->rp);
+ else
+ disable_kprobe(&tk->rp.kp);
+ }
+}
+
/*
* Enable trace_probe
* if the file is NULL, enable "perf" handler, or enable "trace" handler.
*/
-static int
-enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
+static int enable_trace_kprobe(struct trace_event_call *call,
+ struct trace_event_file *file)
{
- bool enabled = trace_probe_is_enabled(&tk->tp);
+ struct trace_probe *pos, *tp;
+ struct trace_kprobe *tk;
+ bool enabled;
int ret = 0;
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+ enabled = trace_probe_is_enabled(tp);
+
+ /* This also changes "enabled" state */
if (file) {
- ret = trace_probe_add_file(&tk->tp, file);
+ ret = trace_probe_add_file(tp, file);
if (ret)
return ret;
} else
- trace_probe_set_flag(&tk->tp, TP_FLAG_PROFILE);
+ trace_probe_set_flag(tp, TP_FLAG_PROFILE);
if (enabled)
return 0;
- ret = __enable_trace_kprobe(tk);
+ list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
+ tk = container_of(pos, struct trace_kprobe, tp);
+ if (trace_kprobe_has_gone(tk))
+ continue;
+ ret = __enable_trace_kprobe(tk);
+ if (ret)
+ break;
+ enabled = true;
+ }
+
if (ret) {
+ /* Failed to enable one of them. Roll back all */
+ if (enabled)
+ __disable_trace_kprobe(tp);
if (file)
- trace_probe_remove_file(&tk->tp, file);
+ trace_probe_remove_file(tp, file);
else
- trace_probe_clear_flag(&tk->tp, TP_FLAG_PROFILE);
+ trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
}
return ret;
@@ -326,11 +399,14 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
* Disable trace_probe
* if the file is NULL, disable "perf" handler, or disable "trace" handler.
*/
-static int
-disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
+static int disable_trace_kprobe(struct trace_event_call *call,
+ struct trace_event_file *file)
{
- struct trace_probe *tp = &tk->tp;
- int ret = 0;
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
if (file) {
if (!trace_probe_get_file_link(tp, file))
@@ -341,12 +417,8 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
} else
trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
- if (!trace_probe_is_enabled(tp) && trace_kprobe_is_registered(tk)) {
- if (trace_kprobe_is_return(tk))
- disable_kretprobe(&tk->rp);
- else
- disable_kprobe(&tk->rp.kp);
- }
+ if (!trace_probe_is_enabled(tp))
+ __disable_trace_kprobe(tp);
out:
if (file)
@@ -358,7 +430,7 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
*/
trace_probe_remove_file(tp, file);
- return ret;
+ return 0;
}
#if defined(CONFIG_KPROBES_ON_FTRACE) && \
@@ -389,6 +461,10 @@ static int __register_trace_kprobe(struct trace_kprobe *tk)
{
int i, ret;
+ ret = security_locked_down(LOCKDOWN_KPROBES);
+ if (ret)
+ return ret;
+
if (trace_kprobe_is_registered(tk))
return -EINVAL;
@@ -437,6 +513,10 @@ static void __unregister_trace_kprobe(struct trace_kprobe *tk)
/* Unregister a trace_probe and probe_event */
static int unregister_trace_kprobe(struct trace_kprobe *tk)
{
+ /* If other probes are on the event, just unregister kprobe */
+ if (trace_probe_has_sibling(&tk->tp))
+ goto unreg;
+
/* Enabled event can not be unregistered */
if (trace_probe_is_enabled(&tk->tp))
return -EBUSY;
@@ -445,12 +525,82 @@ static int unregister_trace_kprobe(struct trace_kprobe *tk)
if (unregister_kprobe_event(tk))
return -EBUSY;
+unreg:
__unregister_trace_kprobe(tk);
dyn_event_remove(&tk->devent);
+ trace_probe_unlink(&tk->tp);
return 0;
}
+static bool trace_kprobe_has_same_kprobe(struct trace_kprobe *orig,
+ struct trace_kprobe *comp)
+{
+ struct trace_probe_event *tpe = orig->tp.event;
+ struct trace_probe *pos;
+ int i;
+
+ list_for_each_entry(pos, &tpe->probes, list) {
+ orig = container_of(pos, struct trace_kprobe, tp);
+ if (strcmp(trace_kprobe_symbol(orig),
+ trace_kprobe_symbol(comp)) ||
+ trace_kprobe_offset(orig) != trace_kprobe_offset(comp))
+ continue;
+
+ /*
+ * trace_probe_compare_arg_type() ensured that nr_args and
+ * each argument name and type are same. Let's compare comm.
+ */
+ for (i = 0; i < orig->tp.nr_args; i++) {
+ if (strcmp(orig->tp.args[i].comm,
+ comp->tp.args[i].comm))
+ break;
+ }
+
+ if (i == orig->tp.nr_args)
+ return true;
+ }
+
+ return false;
+}
+
+static int append_trace_kprobe(struct trace_kprobe *tk, struct trace_kprobe *to)
+{
+ int ret;
+
+ ret = trace_probe_compare_arg_type(&tk->tp, &to->tp);
+ if (ret) {
+ /* Note that argument starts index = 2 */
+ trace_probe_log_set_index(ret + 1);
+ trace_probe_log_err(0, DIFF_ARG_TYPE);
+ return -EEXIST;
+ }
+ if (trace_kprobe_has_same_kprobe(to, tk)) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, SAME_PROBE);
+ return -EEXIST;
+ }
+
+ /* Append to existing event */
+ ret = trace_probe_append(&tk->tp, &to->tp);
+ if (ret)
+ return ret;
+
+ /* Register k*probe */
+ ret = __register_trace_kprobe(tk);
+ if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) {
+ pr_warn("This probe might be able to register after target module is loaded. Continue.\n");
+ ret = 0;
+ }
+
+ if (ret)
+ trace_probe_unlink(&tk->tp);
+ else
+ dyn_event_add(&tk->devent);
+
+ return ret;
+}
+
/* Register a trace_probe and probe_event */
static int register_trace_kprobe(struct trace_kprobe *tk)
{
@@ -459,14 +609,17 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
mutex_lock(&event_mutex);
- /* Delete old (same name) event if exist */
old_tk = find_trace_kprobe(trace_probe_name(&tk->tp),
trace_probe_group_name(&tk->tp));
if (old_tk) {
- ret = unregister_trace_kprobe(old_tk);
- if (ret < 0)
- goto end;
- free_trace_kprobe(old_tk);
+ if (trace_kprobe_is_return(tk) != trace_kprobe_is_return(old_tk)) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, DIFF_PROBE_TYPE);
+ ret = -EEXIST;
+ } else {
+ ret = append_trace_kprobe(tk, old_tk);
+ }
+ goto end;
}
/* Register new event */
@@ -700,7 +853,7 @@ static int trace_kprobe_create(int argc, const char *argv[])
trace_probe_log_err(0, BAD_INSN_BNDRY);
else if (ret == -ENOENT)
trace_probe_log_err(0, BAD_PROBE_ADDR);
- else if (ret != -ENOMEM)
+ else if (ret != -ENOMEM && ret != -EEXIST)
trace_probe_log_err(0, FAIL_REG_PROBE);
goto error;
}
@@ -965,6 +1118,9 @@ retry:
case FETCH_OP_COMM:
val = (unsigned long)current->comm;
break;
+ case FETCH_OP_DATA:
+ val = (unsigned long)code->data;
+ break;
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
case FETCH_OP_ARG:
val = regs_get_kernel_argument(regs, code->param);
@@ -1089,7 +1245,10 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
struct trace_probe *tp;
field = (struct kprobe_trace_entry_head *)iter->ent;
- tp = container_of(event, struct trace_probe, call.event);
+ tp = trace_probe_primary_from_call(
+ container_of(event, struct trace_event_call, event));
+ if (WARN_ON_ONCE(!tp))
+ goto out;
trace_seq_printf(s, "%s: (", trace_probe_name(tp));
@@ -1116,7 +1275,10 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
struct trace_probe *tp;
field = (struct kretprobe_trace_entry_head *)iter->ent;
- tp = container_of(event, struct trace_probe, call.event);
+ tp = trace_probe_primary_from_call(
+ container_of(event, struct trace_event_call, event));
+ if (WARN_ON_ONCE(!tp))
+ goto out;
trace_seq_printf(s, "%s: (", trace_probe_name(tp));
@@ -1145,23 +1307,31 @@ static int kprobe_event_define_fields(struct trace_event_call *event_call)
{
int ret;
struct kprobe_trace_entry_head field;
- struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data;
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(event_call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENOENT;
DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
- return traceprobe_define_arg_fields(event_call, sizeof(field), &tk->tp);
+ return traceprobe_define_arg_fields(event_call, sizeof(field), tp);
}
static int kretprobe_event_define_fields(struct trace_event_call *event_call)
{
int ret;
struct kretprobe_trace_entry_head field;
- struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data;
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(event_call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENOENT;
DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
- return traceprobe_define_arg_fields(event_call, sizeof(field), &tk->tp);
+ return traceprobe_define_arg_fields(event_call, sizeof(field), tp);
}
#ifdef CONFIG_PERF_EVENTS
@@ -1289,20 +1459,19 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
static int kprobe_register(struct trace_event_call *event,
enum trace_reg type, void *data)
{
- struct trace_kprobe *tk = (struct trace_kprobe *)event->data;
struct trace_event_file *file = data;
switch (type) {
case TRACE_REG_REGISTER:
- return enable_trace_kprobe(tk, file);
+ return enable_trace_kprobe(event, file);
case TRACE_REG_UNREGISTER:
- return disable_trace_kprobe(tk, file);
+ return disable_trace_kprobe(event, file);
#ifdef CONFIG_PERF_EVENTS
case TRACE_REG_PERF_REGISTER:
- return enable_trace_kprobe(tk, NULL);
+ return enable_trace_kprobe(event, NULL);
case TRACE_REG_PERF_UNREGISTER:
- return disable_trace_kprobe(tk, NULL);
+ return disable_trace_kprobe(event, NULL);
case TRACE_REG_PERF_OPEN:
case TRACE_REG_PERF_CLOSE:
case TRACE_REG_PERF_ADD:
@@ -1369,7 +1538,6 @@ static inline void init_trace_event_call(struct trace_kprobe *tk)
call->flags = TRACE_EVENT_FL_KPROBE;
call->class->reg = kprobe_register;
- call->data = tk;
}
static int register_kprobe_event(struct trace_kprobe *tk)
@@ -1432,7 +1600,9 @@ void destroy_local_trace_kprobe(struct trace_event_call *event_call)
{
struct trace_kprobe *tk;
- tk = container_of(event_call, struct trace_kprobe, tp.call);
+ tk = trace_kprobe_primary_from_call(event_call);
+ if (unlikely(!tk))
+ return;
if (trace_probe_is_enabled(&tk->tp)) {
WARN_ON(1);
@@ -1577,7 +1747,8 @@ static __init int kprobe_trace_self_tests_init(void)
pr_warn("error on getting probe file.\n");
warn++;
} else
- enable_trace_kprobe(tk, file);
+ enable_trace_kprobe(
+ trace_probe_event_call(&tk->tp), file);
}
}
@@ -1598,7 +1769,8 @@ static __init int kprobe_trace_self_tests_init(void)
pr_warn("error on getting probe file.\n");
warn++;
} else
- enable_trace_kprobe(tk, file);
+ enable_trace_kprobe(
+ trace_probe_event_call(&tk->tp), file);
}
}
@@ -1631,7 +1803,8 @@ static __init int kprobe_trace_self_tests_init(void)
pr_warn("error on getting probe file.\n");
warn++;
} else
- disable_trace_kprobe(tk, file);
+ disable_trace_kprobe(
+ trace_probe_event_call(&tk->tp), file);
}
tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
@@ -1649,7 +1822,8 @@ static __init int kprobe_trace_self_tests_init(void)
pr_warn("error on getting probe file.\n");
warn++;
} else
- disable_trace_kprobe(tk, file);
+ disable_trace_kprobe(
+ trace_probe_event_call(&tk->tp), file);
}
ret = trace_run_command("-:testprobe", create_or_delete_trace_kprobe);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index cab4a5398f1d..d54ce252b05a 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -219,10 +219,10 @@ trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len,
{
int i;
const char *ret = trace_seq_buffer_ptr(p);
+ const char *fmt = concatenate ? "%*phN" : "%*ph";
- for (i = 0; i < buf_len; i++)
- trace_seq_printf(p, "%s%2.2x", concatenate || i == 0 ? "" : " ",
- buf[i]);
+ for (i = 0; i < buf_len; i += 16)
+ trace_seq_printf(p, fmt, min(buf_len - i, 16), &buf[i]);
trace_seq_putc(p, 0);
return ret;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index fb6bfbc5bf86..baf58a3612c0 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -316,6 +316,29 @@ inval_var:
return -EINVAL;
}
+static int str_to_immediate(char *str, unsigned long *imm)
+{
+ if (isdigit(str[0]))
+ return kstrtoul(str, 0, imm);
+ else if (str[0] == '-')
+ return kstrtol(str, 0, (long *)imm);
+ else if (str[0] == '+')
+ return kstrtol(str + 1, 0, (long *)imm);
+ return -EINVAL;
+}
+
+static int __parse_imm_string(char *str, char **pbuf, int offs)
+{
+ size_t len = strlen(str);
+
+ if (str[len - 1] != '"') {
+ trace_probe_log_err(offs + len, IMMSTR_NO_CLOSE);
+ return -EINVAL;
+ }
+ *pbuf = kstrndup(str, len - 1, GFP_KERNEL);
+ return 0;
+}
+
/* Recursive argument parser */
static int
parse_probe_arg(char *arg, const struct fetch_type *type,
@@ -430,7 +453,8 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
ret = parse_probe_arg(arg, t2, &code, end, flags, offs);
if (ret)
break;
- if (code->op == FETCH_OP_COMM) {
+ if (code->op == FETCH_OP_COMM ||
+ code->op == FETCH_OP_DATA) {
trace_probe_log_err(offs, COMM_CANT_DEREF);
return -EINVAL;
}
@@ -444,6 +468,21 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
code->offset = offset;
}
break;
+ case '\\': /* Immediate value */
+ if (arg[1] == '"') { /* Immediate string */
+ ret = __parse_imm_string(arg + 2, &tmp, offs + 2);
+ if (ret)
+ break;
+ code->op = FETCH_OP_DATA;
+ code->data = tmp;
+ } else {
+ ret = str_to_immediate(arg + 1, &code->immediate);
+ if (ret)
+ trace_probe_log_err(offs + 1, BAD_IMM);
+ else
+ code->op = FETCH_OP_IMM;
+ }
+ break;
}
if (!ret && code->op == FETCH_OP_NOP) {
/* Parsed, but do not find fetch method */
@@ -542,8 +581,11 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
}
}
- /* Since $comm can not be dereferred, we can find $comm by strcmp */
- if (strcmp(arg, "$comm") == 0) {
+ /*
+ * Since $comm and immediate string can not be dereferred,
+ * we can find those by strcmp.
+ */
+ if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) {
/* The type of $comm must be "string", and not an array. */
if (parg->count || (t && strcmp(t, "string")))
return -EINVAL;
@@ -580,7 +622,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
if (!strcmp(parg->type->name, "string") ||
!strcmp(parg->type->name, "ustring")) {
if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF &&
- code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM) {
+ code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM &&
+ code->op != FETCH_OP_DATA) {
trace_probe_log_err(offset + (t ? (t - arg) : 0),
BAD_STRING);
ret = -EINVAL;
@@ -589,9 +632,10 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM) ||
parg->count) {
/*
- * IMM and COMM is pointing actual address, those must
- * be kept, and if parg->count != 0, this is an array
- * of string pointers instead of string address itself.
+ * IMM, DATA and COMM is pointing actual address, those
+ * must be kept, and if parg->count != 0, this is an
+ * array of string pointers instead of string address
+ * itself.
*/
code++;
if (code->op != FETCH_OP_NOP) {
@@ -665,7 +709,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
fail:
if (ret) {
for (code = tmp; code < tmp + FETCH_INSN_MAX; code++)
- if (code->op == FETCH_NOP_SYMBOL)
+ if (code->op == FETCH_NOP_SYMBOL ||
+ code->op == FETCH_OP_DATA)
kfree(code->data);
}
kfree(tmp);
@@ -736,7 +781,8 @@ void traceprobe_free_probe_arg(struct probe_arg *arg)
struct fetch_insn *code = arg->code;
while (code && code->op != FETCH_OP_END) {
- if (code->op == FETCH_NOP_SYMBOL)
+ if (code->op == FETCH_NOP_SYMBOL ||
+ code->op == FETCH_OP_DATA)
kfree(code->data);
code++;
}
@@ -886,44 +932,85 @@ int traceprobe_define_arg_fields(struct trace_event_call *event_call,
return 0;
}
+static void trace_probe_event_free(struct trace_probe_event *tpe)
+{
+ kfree(tpe->class.system);
+ kfree(tpe->call.name);
+ kfree(tpe->call.print_fmt);
+ kfree(tpe);
+}
+
+int trace_probe_append(struct trace_probe *tp, struct trace_probe *to)
+{
+ if (trace_probe_has_sibling(tp))
+ return -EBUSY;
+
+ list_del_init(&tp->list);
+ trace_probe_event_free(tp->event);
+
+ tp->event = to->event;
+ list_add_tail(&tp->list, trace_probe_probe_list(to));
+
+ return 0;
+}
+
+void trace_probe_unlink(struct trace_probe *tp)
+{
+ list_del_init(&tp->list);
+ if (list_empty(trace_probe_probe_list(tp)))
+ trace_probe_event_free(tp->event);
+ tp->event = NULL;
+}
void trace_probe_cleanup(struct trace_probe *tp)
{
- struct trace_event_call *call = trace_probe_event_call(tp);
int i;
for (i = 0; i < tp->nr_args; i++)
traceprobe_free_probe_arg(&tp->args[i]);
- if (call->class)
- kfree(call->class->system);
- kfree(call->name);
- kfree(call->print_fmt);
+ if (tp->event)
+ trace_probe_unlink(tp);
}
int trace_probe_init(struct trace_probe *tp, const char *event,
const char *group)
{
- struct trace_event_call *call = trace_probe_event_call(tp);
+ struct trace_event_call *call;
+ int ret = 0;
if (!event || !group)
return -EINVAL;
- call->class = &tp->class;
- call->name = kstrdup(event, GFP_KERNEL);
- if (!call->name)
+ tp->event = kzalloc(sizeof(struct trace_probe_event), GFP_KERNEL);
+ if (!tp->event)
return -ENOMEM;
- tp->class.system = kstrdup(group, GFP_KERNEL);
- if (!tp->class.system) {
- kfree(call->name);
- call->name = NULL;
- return -ENOMEM;
+ INIT_LIST_HEAD(&tp->event->files);
+ INIT_LIST_HEAD(&tp->event->class.fields);
+ INIT_LIST_HEAD(&tp->event->probes);
+ INIT_LIST_HEAD(&tp->list);
+ list_add(&tp->event->probes, &tp->list);
+
+ call = trace_probe_event_call(tp);
+ call->class = &tp->event->class;
+ call->name = kstrdup(event, GFP_KERNEL);
+ if (!call->name) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ tp->event->class.system = kstrdup(group, GFP_KERNEL);
+ if (!tp->event->class.system) {
+ ret = -ENOMEM;
+ goto error;
}
- INIT_LIST_HEAD(&tp->files);
- INIT_LIST_HEAD(&tp->class.fields);
return 0;
+
+error:
+ trace_probe_cleanup(tp);
+ return ret;
}
int trace_probe_register_event_call(struct trace_probe *tp)
@@ -952,7 +1039,7 @@ int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file)
link->file = file;
INIT_LIST_HEAD(&link->list);
- list_add_tail_rcu(&link->list, &tp->files);
+ list_add_tail_rcu(&link->list, &tp->event->files);
trace_probe_set_flag(tp, TP_FLAG_TRACE);
return 0;
}
@@ -983,8 +1070,45 @@ int trace_probe_remove_file(struct trace_probe *tp,
synchronize_rcu();
kfree(link);
- if (list_empty(&tp->files))
+ if (list_empty(&tp->event->files))
trace_probe_clear_flag(tp, TP_FLAG_TRACE);
return 0;
}
+
+/*
+ * Return the smallest index of different type argument (start from 1).
+ * If all argument types and name are same, return 0.
+ */
+int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b)
+{
+ int i;
+
+ for (i = 0; i < a->nr_args; i++) {
+ if ((b->nr_args <= i) ||
+ ((a->args[i].type != b->args[i].type) ||
+ (a->args[i].count != b->args[i].count) ||
+ strcmp(a->args[i].name, b->args[i].name)))
+ return i + 1;
+ }
+
+ return 0;
+}
+
+bool trace_probe_match_command_args(struct trace_probe *tp,
+ int argc, const char **argv)
+{
+ char buf[MAX_ARGSTR_LEN + 1];
+ int i;
+
+ if (tp->nr_args < argc)
+ return false;
+
+ for (i = 0; i < argc; i++) {
+ snprintf(buf, sizeof(buf), "%s=%s",
+ tp->args[i].name, tp->args[i].comm);
+ if (strcmp(buf, argv[i]))
+ return false;
+ }
+ return true;
+}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index d1714820efe1..4ee703728aec 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -89,6 +89,7 @@ enum fetch_op {
FETCH_OP_COMM, /* Current comm */
FETCH_OP_ARG, /* Function argument : .param */
FETCH_OP_FOFFS, /* File offset: .immediate */
+ FETCH_OP_DATA, /* Allocated data: .data */
// Stage 2 (dereference) op
FETCH_OP_DEREF, /* Dereference: .offset */
FETCH_OP_UDEREF, /* User-space Dereference: .offset */
@@ -222,11 +223,18 @@ struct probe_arg {
const struct fetch_type *type; /* Type of this argument */
};
-struct trace_probe {
+/* Event call and class holder */
+struct trace_probe_event {
unsigned int flags; /* For TP_FLAG_* */
struct trace_event_class class;
struct trace_event_call call;
struct list_head files;
+ struct list_head probes;
+};
+
+struct trace_probe {
+ struct list_head list;
+ struct trace_probe_event *event;
ssize_t size; /* trace entry size */
unsigned int nr_args;
struct probe_arg args[];
@@ -240,19 +248,19 @@ struct event_file_link {
static inline bool trace_probe_test_flag(struct trace_probe *tp,
unsigned int flag)
{
- return !!(tp->flags & flag);
+ return !!(tp->event->flags & flag);
}
static inline void trace_probe_set_flag(struct trace_probe *tp,
unsigned int flag)
{
- tp->flags |= flag;
+ tp->event->flags |= flag;
}
static inline void trace_probe_clear_flag(struct trace_probe *tp,
unsigned int flag)
{
- tp->flags &= ~flag;
+ tp->event->flags &= ~flag;
}
static inline bool trace_probe_is_enabled(struct trace_probe *tp)
@@ -262,45 +270,76 @@ static inline bool trace_probe_is_enabled(struct trace_probe *tp)
static inline const char *trace_probe_name(struct trace_probe *tp)
{
- return trace_event_name(&tp->call);
+ return trace_event_name(&tp->event->call);
}
static inline const char *trace_probe_group_name(struct trace_probe *tp)
{
- return tp->call.class->system;
+ return tp->event->call.class->system;
}
static inline struct trace_event_call *
trace_probe_event_call(struct trace_probe *tp)
{
- return &tp->call;
+ return &tp->event->call;
+}
+
+static inline struct trace_probe_event *
+trace_probe_event_from_call(struct trace_event_call *event_call)
+{
+ return container_of(event_call, struct trace_probe_event, call);
+}
+
+static inline struct trace_probe *
+trace_probe_primary_from_call(struct trace_event_call *call)
+{
+ struct trace_probe_event *tpe = trace_probe_event_from_call(call);
+
+ return list_first_entry(&tpe->probes, struct trace_probe, list);
+}
+
+static inline struct list_head *trace_probe_probe_list(struct trace_probe *tp)
+{
+ return &tp->event->probes;
+}
+
+static inline bool trace_probe_has_sibling(struct trace_probe *tp)
+{
+ struct list_head *list = trace_probe_probe_list(tp);
+
+ return !list_empty(list) && !list_is_singular(list);
}
static inline int trace_probe_unregister_event_call(struct trace_probe *tp)
{
/* tp->event is unregistered in trace_remove_event_call() */
- return trace_remove_event_call(&tp->call);
+ return trace_remove_event_call(&tp->event->call);
}
static inline bool trace_probe_has_single_file(struct trace_probe *tp)
{
- return !!list_is_singular(&tp->files);
+ return !!list_is_singular(&tp->event->files);
}
int trace_probe_init(struct trace_probe *tp, const char *event,
const char *group);
void trace_probe_cleanup(struct trace_probe *tp);
+int trace_probe_append(struct trace_probe *tp, struct trace_probe *to);
+void trace_probe_unlink(struct trace_probe *tp);
int trace_probe_register_event_call(struct trace_probe *tp);
int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file);
int trace_probe_remove_file(struct trace_probe *tp,
struct trace_event_file *file);
struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp,
struct trace_event_file *file);
+int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b);
+bool trace_probe_match_command_args(struct trace_probe *tp,
+ int argc, const char **argv);
#define trace_probe_for_each_link(pos, tp) \
- list_for_each_entry(pos, &(tp)->files, list)
+ list_for_each_entry(pos, &(tp)->event->files, list)
#define trace_probe_for_each_link_rcu(pos, tp) \
- list_for_each_entry_rcu(pos, &(tp)->files, list)
+ list_for_each_entry_rcu(pos, &(tp)->event->files, list)
/* Check the name is good for event/group/fields */
static inline bool is_good_name(const char *name)
@@ -370,6 +409,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(BAD_VAR, "Invalid $-valiable specified"), \
C(BAD_REG_NAME, "Invalid register name"), \
C(BAD_MEM_ADDR, "Invalid memory address"), \
+ C(BAD_IMM, "Invalid immediate value"), \
+ C(IMMSTR_NO_CLOSE, "String is not closed with '\"'"), \
C(FILE_ON_KPROBE, "File offset is not available with kprobe"), \
C(BAD_FILE_OFFS, "Invalid file offset value"), \
C(SYM_ON_UPROBE, "Symbol is not available with uprobe"), \
@@ -393,7 +434,10 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(ARG_TOO_LONG, "Argument expression is too long"), \
C(NO_ARG_BODY, "No argument expression"), \
C(BAD_INSN_BNDRY, "Probe point is not an instruction boundary"),\
- C(FAIL_REG_PROBE, "Failed to register probe event"),
+ C(FAIL_REG_PROBE, "Failed to register probe event"),\
+ C(DIFF_PROBE_TYPE, "Probe type is different from existing probe"),\
+ C(DIFF_ARG_TYPE, "Argument type or name is different from existing probe"),\
+ C(SAME_PROBE, "There is already the exact same probe event"),
#undef C
#define C(a, b) TP_ERR_##a
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 5d16f73898db..ec9a34a97129 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -53,6 +53,104 @@ static void print_max_stack(void)
}
}
+/*
+ * The stack tracer looks for a maximum stack at each call from a function. It
+ * registers a callback from ftrace, and in that callback it examines the stack
+ * size. It determines the stack size from the variable passed in, which is the
+ * address of a local variable in the stack_trace_call() callback function.
+ * The stack size is calculated by the address of the local variable to the top
+ * of the current stack. If that size is smaller than the currently saved max
+ * stack size, nothing more is done.
+ *
+ * If the size of the stack is greater than the maximum recorded size, then the
+ * following algorithm takes place.
+ *
+ * For architectures (like x86) that store the function's return address before
+ * saving the function's local variables, the stack will look something like
+ * this:
+ *
+ * [ top of stack ]
+ * 0: sys call entry frame
+ * 10: return addr to entry code
+ * 11: start of sys_foo frame
+ * 20: return addr to sys_foo
+ * 21: start of kernel_func_bar frame
+ * 30: return addr to kernel_func_bar
+ * 31: [ do trace stack here ]
+ *
+ * The save_stack_trace() is called returning all the functions it finds in the
+ * current stack. Which would be (from the bottom of the stack to the top):
+ *
+ * return addr to kernel_func_bar
+ * return addr to sys_foo
+ * return addr to entry code
+ *
+ * Now to figure out how much each of these functions' local variable size is,
+ * a search of the stack is made to find these values. When a match is made, it
+ * is added to the stack_dump_trace[] array. The offset into the stack is saved
+ * in the stack_trace_index[] array. The above example would show:
+ *
+ * stack_dump_trace[] | stack_trace_index[]
+ * ------------------ + -------------------
+ * return addr to kernel_func_bar | 30
+ * return addr to sys_foo | 20
+ * return addr to entry | 10
+ *
+ * The print_max_stack() function above, uses these values to print the size of
+ * each function's portion of the stack.
+ *
+ * for (i = 0; i < nr_entries; i++) {
+ * size = i == nr_entries - 1 ? stack_trace_index[i] :
+ * stack_trace_index[i] - stack_trace_index[i+1]
+ * print "%d %d %d %s\n", i, stack_trace_index[i], size, stack_dump_trace[i]);
+ * }
+ *
+ * The above shows
+ *
+ * depth size location
+ * ----- ---- --------
+ * 0 30 10 kernel_func_bar
+ * 1 20 10 sys_foo
+ * 2 10 10 entry code
+ *
+ * Now for architectures that might save the return address after the functions
+ * local variables (saving the link register before calling nested functions),
+ * this will cause the stack to look a little different:
+ *
+ * [ top of stack ]
+ * 0: sys call entry frame
+ * 10: start of sys_foo_frame
+ * 19: return addr to entry code << lr saved before calling kernel_func_bar
+ * 20: start of kernel_func_bar frame
+ * 29: return addr to sys_foo_frame << lr saved before calling next function
+ * 30: [ do trace stack here ]
+ *
+ * Although the functions returned by save_stack_trace() may be the same, the
+ * placement in the stack will be different. Using the same algorithm as above
+ * would yield:
+ *
+ * stack_dump_trace[] | stack_trace_index[]
+ * ------------------ + -------------------
+ * return addr to kernel_func_bar | 30
+ * return addr to sys_foo | 29
+ * return addr to entry | 19
+ *
+ * Where the mapping is off by one:
+ *
+ * kernel_func_bar stack frame size is 29 - 19 not 30 - 29!
+ *
+ * To fix this, if the architecture sets ARCH_RET_ADDR_AFTER_LOCAL_VARS the
+ * values in stack_trace_index[] are shifted by one to and the number of
+ * stack trace entries is decremented by one.
+ *
+ * stack_dump_trace[] | stack_trace_index[]
+ * ------------------ + -------------------
+ * return addr to kernel_func_bar | 29
+ * return addr to sys_foo | 19
+ *
+ * Although the entry function is not displayed, the first function (sys_foo)
+ * will still include the stack size of it.
+ */
static void check_stack(unsigned long ip, unsigned long *stack)
{
unsigned long this_size, flags; unsigned long *p, *top, *start;
@@ -158,6 +256,20 @@ static void check_stack(unsigned long ip, unsigned long *stack)
i++;
}
+#ifdef ARCH_FTRACE_SHIFT_STACK_TRACER
+ /*
+ * Some archs will store the link register before calling
+ * nested functions. This means the saved return address
+ * comes after the local storage, and we need to shift
+ * for that.
+ */
+ if (x > 1) {
+ memmove(&stack_trace_index[0], &stack_trace_index[1],
+ sizeof(stack_trace_index[0]) * (x - 1));
+ x--;
+ }
+#endif
+
stack_trace_nr_entries = x;
if (task_stack_end_corrupted(current)) {
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 1ceedb9146b1..dd884341f5c5 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -44,7 +44,7 @@ static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev);
static int trace_uprobe_release(struct dyn_event *ev);
static bool trace_uprobe_is_busy(struct dyn_event *ev);
static bool trace_uprobe_match(const char *system, const char *event,
- struct dyn_event *ev);
+ int argc, const char **argv, struct dyn_event *ev);
static struct dyn_event_operations trace_uprobe_ops = {
.create = trace_uprobe_create,
@@ -248,6 +248,9 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest,
case FETCH_OP_COMM:
val = FETCH_TOKEN_COMM;
break;
+ case FETCH_OP_DATA:
+ val = (unsigned long)code->data;
+ break;
case FETCH_OP_FOFFS:
val = translate_user_vaddr(code->immediate);
break;
@@ -284,13 +287,54 @@ static bool trace_uprobe_is_busy(struct dyn_event *ev)
return trace_probe_is_enabled(&tu->tp);
}
+static bool trace_uprobe_match_command_head(struct trace_uprobe *tu,
+ int argc, const char **argv)
+{
+ char buf[MAX_ARGSTR_LEN + 1];
+ int len;
+
+ if (!argc)
+ return true;
+
+ len = strlen(tu->filename);
+ if (strncmp(tu->filename, argv[0], len) || argv[0][len] != ':')
+ return false;
+
+ if (tu->ref_ctr_offset == 0)
+ snprintf(buf, sizeof(buf), "0x%0*lx",
+ (int)(sizeof(void *) * 2), tu->offset);
+ else
+ snprintf(buf, sizeof(buf), "0x%0*lx(0x%lx)",
+ (int)(sizeof(void *) * 2), tu->offset,
+ tu->ref_ctr_offset);
+ if (strcmp(buf, &argv[0][len + 1]))
+ return false;
+
+ argc--; argv++;
+
+ return trace_probe_match_command_args(&tu->tp, argc, argv);
+}
+
static bool trace_uprobe_match(const char *system, const char *event,
- struct dyn_event *ev)
+ int argc, const char **argv, struct dyn_event *ev)
{
struct trace_uprobe *tu = to_trace_uprobe(ev);
return strcmp(trace_probe_name(&tu->tp), event) == 0 &&
- (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0);
+ (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0) &&
+ trace_uprobe_match_command_head(tu, argc, argv);
+}
+
+static nokprobe_inline struct trace_uprobe *
+trace_uprobe_primary_from_call(struct trace_event_call *call)
+{
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return NULL;
+
+ return container_of(tp, struct trace_uprobe, tp);
}
/*
@@ -352,15 +396,76 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu)
{
int ret;
+ if (trace_probe_has_sibling(&tu->tp))
+ goto unreg;
+
ret = unregister_uprobe_event(tu);
if (ret)
return ret;
+unreg:
dyn_event_remove(&tu->devent);
+ trace_probe_unlink(&tu->tp);
free_trace_uprobe(tu);
return 0;
}
+static bool trace_uprobe_has_same_uprobe(struct trace_uprobe *orig,
+ struct trace_uprobe *comp)
+{
+ struct trace_probe_event *tpe = orig->tp.event;
+ struct trace_probe *pos;
+ struct inode *comp_inode = d_real_inode(comp->path.dentry);
+ int i;
+
+ list_for_each_entry(pos, &tpe->probes, list) {
+ orig = container_of(pos, struct trace_uprobe, tp);
+ if (comp_inode != d_real_inode(orig->path.dentry) ||
+ comp->offset != orig->offset)
+ continue;
+
+ /*
+ * trace_probe_compare_arg_type() ensured that nr_args and
+ * each argument name and type are same. Let's compare comm.
+ */
+ for (i = 0; i < orig->tp.nr_args; i++) {
+ if (strcmp(orig->tp.args[i].comm,
+ comp->tp.args[i].comm))
+ break;
+ }
+
+ if (i == orig->tp.nr_args)
+ return true;
+ }
+
+ return false;
+}
+
+static int append_trace_uprobe(struct trace_uprobe *tu, struct trace_uprobe *to)
+{
+ int ret;
+
+ ret = trace_probe_compare_arg_type(&tu->tp, &to->tp);
+ if (ret) {
+ /* Note that argument starts index = 2 */
+ trace_probe_log_set_index(ret + 1);
+ trace_probe_log_err(0, DIFF_ARG_TYPE);
+ return -EEXIST;
+ }
+ if (trace_uprobe_has_same_uprobe(to, tu)) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, SAME_PROBE);
+ return -EEXIST;
+ }
+
+ /* Append to existing event */
+ ret = trace_probe_append(&tu->tp, &to->tp);
+ if (!ret)
+ dyn_event_add(&tu->devent);
+
+ return ret;
+}
+
/*
* Uprobe with multiple reference counter is not allowed. i.e.
* If inode and offset matches, reference counter offset *must*
@@ -370,25 +475,21 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu)
* as the new one does not conflict with any other existing
* ones.
*/
-static struct trace_uprobe *find_old_trace_uprobe(struct trace_uprobe *new)
+static int validate_ref_ctr_offset(struct trace_uprobe *new)
{
struct dyn_event *pos;
- struct trace_uprobe *tmp, *old = NULL;
+ struct trace_uprobe *tmp;
struct inode *new_inode = d_real_inode(new->path.dentry);
- old = find_probe_event(trace_probe_name(&new->tp),
- trace_probe_group_name(&new->tp));
-
for_each_trace_uprobe(tmp, pos) {
- if ((old ? old != tmp : true) &&
- new_inode == d_real_inode(tmp->path.dentry) &&
+ if (new_inode == d_real_inode(tmp->path.dentry) &&
new->offset == tmp->offset &&
new->ref_ctr_offset != tmp->ref_ctr_offset) {
pr_warn("Reference counter offset mismatch.");
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
}
}
- return old;
+ return 0;
}
/* Register a trace_uprobe and probe_event */
@@ -399,18 +500,22 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
mutex_lock(&event_mutex);
- /* register as an event */
- old_tu = find_old_trace_uprobe(tu);
- if (IS_ERR(old_tu)) {
- ret = PTR_ERR(old_tu);
+ ret = validate_ref_ctr_offset(tu);
+ if (ret)
goto end;
- }
+ /* register as an event */
+ old_tu = find_probe_event(trace_probe_name(&tu->tp),
+ trace_probe_group_name(&tu->tp));
if (old_tu) {
- /* delete old event */
- ret = unregister_trace_uprobe(old_tu);
- if (ret)
- goto end;
+ if (is_ret_probe(tu) != is_ret_probe(old_tu)) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, DIFF_PROBE_TYPE);
+ ret = -EEXIST;
+ } else {
+ ret = append_trace_uprobe(tu, old_tu);
+ }
+ goto end;
}
ret = register_uprobe_event(tu);
@@ -897,7 +1002,10 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
u8 *data;
entry = (struct uprobe_trace_entry_head *)iter->ent;
- tu = container_of(event, struct trace_uprobe, tp.call.event);
+ tu = trace_uprobe_primary_from_call(
+ container_of(event, struct trace_event_call, event));
+ if (unlikely(!tu))
+ goto out;
if (is_ret_probe(tu)) {
trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
@@ -924,27 +1032,71 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self,
enum uprobe_filter_ctx ctx,
struct mm_struct *mm);
-static int
-probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
- filter_func_t filter)
+static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter)
{
- bool enabled = trace_probe_is_enabled(&tu->tp);
int ret;
+ tu->consumer.filter = filter;
+ tu->inode = d_real_inode(tu->path.dentry);
+
+ if (tu->ref_ctr_offset)
+ ret = uprobe_register_refctr(tu->inode, tu->offset,
+ tu->ref_ctr_offset, &tu->consumer);
+ else
+ ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+
+ if (ret)
+ tu->inode = NULL;
+
+ return ret;
+}
+
+static void __probe_event_disable(struct trace_probe *tp)
+{
+ struct trace_probe *pos;
+ struct trace_uprobe *tu;
+
+ list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
+ tu = container_of(pos, struct trace_uprobe, tp);
+ if (!tu->inode)
+ continue;
+
+ WARN_ON(!uprobe_filter_is_empty(&tu->filter));
+
+ uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
+ tu->inode = NULL;
+ }
+}
+
+static int probe_event_enable(struct trace_event_call *call,
+ struct trace_event_file *file, filter_func_t filter)
+{
+ struct trace_probe *pos, *tp;
+ struct trace_uprobe *tu;
+ bool enabled;
+ int ret;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+ enabled = trace_probe_is_enabled(tp);
+
+ /* This may also change "enabled" state */
if (file) {
- if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE))
+ if (trace_probe_test_flag(tp, TP_FLAG_PROFILE))
return -EINTR;
- ret = trace_probe_add_file(&tu->tp, file);
+ ret = trace_probe_add_file(tp, file);
if (ret < 0)
return ret;
} else {
- if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
+ if (trace_probe_test_flag(tp, TP_FLAG_TRACE))
return -EINTR;
- trace_probe_set_flag(&tu->tp, TP_FLAG_PROFILE);
+ trace_probe_set_flag(tp, TP_FLAG_PROFILE);
}
+ tu = container_of(tp, struct trace_uprobe, tp);
WARN_ON(!uprobe_filter_is_empty(&tu->filter));
if (enabled)
@@ -954,18 +1106,15 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
if (ret)
goto err_flags;
- tu->consumer.filter = filter;
- tu->inode = d_real_inode(tu->path.dentry);
- if (tu->ref_ctr_offset) {
- ret = uprobe_register_refctr(tu->inode, tu->offset,
- tu->ref_ctr_offset, &tu->consumer);
- } else {
- ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+ list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
+ tu = container_of(pos, struct trace_uprobe, tp);
+ ret = trace_uprobe_enable(tu, filter);
+ if (ret) {
+ __probe_event_disable(tp);
+ goto err_buffer;
+ }
}
- if (ret)
- goto err_buffer;
-
return 0;
err_buffer:
@@ -973,33 +1122,35 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
err_flags:
if (file)
- trace_probe_remove_file(&tu->tp, file);
+ trace_probe_remove_file(tp, file);
else
- trace_probe_clear_flag(&tu->tp, TP_FLAG_PROFILE);
+ trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
return ret;
}
-static void
-probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file)
+static void probe_event_disable(struct trace_event_call *call,
+ struct trace_event_file *file)
{
- if (!trace_probe_is_enabled(&tu->tp))
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return;
+
+ if (!trace_probe_is_enabled(tp))
return;
if (file) {
- if (trace_probe_remove_file(&tu->tp, file) < 0)
+ if (trace_probe_remove_file(tp, file) < 0)
return;
- if (trace_probe_is_enabled(&tu->tp))
+ if (trace_probe_is_enabled(tp))
return;
} else
- trace_probe_clear_flag(&tu->tp, TP_FLAG_PROFILE);
-
- WARN_ON(!uprobe_filter_is_empty(&tu->filter));
-
- uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
- tu->inode = NULL;
+ trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
+ __probe_event_disable(tp);
uprobe_buffer_disable();
}
@@ -1007,7 +1158,11 @@ static int uprobe_event_define_fields(struct trace_event_call *event_call)
{
int ret, size;
struct uprobe_trace_entry_head field;
- struct trace_uprobe *tu = event_call->data;
+ struct trace_uprobe *tu;
+
+ tu = trace_uprobe_primary_from_call(event_call);
+ if (unlikely(!tu))
+ return -ENODEV;
if (is_ret_probe(tu)) {
DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0);
@@ -1100,6 +1255,27 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
return err;
}
+static int uprobe_perf_multi_call(struct trace_event_call *call,
+ struct perf_event *event,
+ int (*op)(struct trace_uprobe *tu, struct perf_event *event))
+{
+ struct trace_probe *pos, *tp;
+ struct trace_uprobe *tu;
+ int ret = 0;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+
+ list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
+ tu = container_of(pos, struct trace_uprobe, tp);
+ ret = op(tu, event);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
static bool uprobe_perf_filter(struct uprobe_consumer *uc,
enum uprobe_filter_ctx ctx, struct mm_struct *mm)
{
@@ -1213,30 +1389,29 @@ static int
trace_uprobe_register(struct trace_event_call *event, enum trace_reg type,
void *data)
{
- struct trace_uprobe *tu = event->data;
struct trace_event_file *file = data;
switch (type) {
case TRACE_REG_REGISTER:
- return probe_event_enable(tu, file, NULL);
+ return probe_event_enable(event, file, NULL);
case TRACE_REG_UNREGISTER:
- probe_event_disable(tu, file);
+ probe_event_disable(event, file);
return 0;
#ifdef CONFIG_PERF_EVENTS
case TRACE_REG_PERF_REGISTER:
- return probe_event_enable(tu, NULL, uprobe_perf_filter);
+ return probe_event_enable(event, NULL, uprobe_perf_filter);
case TRACE_REG_PERF_UNREGISTER:
- probe_event_disable(tu, NULL);
+ probe_event_disable(event, NULL);
return 0;
case TRACE_REG_PERF_OPEN:
- return uprobe_perf_open(tu, data);
+ return uprobe_perf_multi_call(event, data, uprobe_perf_open);
case TRACE_REG_PERF_CLOSE:
- return uprobe_perf_close(tu, data);
+ return uprobe_perf_multi_call(event, data, uprobe_perf_close);
#endif
default:
@@ -1330,7 +1505,6 @@ static inline void init_trace_event_call(struct trace_uprobe *tu)
call->flags = TRACE_EVENT_FL_UPROBE | TRACE_EVENT_FL_CAP_ANY;
call->class->reg = trace_uprobe_register;
- call->data = tu;
}
static int register_uprobe_event(struct trace_uprobe *tu)
@@ -1399,7 +1573,7 @@ void destroy_local_trace_uprobe(struct trace_event_call *event_call)
{
struct trace_uprobe *tu;
- tu = container_of(event_call, struct trace_uprobe, tp.call);
+ tu = trace_uprobe_primary_from_call(event_call);
free_trace_uprobe(tu);
}