diff options
Diffstat (limited to 'kernel')
50 files changed, 977 insertions, 565 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 1f37f15117e5..72ab759a0b43 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -429,7 +429,7 @@ static void kauditd_send_skb(struct sk_buff *skb) * This function doesn't consume an skb as might be expected since it has to * copy it anyways. */ -static void kauditd_send_multicast_skb(struct sk_buff *skb) +static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask) { struct sk_buff *copy; struct audit_net *aunet = net_generic(&init_net, audit_net_id); @@ -448,11 +448,11 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb) * no reason for new multicast clients to continue with this * non-compliance. */ - copy = skb_copy(skb, GFP_KERNEL); + copy = skb_copy(skb, gfp_mask); if (!copy) return; - nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL); + nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, gfp_mask); } /* @@ -833,7 +833,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) s.backlog_limit = audit_backlog_limit; s.lost = atomic_read(&audit_lost); s.backlog = skb_queue_len(&audit_skb_queue); - s.version = AUDIT_VERSION_LATEST; + s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; s.backlog_wait_time = audit_backlog_wait_time; audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; @@ -1100,7 +1100,7 @@ static void audit_receive(struct sk_buff *skb) } /* Run custom bind function on netlink socket group connect or bind requests. */ -static int audit_bind(int group) +static int audit_bind(struct net *net, int group) { if (!capable(CAP_AUDIT_READ)) return -EPERM; @@ -1940,7 +1940,7 @@ void audit_log_end(struct audit_buffer *ab) struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); nlh->nlmsg_len = ab->skb->len; - kauditd_send_multicast_skb(ab->skb); + kauditd_send_multicast_skb(ab->skb, ab->gfp_mask); /* * The original kaudit unicast socket sends up messages with diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 80f29e015570..2e0c97427b33 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -174,9 +174,9 @@ static void insert_hash(struct audit_chunk *chunk) struct fsnotify_mark *entry = &chunk->mark; struct list_head *list; - if (!entry->i.inode) + if (!entry->inode) return; - list = chunk_hash(entry->i.inode); + list = chunk_hash(entry->inode); list_add_rcu(&chunk->hash, list); } @@ -188,7 +188,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) list_for_each_entry_rcu(p, list, hash) { /* mark.inode may have gone NULL, but who cares? */ - if (p->mark.i.inode == inode) { + if (p->mark.inode == inode) { atomic_long_inc(&p->refs); return p; } @@ -231,7 +231,7 @@ static void untag_chunk(struct node *p) new = alloc_chunk(size); spin_lock(&entry->lock); - if (chunk->dead || !entry->i.inode) { + if (chunk->dead || !entry->inode) { spin_unlock(&entry->lock); if (new) free_chunk(new); @@ -258,7 +258,7 @@ static void untag_chunk(struct node *p) goto Fallback; fsnotify_duplicate_mark(&new->mark, entry); - if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { + if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) { fsnotify_put_mark(&new->mark); goto Fallback; } @@ -386,7 +386,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) chunk_entry = &chunk->mark; spin_lock(&old_entry->lock); - if (!old_entry->i.inode) { + if (!old_entry->inode) { /* old_entry is being shot, lets just lie */ spin_unlock(&old_entry->lock); fsnotify_put_mark(old_entry); @@ -395,7 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) } fsnotify_duplicate_mark(chunk_entry, old_entry); - if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { + if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) { spin_unlock(&old_entry->lock); fsnotify_put_mark(chunk_entry); fsnotify_put_mark(old_entry); @@ -611,7 +611,7 @@ void audit_trim_trees(void) list_for_each_entry(node, &tree->chunks, list) { struct audit_chunk *chunk = find_chunk(node); /* this could be NULL if the watch is dying else where... */ - struct inode *inode = chunk->mark.i.inode; + struct inode *inode = chunk->mark.inode; node->index |= 1U<<31; if (iterate_mounts(compare_root, inode, root_mnt)) node->index &= ~(1U<<31); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 3598e13f2a65..4f68a326d92e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -442,19 +442,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { f->type = AUDIT_LOGINUID_SET; f->val = 0; - } - - if ((f->type == AUDIT_PID) || (f->type == AUDIT_PPID)) { - struct pid *pid; - rcu_read_lock(); - pid = find_vpid(f->val); - if (!pid) { - rcu_read_unlock(); - err = -ESRCH; - goto exit_free; - } - f->val = pid_nr(pid); - rcu_read_unlock(); + entry->rule.pflags |= AUDIT_LOGINUID_LEGACY; } err = audit_field_valid(entry, f); @@ -630,6 +618,13 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->buflen += data->values[i] = audit_pack_string(&bufp, krule->filterkey); break; + case AUDIT_LOGINUID_SET: + if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) { + data->fields[i] = AUDIT_LOGINUID; + data->values[i] = AUDIT_UID_UNSET; + break; + } + /* fallthrough if set */ default: data->values[i] = f->val; } @@ -646,6 +641,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) int i; if (a->flags != b->flags || + a->pflags != b->pflags || a->listnr != b->listnr || a->action != b->action || a->field_count != b->field_count) @@ -764,6 +760,7 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old) new = &entry->rule; new->vers_ops = old->vers_ops; new->flags = old->flags; + new->pflags = old->pflags; new->listnr = old->listnr; new->action = old->action; for (i = 0; i < AUDIT_BITMASK_SIZE; i++) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index c75522a83678..072566dd0caf 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -72,6 +72,8 @@ #include <linux/fs_struct.h> #include <linux/compat.h> #include <linux/ctype.h> +#include <linux/string.h> +#include <uapi/linux/limits.h> #include "audit.h" @@ -1861,8 +1863,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, } list_for_each_entry_reverse(n, &context->names_list, list) { - /* does the name pointer match? */ - if (!n->name || n->name->name != name->name) + if (!n->name || strcmp(n->name->name, name->name)) continue; /* match the correct record type */ @@ -1877,12 +1878,48 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, } out_alloc: - /* unable to find the name from a previous getname(). Allocate a new - * anonymous entry. - */ - n = audit_alloc_name(context, AUDIT_TYPE_NORMAL); + /* unable to find an entry with both a matching name and type */ + n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); if (!n) return; + /* unfortunately, while we may have a path name to record with the + * inode, we can't always rely on the string lasting until the end of + * the syscall so we need to create our own copy, it may fail due to + * memory allocation issues, but we do our best */ + if (name) { + /* we can't use getname_kernel() due to size limits */ + size_t len = strlen(name->name) + 1; + struct filename *new = __getname(); + + if (unlikely(!new)) + goto out; + + if (len <= (PATH_MAX - sizeof(*new))) { + new->name = (char *)(new) + sizeof(*new); + new->separate = false; + } else if (len <= PATH_MAX) { + /* this looks odd, but is due to final_putname() */ + struct filename *new2; + + new2 = kmalloc(sizeof(*new2), GFP_KERNEL); + if (unlikely(!new2)) { + __putname(new); + goto out; + } + new2->name = (char *)new; + new2->separate = true; + new = new2; + } else { + /* we should never get here, but let's be safe */ + __putname(new); + goto out; + } + strlcpy((char *)new->name, name->name, len); + new->uptr = NULL; + new->aname = n; + n->name = new; + n->name_put = true; + } out: if (parent) { n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 1adf62b39b96..07ce18ca71e0 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -27,6 +27,9 @@ * version 2. This program is licensed "as is" without any warranty of any * kind, whether express or implied. */ + +#define pr_fmt(fmt) "KGDB: " fmt + #include <linux/pid_namespace.h> #include <linux/clocksource.h> #include <linux/serial_core.h> @@ -196,8 +199,8 @@ int __weak kgdb_validate_break_address(unsigned long addr) return err; err = kgdb_arch_remove_breakpoint(&tmp); if (err) - printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " - "memory destroyed at: %lx", addr); + pr_err("Critical breakpoint error, kernel memory destroyed at: %lx\n", + addr); return err; } @@ -256,8 +259,8 @@ int dbg_activate_sw_breakpoints(void) error = kgdb_arch_set_breakpoint(&kgdb_break[i]); if (error) { ret = error; - printk(KERN_INFO "KGDB: BP install failed: %lx", - kgdb_break[i].bpt_addr); + pr_info("BP install failed: %lx\n", + kgdb_break[i].bpt_addr); continue; } @@ -319,8 +322,8 @@ int dbg_deactivate_sw_breakpoints(void) continue; error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); if (error) { - printk(KERN_INFO "KGDB: BP remove failed: %lx\n", - kgdb_break[i].bpt_addr); + pr_info("BP remove failed: %lx\n", + kgdb_break[i].bpt_addr); ret = error; } @@ -367,7 +370,7 @@ int dbg_remove_all_break(void) goto setundefined; error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); if (error) - printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", + pr_err("breakpoint remove failed: %lx\n", kgdb_break[i].bpt_addr); setundefined: kgdb_break[i].state = BP_UNDEFINED; @@ -400,9 +403,9 @@ static int kgdb_io_ready(int print_wait) if (print_wait) { #ifdef CONFIG_KGDB_KDB if (!dbg_kdb_mode) - printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n"); + pr_crit("waiting... or $3#33 for KDB\n"); #else - printk(KERN_CRIT "KGDB: Waiting for remote debugger\n"); + pr_crit("Waiting for remote debugger\n"); #endif } return 1; @@ -430,8 +433,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks) exception_level = 0; kgdb_skipexception(ks->ex_vector, ks->linux_regs); dbg_activate_sw_breakpoints(); - printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n", - addr); + pr_crit("re-enter error: breakpoint removed %lx\n", addr); WARN_ON_ONCE(1); return 1; @@ -444,7 +446,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks) panic("Recursive entry to debugger"); } - printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n"); + pr_crit("re-enter exception: ALL breakpoints killed\n"); #ifdef CONFIG_KGDB_KDB /* Allow kdb to debug itself one level */ return 0; @@ -471,6 +473,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, int cpu; int trace_on = 0; int online_cpus = num_online_cpus(); + u64 time_left; kgdb_info[ks->cpu].enter_kgdb++; kgdb_info[ks->cpu].exception_state |= exception_state; @@ -595,9 +598,13 @@ return_normal: /* * Wait for the other CPUs to be notified and be waiting for us: */ - while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) + - atomic_read(&slaves_in_kgdb)) != online_cpus) + time_left = loops_per_jiffy * HZ; + while (kgdb_do_roundup && --time_left && + (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) != + online_cpus) cpu_relax(); + if (!time_left) + pr_crit("KGDB: Timed out waiting for secondary CPUs.\n"); /* * At this point the primary processor is completely @@ -795,15 +802,15 @@ static struct console kgdbcons = { static void sysrq_handle_dbg(int key) { if (!dbg_io_ops) { - printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); + pr_crit("ERROR: No KGDB I/O module available\n"); return; } if (!kgdb_connected) { #ifdef CONFIG_KGDB_KDB if (!dbg_kdb_mode) - printk(KERN_CRIT "KGDB or $3#33 for KDB\n"); + pr_crit("KGDB or $3#33 for KDB\n"); #else - printk(KERN_CRIT "Entering KGDB\n"); + pr_crit("Entering KGDB\n"); #endif } @@ -945,7 +952,7 @@ static void kgdb_initial_breakpoint(void) { kgdb_break_asap = 0; - printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n"); + pr_crit("Waiting for connection from remote gdb...\n"); kgdb_breakpoint(); } @@ -964,8 +971,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) if (dbg_io_ops) { spin_unlock(&kgdb_registration_lock); - printk(KERN_ERR "kgdb: Another I/O driver is already " - "registered with KGDB.\n"); + pr_err("Another I/O driver is already registered with KGDB\n"); return -EBUSY; } @@ -981,8 +987,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) spin_unlock(&kgdb_registration_lock); - printk(KERN_INFO "kgdb: Registered I/O driver %s.\n", - new_dbg_io_ops->name); + pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name); /* Arm KGDB now. */ kgdb_register_callbacks(); @@ -1017,8 +1022,7 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops) spin_unlock(&kgdb_registration_lock); - printk(KERN_INFO - "kgdb: Unregistered I/O driver %s, debugger disabled.\n", + pr_info("Unregistered I/O driver %s, debugger disabled\n", old_dbg_io_ops->name); } EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index b20d544f20c2..e1dbf4a2c69e 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -531,22 +531,29 @@ void __init kdb_initbptab(void) for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) bp->bp_free = 1; - kdb_register_repeat("bp", kdb_bp, "[<vaddr>]", - "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("bl", kdb_bp, "[<vaddr>]", - "Display breakpoints", 0, KDB_REPEAT_NO_ARGS); + kdb_register_flags("bp", kdb_bp, "[<vaddr>]", + "Set/Display breakpoints", 0, + KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); + kdb_register_flags("bl", kdb_bp, "[<vaddr>]", + "Display breakpoints", 0, + KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT) - kdb_register_repeat("bph", kdb_bp, "[<vaddr>]", - "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("bc", kdb_bc, "<bpnum>", - "Clear Breakpoint", 0, KDB_REPEAT_NONE); - kdb_register_repeat("be", kdb_bc, "<bpnum>", - "Enable Breakpoint", 0, KDB_REPEAT_NONE); - kdb_register_repeat("bd", kdb_bc, "<bpnum>", - "Disable Breakpoint", 0, KDB_REPEAT_NONE); - - kdb_register_repeat("ss", kdb_ss, "", - "Single Step", 1, KDB_REPEAT_NO_ARGS); + kdb_register_flags("bph", kdb_bp, "[<vaddr>]", + "[datar [length]|dataw [length]] Set hw brk", 0, + KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); + kdb_register_flags("bc", kdb_bc, "<bpnum>", + "Clear Breakpoint", 0, + KDB_ENABLE_FLOW_CTRL); + kdb_register_flags("be", kdb_bc, "<bpnum>", + "Enable Breakpoint", 0, + KDB_ENABLE_FLOW_CTRL); + kdb_register_flags("bd", kdb_bc, "<bpnum>", + "Disable Breakpoint", 0, + KDB_ENABLE_FLOW_CTRL); + + kdb_register_flags("ss", kdb_ss, "", + "Single Step", 1, + KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); /* * Architecture dependent initialization. */ diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 8859ca34dcfe..15e1a7af5dd0 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -129,6 +129,10 @@ int kdb_stub(struct kgdb_state *ks) ks->pass_exception = 1; KDB_FLAG_SET(CATASTROPHIC); } + /* set CATASTROPHIC if the system contains unresponsive processors */ + for_each_online_cpu(i) + if (!kgdb_info[i].enter_kgdb) + KDB_FLAG_SET(CATASTROPHIC); if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { KDB_STATE_CLEAR(SSBPT); KDB_STATE_CLEAR(DOING_SS); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 379650b984f8..f191bddf64b8 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -12,6 +12,7 @@ */ #include <linux/ctype.h> +#include <linux/types.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/kmsg_dump.h> @@ -23,6 +24,7 @@ #include <linux/vmalloc.h> #include <linux/atomic.h> #include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/kallsyms.h> @@ -42,6 +44,12 @@ #include <linux/slab.h> #include "kdb_private.h" +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "kdb." + +static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE; +module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600); + #define GREP_LEN 256 char kdb_grep_string[GREP_LEN]; int kdb_grepping_flag; @@ -121,6 +129,7 @@ static kdbmsg_t kdbmsgs[] = { KDBMSG(BADLENGTH, "Invalid length field"), KDBMSG(NOBP, "No Breakpoint exists"), KDBMSG(BADADDR, "Invalid address"), + KDBMSG(NOPERM, "Permission denied"), }; #undef KDBMSG @@ -188,6 +197,26 @@ struct task_struct *kdb_curr_task(int cpu) } /* + * Check whether the flags of the current command and the permissions + * of the kdb console has allow a command to be run. + */ +static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions, + bool no_args) +{ + /* permissions comes from userspace so needs massaging slightly */ + permissions &= KDB_ENABLE_MASK; + permissions |= KDB_ENABLE_ALWAYS_SAFE; + + /* some commands change group when launched with no arguments */ + if (no_args) + permissions |= permissions << KDB_ENABLE_NO_ARGS_SHIFT; + + flags |= KDB_ENABLE_ALL; + + return permissions & flags; +} + +/* * kdbgetenv - This function will return the character string value of * an environment variable. * Parameters: @@ -476,6 +505,15 @@ int kdbgetaddrarg(int argc, const char **argv, int *nextarg, kdb_symtab_t symtab; /* + * If the enable flags prohibit both arbitrary memory access + * and flow control then there are no reasonable grounds to + * provide symbol lookup. + */ + if (!kdb_check_flags(KDB_ENABLE_MEM_READ | KDB_ENABLE_FLOW_CTRL, + kdb_cmd_enabled, false)) + return KDB_NOPERM; + + /* * Process arguments which follow the following syntax: * * symbol | numeric-address [+/- numeric-offset] @@ -641,8 +679,13 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0) if (!s->count) s->usable = 0; if (s->usable) - kdb_register(s->name, kdb_exec_defcmd, - s->usage, s->help, 0); + /* macros are always safe because when executed each + * internal command re-enters kdb_parse() and is + * safety checked individually. + */ + kdb_register_flags(s->name, kdb_exec_defcmd, s->usage, + s->help, 0, + KDB_ENABLE_ALWAYS_SAFE); return 0; } if (!s->usable) @@ -1003,25 +1046,22 @@ int kdb_parse(const char *cmdstr) if (i < kdb_max_commands) { int result; + + if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1)) + return KDB_NOPERM; + KDB_STATE_SET(CMD); result = (*tp->cmd_func)(argc-1, (const char **)argv); if (result && ignore_errors && result > KDB_CMD_GO) result = 0; KDB_STATE_CLEAR(CMD); - switch (tp->cmd_repeat) { - case KDB_REPEAT_NONE: - argc = 0; - if (argv[0]) - *(argv[0]) = '\0'; - break; - case KDB_REPEAT_NO_ARGS: - argc = 1; - if (argv[1]) - *(argv[1]) = '\0'; - break; - case KDB_REPEAT_WITH_ARGS: - break; - } + + if (tp->cmd_flags & KDB_REPEAT_WITH_ARGS) + return result; + + argc = tp->cmd_flags & KDB_REPEAT_NO_ARGS ? 1 : 0; + if (argv[argc]) + *(argv[argc]) = '\0'; return result; } @@ -1921,10 +1961,14 @@ static int kdb_rm(int argc, const char **argv) */ static int kdb_sr(int argc, const char **argv) { + bool check_mask = + !kdb_check_flags(KDB_ENABLE_ALL, kdb_cmd_enabled, false); + if (argc != 1) return KDB_ARGCOUNT; + kdb_trap_printk++; - __handle_sysrq(*argv[1], false); + __handle_sysrq(*argv[1], check_mask); kdb_trap_printk--; return 0; @@ -2157,6 +2201,8 @@ static void kdb_cpu_status(void) for (start_cpu = -1, i = 0; i < NR_CPUS; i++) { if (!cpu_online(i)) { state = 'F'; /* cpu is offline */ + } else if (!kgdb_info[i].enter_kgdb) { + state = 'D'; /* cpu is online but unresponsive */ } else { state = ' '; /* cpu is responding to kdb */ if (kdb_task_state_char(KDB_TSK(i)) == 'I') @@ -2210,7 +2256,7 @@ static int kdb_cpu(int argc, const char **argv) /* * Validate cpunum */ - if ((cpunum > NR_CPUS) || !cpu_online(cpunum)) + if ((cpunum > NR_CPUS) || !kgdb_info[cpunum].enter_kgdb) return KDB_BADCPUNUM; dbg_switch_cpu = cpunum; @@ -2375,6 +2421,8 @@ static int kdb_help(int argc, const char **argv) return 0; if (!kt->cmd_name) continue; + if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true)) + continue; if (strlen(kt->cmd_usage) > 20) space = "\n "; kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name, @@ -2629,7 +2677,7 @@ static int kdb_grep_help(int argc, const char **argv) } /* - * kdb_register_repeat - This function is used to register a kernel + * kdb_register_flags - This function is used to register a kernel * debugger command. * Inputs: * cmd Command name @@ -2641,12 +2689,12 @@ static int kdb_grep_help(int argc, const char **argv) * zero for success, one if a duplicate command. */ #define kdb_command_extend 50 /* arbitrary */ -int kdb_register_repeat(char *cmd, - kdb_func_t func, - char *usage, - char *help, - short minlen, - kdb_repeat_t repeat) +int kdb_register_flags(char *cmd, + kdb_func_t func, + char *usage, + char *help, + short minlen, + kdb_cmdflags_t flags) { int i; kdbtab_t *kp; @@ -2694,19 +2742,18 @@ int kdb_register_repeat(char *cmd, kp->cmd_func = func; kp->cmd_usage = usage; kp->cmd_help = help; - kp->cmd_flags = 0; kp->cmd_minlen = minlen; - kp->cmd_repeat = repeat; + kp->cmd_flags = flags; return 0; } -EXPORT_SYMBOL_GPL(kdb_register_repeat); +EXPORT_SYMBOL_GPL(kdb_register_flags); /* * kdb_register - Compatibility register function for commands that do * not need to specify a repeat state. Equivalent to - * kdb_register_repeat with KDB_REPEAT_NONE. + * kdb_register_flags with flags set to 0. * Inputs: * cmd Command name * func Function to execute the command @@ -2721,8 +2768,7 @@ int kdb_register(char *cmd, char *help, short minlen) { - return kdb_register_repeat(cmd, func, usage, help, minlen, - KDB_REPEAT_NONE); + return kdb_register_flags(cmd, func, usage, help, minlen, 0); } EXPORT_SYMBOL_GPL(kdb_register); @@ -2764,80 +2810,109 @@ static void __init kdb_inittab(void) for_each_kdbcmd(kp, i) kp->cmd_name = NULL; - kdb_register_repeat("md", kdb_md, "<vaddr>", + kdb_register_flags("md", kdb_md, "<vaddr>", "Display Memory Contents, also mdWcN, e.g. md8c1", 1, - KDB_REPEAT_NO_ARGS); - kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>", - "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>", - "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("mds", kdb_md, "<vaddr>", - "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>", - "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("go", kdb_go, "[<vaddr>]", - "Continue Execution", 1, KDB_REPEAT_NONE); - kdb_register_repeat("rd", kdb_rd, "", - "Display Registers", 0, KDB_REPEAT_NONE); - kdb_register_repeat("rm", kdb_rm, "<reg> <contents>", - "Modify Registers", 0, KDB_REPEAT_NONE); - kdb_register_repeat("ef", kdb_ef, "<vaddr>", - "Display exception frame", 0, KDB_REPEAT_NONE); - kdb_register_repeat("bt", kdb_bt, "[<vaddr>]", - "Stack traceback", 1, KDB_REPEAT_NONE); - kdb_register_repeat("btp", kdb_bt, "<pid>", - "Display stack for process <pid>", 0, KDB_REPEAT_NONE); - kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]", - "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE); - kdb_register_repeat("btc", kdb_bt, "", - "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE); - kdb_register_repeat("btt", kdb_bt, "<vaddr>", + KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); + kdb_register_flags("mdr", kdb_md, "<vaddr> <bytes>", + "Display Raw Memory", 0, + KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); + kdb_register_flags("mdp", kdb_md, "<paddr> <bytes>", + "Display Physical Memory", 0, + KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); + kdb_register_flags("mds", kdb_md, "<vaddr>", + "Display Memory Symbolically", 0, + KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); + kdb_register_flags("mm", kdb_mm, "<vaddr> <contents>", + "Modify Memory Contents", 0, + KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS); + kdb_register_flags("go", kdb_go, "[<vaddr>]", + "Continue Execution", 1, + KDB_ENABLE_REG_WRITE | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS); + kdb_register_flags("rd", kdb_rd, "", + "Display Registers", 0, + KDB_ENABLE_REG_READ); + kdb_register_flags("rm", kdb_rm, "<reg> <contents>", + "Modify Registers", 0, + KDB_ENABLE_REG_WRITE); + kdb_register_flags("ef", kdb_ef, "<vaddr>", + "Display exception frame", 0, + KDB_ENABLE_MEM_READ); + kdb_register_flags("bt", kdb_bt, "[<vaddr>]", + "Stack traceback", 1, + KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS); + kdb_register_flags("btp", kdb_bt, "<pid>", + "Display stack for process <pid>", 0, + KDB_ENABLE_INSPECT); + kdb_register_flags("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]", + "Backtrace all processes matching state flag", 0, + KDB_ENABLE_INSPECT); + kdb_register_flags("btc", kdb_bt, "", + "Backtrace current process on each cpu", 0, + KDB_ENABLE_INSPECT); + kdb_register_flags("btt", kdb_bt, "<vaddr>", "Backtrace process given its struct task address", 0, - KDB_REPEAT_NONE); - kdb_register_repeat("env", kdb_env, "", - "Show environment variables", 0, KDB_REPEAT_NONE); - kdb_register_repeat("set", kdb_set, "", - "Set environment variables", 0, KDB_REPEAT_NONE); - kdb_register_repeat("help", kdb_help, "", - "Display Help Message", 1, KDB_REPEAT_NONE); - kdb_register_repeat("?", kdb_help, "", - "Display Help Message", 0, KDB_REPEAT_NONE); - kdb_register_repeat("cpu", kdb_cpu, "<cpunum>", - "Switch to new cpu", 0, KDB_REPEAT_NONE); - kdb_register_repeat("kgdb", kdb_kgdb, "", - "Enter kgdb mode", 0, KDB_REPEAT_NONE); - kdb_register_repeat("ps", kdb_ps, "[<flags>|A]", - "Display active task list", 0, KDB_REPEAT_NONE); - kdb_register_repeat("pid", kdb_pid, "<pidnum>", - "Switch to another task", 0, KDB_REPEAT_NONE); - kdb_register_repeat("reboot", kdb_reboot, "", - "Reboot the machine immediately", 0, KDB_REPEAT_NONE); + KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS); + kdb_register_flags("env", kdb_env, "", + "Show environment variables", 0, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("set", kdb_set, "", + "Set environment variables", 0, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("help", kdb_help, "", + "Display Help Message", 1, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("?", kdb_help, "", + "Display Help Message", 0, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("cpu", kdb_cpu, "<cpunum>", + "Switch to new cpu", 0, + KDB_ENABLE_ALWAYS_SAFE_NO_ARGS); + kdb_register_flags("kgdb", kdb_kgdb, "", + "Enter kgdb mode", 0, 0); + kdb_register_flags("ps", kdb_ps, "[<flags>|A]", + "Display active task list", 0, + KDB_ENABLE_INSPECT); + kdb_register_flags("pid", kdb_pid, "<pidnum>", + "Switch to another task", 0, + KDB_ENABLE_INSPECT); + kdb_register_flags("reboot", kdb_reboot, "", + "Reboot the machine immediately", 0, + KDB_ENABLE_REBOOT); #if defined(CONFIG_MODULES) - kdb_register_repeat("lsmod", kdb_lsmod, "", - "List loaded kernel modules", 0, KDB_REPEAT_NONE); + kdb_register_flags("lsmod", kdb_lsmod, "", + "List loaded kernel modules", 0, + KDB_ENABLE_INSPECT); #endif #if defined(CONFIG_MAGIC_SYSRQ) - kdb_register_repeat("sr", kdb_sr, "<key>", - "Magic SysRq key", 0, KDB_REPEAT_NONE); + kdb_register_flags("sr", kdb_sr, "<key>", + "Magic SysRq key", 0, + KDB_ENABLE_ALWAYS_SAFE); #endif #if defined(CONFIG_PRINTK) - kdb_register_repeat("dmesg", kdb_dmesg, "[lines]", - "Display syslog buffer", 0, KDB_REPEAT_NONE); + kdb_register_flags("dmesg", kdb_dmesg, "[lines]", + "Display syslog buffer", 0, + KDB_ENABLE_ALWAYS_SAFE); #endif if (arch_kgdb_ops.enable_nmi) { - kdb_register_repeat("disable_nmi", kdb_disable_nmi, "", - "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE); - } - kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"", - "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE); - kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>", - "Send a signal to a process", 0, KDB_REPEAT_NONE); - kdb_register_repeat("summary", kdb_summary, "", - "Summarize the system", 4, KDB_REPEAT_NONE); - kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]", - "Display per_cpu variables", 3, KDB_REPEAT_NONE); - kdb_register_repeat("grephelp", kdb_grep_help, "", - "Display help on | grep", 0, KDB_REPEAT_NONE); + kdb_register_flags("disable_nmi", kdb_disable_nmi, "", + "Disable NMI entry to KDB", 0, + KDB_ENABLE_ALWAYS_SAFE); + } + kdb_register_flags("defcmd", kdb_defcmd, "name \"usage\" \"help\"", + "Define a set of commands, down to endefcmd", 0, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("kill", kdb_kill, "<-signal> <pid>", + "Send a signal to a process", 0, + KDB_ENABLE_SIGNAL); + kdb_register_flags("summary", kdb_summary, "", + "Summarize the system", 4, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]", + "Display per_cpu variables", 3, + KDB_ENABLE_MEM_READ); + kdb_register_flags("grephelp", kdb_grep_help, "", + "Display help on | grep", 0, + KDB_ENABLE_ALWAYS_SAFE); } /* Execute any commands defined in kdb_cmds. */ diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 7afd3c8c41d5..eaacd1693954 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -172,10 +172,9 @@ typedef struct _kdbtab { kdb_func_t cmd_func; /* Function to execute command */ char *cmd_usage; /* Usage String for this command */ char *cmd_help; /* Help message for this command */ - short cmd_flags; /* Parsing flags */ short cmd_minlen; /* Minimum legal # command * chars required */ - kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */ + kdb_cmdflags_t cmd_flags; /* Command behaviour flags */ } kdbtab_t; extern int kdb_bt(int, const char **); /* KDB display back trace */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 113b837470cd..882f835a0d85 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4461,18 +4461,14 @@ perf_output_sample_regs(struct perf_output_handle *handle, } static void perf_sample_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs) + struct pt_regs *regs, + struct pt_regs *regs_user_copy) { - if (!user_mode(regs)) { - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - - if (regs) { - regs_user->abi = perf_reg_abi(current); + if (user_mode(regs)) { + regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; + } else if (current->mm) { + perf_get_regs_user(regs_user, regs, regs_user_copy); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; regs_user->regs = NULL; @@ -4951,7 +4947,8 @@ void perf_prepare_sample(struct perf_event_header *header, } if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) - perf_sample_regs_user(&data->regs_user, regs); + perf_sample_regs_user(&data->regs_user, regs, + &data->regs_user_copy); if (sample_type & PERF_SAMPLE_REGS_USER) { /* regs dump ABI info */ @@ -7477,11 +7474,11 @@ SYSCALL_DEFINE5(perf_event_open, if (move_group) { synchronize_rcu(); - perf_install_in_context(ctx, group_leader, event->cpu); + perf_install_in_context(ctx, group_leader, group_leader->cpu); get_ctx(ctx); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_install_in_context(ctx, sibling, event->cpu); + perf_install_in_context(ctx, sibling, sibling->cpu); get_ctx(ctx); } } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ed8f2cde34c5..cb346f26a22d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -193,7 +193,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, } flush_cache_page(vma, addr, pte_pfn(*ptep)); - ptep_clear_flush(vma, addr, ptep); + ptep_clear_flush_notify(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); page_remove_rmap(page); @@ -724,14 +724,14 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) int more = 0; again: - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) continue; if (!prev && !more) { /* - * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through + * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through * reclaim. This is optimistic, no harm done if it fails. */ prev = kmalloc(sizeof(struct map_info), @@ -755,7 +755,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) info->mm = vma->vm_mm; info->vaddr = offset_to_vaddr(vma, offset); } - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_read(mapping); if (!more) goto out; diff --git a/kernel/exit.c b/kernel/exit.c index 8714e5ded8b4..6806c55475ee 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -212,27 +212,6 @@ repeat: } /* - * This checks not only the pgrp, but falls back on the pid if no - * satisfactory pgrp is found. I dunno - gdb doesn't work correctly - * without this... - * - * The caller must hold rcu lock or the tasklist lock. - */ -struct pid *session_of_pgrp(struct pid *pgrp) -{ - struct task_struct *p; - struct pid *sid = NULL; - - p = pid_task(pgrp, PIDTYPE_PGID); - if (p == NULL) - p = pid_task(pgrp, PIDTYPE_PID); - if (p != NULL) - sid = task_session(p); - - return sid; -} - -/* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected * by terminal-generated stop signals. Newly orphaned process groups are @@ -1308,9 +1287,15 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { + /* + * We can race with wait_task_zombie() from another thread. + * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition + * can't confuse the checks below. + */ + int exit_state = ACCESS_ONCE(p->exit_state); int ret; - if (unlikely(p->exit_state == EXIT_DEAD)) + if (unlikely(exit_state == EXIT_DEAD)) return 0; ret = eligible_child(wo, p); @@ -1331,7 +1316,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, return 0; } - if (unlikely(p->exit_state == EXIT_TRACE)) { + if (unlikely(exit_state == EXIT_TRACE)) { /* * ptrace == 0 means we are the natural parent. In this case * we should clear notask_error, debugger will notify us. @@ -1358,7 +1343,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, } /* slay zombie? */ - if (p->exit_state == EXIT_ZOMBIE) { + if (exit_state == EXIT_ZOMBIE) { /* we don't reap group leaders with subthreads */ if (!delay_group_leader(p)) { /* diff --git a/kernel/fork.c b/kernel/fork.c index 9ca84189cfc2..4dc2ddade9f1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -433,7 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); if (tmp->vm_flags & VM_SHARED) atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); @@ -445,7 +445,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) vma_interval_tree_insert_after(tmp, mpnt, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); } /* diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 3b7408759bdf..c92e44855ddd 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -32,10 +32,13 @@ config GCOV_KERNEL Note that the debugfs filesystem has to be mounted to access profiling data. +config ARCH_HAS_GCOV_PROFILE_ALL + def_bool n + config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64 + depends on ARCH_HAS_GCOV_PROFILE_ALL default n ---help--- This options activates profiling for the entire kernel. diff --git a/kernel/groups.c b/kernel/groups.c index 451698f86cfa..664411f171b5 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -6,6 +6,7 @@ #include <linux/slab.h> #include <linux/security.h> #include <linux/syscalls.h> +#include <linux/user_namespace.h> #include <asm/uaccess.h> /* init to 2 - one for init_task, one to ensure it is never freed */ @@ -213,6 +214,14 @@ out: return i; } +bool may_setgroups(void) +{ + struct user_namespace *user_ns = current_user_ns(); + + return ns_capable(user_ns, CAP_SETGID) && + userns_may_setgroups(user_ns); +} + /* * SMP: Our groups are copy-on-write. We can set them safely * without another task interfering. @@ -223,7 +232,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!ns_capable(current_user_ns(), CAP_SETGID)) + if (!may_setgroups()) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 4332d766619d..df553b0af936 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -78,8 +78,12 @@ extern void unmask_threaded_irq(struct irq_desc *desc); #ifdef CONFIG_SPARSE_IRQ static inline void irq_mark_irq(unsigned int irq) { } +extern void irq_lock_sparse(void); +extern void irq_unlock_sparse(void); #else extern void irq_mark_irq(unsigned int irq); +static inline void irq_lock_sparse(void) { } +static inline void irq_unlock_sparse(void) { } #endif extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index a1782f88f0af..99793b9b6d23 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -132,6 +132,16 @@ static void free_masks(struct irq_desc *desc) static inline void free_masks(struct irq_desc *desc) { } #endif +void irq_lock_sparse(void) +{ + mutex_lock(&sparse_irq_lock); +} + +void irq_unlock_sparse(void) +{ + mutex_unlock(&sparse_irq_lock); +} + static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) { struct irq_desc *desc; @@ -168,6 +178,12 @@ static void free_desc(unsigned int irq) unregister_irq_proc(irq, desc); + /* + * sparse_irq_lock protects also show_interrupts() and + * kstat_irq_usr(). Once we deleted the descriptor from the + * sparse tree we can free it. Access in proc will fail to + * lookup the descriptor. + */ mutex_lock(&sparse_irq_lock); delete_irq_desc(irq); mutex_unlock(&sparse_irq_lock); @@ -574,6 +590,15 @@ void kstat_incr_irq_this_cpu(unsigned int irq) kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); } +/** + * kstat_irqs_cpu - Get the statistics for an interrupt on a cpu + * @irq: The interrupt number + * @cpu: The cpu number + * + * Returns the sum of interrupt counts on @cpu since boot for + * @irq. The caller must ensure that the interrupt is not removed + * concurrently. + */ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); @@ -582,6 +607,14 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; } +/** + * kstat_irqs - Get the statistics for an interrupt + * @irq: The interrupt number + * + * Returns the sum of interrupt counts on all cpus since boot for + * @irq. The caller must ensure that the interrupt is not removed + * concurrently. + */ unsigned int kstat_irqs(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -594,3 +627,22 @@ unsigned int kstat_irqs(unsigned int irq) sum += *per_cpu_ptr(desc->kstat_irqs, cpu); return sum; } + +/** + * kstat_irqs_usr - Get the statistics for an interrupt + * @irq: The interrupt number + * + * Returns the sum of interrupt counts on all cpus since boot for + * @irq. Contrary to kstat_irqs() this can be called from any + * preemptible context. It's protected against concurrent removal of + * an interrupt descriptor when sparse irqs are enabled. + */ +unsigned int kstat_irqs_usr(unsigned int irq) +{ + int sum; + + irq_lock_sparse(); + sum = kstat_irqs(irq); + irq_unlock_sparse(); + return sum; +} diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index ac1ba2f11032..9dc9bfd8a678 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -15,6 +15,23 @@ #include "internals.h" +/* + * Access rules: + * + * procfs protects read/write of /proc/irq/N/ files against a + * concurrent free of the interrupt descriptor. remove_proc_entry() + * immediately prevents new read/writes to happen and waits for + * already running read/write functions to complete. + * + * We remove the proc entries first and then delete the interrupt + * descriptor from the radix tree and free it. So it is guaranteed + * that irq_to_desc(N) is valid as long as the read/writes are + * permitted by procfs. + * + * The read from /proc/interrupts is a different problem because there + * is no protection. So the lookup and the access to irqdesc + * information must be protected by sparse_irq_lock. + */ static struct proc_dir_entry *root_irq_dir; #ifdef CONFIG_SMP @@ -437,9 +454,10 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } + irq_lock_sparse(); desc = irq_to_desc(i); if (!desc) - return 0; + goto outsparse; raw_spin_lock_irqsave(&desc->lock, flags); for_each_online_cpu(j) @@ -479,6 +497,8 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); out: raw_spin_unlock_irqrestore(&desc->lock, flags); +outsparse: + irq_unlock_sparse(); return 0; } #endif diff --git a/kernel/kexec.c b/kernel/kexec.c index 2abf9f6e9a61..9a8a01abbaed 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -600,7 +600,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, if (!kexec_on_panic) { image->swap_page = kimage_alloc_control_pages(image, 0); if (!image->swap_page) { - pr_err(KERN_ERR "Could not allocate swap buffer\n"); + pr_err("Could not allocate swap buffer\n"); goto out_free_control_pages; } } diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 5cf6731b98e9..3ef3736002d8 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -80,13 +80,13 @@ void debug_mutex_unlock(struct mutex *lock) DEBUG_LOCKS_WARN_ON(lock->owner != current); DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); - mutex_clear_owner(lock); } /* * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug * mutexes so that we can do it here after we've verified state. */ + mutex_clear_owner(lock); atomic_set(&lock->count, 1); } diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 4b082b5cac9e..db3ccb1dd614 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -363,6 +363,14 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) } EXPORT_SYMBOL(_raw_spin_lock_nested); +void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass) +{ + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); +} +EXPORT_SYMBOL(_raw_spin_lock_bh_nested); + unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, int subclass) { diff --git a/kernel/module.c b/kernel/module.c index e52a8739361a..3965511ae133 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -42,7 +42,6 @@ #include <linux/vermagic.h> #include <linux/notifier.h> #include <linux/sched.h> -#include <linux/stop_machine.h> #include <linux/device.h> #include <linux/string.h> #include <linux/mutex.h> @@ -98,7 +97,7 @@ * 1) List of modules (also safely readable with preempt_disable), * 2) module_use links, * 3) module_addr_min/module_addr_max. - * (delete uses stop_machine/add uses RCU list operations). */ + * (delete and add uses RCU list operations). */ DEFINE_MUTEX(module_mutex); EXPORT_SYMBOL_GPL(module_mutex); static LIST_HEAD(modules); @@ -158,13 +157,13 @@ static BLOCKING_NOTIFIER_HEAD(module_notify_list); * Protected by module_mutex. */ static unsigned long module_addr_min = -1UL, module_addr_max = 0; -int register_module_notifier(struct notifier_block * nb) +int register_module_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&module_notify_list, nb); } EXPORT_SYMBOL(register_module_notifier); -int unregister_module_notifier(struct notifier_block * nb) +int unregister_module_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&module_notify_list, nb); } @@ -628,18 +627,23 @@ static char last_unloaded_module[MODULE_NAME_LEN+1]; EXPORT_TRACEPOINT_SYMBOL(module_get); +/* MODULE_REF_BASE is the base reference count by kmodule loader. */ +#define MODULE_REF_BASE 1 + /* Init the unload section of the module. */ static int module_unload_init(struct module *mod) { - mod->refptr = alloc_percpu(struct module_ref); - if (!mod->refptr) - return -ENOMEM; + /* + * Initialize reference counter to MODULE_REF_BASE. + * refcnt == 0 means module is going. + */ + atomic_set(&mod->refcnt, MODULE_REF_BASE); INIT_LIST_HEAD(&mod->source_list); INIT_LIST_HEAD(&mod->target_list); /* Hold reference count during initialization. */ - raw_cpu_write(mod->refptr->incs, 1); + atomic_inc(&mod->refcnt); return 0; } @@ -721,8 +725,6 @@ static void module_unload_free(struct module *mod) kfree(use); } mutex_unlock(&module_mutex); - - free_percpu(mod->refptr); } #ifdef CONFIG_MODULE_FORCE_UNLOAD @@ -740,60 +742,39 @@ static inline int try_force_unload(unsigned int flags) } #endif /* CONFIG_MODULE_FORCE_UNLOAD */ -struct stopref +/* Try to release refcount of module, 0 means success. */ +static int try_release_module_ref(struct module *mod) { - struct module *mod; - int flags; - int *forced; -}; + int ret; -/* Whole machine is stopped with interrupts off when this runs. */ -static int __try_stop_module(void *_sref) -{ - struct stopref *sref = _sref; + /* Try to decrement refcnt which we set at loading */ + ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt); + BUG_ON(ret < 0); + if (ret) + /* Someone can put this right now, recover with checking */ + ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0); + + return ret; +} +static int try_stop_module(struct module *mod, int flags, int *forced) +{ /* If it's not unused, quit unless we're forcing. */ - if (module_refcount(sref->mod) != 0) { - if (!(*sref->forced = try_force_unload(sref->flags))) + if (try_release_module_ref(mod) != 0) { + *forced = try_force_unload(flags); + if (!(*forced)) return -EWOULDBLOCK; } /* Mark it as dying. */ - sref->mod->state = MODULE_STATE_GOING; - return 0; -} - -static int try_stop_module(struct module *mod, int flags, int *forced) -{ - struct stopref sref = { mod, flags, forced }; + mod->state = MODULE_STATE_GOING; - return stop_machine(__try_stop_module, &sref, NULL); + return 0; } unsigned long module_refcount(struct module *mod) { - unsigned long incs = 0, decs = 0; - int cpu; - - for_each_possible_cpu(cpu) - decs += per_cpu_ptr(mod->refptr, cpu)->decs; - /* - * ensure the incs are added up after the decs. - * module_put ensures incs are visible before decs with smp_wmb. - * - * This 2-count scheme avoids the situation where the refcount - * for CPU0 is read, then CPU0 increments the module refcount, - * then CPU1 drops that refcount, then the refcount for CPU1 is - * read. We would record a decrement but not its corresponding - * increment so we would see a low count (disaster). - * - * Rare situation? But module_refcount can be preempted, and we - * might be tallying up 4096+ CPUs. So it is not impossible. - */ - smp_rmb(); - for_each_possible_cpu(cpu) - incs += per_cpu_ptr(mod->refptr, cpu)->incs; - return incs - decs; + return (unsigned long)atomic_read(&mod->refcnt) - MODULE_REF_BASE; } EXPORT_SYMBOL(module_refcount); @@ -877,8 +858,10 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) seq_printf(m, " %lu ", module_refcount(mod)); - /* Always include a trailing , so userspace can differentiate - between this and the old multi-field proc format. */ + /* + * Always include a trailing , so userspace can differentiate + * between this and the old multi-field proc format. + */ list_for_each_entry(use, &mod->source_list, source_list) { printed_something = 1; seq_printf(m, "%s,", use->source->name); @@ -886,11 +869,11 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) if (mod->init != NULL && mod->exit == NULL) { printed_something = 1; - seq_printf(m, "[permanent],"); + seq_puts(m, "[permanent],"); } if (!printed_something) - seq_printf(m, "-"); + seq_puts(m, "-"); } void __symbol_put(const char *symbol) @@ -935,7 +918,7 @@ void __module_get(struct module *module) { if (module) { preempt_disable(); - __this_cpu_inc(module->refptr->incs); + atomic_inc(&module->refcnt); trace_module_get(module, _RET_IP_); preempt_enable(); } @@ -948,11 +931,11 @@ bool try_module_get(struct module *module) if (module) { preempt_disable(); - - if (likely(module_is_live(module))) { - __this_cpu_inc(module->refptr->incs); + /* Note: here, we can fail to get a reference */ + if (likely(module_is_live(module) && + atomic_inc_not_zero(&module->refcnt) != 0)) trace_module_get(module, _RET_IP_); - } else + else ret = false; preempt_enable(); @@ -963,11 +946,12 @@ EXPORT_SYMBOL(try_module_get); void module_put(struct module *module) { + int ret; + if (module) { preempt_disable(); - smp_wmb(); /* see comment in module_refcount */ - __this_cpu_inc(module->refptr->decs); - + ret = atomic_dec_if_positive(&module->refcnt); + WARN_ON(ret < 0); /* Failed to put refcount */ trace_module_put(module, _RET_IP_); preempt_enable(); } @@ -978,7 +962,7 @@ EXPORT_SYMBOL(module_put); static inline void print_unload_info(struct seq_file *m, struct module *mod) { /* We don't know the usage count, or what modules are using. */ - seq_printf(m, " - -"); + seq_puts(m, " - -"); } static inline void module_unload_free(struct module *mod) @@ -1131,7 +1115,7 @@ static unsigned long maybe_relocated(unsigned long crc, static int check_version(Elf_Shdr *sechdrs, unsigned int versindex, const char *symname, - struct module *mod, + struct module *mod, const unsigned long *crc, const struct module *crc_owner) { @@ -1165,7 +1149,7 @@ static int check_version(Elf_Shdr *sechdrs, return 0; bad_version: - printk("%s: disagrees about version of symbol %s\n", + pr_warn("%s: disagrees about version of symbol %s\n", mod->name, symname); return 0; } @@ -1200,7 +1184,7 @@ static inline int same_magic(const char *amagic, const char *bmagic, static inline int check_version(Elf_Shdr *sechdrs, unsigned int versindex, const char *symname, - struct module *mod, + struct module *mod, const unsigned long *crc, const struct module *crc_owner) { @@ -1288,15 +1272,13 @@ static inline bool sect_empty(const Elf_Shdr *sect) return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; } -struct module_sect_attr -{ +struct module_sect_attr { struct module_attribute mattr; char *name; unsigned long address; }; -struct module_sect_attrs -{ +struct module_sect_attrs { struct attribute_group grp; unsigned int nsections; struct module_sect_attr attrs[0]; @@ -1550,7 +1532,8 @@ static int module_add_modinfo_attrs(struct module *mod) (attr->test && attr->test(mod))) { memcpy(temp_attr, attr, sizeof(*temp_attr)); sysfs_attr_init(&temp_attr->attr); - error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); + error = sysfs_create_file(&mod->mkobj.kobj, + &temp_attr->attr); ++temp_attr; } } @@ -1566,7 +1549,7 @@ static void module_remove_modinfo_attrs(struct module *mod) /* pick a field to test for end of list */ if (!attr->attr.name) break; - sysfs_remove_file(&mod->mkobj.kobj,&attr->attr); + sysfs_remove_file(&mod->mkobj.kobj, &attr->attr); if (attr->free) attr->free(mod); } @@ -1697,18 +1680,6 @@ static void mod_sysfs_teardown(struct module *mod) mod_sysfs_fini(mod); } -/* - * unlink the module with the whole machine is stopped with interrupts off - * - this defends against kallsyms not taking locks - */ -static int __unlink_module(void *_mod) -{ - struct module *mod = _mod; - list_del(&mod->list); - module_bug_cleanup(mod); - return 0; -} - #ifdef CONFIG_DEBUG_SET_MODULE_RONX /* * LKM RO/NX protection: protect module's text/ro-data @@ -1860,7 +1831,12 @@ static void free_module(struct module *mod) /* Now we can delete it from the lists */ mutex_lock(&module_mutex); - stop_machine(__unlink_module, mod, NULL); + /* Unlink carefully: kallsyms could be walking list. */ + list_del_rcu(&mod->list); + /* Remove this module from bug list, this uses list_del_rcu */ + module_bug_cleanup(mod); + /* Wait for RCU synchronizing before releasing mod->list and buglist. */ + synchronize_rcu(); mutex_unlock(&module_mutex); /* This may be NULL, but that's OK */ @@ -1955,7 +1931,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) /* We compiled with -fno-common. These are not supposed to happen. */ pr_debug("Common symbol: %s\n", name); - printk("%s: please compile with -fno-common\n", + pr_warn("%s: please compile with -fno-common\n", mod->name); ret = -ENOEXEC; break; @@ -2259,7 +2235,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info) } static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, - unsigned int shnum) + unsigned int shnum) { const Elf_Shdr *sec; @@ -2735,7 +2711,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) * This shouldn't happen with same compiler and binutils * building all parts of the module. */ - printk(KERN_WARNING "%s: has both .ctors and .init_array.\n", + pr_warn("%s: has both .ctors and .init_array.\n", mod->name); return -EINVAL; } @@ -3023,8 +2999,10 @@ static int do_init_module(struct module *mod) if (mod->init != NULL) ret = do_one_initcall(mod->init); if (ret < 0) { - /* Init routine failed: abort. Try to protect us from - buggy refcounters. */ + /* + * Init routine failed: abort. Try to protect us from + * buggy refcounters. + */ mod->state = MODULE_STATE_GOING; synchronize_sched(); module_put(mod); @@ -3202,7 +3180,7 @@ out: static int unknown_module_param_cb(char *param, char *val, const char *modname) { - /* Check for magic 'dyndbg' arg */ + /* Check for magic 'dyndbg' arg */ int ret = ddebug_dyndbg_module_param_cb(param, val, modname); if (ret != 0) pr_warn("%s: unknown parameter '%s' ignored\n", modname, param); @@ -3352,6 +3330,8 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); wake_up_all(&module_wq); + /* Wait for RCU synchronizing before releasing mod->list. */ + synchronize_rcu(); mutex_unlock(&module_mutex); free_module: module_deallocate(mod, info); @@ -3685,8 +3665,8 @@ static int m_show(struct seq_file *m, void *p) /* Informative for users. */ seq_printf(m, " %s", - mod->state == MODULE_STATE_GOING ? "Unloading": - mod->state == MODULE_STATE_COMING ? "Loading": + mod->state == MODULE_STATE_GOING ? "Unloading" : + mod->state == MODULE_STATE_COMING ? "Loading" : "Live"); /* Used by oprofile and other similar tools. */ seq_printf(m, " 0x%pK", mod->module_core); @@ -3695,7 +3675,7 @@ static int m_show(struct seq_file *m, void *p) if (mod->taints) seq_printf(m, " %s", module_flags(mod, buf)); - seq_printf(m, "\n"); + seq_puts(m, "\n"); return 0; } diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index ef42d0ab3115..49746c81ad8d 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -220,11 +220,10 @@ void exit_task_namespaces(struct task_struct *p) SYSCALL_DEFINE2(setns, int, fd, int, nstype) { - const struct proc_ns_operations *ops; struct task_struct *tsk = current; struct nsproxy *new_nsproxy; - struct proc_ns *ei; struct file *file; + struct ns_common *ns; int err; file = proc_ns_fget(fd); @@ -232,9 +231,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) return PTR_ERR(file); err = -EINVAL; - ei = get_proc_ns(file_inode(file)); - ops = ei->ns_ops; - if (nstype && (ops->type != nstype)) + ns = get_proc_ns(file_inode(file)); + if (nstype && (ns->ops->type != nstype)) goto out; new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); @@ -243,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) goto out; } - err = ops->install(new_nsproxy, ei->ns); + err = ns->ops->install(new_nsproxy, ns); if (err) { free_nsproxy(new_nsproxy); goto out; diff --git a/kernel/params.c b/kernel/params.c index db97b791390f..0af9b2c4e56c 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -603,74 +603,67 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, const struct kernel_param *kp, const char *name) { - struct module_param_attrs *new; - struct attribute **attrs; - int err, num; + struct module_param_attrs *new_mp; + struct attribute **new_attrs; + unsigned int i; /* We don't bother calling this with invisible parameters. */ BUG_ON(!kp->perm); if (!mk->mp) { - num = 0; - attrs = NULL; - } else { - num = mk->mp->num; - attrs = mk->mp->grp.attrs; + /* First allocation. */ + mk->mp = kzalloc(sizeof(*mk->mp), GFP_KERNEL); + if (!mk->mp) + return -ENOMEM; + mk->mp->grp.name = "parameters"; + /* NULL-terminated attribute array. */ + mk->mp->grp.attrs = kzalloc(sizeof(mk->mp->grp.attrs[0]), + GFP_KERNEL); + /* Caller will cleanup via free_module_param_attrs */ + if (!mk->mp->grp.attrs) + return -ENOMEM; } - /* Enlarge. */ - new = krealloc(mk->mp, - sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1), - GFP_KERNEL); - if (!new) { - kfree(attrs); - err = -ENOMEM; - goto fail; - } - /* Despite looking like the typical realloc() bug, this is safe. - * We *want* the old 'attrs' to be freed either way, and we'll store - * the new one in the success case. */ - attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL); - if (!attrs) { - err = -ENOMEM; - goto fail_free_new; - } + /* Enlarge allocations. */ + new_mp = krealloc(mk->mp, + sizeof(*mk->mp) + + sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1), + GFP_KERNEL); + if (!new_mp) + return -ENOMEM; + mk->mp = new_mp; - /* Sysfs wants everything zeroed. */ - memset(new, 0, sizeof(*new)); - memset(&new->attrs[num], 0, sizeof(new->attrs[num])); - memset(&attrs[num], 0, sizeof(attrs[num])); - new->grp.name = "parameters"; - new->grp.attrs = attrs; + /* Extra pointer for NULL terminator */ + new_attrs = krealloc(mk->mp->grp.attrs, + sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2), + GFP_KERNEL); + if (!new_attrs) + return -ENOMEM; + mk->mp->grp.attrs = new_attrs; /* Tack new one on the end. */ - sysfs_attr_init(&new->attrs[num].mattr.attr); - new->attrs[num].param = kp; - new->attrs[num].mattr.show = param_attr_show; - new->attrs[num].mattr.store = param_attr_store; - new->attrs[num].mattr.attr.name = (char *)name; - new->attrs[num].mattr.attr.mode = kp->perm; - new->num = num+1; + sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr); + mk->mp->attrs[mk->mp->num].param = kp; + mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show; + /* Do not allow runtime DAC changes to make param writable. */ + if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0) + mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store; + mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name; + mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm; + mk->mp->num++; /* Fix up all the pointers, since krealloc can move us */ - for (num = 0; num < new->num; num++) - new->grp.attrs[num] = &new->attrs[num].mattr.attr; - new->grp.attrs[num] = NULL; - - mk->mp = new; + for (i = 0; i < mk->mp->num; i++) + mk->mp->grp.attrs[i] = &mk->mp->attrs[i].mattr.attr; + mk->mp->grp.attrs[mk->mp->num] = NULL; return 0; - -fail_free_new: - kfree(new); -fail: - mk->mp = NULL; - return err; } #ifdef CONFIG_MODULES static void free_module_param_attrs(struct module_kobject *mk) { - kfree(mk->mp->grp.attrs); + if (mk->mp) + kfree(mk->mp->grp.attrs); kfree(mk->mp); mk->mp = NULL; } @@ -695,8 +688,10 @@ int module_param_sysfs_setup(struct module *mod, if (kparam[i].perm == 0) continue; err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name); - if (err) + if (err) { + free_module_param_attrs(&mod->mkobj); return err; + } params = true; } diff --git a/kernel/pid.c b/kernel/pid.c index 82430c858d69..cd36a5e0d173 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -79,7 +79,10 @@ struct pid_namespace init_pid_ns = { .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, - .proc_inum = PROC_PID_INIT_INO, + .ns.inum = PROC_PID_INIT_INO, +#ifdef CONFIG_PID_NS + .ns.ops = &pidns_operations, +#endif }; EXPORT_SYMBOL_GPL(init_pid_ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index bc6d6a89b6e6..a65ba137fd15 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -105,9 +105,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_map; - err = proc_alloc_inum(&ns->proc_inum); + err = ns_alloc_inum(&ns->ns); if (err) goto out_free_map; + ns->ns.ops = &pidns_operations; kref_init(&ns->kref); ns->level = level; @@ -142,7 +143,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) { int i; - proc_free_inum(ns->proc_inum); + ns_free_inum(&ns->ns); for (i = 0; i < PIDMAP_ENTRIES; i++) kfree(ns->pidmap[i].page); put_user_ns(ns->user_ns); @@ -333,7 +334,12 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) return 0; } -static void *pidns_get(struct task_struct *task) +static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) +{ + return container_of(ns, struct pid_namespace, ns); +} + +static struct ns_common *pidns_get(struct task_struct *task) { struct pid_namespace *ns; @@ -343,18 +349,18 @@ static void *pidns_get(struct task_struct *task) get_pid_ns(ns); rcu_read_unlock(); - return ns; + return ns ? &ns->ns : NULL; } -static void pidns_put(void *ns) +static void pidns_put(struct ns_common *ns) { - put_pid_ns(ns); + put_pid_ns(to_pid_ns(ns)); } -static int pidns_install(struct nsproxy *nsproxy, void *ns) +static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) { struct pid_namespace *active = task_active_pid_ns(current); - struct pid_namespace *ancestor, *new = ns; + struct pid_namespace *ancestor, *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) @@ -382,19 +388,12 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns) return 0; } -static unsigned int pidns_inum(void *ns) -{ - struct pid_namespace *pid_ns = ns; - return pid_ns->proc_inum; -} - const struct proc_ns_operations pidns_operations = { .name = "pid", .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, - .inum = pidns_inum, }; static __init int pid_namespaces_init(void) diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 6e7708c2c21f..48b28d387c7f 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -94,7 +94,7 @@ config PM_STD_PARTITION config PM_SLEEP def_bool y depends on SUSPEND || HIBERNATE_CALLBACKS - select PM_RUNTIME + select PM config PM_SLEEP_SMP def_bool y @@ -130,23 +130,19 @@ config PM_WAKELOCKS_GC depends on PM_WAKELOCKS default y -config PM_RUNTIME - bool "Run-time PM core functionality" +config PM + bool "Device power management core functionality" ---help--- Enable functionality allowing I/O devices to be put into energy-saving - (low power) states at run time (or autosuspended) after a specified - period of inactivity and woken up in response to a hardware-generated + (low power) states, for example after a specified period of inactivity + (autosuspended), and woken up in response to a hardware-generated wake-up event or a driver's request. Hardware support is generally required for this functionality to work and the bus type drivers of the buses the devices are on are - responsible for the actual handling of the autosuspend requests and + responsible for the actual handling of device suspend requests and wake-up events. -config PM - def_bool y - depends on PM_SLEEP || PM_RUNTIME - config PM_DEBUG bool "Power Management Debug Support" depends on PM diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index f900dc9f6822..02d6b6d28796 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1857,10 +1857,16 @@ asmlinkage __visible int printk(const char *fmt, ...) int r; va_start(args, fmt); - preempt_disable(); + + /* + * If a caller overrides the per_cpu printk_func, then it needs + * to disable preemption when calling printk(). Otherwise + * the printk_func should be set to the default. No need to + * disable preemption here. + */ vprintk_func = this_cpu_read(printk_func); r = vprintk_func(fmt, args); - preempt_enable(); + va_end(args); return r; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b5797b78add6..c0accc00566e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7113,9 +7113,6 @@ void __init sched_init(void) #ifdef CONFIG_RT_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); #endif -#ifdef CONFIG_CPUMASK_OFFSTACK - alloc_size += num_possible_cpus() * cpumask_size(); -#endif if (alloc_size) { ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); @@ -7135,13 +7132,13 @@ void __init sched_init(void) ptr += nr_cpu_ids * sizeof(void **); #endif /* CONFIG_RT_GROUP_SCHED */ + } #ifdef CONFIG_CPUMASK_OFFSTACK - for_each_possible_cpu(i) { - per_cpu(load_balance_mask, i) = (void *)ptr; - ptr += cpumask_size(); - } -#endif /* CONFIG_CPUMASK_OFFSTACK */ + for_each_possible_cpu(i) { + per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( + cpumask_size(), GFP_KERNEL, cpu_to_node(i)); } +#endif /* CONFIG_CPUMASK_OFFSTACK */ init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e5db8c6feebd..b52092f2636d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -570,24 +570,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) static int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) { - int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); - int rorun = dl_se->runtime <= 0; - - if (!rorun && !dmiss) - return 0; - - /* - * If we are beyond our current deadline and we are still - * executing, then we have already used some of the runtime of - * the next instance. Thus, if we do not account that, we are - * stealing bandwidth from the system at each deadline miss! - */ - if (dmiss) { - dl_se->runtime = rorun ? dl_se->runtime : 0; - dl_se->runtime -= rq_clock(rq) - dl_se->deadline; - } - - return 1; + return (dl_se->runtime <= 0); } extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); @@ -826,10 +809,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, * parameters of the task might need updating. Otherwise, * we want a replenishment of its runtime. */ - if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) - replenish_dl_entity(dl_se, pi_se); - else + if (dl_se->dl_new || flags & ENQUEUE_WAKEUP) update_dl_entity(dl_se, pi_se); + else if (flags & ENQUEUE_REPLENISH) + replenish_dl_entity(dl_se, pi_se); __enqueue_dl_entity(dl_se); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df2cdf77f899..40667cbf371b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4005,6 +4005,10 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force) static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { + /* init_cfs_bandwidth() was not called */ + if (!cfs_b->throttled_cfs_rq.next) + return; + hrtimer_cancel(&cfs_b->period_timer); hrtimer_cancel(&cfs_b->slack_timer); } @@ -4424,7 +4428,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) * wl = S * s'_i; see (2) */ if (W > 0 && w < W) - wl = (w * tg->shares) / W; + wl = (w * (long)tg->shares) / W; else wl = tg->shares; diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 00fe55cc5a82..b6e4c16377c7 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -25,6 +25,38 @@ void print_stack_trace(struct stack_trace *trace, int spaces) } EXPORT_SYMBOL_GPL(print_stack_trace); +int snprint_stack_trace(char *buf, size_t size, + struct stack_trace *trace, int spaces) +{ + int i; + unsigned long ip; + int generated; + int total = 0; + + if (WARN_ON(!trace->entries)) + return 0; + + for (i = 0; i < trace->nr_entries; i++) { + ip = trace->entries[i]; + generated = snprintf(buf, size, "%*c[<%p>] %pS\n", + 1 + spaces, ' ', (void *) ip, (void *) ip); + + total += generated; + + /* Assume that generated isn't a negative number */ + if (generated >= size) { + buf += size; + size = 0; + } else { + buf += generated; + size -= generated; + } + } + + return total; +} +EXPORT_SYMBOL_GPL(snprint_stack_trace); + /* * Architectures that do not implement save_stack_trace_tsk or * save_stack_trace_regs get this weak alias and a once-per-bootup warning diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 61eea02b53f5..5adcb0ae3a58 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -226,3 +226,6 @@ cond_syscall(sys_seccomp); /* access BPF programs and maps */ cond_syscall(sys_bpf); + +/* execveat */ +cond_syscall(sys_execveat); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7c54ff79afd7..137c7f69b264 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -623,6 +623,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tracepoint_printk", + .data = &tracepoint_printk, + .maxlen = sizeof(tracepoint_printk), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif #ifdef CONFIG_KEXEC { diff --git a/kernel/time/Makefile b/kernel/time/Makefile index f622cf28628a..c09c07817d7a 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,6 +1,6 @@ obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o -obj-y += timeconv.o posix-clock.o alarmtimer.o +obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index b79f39bda7e1..4892352f0e49 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -34,82 +34,6 @@ #include "tick-internal.h" #include "timekeeping_internal.h" -void timecounter_init(struct timecounter *tc, - const struct cyclecounter *cc, - u64 start_tstamp) -{ - tc->cc = cc; - tc->cycle_last = cc->read(cc); - tc->nsec = start_tstamp; -} -EXPORT_SYMBOL_GPL(timecounter_init); - -/** - * timecounter_read_delta - get nanoseconds since last call of this function - * @tc: Pointer to time counter - * - * When the underlying cycle counter runs over, this will be handled - * correctly as long as it does not run over more than once between - * calls. - * - * The first call to this function for a new time counter initializes - * the time tracking and returns an undefined result. - */ -static u64 timecounter_read_delta(struct timecounter *tc) -{ - cycle_t cycle_now, cycle_delta; - u64 ns_offset; - - /* read cycle counter: */ - cycle_now = tc->cc->read(tc->cc); - - /* calculate the delta since the last timecounter_read_delta(): */ - cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask; - - /* convert to nanoseconds: */ - ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta); - - /* update time stamp of timecounter_read_delta() call: */ - tc->cycle_last = cycle_now; - - return ns_offset; -} - -u64 timecounter_read(struct timecounter *tc) -{ - u64 nsec; - - /* increment time by nanoseconds since last call */ - nsec = timecounter_read_delta(tc); - nsec += tc->nsec; - tc->nsec = nsec; - - return nsec; -} -EXPORT_SYMBOL_GPL(timecounter_read); - -u64 timecounter_cyc2time(struct timecounter *tc, - cycle_t cycle_tstamp) -{ - u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask; - u64 nsec; - - /* - * Instead of always treating cycle_tstamp as more recent - * than tc->cycle_last, detect when it is too far in the - * future and treat it as old time stamp instead. - */ - if (cycle_delta > tc->cc->mask / 2) { - cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask; - nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta); - } else { - nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec; - } - - return nsec; -} -EXPORT_SYMBOL_GPL(timecounter_cyc2time); - /** * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks * @mult: pointer to mult variable diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 4d54b7540585..1363d58f07e9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -847,7 +847,6 @@ void tick_nohz_idle_enter(void) local_irq_enable(); } -EXPORT_SYMBOL_GPL(tick_nohz_idle_enter); /** * tick_nohz_irq_exit - update next tick event from interrupt exit @@ -974,7 +973,6 @@ void tick_nohz_idle_exit(void) local_irq_enable(); } -EXPORT_SYMBOL_GPL(tick_nohz_idle_exit); static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) { diff --git a/kernel/time/time.c b/kernel/time/time.c index 65015ff2f07c..6390517e77d4 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -741,6 +741,7 @@ u64 nsecs_to_jiffies64(u64 n) return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); #endif } +EXPORT_SYMBOL(nsecs_to_jiffies64); /** * nsecs_to_jiffies - Convert nsecs in u64 to jiffies diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c new file mode 100644 index 000000000000..4687b3104bae --- /dev/null +++ b/kernel/time/timecounter.c @@ -0,0 +1,112 @@ +/* + * linux/kernel/time/timecounter.c + * + * based on code that migrated away from + * linux/kernel/time/clocksource.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/export.h> +#include <linux/timecounter.h> + +void timecounter_init(struct timecounter *tc, + const struct cyclecounter *cc, + u64 start_tstamp) +{ + tc->cc = cc; + tc->cycle_last = cc->read(cc); + tc->nsec = start_tstamp; + tc->mask = (1ULL << cc->shift) - 1; + tc->frac = 0; +} +EXPORT_SYMBOL_GPL(timecounter_init); + +/** + * timecounter_read_delta - get nanoseconds since last call of this function + * @tc: Pointer to time counter + * + * When the underlying cycle counter runs over, this will be handled + * correctly as long as it does not run over more than once between + * calls. + * + * The first call to this function for a new time counter initializes + * the time tracking and returns an undefined result. + */ +static u64 timecounter_read_delta(struct timecounter *tc) +{ + cycle_t cycle_now, cycle_delta; + u64 ns_offset; + + /* read cycle counter: */ + cycle_now = tc->cc->read(tc->cc); + + /* calculate the delta since the last timecounter_read_delta(): */ + cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask; + + /* convert to nanoseconds: */ + ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta, + tc->mask, &tc->frac); + + /* update time stamp of timecounter_read_delta() call: */ + tc->cycle_last = cycle_now; + + return ns_offset; +} + +u64 timecounter_read(struct timecounter *tc) +{ + u64 nsec; + + /* increment time by nanoseconds since last call */ + nsec = timecounter_read_delta(tc); + nsec += tc->nsec; + tc->nsec = nsec; + + return nsec; +} +EXPORT_SYMBOL_GPL(timecounter_read); + +/* + * This is like cyclecounter_cyc2ns(), but it is used for computing a + * time previous to the time stored in the cycle counter. + */ +static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc, + cycle_t cycles, u64 mask, u64 frac) +{ + u64 ns = (u64) cycles; + + ns = ((ns * cc->mult) - frac) >> cc->shift; + + return ns; +} + +u64 timecounter_cyc2time(struct timecounter *tc, + cycle_t cycle_tstamp) +{ + u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask; + u64 nsec = tc->nsec, frac = tc->frac; + + /* + * Instead of always treating cycle_tstamp as more recent + * than tc->cycle_last, detect when it is too far in the + * future and treat it as old time stamp instead. + */ + if (delta > tc->cc->mask / 2) { + delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask; + nsec -= cc_cyc2ns_backwards(tc->cc, delta, tc->mask, frac); + } else { + nsec += cyclecounter_cyc2ns(tc->cc, delta, tc->mask, &frac); + } + + return nsec; +} +EXPORT_SYMBOL_GPL(timecounter_cyc2time); diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 67d6369ddf83..979ccde26720 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -55,7 +55,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o -ifeq ($(CONFIG_PM_RUNTIME),y) +ifeq ($(CONFIG_PM),y) obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o endif ifeq ($(CONFIG_TRACING),y) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 11b9cb36092b..483cecfa5c17 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1477,9 +1477,6 @@ static int blk_trace_remove_queue(struct request_queue *q) if (atomic_dec_and_test(&blk_probes_ref)) blk_unregister_tracepoints(); - spin_lock_irq(&running_trace_lock); - list_del(&bt->running_list); - spin_unlock_irq(&running_trace_lock); blk_trace_free(bt); return 0; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ab76b7bcb36a..2e767972e99c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -63,6 +63,10 @@ static bool __read_mostly tracing_selftest_running; */ bool __read_mostly tracing_selftest_disabled; +/* Pipe tracepoints to printk */ +struct trace_iterator *tracepoint_print_iter; +int tracepoint_printk; + /* For tracers that don't implement custom flags */ static struct tracer_opt dummy_tracer_opt[] = { { } @@ -193,6 +197,13 @@ static int __init set_trace_boot_clock(char *str) } __setup("trace_clock=", set_trace_boot_clock); +static int __init set_tracepoint_printk(char *str) +{ + if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) + tracepoint_printk = 1; + return 1; +} +__setup("tp_printk", set_tracepoint_printk); unsigned long long ns2usecs(cycle_t nsec) { @@ -6898,6 +6909,19 @@ out: return ret; } +void __init trace_init(void) +{ + if (tracepoint_printk) { + tracepoint_print_iter = + kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL); + if (WARN_ON(!tracepoint_print_iter)) + tracepoint_printk = 0; + } + tracer_alloc_buffers(); + init_ftrace_syscalls(); + trace_event_init(); +} + __init static int clear_boot_tracer(void) { /* @@ -6917,6 +6941,5 @@ __init static int clear_boot_tracer(void) return 0; } -early_initcall(tracer_alloc_buffers); fs_initcall(tracer_init_debugfs); late_initcall(clear_boot_tracer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3255dfb054a0..8de48bac1ce2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1301,4 +1301,18 @@ int perf_ftrace_event_register(struct ftrace_event_call *call, #define perf_ftrace_event_register NULL #endif +#ifdef CONFIG_FTRACE_SYSCALLS +void init_ftrace_syscalls(void); +#else +static inline void init_ftrace_syscalls(void) { } +#endif + +#ifdef CONFIG_EVENT_TRACING +void trace_event_init(void); +#else +static inline void __init trace_event_init(void) { } +#endif + +extern struct trace_iterator *tracepoint_print_iter; + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index d0e4f92b5eb6..366a78a3e61e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -212,8 +212,40 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, } EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve); +static DEFINE_SPINLOCK(tracepoint_iter_lock); + +static void output_printk(struct ftrace_event_buffer *fbuffer) +{ + struct ftrace_event_call *event_call; + struct trace_event *event; + unsigned long flags; + struct trace_iterator *iter = tracepoint_print_iter; + + if (!iter) + return; + + event_call = fbuffer->ftrace_file->event_call; + if (!event_call || !event_call->event.funcs || + !event_call->event.funcs->trace) + return; + + event = &fbuffer->ftrace_file->event_call->event; + + spin_lock_irqsave(&tracepoint_iter_lock, flags); + trace_seq_init(&iter->seq); + iter->ent = fbuffer->entry; + event_call->event.funcs->trace(iter, 0, event); + trace_seq_putc(&iter->seq, 0); + printk("%s", iter->seq.buffer); + + spin_unlock_irqrestore(&tracepoint_iter_lock, flags); +} + void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer) { + if (tracepoint_printk) + output_printk(fbuffer); + event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer, fbuffer->event, fbuffer->entry, fbuffer->flags, fbuffer->pc); @@ -2480,8 +2512,14 @@ static __init int event_trace_init(void) #endif return 0; } -early_initcall(event_trace_memsetup); -core_initcall(event_trace_enable); + +void __init trace_event_init(void) +{ + event_trace_memsetup(); + init_ftrace_syscalls(); + event_trace_enable(); +} + fs_initcall(event_trace_init); #ifdef CONFIG_FTRACE_STARTUP_TEST diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index b0b1c44e923a..3ccf5c2c1320 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -132,8 +132,8 @@ static int kdb_ftdump(int argc, const char **argv) static __init int kdb_ftrace_register(void) { - kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", - "Dump ftrace log", 0, KDB_REPEAT_NONE); + kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", + "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE); return 0; } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index dfe00a4f3f3e..c6ee36fcbf90 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -514,7 +514,7 @@ unsigned long __init __weak arch_syscall_addr(int nr) return (unsigned long)sys_call_table[nr]; } -static int __init init_ftrace_syscalls(void) +void __init init_ftrace_syscalls(void) { struct syscall_metadata *meta; unsigned long addr; @@ -524,7 +524,7 @@ static int __init init_ftrace_syscalls(void) GFP_KERNEL); if (!syscalls_metadata) { WARN_ON(1); - return -ENOMEM; + return; } for (i = 0; i < NR_syscalls; i++) { @@ -536,10 +536,7 @@ static int __init init_ftrace_syscalls(void) meta->syscall_nr = i; syscalls_metadata[i] = meta; } - - return 0; } -early_initcall(init_ftrace_syscalls); #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/uid16.c b/kernel/uid16.c index 602e5bbbceff..d58cc4d8f0d1 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!ns_capable(current_user_ns(), CAP_SETGID)) + if (!may_setgroups()) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/kernel/user.c b/kernel/user.c index 4efa39350e44..b069ccbfb0b0 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -50,7 +50,11 @@ struct user_namespace init_user_ns = { .count = ATOMIC_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, - .proc_inum = PROC_USER_INIT_INO, + .ns.inum = PROC_USER_INIT_INO, +#ifdef CONFIG_USER_NS + .ns.ops = &userns_operations, +#endif + .flags = USERNS_INIT_FLAGS, #ifdef CONFIG_PERSISTENT_KEYRINGS .persistent_keyring_register_sem = __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem), diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index aa312b0dc3ec..4109f8320684 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -24,6 +24,7 @@ #include <linux/fs_struct.h> static struct kmem_cache *user_ns_cachep __read_mostly; +static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, @@ -86,11 +87,12 @@ int create_user_ns(struct cred *new) if (!ns) return -ENOMEM; - ret = proc_alloc_inum(&ns->proc_inum); + ret = ns_alloc_inum(&ns->ns); if (ret) { kmem_cache_free(user_ns_cachep, ns); return ret; } + ns->ns.ops = &userns_operations; atomic_set(&ns->count, 1); /* Leave the new->user_ns reference with the new user namespace. */ @@ -99,6 +101,11 @@ int create_user_ns(struct cred *new) ns->owner = owner; ns->group = group; + /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ + mutex_lock(&userns_state_mutex); + ns->flags = parent_ns->flags; + mutex_unlock(&userns_state_mutex); + set_cred_user_ns(new, ns); #ifdef CONFIG_PERSISTENT_KEYRINGS @@ -136,7 +143,7 @@ void free_user_ns(struct user_namespace *ns) #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif - proc_free_inum(ns->proc_inum); + ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); ns = parent; } while (atomic_dec_and_test(&parent->count)); @@ -583,9 +590,6 @@ static bool mappings_overlap(struct uid_gid_map *new_map, return false; } - -static DEFINE_MUTEX(id_map_mutex); - static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, @@ -602,7 +606,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, ssize_t ret = -EINVAL; /* - * The id_map_mutex serializes all writes to any given map. + * The userns_state_mutex serializes all writes to any given map. * * Any map is only ever written once. * @@ -620,7 +624,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, * order and smp_rmb() is guaranteed that we don't have crazy * architectures returning stale data. */ - mutex_lock(&id_map_mutex); + mutex_lock(&userns_state_mutex); ret = -EPERM; /* Only allow one successful write to the map */ @@ -640,7 +644,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (!page) goto out; - /* Only allow <= page size writes at the beginning of the file */ + /* Only allow < page size writes at the beginning of the file */ ret = -EINVAL; if ((*ppos != 0) || (count >= PAGE_SIZE)) goto out; @@ -750,7 +754,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, *ppos = count; ret = count; out: - mutex_unlock(&id_map_mutex); + mutex_unlock(&userns_state_mutex); if (page) free_page(page); return ret; @@ -812,16 +816,21 @@ static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { - /* Allow mapping to your own filesystem ids */ - if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) { + const struct cred *cred = file->f_cred; + /* Don't allow mappings that would allow anything that wouldn't + * be allowed without the establishment of unprivileged mappings. + */ + if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && + uid_eq(ns->owner, cred->euid)) { u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); - if (uid_eq(uid, file->f_cred->fsuid)) + if (uid_eq(uid, cred->euid)) return true; } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); - if (gid_eq(gid, file->f_cred->fsgid)) + if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && + gid_eq(gid, cred->egid)) return true; } } @@ -841,7 +850,106 @@ static bool new_idmap_permitted(const struct file *file, return false; } -static void *userns_get(struct task_struct *task) +int proc_setgroups_show(struct seq_file *seq, void *v) +{ + struct user_namespace *ns = seq->private; + unsigned long userns_flags = ACCESS_ONCE(ns->flags); + + seq_printf(seq, "%s\n", + (userns_flags & USERNS_SETGROUPS_ALLOWED) ? + "allow" : "deny"); + return 0; +} + +ssize_t proc_setgroups_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + char kbuf[8], *pos; + bool setgroups_allowed; + ssize_t ret; + + /* Only allow a very narrow range of strings to be written */ + ret = -EINVAL; + if ((*ppos != 0) || (count >= sizeof(kbuf))) + goto out; + + /* What was written? */ + ret = -EFAULT; + if (copy_from_user(kbuf, buf, count)) + goto out; + kbuf[count] = '\0'; + pos = kbuf; + + /* What is being requested? */ + ret = -EINVAL; + if (strncmp(pos, "allow", 5) == 0) { + pos += 5; + setgroups_allowed = true; + } + else if (strncmp(pos, "deny", 4) == 0) { + pos += 4; + setgroups_allowed = false; + } + else + goto out; + + /* Verify there is not trailing junk on the line */ + pos = skip_spaces(pos); + if (*pos != '\0') + goto out; + + ret = -EPERM; + mutex_lock(&userns_state_mutex); + if (setgroups_allowed) { + /* Enabling setgroups after setgroups has been disabled + * is not allowed. + */ + if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) + goto out_unlock; + } else { + /* Permanently disabling setgroups after setgroups has + * been enabled by writing the gid_map is not allowed. + */ + if (ns->gid_map.nr_extents != 0) + goto out_unlock; + ns->flags &= ~USERNS_SETGROUPS_ALLOWED; + } + mutex_unlock(&userns_state_mutex); + + /* Report a successful write */ + *ppos = count; + ret = count; +out: + return ret; +out_unlock: + mutex_unlock(&userns_state_mutex); + goto out; +} + +bool userns_may_setgroups(const struct user_namespace *ns) +{ + bool allowed; + + mutex_lock(&userns_state_mutex); + /* It is not safe to use setgroups until a gid mapping in + * the user namespace has been established. + */ + allowed = ns->gid_map.nr_extents != 0; + /* Is setgroups allowed? */ + allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); + mutex_unlock(&userns_state_mutex); + + return allowed; +} + +static inline struct user_namespace *to_user_ns(struct ns_common *ns) +{ + return container_of(ns, struct user_namespace, ns); +} + +static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; @@ -849,17 +957,17 @@ static void *userns_get(struct task_struct *task) user_ns = get_user_ns(__task_cred(task)->user_ns); rcu_read_unlock(); - return user_ns; + return user_ns ? &user_ns->ns : NULL; } -static void userns_put(void *ns) +static void userns_put(struct ns_common *ns) { - put_user_ns(ns); + put_user_ns(to_user_ns(ns)); } -static int userns_install(struct nsproxy *nsproxy, void *ns) +static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) { - struct user_namespace *user_ns = ns; + struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; /* Don't allow gaining capabilities by reentering @@ -888,19 +996,12 @@ static int userns_install(struct nsproxy *nsproxy, void *ns) return commit_creds(cred); } -static unsigned int userns_inum(void *ns) -{ - struct user_namespace *user_ns = ns; - return user_ns->proc_inum; -} - const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, - .inum = userns_inum, }; static __init int user_namespaces_init(void) diff --git a/kernel/utsname.c b/kernel/utsname.c index 883aaaa7de8a..831ea7108232 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -42,12 +42,14 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, if (!ns) return ERR_PTR(-ENOMEM); - err = proc_alloc_inum(&ns->proc_inum); + err = ns_alloc_inum(&ns->ns); if (err) { kfree(ns); return ERR_PTR(err); } + ns->ns.ops = &utsns_operations; + down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); ns->user_ns = get_user_ns(user_ns); @@ -84,11 +86,16 @@ void free_uts_ns(struct kref *kref) ns = container_of(kref, struct uts_namespace, kref); put_user_ns(ns->user_ns); - proc_free_inum(ns->proc_inum); + ns_free_inum(&ns->ns); kfree(ns); } -static void *utsns_get(struct task_struct *task) +static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) +{ + return container_of(ns, struct uts_namespace, ns); +} + +static struct ns_common *utsns_get(struct task_struct *task) { struct uts_namespace *ns = NULL; struct nsproxy *nsproxy; @@ -101,17 +108,17 @@ static void *utsns_get(struct task_struct *task) } task_unlock(task); - return ns; + return ns ? &ns->ns : NULL; } -static void utsns_put(void *ns) +static void utsns_put(struct ns_common *ns) { - put_uts_ns(ns); + put_uts_ns(to_uts_ns(ns)); } -static int utsns_install(struct nsproxy *nsproxy, void *new) +static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new) { - struct uts_namespace *ns = new; + struct uts_namespace *ns = to_uts_ns(new); if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) @@ -123,18 +130,10 @@ static int utsns_install(struct nsproxy *nsproxy, void *new) return 0; } -static unsigned int utsns_inum(void *vp) -{ - struct uts_namespace *ns = vp; - - return ns->proc_inum; -} - const struct proc_ns_operations utsns_operations = { .name = "uts", .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, - .inum = utsns_inum, }; |