diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/acct.c | 94 | ||||
-rw-r--r-- | kernel/audit.h | 17 | ||||
-rw-r--r-- | kernel/auditsc.c | 176 | ||||
-rw-r--r-- | kernel/debug/debug_core.c | 19 | ||||
-rw-r--r-- | kernel/debug/kdb/kdb_io.c | 46 | ||||
-rw-r--r-- | kernel/debug/kdb/kdb_main.c | 16 | ||||
-rw-r--r-- | kernel/debug/kdb/kdb_private.h | 4 | ||||
-rw-r--r-- | kernel/gcov/Makefile | 36 | ||||
-rw-r--r-- | kernel/kexec.c | 23 | ||||
-rw-r--r-- | kernel/locking/rtmutex.c | 3 | ||||
-rw-r--r-- | kernel/module.c | 9 | ||||
-rw-r--r-- | kernel/power/suspend.c | 43 | ||||
-rw-r--r-- | kernel/printk/printk.c | 2 | ||||
-rw-r--r-- | kernel/ptrace.c | 1 | ||||
-rw-r--r-- | kernel/rcu/tree_plugin.h | 1 | ||||
-rw-r--r-- | kernel/sched/auto_group.c | 6 | ||||
-rw-r--r-- | kernel/sched/completion.c | 19 | ||||
-rw-r--r-- | kernel/sched/core.c | 113 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 33 | ||||
-rw-r--r-- | kernel/sched/idle.c | 16 | ||||
-rw-r--r-- | kernel/sched/sched.h | 76 | ||||
-rw-r--r-- | kernel/seccomp.c | 4 | ||||
-rw-r--r-- | kernel/signal.c | 4 | ||||
-rw-r--r-- | kernel/time/ntp.c | 10 | ||||
-rw-r--r-- | kernel/time/tick-common.c | 50 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 48 | ||||
-rw-r--r-- | kernel/time/timekeeping.h | 2 |
27 files changed, 476 insertions, 395 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index 33738ef972f3..e6c10d1a4058 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -76,10 +76,11 @@ int acct_parm[3] = {4, 2, 30}; /* * External references and all of the globals. */ -static void do_acct_process(struct bsd_acct_struct *acct); struct bsd_acct_struct { struct fs_pin pin; + atomic_long_t count; + struct rcu_head rcu; struct mutex lock; int active; unsigned long needcheck; @@ -89,6 +90,8 @@ struct bsd_acct_struct { struct completion done; }; +static void do_acct_process(struct bsd_acct_struct *acct); + /* * Check the amount of free space and suspend/resume accordingly. */ @@ -124,32 +127,56 @@ out: return acct->active; } +static void acct_put(struct bsd_acct_struct *p) +{ + if (atomic_long_dec_and_test(&p->count)) + kfree_rcu(p, rcu); +} + +static inline struct bsd_acct_struct *to_acct(struct fs_pin *p) +{ + return p ? container_of(p, struct bsd_acct_struct, pin) : NULL; +} + static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) { struct bsd_acct_struct *res; again: smp_rmb(); rcu_read_lock(); - res = ACCESS_ONCE(ns->bacct); + res = to_acct(ACCESS_ONCE(ns->bacct)); if (!res) { rcu_read_unlock(); return NULL; } - if (!atomic_long_inc_not_zero(&res->pin.count)) { + if (!atomic_long_inc_not_zero(&res->count)) { rcu_read_unlock(); cpu_relax(); goto again; } rcu_read_unlock(); mutex_lock(&res->lock); - if (!res->ns) { + if (res != to_acct(ACCESS_ONCE(ns->bacct))) { mutex_unlock(&res->lock); - pin_put(&res->pin); + acct_put(res); goto again; } return res; } +static void acct_pin_kill(struct fs_pin *pin) +{ + struct bsd_acct_struct *acct = to_acct(pin); + mutex_lock(&acct->lock); + do_acct_process(acct); + schedule_work(&acct->work); + wait_for_completion(&acct->done); + cmpxchg(&acct->ns->bacct, pin, NULL); + mutex_unlock(&acct->lock); + pin_remove(pin); + acct_put(acct); +} + static void close_work(struct work_struct *work) { struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); @@ -160,44 +187,13 @@ static void close_work(struct work_struct *work) complete(&acct->done); } -static void acct_kill(struct bsd_acct_struct *acct, - struct bsd_acct_struct *new) -{ - if (acct) { - struct pid_namespace *ns = acct->ns; - do_acct_process(acct); - INIT_WORK(&acct->work, close_work); - init_completion(&acct->done); - schedule_work(&acct->work); - wait_for_completion(&acct->done); - pin_remove(&acct->pin); - ns->bacct = new; - acct->ns = NULL; - atomic_long_dec(&acct->pin.count); - mutex_unlock(&acct->lock); - pin_put(&acct->pin); - } -} - -static void acct_pin_kill(struct fs_pin *pin) -{ - struct bsd_acct_struct *acct; - acct = container_of(pin, struct bsd_acct_struct, pin); - mutex_lock(&acct->lock); - if (!acct->ns) { - mutex_unlock(&acct->lock); - pin_put(pin); - acct = NULL; - } - acct_kill(acct, NULL); -} - static int acct_on(struct filename *pathname) { struct file *file; struct vfsmount *mnt, *internal; struct pid_namespace *ns = task_active_pid_ns(current); - struct bsd_acct_struct *acct, *old; + struct bsd_acct_struct *acct; + struct fs_pin *old; int err; acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); @@ -238,21 +234,21 @@ static int acct_on(struct filename *pathname) mnt = file->f_path.mnt; file->f_path.mnt = internal; - atomic_long_set(&acct->pin.count, 1); - acct->pin.kill = acct_pin_kill; + atomic_long_set(&acct->count, 1); + init_fs_pin(&acct->pin, acct_pin_kill); acct->file = file; acct->needcheck = jiffies; acct->ns = ns; mutex_init(&acct->lock); + INIT_WORK(&acct->work, close_work); + init_completion(&acct->done); mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ pin_insert(&acct->pin, mnt); - old = acct_get(ns); - if (old) - acct_kill(old, acct); - else - ns->bacct = acct; + rcu_read_lock(); + old = xchg(&ns->bacct, &acct->pin); mutex_unlock(&acct->lock); + pin_kill(old); mnt_drop_write(mnt); mntput(mnt); return 0; @@ -288,7 +284,8 @@ SYSCALL_DEFINE1(acct, const char __user *, name) mutex_unlock(&acct_on_mutex); putname(tmp); } else { - acct_kill(acct_get(task_active_pid_ns(current)), NULL); + rcu_read_lock(); + pin_kill(task_active_pid_ns(current)->bacct); } return error; @@ -296,7 +293,8 @@ SYSCALL_DEFINE1(acct, const char __user *, name) void acct_exit_ns(struct pid_namespace *ns) { - acct_kill(acct_get(ns), NULL); + rcu_read_lock(); + pin_kill(ns->bacct); } /* @@ -576,7 +574,7 @@ static void slow_acct_process(struct pid_namespace *ns) if (acct) { do_acct_process(acct); mutex_unlock(&acct->lock); - pin_put(&acct->pin); + acct_put(acct); } } } diff --git a/kernel/audit.h b/kernel/audit.h index 3cdffad5a1d9..1caa0d345d90 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -24,12 +24,6 @@ #include <linux/skbuff.h> #include <uapi/linux/mqueue.h> -/* 0 = no checking - 1 = put_count checking - 2 = verbose put_count checking -*/ -#define AUDIT_DEBUG 0 - /* AUDIT_NAMES is the number of slots we reserve in the audit_context * for saving names from getname(). If we get more names we will allocate * a name dynamically and also add those to the list anchored by names_list. */ @@ -74,9 +68,8 @@ struct audit_cap_data { }; }; -/* When fs/namei.c:getname() is called, we store the pointer in name and - * we don't let putname() free it (instead we free all of the saved - * pointers at syscall exit time). +/* When fs/namei.c:getname() is called, we store the pointer in name and bump + * the refcnt in the associated filename struct. * * Further, in fs/namei.c:path_lookup() we store the inode and device. */ @@ -86,7 +79,6 @@ struct audit_names { struct filename *name; int name_len; /* number of chars to log */ bool hidden; /* don't log this record */ - bool name_put; /* call __putname()? */ unsigned long ino; dev_t dev; @@ -208,11 +200,6 @@ struct audit_context { }; int fds[2]; struct audit_proctitle proctitle; - -#if AUDIT_DEBUG - int put_count; - int ino_count; -#endif }; extern u32 audit_ever_enabled; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 072566dd0caf..dc4ae70a7413 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -866,33 +866,10 @@ static inline void audit_free_names(struct audit_context *context) { struct audit_names *n, *next; -#if AUDIT_DEBUG == 2 - if (context->put_count + context->ino_count != context->name_count) { - int i = 0; - - pr_err("%s:%d(:%d): major=%d in_syscall=%d" - " name_count=%d put_count=%d ino_count=%d" - " [NOT freeing]\n", __FILE__, __LINE__, - context->serial, context->major, context->in_syscall, - context->name_count, context->put_count, - context->ino_count); - list_for_each_entry(n, &context->names_list, list) { - pr_err("names[%d] = %p = %s\n", i++, n->name, - n->name->name ?: "(null)"); - } - dump_stack(); - return; - } -#endif -#if AUDIT_DEBUG - context->put_count = 0; - context->ino_count = 0; -#endif - list_for_each_entry_safe(n, next, &context->names_list, list) { list_del(&n->list); - if (n->name && n->name_put) - final_putname(n->name); + if (n->name) + putname(n->name); if (n->should_free) kfree(n); } @@ -1711,9 +1688,6 @@ static struct audit_names *audit_alloc_name(struct audit_context *context, list_add_tail(&aname->list, &context->names_list); context->name_count++; -#if AUDIT_DEBUG - context->ino_count++; -#endif return aname; } @@ -1734,8 +1708,10 @@ __audit_reusename(const __user char *uptr) list_for_each_entry(n, &context->names_list, list) { if (!n->name) continue; - if (n->name->uptr == uptr) + if (n->name->uptr == uptr) { + n->name->refcnt++; return n->name; + } } return NULL; } @@ -1752,19 +1728,8 @@ void __audit_getname(struct filename *name) struct audit_context *context = current->audit_context; struct audit_names *n; - if (!context->in_syscall) { -#if AUDIT_DEBUG == 2 - pr_err("%s:%d(:%d): ignoring getname(%p)\n", - __FILE__, __LINE__, context->serial, name); - dump_stack(); -#endif + if (!context->in_syscall) return; - } - -#if AUDIT_DEBUG - /* The filename _must_ have a populated ->name */ - BUG_ON(!name->name); -#endif n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); if (!n) @@ -1772,56 +1737,13 @@ void __audit_getname(struct filename *name) n->name = name; n->name_len = AUDIT_NAME_FULL; - n->name_put = true; name->aname = n; + name->refcnt++; if (!context->pwd.dentry) get_fs_pwd(current->fs, &context->pwd); } -/* audit_putname - intercept a putname request - * @name: name to intercept and delay for putname - * - * If we have stored the name from getname in the audit context, - * then we delay the putname until syscall exit. - * Called from include/linux/fs.h:putname(). - */ -void audit_putname(struct filename *name) -{ - struct audit_context *context = current->audit_context; - - BUG_ON(!context); - if (!name->aname || !context->in_syscall) { -#if AUDIT_DEBUG == 2 - pr_err("%s:%d(:%d): final_putname(%p)\n", - __FILE__, __LINE__, context->serial, name); - if (context->name_count) { - struct audit_names *n; - int i = 0; - - list_for_each_entry(n, &context->names_list, list) - pr_err("name[%d] = %p = %s\n", i++, n->name, - n->name->name ?: "(null)"); - } -#endif - final_putname(name); - } -#if AUDIT_DEBUG - else { - ++context->put_count; - if (context->put_count > context->name_count) { - pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)" - " name_count=%d put_count=%d\n", - __FILE__, __LINE__, - context->serial, context->major, - context->in_syscall, name->name, - context->name_count, context->put_count); - dump_stack(); - } - } -#endif -} - /** * __audit_inode - store the inode and device from a lookup * @name: name being audited @@ -1842,10 +1764,6 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, if (!name) goto out_alloc; -#if AUDIT_DEBUG - /* The struct filename _must_ have a populated ->name */ - BUG_ON(!name->name); -#endif /* * If we have a pointer to an audit_names entry already, then we can * just use it directly if the type is correct. @@ -1863,7 +1781,17 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, } list_for_each_entry_reverse(n, &context->names_list, list) { - if (!n->name || strcmp(n->name->name, name->name)) + if (n->ino) { + /* valid inode number, use that for the comparison */ + if (n->ino != inode->i_ino || + n->dev != inode->i_sb->s_dev) + continue; + } else if (n->name) { + /* inode number has not been set, check the name */ + if (strcmp(n->name->name, name->name)) + continue; + } else + /* no inode and no name (?!) ... this is odd ... */ continue; /* match the correct record type */ @@ -1882,44 +1810,11 @@ out_alloc: n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); if (!n) return; - /* unfortunately, while we may have a path name to record with the - * inode, we can't always rely on the string lasting until the end of - * the syscall so we need to create our own copy, it may fail due to - * memory allocation issues, but we do our best */ if (name) { - /* we can't use getname_kernel() due to size limits */ - size_t len = strlen(name->name) + 1; - struct filename *new = __getname(); - - if (unlikely(!new)) - goto out; - - if (len <= (PATH_MAX - sizeof(*new))) { - new->name = (char *)(new) + sizeof(*new); - new->separate = false; - } else if (len <= PATH_MAX) { - /* this looks odd, but is due to final_putname() */ - struct filename *new2; - - new2 = kmalloc(sizeof(*new2), GFP_KERNEL); - if (unlikely(!new2)) { - __putname(new); - goto out; - } - new2->name = (char *)new; - new2->separate = true; - new = new2; - } else { - /* we should never get here, but let's be safe */ - __putname(new); - goto out; - } - strlcpy((char *)new->name, name->name, len); - new->uptr = NULL; - new->aname = n; - n->name = new; - n->name_put = true; + n->name = name; + name->refcnt++; } + out: if (parent) { n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; @@ -1970,11 +1865,16 @@ void __audit_inode_child(const struct inode *parent, /* look for a parent entry first */ list_for_each_entry(n, &context->names_list, list) { - if (!n->name || n->type != AUDIT_TYPE_PARENT) + if (!n->name || + (n->type != AUDIT_TYPE_PARENT && + n->type != AUDIT_TYPE_UNKNOWN)) continue; - if (n->ino == parent->i_ino && - !audit_compare_dname_path(dname, n->name->name, n->name_len)) { + if (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev && + !audit_compare_dname_path(dname, + n->name->name, n->name_len)) { + if (n->type == AUDIT_TYPE_UNKNOWN) + n->type = AUDIT_TYPE_PARENT; found_parent = n; break; } @@ -1983,11 +1883,8 @@ void __audit_inode_child(const struct inode *parent, /* is there a matching child entry? */ list_for_each_entry(n, &context->names_list, list) { /* can only match entries that have a name */ - if (!n->name || n->type != type) - continue; - - /* if we found a parent, make sure this one is a child of it */ - if (found_parent && (n->name != found_parent->name)) + if (!n->name || + (n->type != type && n->type != AUDIT_TYPE_UNKNOWN)) continue; if (!strcmp(dname, n->name->name) || @@ -1995,6 +1892,8 @@ void __audit_inode_child(const struct inode *parent, found_parent ? found_parent->name_len : AUDIT_NAME_FULL)) { + if (n->type == AUDIT_TYPE_UNKNOWN) + n->type = type; found_child = n; break; } @@ -2019,10 +1918,10 @@ void __audit_inode_child(const struct inode *parent, if (found_parent) { found_child->name = found_parent->name; found_child->name_len = AUDIT_NAME_FULL; - /* don't call __putname() */ - found_child->name_put = false; + found_child->name->refcnt++; } } + if (inode) audit_copy_inode(found_child, dentry, inode); else @@ -2405,7 +2304,6 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, struct audit_aux_data_bprm_fcaps *ax; struct audit_context *context = current->audit_context; struct cpu_vfs_cap_data vcaps; - struct dentry *dentry; ax = kmalloc(sizeof(*ax), GFP_KERNEL); if (!ax) @@ -2415,9 +2313,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, ax->d.next = context->aux; context->aux = (void *)ax; - dentry = dget(bprm->file->f_path.dentry); - get_vfs_caps_from_disk(dentry, &vcaps); - dput(dentry); + get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps); ax->fcap.permitted = vcaps.permitted; ax->fcap.inheritable = vcaps.inheritable; diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 07ce18ca71e0..0874e2edd275 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -604,7 +604,7 @@ return_normal: online_cpus) cpu_relax(); if (!time_left) - pr_crit("KGDB: Timed out waiting for secondary CPUs.\n"); + pr_crit("Timed out waiting for secondary CPUs.\n"); /* * At this point the primary processor is completely @@ -696,6 +696,14 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) if (arch_kgdb_ops.enable_nmi) arch_kgdb_ops.enable_nmi(0); + /* + * Avoid entering the debugger if we were triggered due to an oops + * but panic_timeout indicates the system should automatically + * reboot on panic. We don't want to get stuck waiting for input + * on such systems, especially if its "just" an oops. + */ + if (signo != SIGTRAP && panic_timeout) + return 1; memset(ks, 0, sizeof(struct kgdb_state)); ks->cpu = raw_smp_processor_id(); @@ -828,6 +836,15 @@ static int kgdb_panic_event(struct notifier_block *self, unsigned long val, void *data) { + /* + * Avoid entering the debugger if we were triggered due to a panic + * We don't want to get stuck waiting for input from user in such case. + * panic_timeout indicates the system should automatically + * reboot on panic. + */ + if (panic_timeout) + return NOTIFY_DONE; + if (dbg_kdb_mode) kdb_printf("PANIC: %s\n", (char *)data); kgdb_breakpoint(); diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 7c70812caea5..fc1ef736253c 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -439,7 +439,7 @@ poll_again: * substituted for %d, %x or %o in the prompt. */ -char *kdb_getstr(char *buffer, size_t bufsize, char *prompt) +char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt) { if (prompt && kdb_prompt_str != prompt) strncpy(kdb_prompt_str, prompt, CMD_BUFLEN); @@ -548,7 +548,7 @@ static int kdb_search_string(char *searched, char *searchfor) return 0; } -int vkdb_printf(const char *fmt, va_list ap) +int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) { int diag; int linecount; @@ -680,6 +680,12 @@ int vkdb_printf(const char *fmt, va_list ap) size_avail = sizeof(kdb_buffer) - len; goto kdb_print_out; } + if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH) + /* + * This was a interactive search (using '/' at more + * prompt) and it has completed. Clear the flag. + */ + kdb_grepping_flag = 0; /* * at this point the string is a full line and * should be printed, up to the null. @@ -691,19 +697,20 @@ kdb_printit: * Write to all consoles. */ retlen = strlen(kdb_buffer); + cp = (char *) printk_skip_level(kdb_buffer); if (!dbg_kdb_mode && kgdb_connected) { - gdbstub_msg_write(kdb_buffer, retlen); + gdbstub_msg_write(cp, retlen - (cp - kdb_buffer)); } else { if (dbg_io_ops && !dbg_io_ops->is_console) { - len = retlen; - cp = kdb_buffer; + len = retlen - (cp - kdb_buffer); + cp2 = cp; while (len--) { - dbg_io_ops->write_char(*cp); - cp++; + dbg_io_ops->write_char(*cp2); + cp2++; } } while (c) { - c->write(c, kdb_buffer, retlen); + c->write(c, cp, retlen - (cp - kdb_buffer)); touch_nmi_watchdog(); c = c->next; } @@ -711,7 +718,10 @@ kdb_printit: if (logging) { saved_loglevel = console_loglevel; console_loglevel = CONSOLE_LOGLEVEL_SILENT; - printk(KERN_INFO "%s", kdb_buffer); + if (printk_get_level(kdb_buffer) || src == KDB_MSGSRC_PRINTK) + printk("%s", kdb_buffer); + else + pr_info("%s", kdb_buffer); } if (KDB_STATE(PAGER)) { @@ -794,11 +804,23 @@ kdb_printit: kdb_nextline = linecount - 1; kdb_printf("\r"); suspend_grep = 1; /* for this recursion */ + } else if (buf1[0] == '/' && !kdb_grepping_flag) { + kdb_printf("\r"); + kdb_getstr(kdb_grep_string, KDB_GREP_STRLEN, + kdbgetenv("SEARCHPROMPT") ?: "search> "); + *strchrnul(kdb_grep_string, '\n') = '\0'; + kdb_grepping_flag += KDB_GREPPING_FLAG_SEARCH; + suspend_grep = 1; /* for this recursion */ } else if (buf1[0] && buf1[0] != '\n') { /* user hit something other than enter */ suspend_grep = 1; /* for this recursion */ - kdb_printf("\nOnly 'q' or 'Q' are processed at more " - "prompt, input ignored\n"); + if (buf1[0] != '/') + kdb_printf( + "\nOnly 'q', 'Q' or '/' are processed at " + "more prompt, input ignored\n"); + else + kdb_printf("\n'/' cannot be used during | " + "grep filtering, input ignored\n"); } else if (kdb_grepping_flag) { /* user hit enter */ suspend_grep = 1; /* for this recursion */ @@ -844,7 +866,7 @@ int kdb_printf(const char *fmt, ...) int r; va_start(ap, fmt); - r = vkdb_printf(fmt, ap); + r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap); va_end(ap); return r; diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 7b40c5f07dce..4121345498e0 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -50,8 +50,7 @@ static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE; module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600); -#define GREP_LEN 256 -char kdb_grep_string[GREP_LEN]; +char kdb_grep_string[KDB_GREP_STRLEN]; int kdb_grepping_flag; EXPORT_SYMBOL(kdb_grepping_flag); int kdb_grep_leading; @@ -870,7 +869,7 @@ static void parse_grep(const char *str) len = strlen(cp); if (!len) return; - if (len >= GREP_LEN) { + if (len >= KDB_GREP_STRLEN) { kdb_printf("search string too long\n"); return; } @@ -915,13 +914,12 @@ int kdb_parse(const char *cmdstr) char *cp; char *cpp, quoted; kdbtab_t *tp; - int i, escaped, ignore_errors = 0, check_grep; + int i, escaped, ignore_errors = 0, check_grep = 0; /* * First tokenize the command string. */ cp = (char *)cmdstr; - kdb_grepping_flag = check_grep = 0; if (KDB_FLAG(CMD_INTERRUPT)) { /* Previous command was interrupted, newline must not @@ -1247,7 +1245,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, kdb_printf("due to NonMaskable Interrupt @ " kdb_machreg_fmt "\n", instruction_pointer(regs)); - kdb_dumpregs(regs); break; case KDB_REASON_SSTEP: case KDB_REASON_BREAK: @@ -1281,6 +1278,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, */ kdb_nextline = 1; KDB_STATE_CLEAR(SUPPRESS); + kdb_grepping_flag = 0; + /* ensure the old search does not leak into '/' commands */ + kdb_grep_string[0] = '\0'; cmdbuf = cmd_cur; *cmdbuf = '\0'; @@ -2256,7 +2256,7 @@ static int kdb_cpu(int argc, const char **argv) /* * Validate cpunum */ - if ((cpunum > NR_CPUS) || !kgdb_info[cpunum].enter_kgdb) + if ((cpunum >= CONFIG_NR_CPUS) || !kgdb_info[cpunum].enter_kgdb) return KDB_BADCPUNUM; dbg_switch_cpu = cpunum; @@ -2583,7 +2583,7 @@ static int kdb_summary(int argc, const char **argv) #define K(x) ((x) << (PAGE_SHIFT - 10)) kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n" "Buffers: %8lu kB\n", - val.totalram, val.freeram, val.bufferram); + K(val.totalram), K(val.freeram), K(val.bufferram)); return 0; } diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index eaacd1693954..75014d7f4568 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -196,7 +196,9 @@ extern int kdb_main_loop(kdb_reason_t, kdb_reason_t, /* Miscellaneous functions and data areas */ extern int kdb_grepping_flag; +#define KDB_GREPPING_FLAG_SEARCH 0x8000 extern char kdb_grep_string[]; +#define KDB_GREP_STRLEN 256 extern int kdb_grep_leading; extern int kdb_grep_trailing; extern char *kdb_cmds[]; @@ -209,7 +211,7 @@ extern void kdb_ps1(const struct task_struct *p); extern void kdb_print_nameval(const char *name, unsigned long val); extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); extern void kdb_meminfo_proc_show(void); -extern char *kdb_getstr(char *, size_t, char *); +extern char *kdb_getstr(char *, size_t, const char *); extern void kdb_gdb_state_pass(char *buf); /* Defines for kdb_symbol_print */ diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index 52aa7e8de927..752d6486b67e 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -1,33 +1,7 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' -# if-lt -# Usage VAR := $(call if-lt, $(a), $(b)) -# Returns 1 if (a < b) -if-lt = $(shell [ $(1) -lt $(2) ] && echo 1) - -ifeq ($(CONFIG_GCOV_FORMAT_3_4),y) - cc-ver := 0304 -else ifeq ($(CONFIG_GCOV_FORMAT_4_7),y) - cc-ver := 0407 -else -# Use cc-version if available, otherwise set 0 -# -# scripts/Kbuild.include, which contains cc-version function, is not included -# during make clean "make -f scripts/Makefile.clean obj=kernel/gcov" -# Meaning cc-ver is empty causing if-lt test to fail with -# "/bin/sh: line 0: [: -lt: unary operator expected" error mesage. -# This has no affect on the clean phase, but the error message could be -# confusing/annoying. So this dummy workaround sets cc-ver to zero if cc-version -# is not available. We can probably move if-lt to Kbuild.include, so it's also -# not defined during clean or to include Kbuild.include in -# scripts/Makefile.clean. But the following workaround seems least invasive. - cc-ver := $(if $(call cc-version),$(call cc-version),0) -endif - -obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o - -ifeq ($(call if-lt, $(cc-ver), 0407),1) - obj-$(CONFIG_GCOV_KERNEL) += gcc_3_4.o -else - obj-$(CONFIG_GCOV_KERNEL) += gcc_4_7.o -endif +obj-y := base.o fs.o +obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o +obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o +obj-$(CONFIG_GCOV_FORMAT_AUTODETECT) += $(call cc-ifversion, -lt, 0407, \ + gcc_3_4.o, gcc_4_7.o) diff --git a/kernel/kexec.c b/kernel/kexec.c index c85277639b34..38c25b1f2fd5 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -444,7 +444,7 @@ arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, } /* - * Free up memory used by kernel, initrd, and comand line. This is temporary + * Free up memory used by kernel, initrd, and command line. This is temporary * memory allocation which is not needed any more after these buffers have * been loaded into separate segments and have been copied elsewhere. */ @@ -856,8 +856,6 @@ static int kimage_set_destination(struct kimage *image, destination &= PAGE_MASK; result = kimage_add_entry(image, destination | IND_DESTINATION); - if (result == 0) - image->destination = destination; return result; } @@ -869,8 +867,6 @@ static int kimage_add_page(struct kimage *image, unsigned long page) page &= PAGE_MASK; result = kimage_add_entry(image, page | IND_SOURCE); - if (result == 0) - image->destination += PAGE_SIZE; return result; } @@ -1288,19 +1284,22 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, if (nr_segments > 0) { unsigned long i; - /* Loading another kernel to reboot into */ - if ((flags & KEXEC_ON_CRASH) == 0) - result = kimage_alloc_init(&image, entry, nr_segments, - segments, flags); - /* Loading another kernel to switch to if this one crashes */ - else if (flags & KEXEC_ON_CRASH) { - /* Free any current crash dump kernel before + if (flags & KEXEC_ON_CRASH) { + /* + * Loading another kernel to switch to if this one + * crashes. Free any current crash dump kernel before * we corrupt it. */ + kimage_free(xchg(&kexec_crash_image, NULL)); result = kimage_alloc_init(&image, entry, nr_segments, segments, flags); crash_map_reserved_pages(); + } else { + /* Loading another kernel to reboot into. */ + + result = kimage_alloc_init(&image, entry, nr_segments, + segments, flags); } if (result) goto out; diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 3059bc2f022d..e16e5542bf13 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1193,7 +1193,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); if (unlikely(ret)) { - remove_waiter(lock, &waiter); + if (rt_mutex_has_waiters(lock)) + remove_waiter(lock, &waiter); rt_mutex_handle_deadlock(ret, chwalk, &waiter); } diff --git a/kernel/module.c b/kernel/module.c index 8426ad48362c..b34813f725e9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3025,8 +3025,13 @@ static void do_free_init(struct rcu_head *head) kfree(m); } -/* This is where the real work happens */ -static int do_init_module(struct module *mod) +/* + * This is where the real work happens. + * + * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb + * helper command 'lx-symbols'. + */ +static noinline int do_init_module(struct module *mod) { int ret = 0; struct mod_initfree *freeinit; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c347e3ce3a55..b7d6b3a721b1 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -37,7 +37,9 @@ const char *pm_states[PM_SUSPEND_MAX]; static const struct platform_suspend_ops *suspend_ops; static const struct platform_freeze_ops *freeze_ops; static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); -static bool suspend_freeze_wake; + +enum freeze_state __read_mostly suspend_freeze_state; +static DEFINE_SPINLOCK(suspend_freeze_lock); void freeze_set_ops(const struct platform_freeze_ops *ops) { @@ -48,22 +50,49 @@ void freeze_set_ops(const struct platform_freeze_ops *ops) static void freeze_begin(void) { - suspend_freeze_wake = false; + suspend_freeze_state = FREEZE_STATE_NONE; } static void freeze_enter(void) { - cpuidle_use_deepest_state(true); + spin_lock_irq(&suspend_freeze_lock); + if (pm_wakeup_pending()) + goto out; + + suspend_freeze_state = FREEZE_STATE_ENTER; + spin_unlock_irq(&suspend_freeze_lock); + + get_online_cpus(); cpuidle_resume(); - wait_event(suspend_freeze_wait_head, suspend_freeze_wake); + + /* Push all the CPUs into the idle loop. */ + wake_up_all_idle_cpus(); + pr_debug("PM: suspend-to-idle\n"); + /* Make the current CPU wait so it can enter the idle loop too. */ + wait_event(suspend_freeze_wait_head, + suspend_freeze_state == FREEZE_STATE_WAKE); + pr_debug("PM: resume from suspend-to-idle\n"); + cpuidle_pause(); - cpuidle_use_deepest_state(false); + put_online_cpus(); + + spin_lock_irq(&suspend_freeze_lock); + + out: + suspend_freeze_state = FREEZE_STATE_NONE; + spin_unlock_irq(&suspend_freeze_lock); } void freeze_wake(void) { - suspend_freeze_wake = true; - wake_up(&suspend_freeze_wait_head); + unsigned long flags; + + spin_lock_irqsave(&suspend_freeze_lock, flags); + if (suspend_freeze_state > FREEZE_STATE_NONE) { + suspend_freeze_state = FREEZE_STATE_WAKE; + wake_up(&suspend_freeze_wait_head); + } + spin_unlock_irqrestore(&suspend_freeze_lock, flags); } EXPORT_SYMBOL_GPL(freeze_wake); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index c06df7de0963..01cfd69c54c6 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1811,7 +1811,7 @@ int vprintk_default(const char *fmt, va_list args) #ifdef CONFIG_KGDB_KDB if (unlikely(kdb_trap_printk)) { - r = vkdb_printf(fmt, args); + r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); return r; } #endif diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1eb9d90c3af9..227fec36b12a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1077,7 +1077,6 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, } #if defined CONFIG_COMPAT -#include <linux/compat.h> int compat_ptrace_request(struct task_struct *child, compat_long_t request, compat_ulong_t addr, compat_ulong_t data) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0d7bbe3095ad..0a571e9a0f1d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -326,6 +326,7 @@ void rcu_read_unlock_special(struct task_struct *t) special = t->rcu_read_unlock_special; if (special.b.need_qs) { rcu_preempt_qs(); + t->rcu_read_unlock_special.b.need_qs = false; if (!t->rcu_read_unlock_special.s) { local_irq_restore(flags); return; diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 8a2e230fb86a..eae160dd669d 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -87,8 +87,7 @@ static inline struct autogroup *autogroup_create(void) * so we don't have to move tasks around upon policy change, * or flail around trying to allocate bandwidth on the fly. * A bandwidth exception in __sched_setscheduler() allows - * the policy change to proceed. Thereafter, task_group() - * returns &root_task_group, so zero bandwidth is required. + * the policy change to proceed. */ free_rt_sched_group(tg); tg->rt_se = root_task_group.rt_se; @@ -115,9 +114,6 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) if (tg != &root_task_group) return false; - if (p->sched_class != &fair_sched_class) - return false; - /* * We can only assume the task group can't go away on us if * autogroup_move_group() can see us on ->thread_group list. diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 7052d3fd4e7b..8d0f35debf35 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -274,7 +274,7 @@ bool try_wait_for_completion(struct completion *x) * first without taking the lock so we can * return early in the blocking case. */ - if (!ACCESS_ONCE(x->done)) + if (!READ_ONCE(x->done)) return 0; spin_lock_irqsave(&x->wait.lock, flags); @@ -297,6 +297,21 @@ EXPORT_SYMBOL(try_wait_for_completion); */ bool completion_done(struct completion *x) { - return !!ACCESS_ONCE(x->done); + if (!READ_ONCE(x->done)) + return false; + + /* + * If ->done, we need to wait for complete() to release ->wait.lock + * otherwise we can end up freeing the completion before complete() + * is done referencing it. + * + * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders + * the loads of ->done and ->wait.lock such that we cannot observe + * the lock before complete() acquires it while observing the ->done + * after it's acquired the lock. + */ + smp_rmb(); + spin_unlock_wait(&x->wait.lock); + return true; } EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 13049aac05a6..f0f831e8a345 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -307,66 +307,6 @@ __read_mostly int scheduler_running; int sysctl_sched_rt_runtime = 950000; /* - * __task_rq_lock - lock the rq @p resides on. - */ -static inline struct rq *__task_rq_lock(struct task_struct *p) - __acquires(rq->lock) -{ - struct rq *rq; - - lockdep_assert_held(&p->pi_lock); - - for (;;) { - rq = task_rq(p); - raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) - return rq; - raw_spin_unlock(&rq->lock); - - while (unlikely(task_on_rq_migrating(p))) - cpu_relax(); - } -} - -/* - * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. - */ -static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) - __acquires(p->pi_lock) - __acquires(rq->lock) -{ - struct rq *rq; - - for (;;) { - raw_spin_lock_irqsave(&p->pi_lock, *flags); - rq = task_rq(p); - raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) - return rq; - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); - - while (unlikely(task_on_rq_migrating(p))) - cpu_relax(); - } -} - -static void __task_rq_unlock(struct rq *rq) - __releases(rq->lock) -{ - raw_spin_unlock(&rq->lock); -} - -static inline void -task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) - __releases(rq->lock) - __releases(p->pi_lock) -{ - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); -} - -/* * this_rq_lock - lock this runqueue and disable interrupts. */ static struct rq *this_rq_lock(void) @@ -2899,7 +2839,7 @@ void __sched schedule_preempt_disabled(void) preempt_disable(); } -static void preempt_schedule_common(void) +static void __sched notrace preempt_schedule_common(void) { do { __preempt_count_add(PREEMPT_ACTIVE); @@ -4418,36 +4358,29 @@ EXPORT_SYMBOL_GPL(yield_to); * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. */ -void __sched io_schedule(void) -{ - struct rq *rq = raw_rq(); - - delayacct_blkio_start(); - atomic_inc(&rq->nr_iowait); - blk_flush_plug(current); - current->in_iowait = 1; - schedule(); - current->in_iowait = 0; - atomic_dec(&rq->nr_iowait); - delayacct_blkio_end(); -} -EXPORT_SYMBOL(io_schedule); - long __sched io_schedule_timeout(long timeout) { - struct rq *rq = raw_rq(); + int old_iowait = current->in_iowait; + struct rq *rq; long ret; + current->in_iowait = 1; + if (old_iowait) + blk_schedule_flush_plug(current); + else + blk_flush_plug(current); + delayacct_blkio_start(); + rq = raw_rq(); atomic_inc(&rq->nr_iowait); - blk_flush_plug(current); - current->in_iowait = 1; ret = schedule_timeout(timeout); - current->in_iowait = 0; + current->in_iowait = old_iowait; atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); + return ret; } +EXPORT_SYMBOL(io_schedule_timeout); /** * sys_sched_get_priority_max - return maximum RT priority. @@ -7642,6 +7575,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg) { struct task_struct *g, *p; + /* + * Autogroups do not have RT tasks; see autogroup_create(). + */ + if (task_group_is_autogroup(tg)) + return 0; + for_each_process_thread(g, p) { if (rt_task(p) && task_group(p) == tg) return 1; @@ -7734,6 +7673,17 @@ static int tg_set_rt_bandwidth(struct task_group *tg, { int i, err = 0; + /* + * Disallowing the root group RT runtime is BAD, it would disallow the + * kernel creating (and or operating) RT threads. + */ + if (tg == &root_task_group && rt_runtime == 0) + return -EINVAL; + + /* No period doesn't make any sense. */ + if (rt_period == 0) + return -EINVAL; + mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); err = __rt_schedulable(tg, rt_period, rt_runtime); @@ -7790,9 +7740,6 @@ static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) rt_period = (u64)rt_period_us * NSEC_PER_USEC; rt_runtime = tg->rt_bandwidth.rt_runtime; - if (rt_period == 0) - return -EINVAL; - return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a027799ae130..3fa8fa6d9403 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -511,16 +511,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct sched_dl_entity, dl_timer); struct task_struct *p = dl_task_of(dl_se); + unsigned long flags; struct rq *rq; -again: - rq = task_rq(p); - raw_spin_lock(&rq->lock); - if (rq != task_rq(p)) { - /* Task was moved, retrying. */ - raw_spin_unlock(&rq->lock); - goto again; - } + rq = task_rq_lock(current, &flags); /* * We need to take care of several possible races here: @@ -541,6 +535,26 @@ again: sched_clock_tick(); update_rq_clock(rq); + + /* + * If the throttle happened during sched-out; like: + * + * schedule() + * deactivate_task() + * dequeue_task_dl() + * update_curr_dl() + * start_dl_timer() + * __dequeue_task_dl() + * prev->on_rq = 0; + * + * We can be both throttled and !queued. Replenish the counter + * but do not enqueue -- wait for our wakeup to do that. + */ + if (!task_on_rq_queued(p)) { + replenish_dl_entity(dl_se, dl_se); + goto unlock; + } + enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (dl_task(rq->curr)) check_preempt_curr_dl(rq, p, 0); @@ -555,7 +569,7 @@ again: push_dl_task(rq); #endif unlock: - raw_spin_unlock(&rq->lock); + task_rq_unlock(rq, current, &flags); return HRTIMER_NORESTART; } @@ -898,6 +912,7 @@ static void yield_task_dl(struct rq *rq) rq->curr->dl.dl_yielded = 1; p->dl.runtime = 0; } + update_rq_clock(rq); update_curr_dl(rq); } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index aaf1c1d5cf5d..94b2d7b88a27 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -7,6 +7,7 @@ #include <linux/tick.h> #include <linux/mm.h> #include <linux/stackprotector.h> +#include <linux/suspend.h> #include <asm/tlb.h> @@ -105,6 +106,21 @@ static void cpuidle_idle_call(void) rcu_idle_enter(); /* + * Suspend-to-idle ("freeze") is a system state in which all user space + * has been frozen, all I/O devices have been suspended and the only + * activity happens here and in iterrupts (if any). In that case bypass + * the cpuidle governor and go stratight for the deepest idle state + * available. Possibly also suspend the local tick and the entire + * timekeeping to prevent timer interrupts from kicking us out of idle + * until a proper wakeup interrupt happens. + */ + if (idle_should_freeze()) { + cpuidle_enter_freeze(); + local_irq_enable(); + goto exit_idle; + } + + /* * Ask the cpuidle framework to choose a convenient idle state. * Fall back to the default arch idle method on errors. */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0870db23d79c..dc0f435a2779 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1380,6 +1380,82 @@ static inline void sched_avg_update(struct rq *rq) { } extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); +/* + * __task_rq_lock - lock the rq @p resides on. + */ +static inline struct rq *__task_rq_lock(struct task_struct *p) + __acquires(rq->lock) +{ + struct rq *rq; + + lockdep_assert_held(&p->pi_lock); + + for (;;) { + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) + return rq; + raw_spin_unlock(&rq->lock); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); + } +} + +/* + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. + */ +static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) + __acquires(p->pi_lock) + __acquires(rq->lock) +{ + struct rq *rq; + + for (;;) { + raw_spin_lock_irqsave(&p->pi_lock, *flags); + rq = task_rq(p); + raw_spin_lock(&rq->lock); + /* + * move_queued_task() task_rq_lock() + * + * ACQUIRE (rq->lock) + * [S] ->on_rq = MIGRATING [L] rq = task_rq() + * WMB (__set_task_cpu()) ACQUIRE (rq->lock); + * [S] ->cpu = new_cpu [L] task_rq() + * [L] ->on_rq + * RELEASE (rq->lock) + * + * If we observe the old cpu in task_rq_lock, the acquire of + * the old rq->lock will fully serialize against the stores. + * + * If we observe the new cpu in task_rq_lock, the acquire will + * pair with the WMB to ensure we must then also see migrating. + */ + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) + return rq; + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); + } +} + +static inline void __task_rq_unlock(struct rq *rq) + __releases(rq->lock) +{ + raw_spin_unlock(&rq->lock); +} + +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) + __releases(rq->lock) + __releases(p->pi_lock) +{ + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); +} + #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 4ef9687ac115..4f44028943e6 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -629,7 +629,9 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) switch (action) { case SECCOMP_RET_ERRNO: - /* Set the low-order 16-bits as a errno. */ + /* Set low-order bits as an errno, capped at MAX_ERRNO. */ + if (data > MAX_ERRNO) + data = MAX_ERRNO; syscall_set_return_value(current, task_pt_regs(current), -data, 0); goto skip; diff --git a/kernel/signal.c b/kernel/signal.c index 33a52759cc0e..a390499943e4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3550,7 +3550,7 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler) SYSCALL_DEFINE0(pause) { while (!signal_pending(current)) { - current->state = TASK_INTERRUPTIBLE; + __set_current_state(TASK_INTERRUPTIBLE); schedule(); } return -ERESTARTNOHAND; @@ -3563,7 +3563,7 @@ int sigsuspend(sigset_t *set) current->saved_sigmask = current->blocked; set_current_blocked(set); - current->state = TASK_INTERRUPTIBLE; + __set_current_state(TASK_INTERRUPTIBLE); schedule(); set_restore_sigmask(); return -ERESTARTNOHAND; diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 4b585e0fdd22..0f60b08a4f07 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -633,10 +633,14 @@ int ntp_validate_timex(struct timex *txc) if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) return -EPERM; - if (txc->modes & ADJ_FREQUENCY) { - if (LONG_MIN / PPM_SCALE > txc->freq) + /* + * Check for potential multiplication overflows that can + * only happen on 64-bit systems: + */ + if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) { + if (LLONG_MIN / PPM_SCALE > txc->freq) return -EINVAL; - if (LONG_MAX / PPM_SCALE < txc->freq) + if (LLONG_MAX / PPM_SCALE < txc->freq) return -EINVAL; } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 7efeedf53ebd..f7c515595b42 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -394,6 +394,56 @@ void tick_resume(void) } } +static DEFINE_RAW_SPINLOCK(tick_freeze_lock); +static unsigned int tick_freeze_depth; + +/** + * tick_freeze - Suspend the local tick and (possibly) timekeeping. + * + * Check if this is the last online CPU executing the function and if so, + * suspend timekeeping. Otherwise suspend the local tick. + * + * Call with interrupts disabled. Must be balanced with %tick_unfreeze(). + * Interrupts must not be enabled before the subsequent %tick_unfreeze(). + */ +void tick_freeze(void) +{ + raw_spin_lock(&tick_freeze_lock); + + tick_freeze_depth++; + if (tick_freeze_depth == num_online_cpus()) { + timekeeping_suspend(); + } else { + tick_suspend(); + tick_suspend_broadcast(); + } + + raw_spin_unlock(&tick_freeze_lock); +} + +/** + * tick_unfreeze - Resume the local tick and (possibly) timekeeping. + * + * Check if this is the first CPU executing the function and if so, resume + * timekeeping. Otherwise resume the local tick. + * + * Call with interrupts disabled. Must be balanced with %tick_freeze(). + * Interrupts must not be enabled after the preceding %tick_freeze(). + */ +void tick_unfreeze(void) +{ + raw_spin_lock(&tick_freeze_lock); + + if (tick_freeze_depth == num_online_cpus()) + timekeeping_resume(); + else + tick_resume(); + + tick_freeze_depth--; + + raw_spin_unlock(&tick_freeze_lock); +} + /** * tick_init - initialize the tick control */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b124af259800..91db94136c10 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -230,9 +230,7 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) /** * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. - * @tk: The timekeeper from which we take the update - * @tkf: The fast timekeeper to update - * @tbase: The time base for the fast timekeeper (mono/raw) + * @tkr: Timekeeping readout base from which we take the update * * We want to use this from any context including NMI and tracing / * instrumenting the timekeeping code itself. @@ -244,11 +242,11 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) * smp_wmb(); <- Ensure that the last base[1] update is visible * tkf->seq++; * smp_wmb(); <- Ensure that the seqcount update is visible - * update(tkf->base[0], tk); + * update(tkf->base[0], tkr); * smp_wmb(); <- Ensure that the base[0] update is visible * tkf->seq++; * smp_wmb(); <- Ensure that the seqcount update is visible - * update(tkf->base[1], tk); + * update(tkf->base[1], tkr); * * The reader side does: * @@ -269,7 +267,7 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) * slightly wrong timestamp (a few nanoseconds). See * @ktime_get_mono_fast_ns. */ -static void update_fast_timekeeper(struct timekeeper *tk) +static void update_fast_timekeeper(struct tk_read_base *tkr) { struct tk_read_base *base = tk_fast_mono.base; @@ -277,7 +275,7 @@ static void update_fast_timekeeper(struct timekeeper *tk) raw_write_seqcount_latch(&tk_fast_mono.seq); /* Update base[0] */ - memcpy(base, &tk->tkr, sizeof(*base)); + memcpy(base, tkr, sizeof(*base)); /* Force readers back to base[0] */ raw_write_seqcount_latch(&tk_fast_mono.seq); @@ -334,6 +332,35 @@ u64 notrace ktime_get_mono_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); +/* Suspend-time cycles value for halted fast timekeeper. */ +static cycle_t cycles_at_suspend; + +static cycle_t dummy_clock_read(struct clocksource *cs) +{ + return cycles_at_suspend; +} + +/** + * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. + * @tk: Timekeeper to snapshot. + * + * It generally is unsafe to access the clocksource after timekeeping has been + * suspended, so take a snapshot of the readout base of @tk and use it as the + * fast timekeeper's readout base while suspended. It will return the same + * number of cycles every time until timekeeping is resumed at which time the + * proper readout base for the fast timekeeper will be restored automatically. + */ +static void halt_fast_timekeeper(struct timekeeper *tk) +{ + static struct tk_read_base tkr_dummy; + struct tk_read_base *tkr = &tk->tkr; + + memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); + cycles_at_suspend = tkr->read(tkr->clock); + tkr_dummy.read = dummy_clock_read; + update_fast_timekeeper(&tkr_dummy); +} + #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD static inline void update_vsyscall(struct timekeeper *tk) @@ -462,7 +489,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) memcpy(&shadow_timekeeper, &tk_core.timekeeper, sizeof(tk_core.timekeeper)); - update_fast_timekeeper(tk); + update_fast_timekeeper(&tk->tkr); } /** @@ -1170,7 +1197,7 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta) * xtime/wall_to_monotonic/jiffies/etc are * still managed by arch specific suspend/resume code. */ -static void timekeeping_resume(void) +void timekeeping_resume(void) { struct timekeeper *tk = &tk_core.timekeeper; struct clocksource *clock = tk->tkr.clock; @@ -1251,7 +1278,7 @@ static void timekeeping_resume(void) hrtimers_resume(); } -static int timekeeping_suspend(void) +int timekeeping_suspend(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; @@ -1296,6 +1323,7 @@ static int timekeeping_suspend(void) } timekeeping_update(tk, TK_MIRROR); + halt_fast_timekeeper(tk); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index adc1fc98bde3..1d91416055d5 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -16,5 +16,7 @@ extern int timekeeping_inject_offset(struct timespec *ts); extern s32 timekeeping_get_tai_offset(void); extern void timekeeping_set_tai_offset(s32 tai_offset); extern void timekeeping_clocktai(struct timespec *ts); +extern int timekeeping_suspend(void); +extern void timekeeping_resume(void); #endif |