aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c94
-rw-r--r--kernel/audit.h17
-rw-r--r--kernel/auditsc.c176
-rw-r--r--kernel/debug/debug_core.c19
-rw-r--r--kernel/debug/kdb/kdb_io.c46
-rw-r--r--kernel/debug/kdb/kdb_main.c16
-rw-r--r--kernel/debug/kdb/kdb_private.h4
-rw-r--r--kernel/gcov/Makefile36
-rw-r--r--kernel/kexec.c23
-rw-r--r--kernel/locking/rtmutex.c3
-rw-r--r--kernel/module.c9
-rw-r--r--kernel/power/suspend.c43
-rw-r--r--kernel/printk/printk.c2
-rw-r--r--kernel/ptrace.c1
-rw-r--r--kernel/rcu/tree_plugin.h1
-rw-r--r--kernel/sched/auto_group.c6
-rw-r--r--kernel/sched/completion.c19
-rw-r--r--kernel/sched/core.c113
-rw-r--r--kernel/sched/deadline.c33
-rw-r--r--kernel/sched/idle.c16
-rw-r--r--kernel/sched/sched.h76
-rw-r--r--kernel/seccomp.c4
-rw-r--r--kernel/signal.c4
-rw-r--r--kernel/time/ntp.c10
-rw-r--r--kernel/time/tick-common.c50
-rw-r--r--kernel/time/timekeeping.c48
-rw-r--r--kernel/time/timekeeping.h2
27 files changed, 476 insertions, 395 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 33738ef972f3..e6c10d1a4058 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -76,10 +76,11 @@ int acct_parm[3] = {4, 2, 30};
/*
* External references and all of the globals.
*/
-static void do_acct_process(struct bsd_acct_struct *acct);
struct bsd_acct_struct {
struct fs_pin pin;
+ atomic_long_t count;
+ struct rcu_head rcu;
struct mutex lock;
int active;
unsigned long needcheck;
@@ -89,6 +90,8 @@ struct bsd_acct_struct {
struct completion done;
};
+static void do_acct_process(struct bsd_acct_struct *acct);
+
/*
* Check the amount of free space and suspend/resume accordingly.
*/
@@ -124,32 +127,56 @@ out:
return acct->active;
}
+static void acct_put(struct bsd_acct_struct *p)
+{
+ if (atomic_long_dec_and_test(&p->count))
+ kfree_rcu(p, rcu);
+}
+
+static inline struct bsd_acct_struct *to_acct(struct fs_pin *p)
+{
+ return p ? container_of(p, struct bsd_acct_struct, pin) : NULL;
+}
+
static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
{
struct bsd_acct_struct *res;
again:
smp_rmb();
rcu_read_lock();
- res = ACCESS_ONCE(ns->bacct);
+ res = to_acct(ACCESS_ONCE(ns->bacct));
if (!res) {
rcu_read_unlock();
return NULL;
}
- if (!atomic_long_inc_not_zero(&res->pin.count)) {
+ if (!atomic_long_inc_not_zero(&res->count)) {
rcu_read_unlock();
cpu_relax();
goto again;
}
rcu_read_unlock();
mutex_lock(&res->lock);
- if (!res->ns) {
+ if (res != to_acct(ACCESS_ONCE(ns->bacct))) {
mutex_unlock(&res->lock);
- pin_put(&res->pin);
+ acct_put(res);
goto again;
}
return res;
}
+static void acct_pin_kill(struct fs_pin *pin)
+{
+ struct bsd_acct_struct *acct = to_acct(pin);
+ mutex_lock(&acct->lock);
+ do_acct_process(acct);
+ schedule_work(&acct->work);
+ wait_for_completion(&acct->done);
+ cmpxchg(&acct->ns->bacct, pin, NULL);
+ mutex_unlock(&acct->lock);
+ pin_remove(pin);
+ acct_put(acct);
+}
+
static void close_work(struct work_struct *work)
{
struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
@@ -160,44 +187,13 @@ static void close_work(struct work_struct *work)
complete(&acct->done);
}
-static void acct_kill(struct bsd_acct_struct *acct,
- struct bsd_acct_struct *new)
-{
- if (acct) {
- struct pid_namespace *ns = acct->ns;
- do_acct_process(acct);
- INIT_WORK(&acct->work, close_work);
- init_completion(&acct->done);
- schedule_work(&acct->work);
- wait_for_completion(&acct->done);
- pin_remove(&acct->pin);
- ns->bacct = new;
- acct->ns = NULL;
- atomic_long_dec(&acct->pin.count);
- mutex_unlock(&acct->lock);
- pin_put(&acct->pin);
- }
-}
-
-static void acct_pin_kill(struct fs_pin *pin)
-{
- struct bsd_acct_struct *acct;
- acct = container_of(pin, struct bsd_acct_struct, pin);
- mutex_lock(&acct->lock);
- if (!acct->ns) {
- mutex_unlock(&acct->lock);
- pin_put(pin);
- acct = NULL;
- }
- acct_kill(acct, NULL);
-}
-
static int acct_on(struct filename *pathname)
{
struct file *file;
struct vfsmount *mnt, *internal;
struct pid_namespace *ns = task_active_pid_ns(current);
- struct bsd_acct_struct *acct, *old;
+ struct bsd_acct_struct *acct;
+ struct fs_pin *old;
int err;
acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
@@ -238,21 +234,21 @@ static int acct_on(struct filename *pathname)
mnt = file->f_path.mnt;
file->f_path.mnt = internal;
- atomic_long_set(&acct->pin.count, 1);
- acct->pin.kill = acct_pin_kill;
+ atomic_long_set(&acct->count, 1);
+ init_fs_pin(&acct->pin, acct_pin_kill);
acct->file = file;
acct->needcheck = jiffies;
acct->ns = ns;
mutex_init(&acct->lock);
+ INIT_WORK(&acct->work, close_work);
+ init_completion(&acct->done);
mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */
pin_insert(&acct->pin, mnt);
- old = acct_get(ns);
- if (old)
- acct_kill(old, acct);
- else
- ns->bacct = acct;
+ rcu_read_lock();
+ old = xchg(&ns->bacct, &acct->pin);
mutex_unlock(&acct->lock);
+ pin_kill(old);
mnt_drop_write(mnt);
mntput(mnt);
return 0;
@@ -288,7 +284,8 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
mutex_unlock(&acct_on_mutex);
putname(tmp);
} else {
- acct_kill(acct_get(task_active_pid_ns(current)), NULL);
+ rcu_read_lock();
+ pin_kill(task_active_pid_ns(current)->bacct);
}
return error;
@@ -296,7 +293,8 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
void acct_exit_ns(struct pid_namespace *ns)
{
- acct_kill(acct_get(ns), NULL);
+ rcu_read_lock();
+ pin_kill(ns->bacct);
}
/*
@@ -576,7 +574,7 @@ static void slow_acct_process(struct pid_namespace *ns)
if (acct) {
do_acct_process(acct);
mutex_unlock(&acct->lock);
- pin_put(&acct->pin);
+ acct_put(acct);
}
}
}
diff --git a/kernel/audit.h b/kernel/audit.h
index 3cdffad5a1d9..1caa0d345d90 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -24,12 +24,6 @@
#include <linux/skbuff.h>
#include <uapi/linux/mqueue.h>
-/* 0 = no checking
- 1 = put_count checking
- 2 = verbose put_count checking
-*/
-#define AUDIT_DEBUG 0
-
/* AUDIT_NAMES is the number of slots we reserve in the audit_context
* for saving names from getname(). If we get more names we will allocate
* a name dynamically and also add those to the list anchored by names_list. */
@@ -74,9 +68,8 @@ struct audit_cap_data {
};
};
-/* When fs/namei.c:getname() is called, we store the pointer in name and
- * we don't let putname() free it (instead we free all of the saved
- * pointers at syscall exit time).
+/* When fs/namei.c:getname() is called, we store the pointer in name and bump
+ * the refcnt in the associated filename struct.
*
* Further, in fs/namei.c:path_lookup() we store the inode and device.
*/
@@ -86,7 +79,6 @@ struct audit_names {
struct filename *name;
int name_len; /* number of chars to log */
bool hidden; /* don't log this record */
- bool name_put; /* call __putname()? */
unsigned long ino;
dev_t dev;
@@ -208,11 +200,6 @@ struct audit_context {
};
int fds[2];
struct audit_proctitle proctitle;
-
-#if AUDIT_DEBUG
- int put_count;
- int ino_count;
-#endif
};
extern u32 audit_ever_enabled;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 072566dd0caf..dc4ae70a7413 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -866,33 +866,10 @@ static inline void audit_free_names(struct audit_context *context)
{
struct audit_names *n, *next;
-#if AUDIT_DEBUG == 2
- if (context->put_count + context->ino_count != context->name_count) {
- int i = 0;
-
- pr_err("%s:%d(:%d): major=%d in_syscall=%d"
- " name_count=%d put_count=%d ino_count=%d"
- " [NOT freeing]\n", __FILE__, __LINE__,
- context->serial, context->major, context->in_syscall,
- context->name_count, context->put_count,
- context->ino_count);
- list_for_each_entry(n, &context->names_list, list) {
- pr_err("names[%d] = %p = %s\n", i++, n->name,
- n->name->name ?: "(null)");
- }
- dump_stack();
- return;
- }
-#endif
-#if AUDIT_DEBUG
- context->put_count = 0;
- context->ino_count = 0;
-#endif
-
list_for_each_entry_safe(n, next, &context->names_list, list) {
list_del(&n->list);
- if (n->name && n->name_put)
- final_putname(n->name);
+ if (n->name)
+ putname(n->name);
if (n->should_free)
kfree(n);
}
@@ -1711,9 +1688,6 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
list_add_tail(&aname->list, &context->names_list);
context->name_count++;
-#if AUDIT_DEBUG
- context->ino_count++;
-#endif
return aname;
}
@@ -1734,8 +1708,10 @@ __audit_reusename(const __user char *uptr)
list_for_each_entry(n, &context->names_list, list) {
if (!n->name)
continue;
- if (n->name->uptr == uptr)
+ if (n->name->uptr == uptr) {
+ n->name->refcnt++;
return n->name;
+ }
}
return NULL;
}
@@ -1752,19 +1728,8 @@ void __audit_getname(struct filename *name)
struct audit_context *context = current->audit_context;
struct audit_names *n;
- if (!context->in_syscall) {
-#if AUDIT_DEBUG == 2
- pr_err("%s:%d(:%d): ignoring getname(%p)\n",
- __FILE__, __LINE__, context->serial, name);
- dump_stack();
-#endif
+ if (!context->in_syscall)
return;
- }
-
-#if AUDIT_DEBUG
- /* The filename _must_ have a populated ->name */
- BUG_ON(!name->name);
-#endif
n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
if (!n)
@@ -1772,56 +1737,13 @@ void __audit_getname(struct filename *name)
n->name = name;
n->name_len = AUDIT_NAME_FULL;
- n->name_put = true;
name->aname = n;
+ name->refcnt++;
if (!context->pwd.dentry)
get_fs_pwd(current->fs, &context->pwd);
}
-/* audit_putname - intercept a putname request
- * @name: name to intercept and delay for putname
- *
- * If we have stored the name from getname in the audit context,
- * then we delay the putname until syscall exit.
- * Called from include/linux/fs.h:putname().
- */
-void audit_putname(struct filename *name)
-{
- struct audit_context *context = current->audit_context;
-
- BUG_ON(!context);
- if (!name->aname || !context->in_syscall) {
-#if AUDIT_DEBUG == 2
- pr_err("%s:%d(:%d): final_putname(%p)\n",
- __FILE__, __LINE__, context->serial, name);
- if (context->name_count) {
- struct audit_names *n;
- int i = 0;
-
- list_for_each_entry(n, &context->names_list, list)
- pr_err("name[%d] = %p = %s\n", i++, n->name,
- n->name->name ?: "(null)");
- }
-#endif
- final_putname(name);
- }
-#if AUDIT_DEBUG
- else {
- ++context->put_count;
- if (context->put_count > context->name_count) {
- pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)"
- " name_count=%d put_count=%d\n",
- __FILE__, __LINE__,
- context->serial, context->major,
- context->in_syscall, name->name,
- context->name_count, context->put_count);
- dump_stack();
- }
- }
-#endif
-}
-
/**
* __audit_inode - store the inode and device from a lookup
* @name: name being audited
@@ -1842,10 +1764,6 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
if (!name)
goto out_alloc;
-#if AUDIT_DEBUG
- /* The struct filename _must_ have a populated ->name */
- BUG_ON(!name->name);
-#endif
/*
* If we have a pointer to an audit_names entry already, then we can
* just use it directly if the type is correct.
@@ -1863,7 +1781,17 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
}
list_for_each_entry_reverse(n, &context->names_list, list) {
- if (!n->name || strcmp(n->name->name, name->name))
+ if (n->ino) {
+ /* valid inode number, use that for the comparison */
+ if (n->ino != inode->i_ino ||
+ n->dev != inode->i_sb->s_dev)
+ continue;
+ } else if (n->name) {
+ /* inode number has not been set, check the name */
+ if (strcmp(n->name->name, name->name))
+ continue;
+ } else
+ /* no inode and no name (?!) ... this is odd ... */
continue;
/* match the correct record type */
@@ -1882,44 +1810,11 @@ out_alloc:
n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
if (!n)
return;
- /* unfortunately, while we may have a path name to record with the
- * inode, we can't always rely on the string lasting until the end of
- * the syscall so we need to create our own copy, it may fail due to
- * memory allocation issues, but we do our best */
if (name) {
- /* we can't use getname_kernel() due to size limits */
- size_t len = strlen(name->name) + 1;
- struct filename *new = __getname();
-
- if (unlikely(!new))
- goto out;
-
- if (len <= (PATH_MAX - sizeof(*new))) {
- new->name = (char *)(new) + sizeof(*new);
- new->separate = false;
- } else if (len <= PATH_MAX) {
- /* this looks odd, but is due to final_putname() */
- struct filename *new2;
-
- new2 = kmalloc(sizeof(*new2), GFP_KERNEL);
- if (unlikely(!new2)) {
- __putname(new);
- goto out;
- }
- new2->name = (char *)new;
- new2->separate = true;
- new = new2;
- } else {
- /* we should never get here, but let's be safe */
- __putname(new);
- goto out;
- }
- strlcpy((char *)new->name, name->name, len);
- new->uptr = NULL;
- new->aname = n;
- n->name = new;
- n->name_put = true;
+ n->name = name;
+ name->refcnt++;
}
+
out:
if (parent) {
n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
@@ -1970,11 +1865,16 @@ void __audit_inode_child(const struct inode *parent,
/* look for a parent entry first */
list_for_each_entry(n, &context->names_list, list) {
- if (!n->name || n->type != AUDIT_TYPE_PARENT)
+ if (!n->name ||
+ (n->type != AUDIT_TYPE_PARENT &&
+ n->type != AUDIT_TYPE_UNKNOWN))
continue;
- if (n->ino == parent->i_ino &&
- !audit_compare_dname_path(dname, n->name->name, n->name_len)) {
+ if (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev &&
+ !audit_compare_dname_path(dname,
+ n->name->name, n->name_len)) {
+ if (n->type == AUDIT_TYPE_UNKNOWN)
+ n->type = AUDIT_TYPE_PARENT;
found_parent = n;
break;
}
@@ -1983,11 +1883,8 @@ void __audit_inode_child(const struct inode *parent,
/* is there a matching child entry? */
list_for_each_entry(n, &context->names_list, list) {
/* can only match entries that have a name */
- if (!n->name || n->type != type)
- continue;
-
- /* if we found a parent, make sure this one is a child of it */
- if (found_parent && (n->name != found_parent->name))
+ if (!n->name ||
+ (n->type != type && n->type != AUDIT_TYPE_UNKNOWN))
continue;
if (!strcmp(dname, n->name->name) ||
@@ -1995,6 +1892,8 @@ void __audit_inode_child(const struct inode *parent,
found_parent ?
found_parent->name_len :
AUDIT_NAME_FULL)) {
+ if (n->type == AUDIT_TYPE_UNKNOWN)
+ n->type = type;
found_child = n;
break;
}
@@ -2019,10 +1918,10 @@ void __audit_inode_child(const struct inode *parent,
if (found_parent) {
found_child->name = found_parent->name;
found_child->name_len = AUDIT_NAME_FULL;
- /* don't call __putname() */
- found_child->name_put = false;
+ found_child->name->refcnt++;
}
}
+
if (inode)
audit_copy_inode(found_child, dentry, inode);
else
@@ -2405,7 +2304,6 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
struct audit_aux_data_bprm_fcaps *ax;
struct audit_context *context = current->audit_context;
struct cpu_vfs_cap_data vcaps;
- struct dentry *dentry;
ax = kmalloc(sizeof(*ax), GFP_KERNEL);
if (!ax)
@@ -2415,9 +2313,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->d.next = context->aux;
context->aux = (void *)ax;
- dentry = dget(bprm->file->f_path.dentry);
- get_vfs_caps_from_disk(dentry, &vcaps);
- dput(dentry);
+ get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
ax->fcap.permitted = vcaps.permitted;
ax->fcap.inheritable = vcaps.inheritable;
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 07ce18ca71e0..0874e2edd275 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -604,7 +604,7 @@ return_normal:
online_cpus)
cpu_relax();
if (!time_left)
- pr_crit("KGDB: Timed out waiting for secondary CPUs.\n");
+ pr_crit("Timed out waiting for secondary CPUs.\n");
/*
* At this point the primary processor is completely
@@ -696,6 +696,14 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
if (arch_kgdb_ops.enable_nmi)
arch_kgdb_ops.enable_nmi(0);
+ /*
+ * Avoid entering the debugger if we were triggered due to an oops
+ * but panic_timeout indicates the system should automatically
+ * reboot on panic. We don't want to get stuck waiting for input
+ * on such systems, especially if its "just" an oops.
+ */
+ if (signo != SIGTRAP && panic_timeout)
+ return 1;
memset(ks, 0, sizeof(struct kgdb_state));
ks->cpu = raw_smp_processor_id();
@@ -828,6 +836,15 @@ static int kgdb_panic_event(struct notifier_block *self,
unsigned long val,
void *data)
{
+ /*
+ * Avoid entering the debugger if we were triggered due to a panic
+ * We don't want to get stuck waiting for input from user in such case.
+ * panic_timeout indicates the system should automatically
+ * reboot on panic.
+ */
+ if (panic_timeout)
+ return NOTIFY_DONE;
+
if (dbg_kdb_mode)
kdb_printf("PANIC: %s\n", (char *)data);
kgdb_breakpoint();
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 7c70812caea5..fc1ef736253c 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -439,7 +439,7 @@ poll_again:
* substituted for %d, %x or %o in the prompt.
*/
-char *kdb_getstr(char *buffer, size_t bufsize, char *prompt)
+char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt)
{
if (prompt && kdb_prompt_str != prompt)
strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
@@ -548,7 +548,7 @@ static int kdb_search_string(char *searched, char *searchfor)
return 0;
}
-int vkdb_printf(const char *fmt, va_list ap)
+int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
{
int diag;
int linecount;
@@ -680,6 +680,12 @@ int vkdb_printf(const char *fmt, va_list ap)
size_avail = sizeof(kdb_buffer) - len;
goto kdb_print_out;
}
+ if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH)
+ /*
+ * This was a interactive search (using '/' at more
+ * prompt) and it has completed. Clear the flag.
+ */
+ kdb_grepping_flag = 0;
/*
* at this point the string is a full line and
* should be printed, up to the null.
@@ -691,19 +697,20 @@ kdb_printit:
* Write to all consoles.
*/
retlen = strlen(kdb_buffer);
+ cp = (char *) printk_skip_level(kdb_buffer);
if (!dbg_kdb_mode && kgdb_connected) {
- gdbstub_msg_write(kdb_buffer, retlen);
+ gdbstub_msg_write(cp, retlen - (cp - kdb_buffer));
} else {
if (dbg_io_ops && !dbg_io_ops->is_console) {
- len = retlen;
- cp = kdb_buffer;
+ len = retlen - (cp - kdb_buffer);
+ cp2 = cp;
while (len--) {
- dbg_io_ops->write_char(*cp);
- cp++;
+ dbg_io_ops->write_char(*cp2);
+ cp2++;
}
}
while (c) {
- c->write(c, kdb_buffer, retlen);
+ c->write(c, cp, retlen - (cp - kdb_buffer));
touch_nmi_watchdog();
c = c->next;
}
@@ -711,7 +718,10 @@ kdb_printit:
if (logging) {
saved_loglevel = console_loglevel;
console_loglevel = CONSOLE_LOGLEVEL_SILENT;
- printk(KERN_INFO "%s", kdb_buffer);
+ if (printk_get_level(kdb_buffer) || src == KDB_MSGSRC_PRINTK)
+ printk("%s", kdb_buffer);
+ else
+ pr_info("%s", kdb_buffer);
}
if (KDB_STATE(PAGER)) {
@@ -794,11 +804,23 @@ kdb_printit:
kdb_nextline = linecount - 1;
kdb_printf("\r");
suspend_grep = 1; /* for this recursion */
+ } else if (buf1[0] == '/' && !kdb_grepping_flag) {
+ kdb_printf("\r");
+ kdb_getstr(kdb_grep_string, KDB_GREP_STRLEN,
+ kdbgetenv("SEARCHPROMPT") ?: "search> ");
+ *strchrnul(kdb_grep_string, '\n') = '\0';
+ kdb_grepping_flag += KDB_GREPPING_FLAG_SEARCH;
+ suspend_grep = 1; /* for this recursion */
} else if (buf1[0] && buf1[0] != '\n') {
/* user hit something other than enter */
suspend_grep = 1; /* for this recursion */
- kdb_printf("\nOnly 'q' or 'Q' are processed at more "
- "prompt, input ignored\n");
+ if (buf1[0] != '/')
+ kdb_printf(
+ "\nOnly 'q', 'Q' or '/' are processed at "
+ "more prompt, input ignored\n");
+ else
+ kdb_printf("\n'/' cannot be used during | "
+ "grep filtering, input ignored\n");
} else if (kdb_grepping_flag) {
/* user hit enter */
suspend_grep = 1; /* for this recursion */
@@ -844,7 +866,7 @@ int kdb_printf(const char *fmt, ...)
int r;
va_start(ap, fmt);
- r = vkdb_printf(fmt, ap);
+ r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
va_end(ap);
return r;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 7b40c5f07dce..4121345498e0 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -50,8 +50,7 @@
static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE;
module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600);
-#define GREP_LEN 256
-char kdb_grep_string[GREP_LEN];
+char kdb_grep_string[KDB_GREP_STRLEN];
int kdb_grepping_flag;
EXPORT_SYMBOL(kdb_grepping_flag);
int kdb_grep_leading;
@@ -870,7 +869,7 @@ static void parse_grep(const char *str)
len = strlen(cp);
if (!len)
return;
- if (len >= GREP_LEN) {
+ if (len >= KDB_GREP_STRLEN) {
kdb_printf("search string too long\n");
return;
}
@@ -915,13 +914,12 @@ int kdb_parse(const char *cmdstr)
char *cp;
char *cpp, quoted;
kdbtab_t *tp;
- int i, escaped, ignore_errors = 0, check_grep;
+ int i, escaped, ignore_errors = 0, check_grep = 0;
/*
* First tokenize the command string.
*/
cp = (char *)cmdstr;
- kdb_grepping_flag = check_grep = 0;
if (KDB_FLAG(CMD_INTERRUPT)) {
/* Previous command was interrupted, newline must not
@@ -1247,7 +1245,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
kdb_printf("due to NonMaskable Interrupt @ "
kdb_machreg_fmt "\n",
instruction_pointer(regs));
- kdb_dumpregs(regs);
break;
case KDB_REASON_SSTEP:
case KDB_REASON_BREAK:
@@ -1281,6 +1278,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
*/
kdb_nextline = 1;
KDB_STATE_CLEAR(SUPPRESS);
+ kdb_grepping_flag = 0;
+ /* ensure the old search does not leak into '/' commands */
+ kdb_grep_string[0] = '\0';
cmdbuf = cmd_cur;
*cmdbuf = '\0';
@@ -2256,7 +2256,7 @@ static int kdb_cpu(int argc, const char **argv)
/*
* Validate cpunum
*/
- if ((cpunum > NR_CPUS) || !kgdb_info[cpunum].enter_kgdb)
+ if ((cpunum >= CONFIG_NR_CPUS) || !kgdb_info[cpunum].enter_kgdb)
return KDB_BADCPUNUM;
dbg_switch_cpu = cpunum;
@@ -2583,7 +2583,7 @@ static int kdb_summary(int argc, const char **argv)
#define K(x) ((x) << (PAGE_SHIFT - 10))
kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n"
"Buffers: %8lu kB\n",
- val.totalram, val.freeram, val.bufferram);
+ K(val.totalram), K(val.freeram), K(val.bufferram));
return 0;
}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index eaacd1693954..75014d7f4568 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -196,7 +196,9 @@ extern int kdb_main_loop(kdb_reason_t, kdb_reason_t,
/* Miscellaneous functions and data areas */
extern int kdb_grepping_flag;
+#define KDB_GREPPING_FLAG_SEARCH 0x8000
extern char kdb_grep_string[];
+#define KDB_GREP_STRLEN 256
extern int kdb_grep_leading;
extern int kdb_grep_trailing;
extern char *kdb_cmds[];
@@ -209,7 +211,7 @@ extern void kdb_ps1(const struct task_struct *p);
extern void kdb_print_nameval(const char *name, unsigned long val);
extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
extern void kdb_meminfo_proc_show(void);
-extern char *kdb_getstr(char *, size_t, char *);
+extern char *kdb_getstr(char *, size_t, const char *);
extern void kdb_gdb_state_pass(char *buf);
/* Defines for kdb_symbol_print */
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 52aa7e8de927..752d6486b67e 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,33 +1,7 @@
ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
-# if-lt
-# Usage VAR := $(call if-lt, $(a), $(b))
-# Returns 1 if (a < b)
-if-lt = $(shell [ $(1) -lt $(2) ] && echo 1)
-
-ifeq ($(CONFIG_GCOV_FORMAT_3_4),y)
- cc-ver := 0304
-else ifeq ($(CONFIG_GCOV_FORMAT_4_7),y)
- cc-ver := 0407
-else
-# Use cc-version if available, otherwise set 0
-#
-# scripts/Kbuild.include, which contains cc-version function, is not included
-# during make clean "make -f scripts/Makefile.clean obj=kernel/gcov"
-# Meaning cc-ver is empty causing if-lt test to fail with
-# "/bin/sh: line 0: [: -lt: unary operator expected" error mesage.
-# This has no affect on the clean phase, but the error message could be
-# confusing/annoying. So this dummy workaround sets cc-ver to zero if cc-version
-# is not available. We can probably move if-lt to Kbuild.include, so it's also
-# not defined during clean or to include Kbuild.include in
-# scripts/Makefile.clean. But the following workaround seems least invasive.
- cc-ver := $(if $(call cc-version),$(call cc-version),0)
-endif
-
-obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o
-
-ifeq ($(call if-lt, $(cc-ver), 0407),1)
- obj-$(CONFIG_GCOV_KERNEL) += gcc_3_4.o
-else
- obj-$(CONFIG_GCOV_KERNEL) += gcc_4_7.o
-endif
+obj-y := base.o fs.o
+obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o
+obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o
+obj-$(CONFIG_GCOV_FORMAT_AUTODETECT) += $(call cc-ifversion, -lt, 0407, \
+ gcc_3_4.o, gcc_4_7.o)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c85277639b34..38c25b1f2fd5 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -444,7 +444,7 @@ arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
}
/*
- * Free up memory used by kernel, initrd, and comand line. This is temporary
+ * Free up memory used by kernel, initrd, and command line. This is temporary
* memory allocation which is not needed any more after these buffers have
* been loaded into separate segments and have been copied elsewhere.
*/
@@ -856,8 +856,6 @@ static int kimage_set_destination(struct kimage *image,
destination &= PAGE_MASK;
result = kimage_add_entry(image, destination | IND_DESTINATION);
- if (result == 0)
- image->destination = destination;
return result;
}
@@ -869,8 +867,6 @@ static int kimage_add_page(struct kimage *image, unsigned long page)
page &= PAGE_MASK;
result = kimage_add_entry(image, page | IND_SOURCE);
- if (result == 0)
- image->destination += PAGE_SIZE;
return result;
}
@@ -1288,19 +1284,22 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
if (nr_segments > 0) {
unsigned long i;
- /* Loading another kernel to reboot into */
- if ((flags & KEXEC_ON_CRASH) == 0)
- result = kimage_alloc_init(&image, entry, nr_segments,
- segments, flags);
- /* Loading another kernel to switch to if this one crashes */
- else if (flags & KEXEC_ON_CRASH) {
- /* Free any current crash dump kernel before
+ if (flags & KEXEC_ON_CRASH) {
+ /*
+ * Loading another kernel to switch to if this one
+ * crashes. Free any current crash dump kernel before
* we corrupt it.
*/
+
kimage_free(xchg(&kexec_crash_image, NULL));
result = kimage_alloc_init(&image, entry, nr_segments,
segments, flags);
crash_map_reserved_pages();
+ } else {
+ /* Loading another kernel to reboot into. */
+
+ result = kimage_alloc_init(&image, entry, nr_segments,
+ segments, flags);
}
if (result)
goto out;
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 3059bc2f022d..e16e5542bf13 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1193,7 +1193,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
if (unlikely(ret)) {
- remove_waiter(lock, &waiter);
+ if (rt_mutex_has_waiters(lock))
+ remove_waiter(lock, &waiter);
rt_mutex_handle_deadlock(ret, chwalk, &waiter);
}
diff --git a/kernel/module.c b/kernel/module.c
index 8426ad48362c..b34813f725e9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3025,8 +3025,13 @@ static void do_free_init(struct rcu_head *head)
kfree(m);
}
-/* This is where the real work happens */
-static int do_init_module(struct module *mod)
+/*
+ * This is where the real work happens.
+ *
+ * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb
+ * helper command 'lx-symbols'.
+ */
+static noinline int do_init_module(struct module *mod)
{
int ret = 0;
struct mod_initfree *freeinit;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c347e3ce3a55..b7d6b3a721b1 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -37,7 +37,9 @@ const char *pm_states[PM_SUSPEND_MAX];
static const struct platform_suspend_ops *suspend_ops;
static const struct platform_freeze_ops *freeze_ops;
static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
-static bool suspend_freeze_wake;
+
+enum freeze_state __read_mostly suspend_freeze_state;
+static DEFINE_SPINLOCK(suspend_freeze_lock);
void freeze_set_ops(const struct platform_freeze_ops *ops)
{
@@ -48,22 +50,49 @@ void freeze_set_ops(const struct platform_freeze_ops *ops)
static void freeze_begin(void)
{
- suspend_freeze_wake = false;
+ suspend_freeze_state = FREEZE_STATE_NONE;
}
static void freeze_enter(void)
{
- cpuidle_use_deepest_state(true);
+ spin_lock_irq(&suspend_freeze_lock);
+ if (pm_wakeup_pending())
+ goto out;
+
+ suspend_freeze_state = FREEZE_STATE_ENTER;
+ spin_unlock_irq(&suspend_freeze_lock);
+
+ get_online_cpus();
cpuidle_resume();
- wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+
+ /* Push all the CPUs into the idle loop. */
+ wake_up_all_idle_cpus();
+ pr_debug("PM: suspend-to-idle\n");
+ /* Make the current CPU wait so it can enter the idle loop too. */
+ wait_event(suspend_freeze_wait_head,
+ suspend_freeze_state == FREEZE_STATE_WAKE);
+ pr_debug("PM: resume from suspend-to-idle\n");
+
cpuidle_pause();
- cpuidle_use_deepest_state(false);
+ put_online_cpus();
+
+ spin_lock_irq(&suspend_freeze_lock);
+
+ out:
+ suspend_freeze_state = FREEZE_STATE_NONE;
+ spin_unlock_irq(&suspend_freeze_lock);
}
void freeze_wake(void)
{
- suspend_freeze_wake = true;
- wake_up(&suspend_freeze_wait_head);
+ unsigned long flags;
+
+ spin_lock_irqsave(&suspend_freeze_lock, flags);
+ if (suspend_freeze_state > FREEZE_STATE_NONE) {
+ suspend_freeze_state = FREEZE_STATE_WAKE;
+ wake_up(&suspend_freeze_wait_head);
+ }
+ spin_unlock_irqrestore(&suspend_freeze_lock, flags);
}
EXPORT_SYMBOL_GPL(freeze_wake);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index c06df7de0963..01cfd69c54c6 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1811,7 +1811,7 @@ int vprintk_default(const char *fmt, va_list args)
#ifdef CONFIG_KGDB_KDB
if (unlikely(kdb_trap_printk)) {
- r = vkdb_printf(fmt, args);
+ r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
return r;
}
#endif
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1eb9d90c3af9..227fec36b12a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1077,7 +1077,6 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
}
#if defined CONFIG_COMPAT
-#include <linux/compat.h>
int compat_ptrace_request(struct task_struct *child, compat_long_t request,
compat_ulong_t addr, compat_ulong_t data)
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0d7bbe3095ad..0a571e9a0f1d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -326,6 +326,7 @@ void rcu_read_unlock_special(struct task_struct *t)
special = t->rcu_read_unlock_special;
if (special.b.need_qs) {
rcu_preempt_qs();
+ t->rcu_read_unlock_special.b.need_qs = false;
if (!t->rcu_read_unlock_special.s) {
local_irq_restore(flags);
return;
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 8a2e230fb86a..eae160dd669d 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -87,8 +87,7 @@ static inline struct autogroup *autogroup_create(void)
* so we don't have to move tasks around upon policy change,
* or flail around trying to allocate bandwidth on the fly.
* A bandwidth exception in __sched_setscheduler() allows
- * the policy change to proceed. Thereafter, task_group()
- * returns &root_task_group, so zero bandwidth is required.
+ * the policy change to proceed.
*/
free_rt_sched_group(tg);
tg->rt_se = root_task_group.rt_se;
@@ -115,9 +114,6 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
if (tg != &root_task_group)
return false;
- if (p->sched_class != &fair_sched_class)
- return false;
-
/*
* We can only assume the task group can't go away on us if
* autogroup_move_group() can see us on ->thread_group list.
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 7052d3fd4e7b..8d0f35debf35 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -274,7 +274,7 @@ bool try_wait_for_completion(struct completion *x)
* first without taking the lock so we can
* return early in the blocking case.
*/
- if (!ACCESS_ONCE(x->done))
+ if (!READ_ONCE(x->done))
return 0;
spin_lock_irqsave(&x->wait.lock, flags);
@@ -297,6 +297,21 @@ EXPORT_SYMBOL(try_wait_for_completion);
*/
bool completion_done(struct completion *x)
{
- return !!ACCESS_ONCE(x->done);
+ if (!READ_ONCE(x->done))
+ return false;
+
+ /*
+ * If ->done, we need to wait for complete() to release ->wait.lock
+ * otherwise we can end up freeing the completion before complete()
+ * is done referencing it.
+ *
+ * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
+ * the loads of ->done and ->wait.lock such that we cannot observe
+ * the lock before complete() acquires it while observing the ->done
+ * after it's acquired the lock.
+ */
+ smp_rmb();
+ spin_unlock_wait(&x->wait.lock);
+ return true;
}
EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 13049aac05a6..f0f831e8a345 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -307,66 +307,6 @@ __read_mostly int scheduler_running;
int sysctl_sched_rt_runtime = 950000;
/*
- * __task_rq_lock - lock the rq @p resides on.
- */
-static inline struct rq *__task_rq_lock(struct task_struct *p)
- __acquires(rq->lock)
-{
- struct rq *rq;
-
- lockdep_assert_held(&p->pi_lock);
-
- for (;;) {
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
- return rq;
- raw_spin_unlock(&rq->lock);
-
- while (unlikely(task_on_rq_migrating(p)))
- cpu_relax();
- }
-}
-
-/*
- * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
- */
-static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
- __acquires(p->pi_lock)
- __acquires(rq->lock)
-{
- struct rq *rq;
-
- for (;;) {
- raw_spin_lock_irqsave(&p->pi_lock, *flags);
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
- return rq;
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-
- while (unlikely(task_on_rq_migrating(p)))
- cpu_relax();
- }
-}
-
-static void __task_rq_unlock(struct rq *rq)
- __releases(rq->lock)
-{
- raw_spin_unlock(&rq->lock);
-}
-
-static inline void
-task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
- __releases(rq->lock)
- __releases(p->pi_lock)
-{
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-}
-
-/*
* this_rq_lock - lock this runqueue and disable interrupts.
*/
static struct rq *this_rq_lock(void)
@@ -2899,7 +2839,7 @@ void __sched schedule_preempt_disabled(void)
preempt_disable();
}
-static void preempt_schedule_common(void)
+static void __sched notrace preempt_schedule_common(void)
{
do {
__preempt_count_add(PREEMPT_ACTIVE);
@@ -4418,36 +4358,29 @@ EXPORT_SYMBOL_GPL(yield_to);
* This task is about to go to sleep on IO. Increment rq->nr_iowait so
* that process accounting knows that this is a task in IO wait state.
*/
-void __sched io_schedule(void)
-{
- struct rq *rq = raw_rq();
-
- delayacct_blkio_start();
- atomic_inc(&rq->nr_iowait);
- blk_flush_plug(current);
- current->in_iowait = 1;
- schedule();
- current->in_iowait = 0;
- atomic_dec(&rq->nr_iowait);
- delayacct_blkio_end();
-}
-EXPORT_SYMBOL(io_schedule);
-
long __sched io_schedule_timeout(long timeout)
{
- struct rq *rq = raw_rq();
+ int old_iowait = current->in_iowait;
+ struct rq *rq;
long ret;
+ current->in_iowait = 1;
+ if (old_iowait)
+ blk_schedule_flush_plug(current);
+ else
+ blk_flush_plug(current);
+
delayacct_blkio_start();
+ rq = raw_rq();
atomic_inc(&rq->nr_iowait);
- blk_flush_plug(current);
- current->in_iowait = 1;
ret = schedule_timeout(timeout);
- current->in_iowait = 0;
+ current->in_iowait = old_iowait;
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
+
return ret;
}
+EXPORT_SYMBOL(io_schedule_timeout);
/**
* sys_sched_get_priority_max - return maximum RT priority.
@@ -7642,6 +7575,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
{
struct task_struct *g, *p;
+ /*
+ * Autogroups do not have RT tasks; see autogroup_create().
+ */
+ if (task_group_is_autogroup(tg))
+ return 0;
+
for_each_process_thread(g, p) {
if (rt_task(p) && task_group(p) == tg)
return 1;
@@ -7734,6 +7673,17 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
{
int i, err = 0;
+ /*
+ * Disallowing the root group RT runtime is BAD, it would disallow the
+ * kernel creating (and or operating) RT threads.
+ */
+ if (tg == &root_task_group && rt_runtime == 0)
+ return -EINVAL;
+
+ /* No period doesn't make any sense. */
+ if (rt_period == 0)
+ return -EINVAL;
+
mutex_lock(&rt_constraints_mutex);
read_lock(&tasklist_lock);
err = __rt_schedulable(tg, rt_period, rt_runtime);
@@ -7790,9 +7740,6 @@ static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
rt_period = (u64)rt_period_us * NSEC_PER_USEC;
rt_runtime = tg->rt_bandwidth.rt_runtime;
- if (rt_period == 0)
- return -EINVAL;
-
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a027799ae130..3fa8fa6d9403 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -511,16 +511,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
struct sched_dl_entity,
dl_timer);
struct task_struct *p = dl_task_of(dl_se);
+ unsigned long flags;
struct rq *rq;
-again:
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- if (rq != task_rq(p)) {
- /* Task was moved, retrying. */
- raw_spin_unlock(&rq->lock);
- goto again;
- }
+ rq = task_rq_lock(current, &flags);
/*
* We need to take care of several possible races here:
@@ -541,6 +535,26 @@ again:
sched_clock_tick();
update_rq_clock(rq);
+
+ /*
+ * If the throttle happened during sched-out; like:
+ *
+ * schedule()
+ * deactivate_task()
+ * dequeue_task_dl()
+ * update_curr_dl()
+ * start_dl_timer()
+ * __dequeue_task_dl()
+ * prev->on_rq = 0;
+ *
+ * We can be both throttled and !queued. Replenish the counter
+ * but do not enqueue -- wait for our wakeup to do that.
+ */
+ if (!task_on_rq_queued(p)) {
+ replenish_dl_entity(dl_se, dl_se);
+ goto unlock;
+ }
+
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0);
@@ -555,7 +569,7 @@ again:
push_dl_task(rq);
#endif
unlock:
- raw_spin_unlock(&rq->lock);
+ task_rq_unlock(rq, current, &flags);
return HRTIMER_NORESTART;
}
@@ -898,6 +912,7 @@ static void yield_task_dl(struct rq *rq)
rq->curr->dl.dl_yielded = 1;
p->dl.runtime = 0;
}
+ update_rq_clock(rq);
update_curr_dl(rq);
}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index aaf1c1d5cf5d..94b2d7b88a27 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -7,6 +7,7 @@
#include <linux/tick.h>
#include <linux/mm.h>
#include <linux/stackprotector.h>
+#include <linux/suspend.h>
#include <asm/tlb.h>
@@ -105,6 +106,21 @@ static void cpuidle_idle_call(void)
rcu_idle_enter();
/*
+ * Suspend-to-idle ("freeze") is a system state in which all user space
+ * has been frozen, all I/O devices have been suspended and the only
+ * activity happens here and in iterrupts (if any). In that case bypass
+ * the cpuidle governor and go stratight for the deepest idle state
+ * available. Possibly also suspend the local tick and the entire
+ * timekeeping to prevent timer interrupts from kicking us out of idle
+ * until a proper wakeup interrupt happens.
+ */
+ if (idle_should_freeze()) {
+ cpuidle_enter_freeze();
+ local_irq_enable();
+ goto exit_idle;
+ }
+
+ /*
* Ask the cpuidle framework to choose a convenient idle state.
* Fall back to the default arch idle method on errors.
*/
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0870db23d79c..dc0f435a2779 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1380,6 +1380,82 @@ static inline void sched_avg_update(struct rq *rq) { }
extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
+/*
+ * __task_rq_lock - lock the rq @p resides on.
+ */
+static inline struct rq *__task_rq_lock(struct task_struct *p)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ for (;;) {
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+ return rq;
+ raw_spin_unlock(&rq->lock);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
+ }
+}
+
+/*
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
+ */
+static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+ __acquires(p->pi_lock)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ for (;;) {
+ raw_spin_lock_irqsave(&p->pi_lock, *flags);
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ /*
+ * move_queued_task() task_rq_lock()
+ *
+ * ACQUIRE (rq->lock)
+ * [S] ->on_rq = MIGRATING [L] rq = task_rq()
+ * WMB (__set_task_cpu()) ACQUIRE (rq->lock);
+ * [S] ->cpu = new_cpu [L] task_rq()
+ * [L] ->on_rq
+ * RELEASE (rq->lock)
+ *
+ * If we observe the old cpu in task_rq_lock, the acquire of
+ * the old rq->lock will fully serialize against the stores.
+ *
+ * If we observe the new cpu in task_rq_lock, the acquire will
+ * pair with the WMB to ensure we must then also see migrating.
+ */
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+ return rq;
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
+ }
+}
+
+static inline void __task_rq_unlock(struct rq *rq)
+ __releases(rq->lock)
+{
+ raw_spin_unlock(&rq->lock);
+}
+
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
+{
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+}
+
#ifdef CONFIG_SMP
#ifdef CONFIG_PREEMPT
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 4ef9687ac115..4f44028943e6 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -629,7 +629,9 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
switch (action) {
case SECCOMP_RET_ERRNO:
- /* Set the low-order 16-bits as a errno. */
+ /* Set low-order bits as an errno, capped at MAX_ERRNO. */
+ if (data > MAX_ERRNO)
+ data = MAX_ERRNO;
syscall_set_return_value(current, task_pt_regs(current),
-data, 0);
goto skip;
diff --git a/kernel/signal.c b/kernel/signal.c
index 33a52759cc0e..a390499943e4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3550,7 +3550,7 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
SYSCALL_DEFINE0(pause)
{
while (!signal_pending(current)) {
- current->state = TASK_INTERRUPTIBLE;
+ __set_current_state(TASK_INTERRUPTIBLE);
schedule();
}
return -ERESTARTNOHAND;
@@ -3563,7 +3563,7 @@ int sigsuspend(sigset_t *set)
current->saved_sigmask = current->blocked;
set_current_blocked(set);
- current->state = TASK_INTERRUPTIBLE;
+ __set_current_state(TASK_INTERRUPTIBLE);
schedule();
set_restore_sigmask();
return -ERESTARTNOHAND;
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 4b585e0fdd22..0f60b08a4f07 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -633,10 +633,14 @@ int ntp_validate_timex(struct timex *txc)
if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
return -EPERM;
- if (txc->modes & ADJ_FREQUENCY) {
- if (LONG_MIN / PPM_SCALE > txc->freq)
+ /*
+ * Check for potential multiplication overflows that can
+ * only happen on 64-bit systems:
+ */
+ if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
+ if (LLONG_MIN / PPM_SCALE > txc->freq)
return -EINVAL;
- if (LONG_MAX / PPM_SCALE < txc->freq)
+ if (LLONG_MAX / PPM_SCALE < txc->freq)
return -EINVAL;
}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 7efeedf53ebd..f7c515595b42 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -394,6 +394,56 @@ void tick_resume(void)
}
}
+static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
+static unsigned int tick_freeze_depth;
+
+/**
+ * tick_freeze - Suspend the local tick and (possibly) timekeeping.
+ *
+ * Check if this is the last online CPU executing the function and if so,
+ * suspend timekeeping. Otherwise suspend the local tick.
+ *
+ * Call with interrupts disabled. Must be balanced with %tick_unfreeze().
+ * Interrupts must not be enabled before the subsequent %tick_unfreeze().
+ */
+void tick_freeze(void)
+{
+ raw_spin_lock(&tick_freeze_lock);
+
+ tick_freeze_depth++;
+ if (tick_freeze_depth == num_online_cpus()) {
+ timekeeping_suspend();
+ } else {
+ tick_suspend();
+ tick_suspend_broadcast();
+ }
+
+ raw_spin_unlock(&tick_freeze_lock);
+}
+
+/**
+ * tick_unfreeze - Resume the local tick and (possibly) timekeeping.
+ *
+ * Check if this is the first CPU executing the function and if so, resume
+ * timekeeping. Otherwise resume the local tick.
+ *
+ * Call with interrupts disabled. Must be balanced with %tick_freeze().
+ * Interrupts must not be enabled after the preceding %tick_freeze().
+ */
+void tick_unfreeze(void)
+{
+ raw_spin_lock(&tick_freeze_lock);
+
+ if (tick_freeze_depth == num_online_cpus())
+ timekeeping_resume();
+ else
+ tick_resume();
+
+ tick_freeze_depth--;
+
+ raw_spin_unlock(&tick_freeze_lock);
+}
+
/**
* tick_init - initialize the tick control
*/
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b124af259800..91db94136c10 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -230,9 +230,7 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
/**
* update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
- * @tk: The timekeeper from which we take the update
- * @tkf: The fast timekeeper to update
- * @tbase: The time base for the fast timekeeper (mono/raw)
+ * @tkr: Timekeeping readout base from which we take the update
*
* We want to use this from any context including NMI and tracing /
* instrumenting the timekeeping code itself.
@@ -244,11 +242,11 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
* smp_wmb(); <- Ensure that the last base[1] update is visible
* tkf->seq++;
* smp_wmb(); <- Ensure that the seqcount update is visible
- * update(tkf->base[0], tk);
+ * update(tkf->base[0], tkr);
* smp_wmb(); <- Ensure that the base[0] update is visible
* tkf->seq++;
* smp_wmb(); <- Ensure that the seqcount update is visible
- * update(tkf->base[1], tk);
+ * update(tkf->base[1], tkr);
*
* The reader side does:
*
@@ -269,7 +267,7 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
* slightly wrong timestamp (a few nanoseconds). See
* @ktime_get_mono_fast_ns.
*/
-static void update_fast_timekeeper(struct timekeeper *tk)
+static void update_fast_timekeeper(struct tk_read_base *tkr)
{
struct tk_read_base *base = tk_fast_mono.base;
@@ -277,7 +275,7 @@ static void update_fast_timekeeper(struct timekeeper *tk)
raw_write_seqcount_latch(&tk_fast_mono.seq);
/* Update base[0] */
- memcpy(base, &tk->tkr, sizeof(*base));
+ memcpy(base, tkr, sizeof(*base));
/* Force readers back to base[0] */
raw_write_seqcount_latch(&tk_fast_mono.seq);
@@ -334,6 +332,35 @@ u64 notrace ktime_get_mono_fast_ns(void)
}
EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
+/* Suspend-time cycles value for halted fast timekeeper. */
+static cycle_t cycles_at_suspend;
+
+static cycle_t dummy_clock_read(struct clocksource *cs)
+{
+ return cycles_at_suspend;
+}
+
+/**
+ * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
+ * @tk: Timekeeper to snapshot.
+ *
+ * It generally is unsafe to access the clocksource after timekeeping has been
+ * suspended, so take a snapshot of the readout base of @tk and use it as the
+ * fast timekeeper's readout base while suspended. It will return the same
+ * number of cycles every time until timekeeping is resumed at which time the
+ * proper readout base for the fast timekeeper will be restored automatically.
+ */
+static void halt_fast_timekeeper(struct timekeeper *tk)
+{
+ static struct tk_read_base tkr_dummy;
+ struct tk_read_base *tkr = &tk->tkr;
+
+ memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
+ cycles_at_suspend = tkr->read(tkr->clock);
+ tkr_dummy.read = dummy_clock_read;
+ update_fast_timekeeper(&tkr_dummy);
+}
+
#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
static inline void update_vsyscall(struct timekeeper *tk)
@@ -462,7 +489,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
memcpy(&shadow_timekeeper, &tk_core.timekeeper,
sizeof(tk_core.timekeeper));
- update_fast_timekeeper(tk);
+ update_fast_timekeeper(&tk->tkr);
}
/**
@@ -1170,7 +1197,7 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
* xtime/wall_to_monotonic/jiffies/etc are
* still managed by arch specific suspend/resume code.
*/
-static void timekeeping_resume(void)
+void timekeeping_resume(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct clocksource *clock = tk->tkr.clock;
@@ -1251,7 +1278,7 @@ static void timekeeping_resume(void)
hrtimers_resume();
}
-static int timekeeping_suspend(void)
+int timekeeping_suspend(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
@@ -1296,6 +1323,7 @@ static int timekeeping_suspend(void)
}
timekeeping_update(tk, TK_MIRROR);
+ halt_fast_timekeeper(tk);
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index adc1fc98bde3..1d91416055d5 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -16,5 +16,7 @@ extern int timekeeping_inject_offset(struct timespec *ts);
extern s32 timekeeping_get_tai_offset(void);
extern void timekeeping_set_tai_offset(s32 tai_offset);
extern void timekeeping_clocktai(struct timespec *ts);
+extern int timekeeping_suspend(void);
+extern void timekeeping_resume(void);
#endif