aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c456
-rw-r--r--kernel/cgroup.c47
-rw-r--r--kernel/compat.c24
-rw-r--r--kernel/events/core.c23
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/irq/chip.c1
-rw-r--r--kernel/kcmp.c7
-rw-r--r--kernel/kexec.c11
-rw-r--r--kernel/kprobes.c13
-rw-r--r--kernel/module.c5
-rw-r--r--kernel/power/power.h1
-rw-r--r--kernel/power/snapshot.c21
-rw-r--r--kernel/power/suspend.c2
-rw-r--r--kernel/power/suspend_test.c31
-rw-r--r--kernel/printk/printk.c18
-rw-r--r--kernel/rcu/tree.h2
-rw-r--r--kernel/rcu/tree_plugin.h22
-rw-r--r--kernel/resource.c11
-rw-r--r--kernel/sched/core.c7
-rw-r--r--kernel/sched/proc.c7
-rw-r--r--kernel/seccomp.c10
-rw-r--r--kernel/time/tick-sched.c14
-rw-r--r--kernel/time/timekeeping.c10
-rw-r--r--kernel/trace/ftrace.c246
-rw-r--r--kernel/trace/ring_buffer.c16
25 files changed, 587 insertions, 420 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 51793520566f..b4c667d22e79 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -59,6 +59,7 @@
#include <asm/div64.h>
#include <linux/blkdev.h> /* sector_div */
#include <linux/pid_namespace.h>
+#include <linux/fs_pin.h>
/*
* These constants control the amount of freespace that suspend and
@@ -75,172 +76,190 @@ int acct_parm[3] = {4, 2, 30};
/*
* External references and all of the globals.
*/
-static void do_acct_process(struct bsd_acct_struct *acct,
- struct pid_namespace *ns, struct file *);
+static void do_acct_process(struct bsd_acct_struct *acct);
-/*
- * This structure is used so that all the data protected by lock
- * can be placed in the same cache line as the lock. This primes
- * the cache line to have the data after getting the lock.
- */
struct bsd_acct_struct {
+ struct fs_pin pin;
+ struct mutex lock;
int active;
unsigned long needcheck;
struct file *file;
struct pid_namespace *ns;
- struct list_head list;
+ struct work_struct work;
+ struct completion done;
};
-static DEFINE_SPINLOCK(acct_lock);
-static LIST_HEAD(acct_list);
-
/*
* Check the amount of free space and suspend/resume accordingly.
*/
-static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
+static int check_free_space(struct bsd_acct_struct *acct)
{
struct kstatfs sbuf;
- int res;
- int act;
- u64 resume;
- u64 suspend;
-
- spin_lock(&acct_lock);
- res = acct->active;
- if (!file || time_is_before_jiffies(acct->needcheck))
+
+ if (time_is_before_jiffies(acct->needcheck))
goto out;
- spin_unlock(&acct_lock);
/* May block */
- if (vfs_statfs(&file->f_path, &sbuf))
- return res;
- suspend = sbuf.f_blocks * SUSPEND;
- resume = sbuf.f_blocks * RESUME;
-
- do_div(suspend, 100);
- do_div(resume, 100);
-
- if (sbuf.f_bavail <= suspend)
- act = -1;
- else if (sbuf.f_bavail >= resume)
- act = 1;
- else
- act = 0;
-
- /*
- * If some joker switched acct->file under us we'ld better be
- * silent and _not_ touch anything.
- */
- spin_lock(&acct_lock);
- if (file != acct->file) {
- if (act)
- res = act > 0;
+ if (vfs_statfs(&acct->file->f_path, &sbuf))
goto out;
- }
if (acct->active) {
- if (act < 0) {
+ u64 suspend = sbuf.f_blocks * SUSPEND;
+ do_div(suspend, 100);
+ if (sbuf.f_bavail <= suspend) {
acct->active = 0;
pr_info("Process accounting paused\n");
}
} else {
- if (act > 0) {
+ u64 resume = sbuf.f_blocks * RESUME;
+ do_div(resume, 100);
+ if (sbuf.f_bavail >= resume) {
acct->active = 1;
pr_info("Process accounting resumed\n");
}
}
acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
- res = acct->active;
out:
- spin_unlock(&acct_lock);
+ return acct->active;
+}
+
+static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
+{
+ struct bsd_acct_struct *res;
+again:
+ smp_rmb();
+ rcu_read_lock();
+ res = ACCESS_ONCE(ns->bacct);
+ if (!res) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ if (!atomic_long_inc_not_zero(&res->pin.count)) {
+ rcu_read_unlock();
+ cpu_relax();
+ goto again;
+ }
+ rcu_read_unlock();
+ mutex_lock(&res->lock);
+ if (!res->ns) {
+ mutex_unlock(&res->lock);
+ pin_put(&res->pin);
+ goto again;
+ }
return res;
}
-/*
- * Close the old accounting file (if currently open) and then replace
- * it with file (if non-NULL).
- *
- * NOTE: acct_lock MUST be held on entry and exit.
- */
-static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
- struct pid_namespace *ns)
+static void close_work(struct work_struct *work)
{
- struct file *old_acct = NULL;
- struct pid_namespace *old_ns = NULL;
-
- if (acct->file) {
- old_acct = acct->file;
- old_ns = acct->ns;
- acct->active = 0;
- acct->file = NULL;
+ struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
+ struct file *file = acct->file;
+ if (file->f_op->flush)
+ file->f_op->flush(file, NULL);
+ __fput_sync(file);
+ complete(&acct->done);
+}
+
+static void acct_kill(struct bsd_acct_struct *acct,
+ struct bsd_acct_struct *new)
+{
+ if (acct) {
+ struct pid_namespace *ns = acct->ns;
+ do_acct_process(acct);
+ INIT_WORK(&acct->work, close_work);
+ init_completion(&acct->done);
+ schedule_work(&acct->work);
+ wait_for_completion(&acct->done);
+ pin_remove(&acct->pin);
+ ns->bacct = new;
acct->ns = NULL;
- list_del(&acct->list);
- }
- if (file) {
- acct->file = file;
- acct->ns = ns;
- acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
- acct->active = 1;
- list_add(&acct->list, &acct_list);
+ atomic_long_dec(&acct->pin.count);
+ mutex_unlock(&acct->lock);
+ pin_put(&acct->pin);
}
- if (old_acct) {
- mnt_unpin(old_acct->f_path.mnt);
- spin_unlock(&acct_lock);
- do_acct_process(acct, old_ns, old_acct);
- filp_close(old_acct, NULL);
- spin_lock(&acct_lock);
+}
+
+static void acct_pin_kill(struct fs_pin *pin)
+{
+ struct bsd_acct_struct *acct;
+ acct = container_of(pin, struct bsd_acct_struct, pin);
+ mutex_lock(&acct->lock);
+ if (!acct->ns) {
+ mutex_unlock(&acct->lock);
+ pin_put(pin);
+ acct = NULL;
}
+ acct_kill(acct, NULL);
}
static int acct_on(struct filename *pathname)
{
struct file *file;
- struct vfsmount *mnt;
- struct pid_namespace *ns;
- struct bsd_acct_struct *acct = NULL;
+ struct vfsmount *mnt, *internal;
+ struct pid_namespace *ns = task_active_pid_ns(current);
+ struct bsd_acct_struct *acct, *old;
+ int err;
+
+ acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
+ if (!acct)
+ return -ENOMEM;
/* Difference from BSD - they don't do O_APPEND */
file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
- if (IS_ERR(file))
+ if (IS_ERR(file)) {
+ kfree(acct);
return PTR_ERR(file);
+ }
if (!S_ISREG(file_inode(file)->i_mode)) {
+ kfree(acct);
filp_close(file, NULL);
return -EACCES;
}
if (!file->f_op->write) {
+ kfree(acct);
filp_close(file, NULL);
return -EIO;
}
-
- ns = task_active_pid_ns(current);
- if (ns->bacct == NULL) {
- acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
- if (acct == NULL) {
- filp_close(file, NULL);
- return -ENOMEM;
- }
+ internal = mnt_clone_internal(&file->f_path);
+ if (IS_ERR(internal)) {
+ kfree(acct);
+ filp_close(file, NULL);
+ return PTR_ERR(internal);
}
-
- spin_lock(&acct_lock);
- if (ns->bacct == NULL) {
- ns->bacct = acct;
- acct = NULL;
+ err = mnt_want_write(internal);
+ if (err) {
+ mntput(internal);
+ kfree(acct);
+ filp_close(file, NULL);
+ return err;
}
-
mnt = file->f_path.mnt;
- mnt_pin(mnt);
- acct_file_reopen(ns->bacct, file, ns);
- spin_unlock(&acct_lock);
-
- mntput(mnt); /* it's pinned, now give up active reference */
- kfree(acct);
-
+ file->f_path.mnt = internal;
+
+ atomic_long_set(&acct->pin.count, 1);
+ acct->pin.kill = acct_pin_kill;
+ acct->file = file;
+ acct->needcheck = jiffies;
+ acct->ns = ns;
+ mutex_init(&acct->lock);
+ mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */
+ pin_insert(&acct->pin, mnt);
+
+ old = acct_get(ns);
+ if (old)
+ acct_kill(old, acct);
+ else
+ ns->bacct = acct;
+ mutex_unlock(&acct->lock);
+ mnt_drop_write(mnt);
+ mntput(mnt);
return 0;
}
+static DEFINE_MUTEX(acct_on_mutex);
+
/**
* sys_acct - enable/disable process accounting
* @name: file name for accounting records or NULL to shutdown accounting
@@ -264,78 +283,20 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
if (IS_ERR(tmp))
return PTR_ERR(tmp);
+ mutex_lock(&acct_on_mutex);
error = acct_on(tmp);
+ mutex_unlock(&acct_on_mutex);
putname(tmp);
} else {
- struct bsd_acct_struct *acct;
-
- acct = task_active_pid_ns(current)->bacct;
- if (acct == NULL)
- return 0;
-
- spin_lock(&acct_lock);
- acct_file_reopen(acct, NULL, NULL);
- spin_unlock(&acct_lock);
+ acct_kill(acct_get(task_active_pid_ns(current)), NULL);
}
return error;
}
-/**
- * acct_auto_close - turn off a filesystem's accounting if it is on
- * @m: vfsmount being shut down
- *
- * If the accounting is turned on for a file in the subtree pointed to
- * to by m, turn accounting off. Done when m is about to die.
- */
-void acct_auto_close_mnt(struct vfsmount *m)
-{
- struct bsd_acct_struct *acct;
-
- spin_lock(&acct_lock);
-restart:
- list_for_each_entry(acct, &acct_list, list)
- if (acct->file && acct->file->f_path.mnt == m) {
- acct_file_reopen(acct, NULL, NULL);
- goto restart;
- }
- spin_unlock(&acct_lock);
-}
-
-/**
- * acct_auto_close - turn off a filesystem's accounting if it is on
- * @sb: super block for the filesystem
- *
- * If the accounting is turned on for a file in the filesystem pointed
- * to by sb, turn accounting off.
- */
-void acct_auto_close(struct super_block *sb)
-{
- struct bsd_acct_struct *acct;
-
- spin_lock(&acct_lock);
-restart:
- list_for_each_entry(acct, &acct_list, list)
- if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
- acct_file_reopen(acct, NULL, NULL);
- goto restart;
- }
- spin_unlock(&acct_lock);
-}
-
void acct_exit_ns(struct pid_namespace *ns)
{
- struct bsd_acct_struct *acct = ns->bacct;
-
- if (acct == NULL)
- return;
-
- spin_lock(&acct_lock);
- if (acct->file != NULL)
- acct_file_reopen(acct, NULL, NULL);
- spin_unlock(&acct_lock);
-
- kfree(acct);
+ acct_kill(acct_get(ns), NULL);
}
/*
@@ -450,38 +411,20 @@ static u32 encode_float(u64 value)
* do_exit() or when switching to a different output file.
*/
-/*
- * do_acct_process does all actual work. Caller holds the reference to file.
- */
-static void do_acct_process(struct bsd_acct_struct *acct,
- struct pid_namespace *ns, struct file *file)
+static void fill_ac(acct_t *ac)
{
struct pacct_struct *pacct = &current->signal->pacct;
- acct_t ac;
- mm_segment_t fs;
- unsigned long flim;
u64 elapsed, run_time;
struct tty_struct *tty;
- const struct cred *orig_cred;
-
- /* Perform file operations on behalf of whoever enabled accounting */
- orig_cred = override_creds(file->f_cred);
-
- /*
- * First check to see if there is enough free_space to continue
- * the process accounting system.
- */
- if (!check_free_space(acct, file))
- goto out;
/*
* Fill the accounting struct with the needed info as recorded
* by the different kernel functions.
*/
- memset(&ac, 0, sizeof(acct_t));
+ memset(ac, 0, sizeof(acct_t));
- ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
- strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
+ ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
+ strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
/* calculate run_time in nsec*/
run_time = ktime_get_ns();
@@ -489,9 +432,9 @@ static void do_acct_process(struct bsd_acct_struct *acct,
/* convert nsec -> AHZ */
elapsed = nsec_to_AHZ(run_time);
#if ACCT_VERSION == 3
- ac.ac_etime = encode_float(elapsed);
+ ac->ac_etime = encode_float(elapsed);
#else
- ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
+ ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
(unsigned long) elapsed : (unsigned long) -1l);
#endif
#if ACCT_VERSION == 1 || ACCT_VERSION == 2
@@ -499,18 +442,58 @@ static void do_acct_process(struct bsd_acct_struct *acct,
/* new enlarged etime field */
comp2_t etime = encode_comp2_t(elapsed);
- ac.ac_etime_hi = etime >> 16;
- ac.ac_etime_lo = (u16) etime;
+ ac->ac_etime_hi = etime >> 16;
+ ac->ac_etime_lo = (u16) etime;
}
#endif
do_div(elapsed, AHZ);
- ac.ac_btime = get_seconds() - elapsed;
+ ac->ac_btime = get_seconds() - elapsed;
+#if ACCT_VERSION==2
+ ac->ac_ahz = AHZ;
+#endif
+
+ spin_lock_irq(&current->sighand->siglock);
+ tty = current->signal->tty; /* Safe as we hold the siglock */
+ ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
+ ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
+ ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
+ ac->ac_flag = pacct->ac_flag;
+ ac->ac_mem = encode_comp_t(pacct->ac_mem);
+ ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
+ ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
+ ac->ac_exitcode = pacct->ac_exitcode;
+ spin_unlock_irq(&current->sighand->siglock);
+}
+/*
+ * do_acct_process does all actual work. Caller holds the reference to file.
+ */
+static void do_acct_process(struct bsd_acct_struct *acct)
+{
+ acct_t ac;
+ unsigned long flim;
+ const struct cred *orig_cred;
+ struct pid_namespace *ns = acct->ns;
+ struct file *file = acct->file;
+
+ /*
+ * Accounting records are not subject to resource limits.
+ */
+ flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+ /* Perform file operations on behalf of whoever enabled accounting */
+ orig_cred = override_creds(file->f_cred);
+
+ /*
+ * First check to see if there is enough free_space to continue
+ * the process accounting system.
+ */
+ if (!check_free_space(acct))
+ goto out;
+
+ fill_ac(&ac);
/* we really need to bite the bullet and change layout */
ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
-#if ACCT_VERSION == 2
- ac.ac_ahz = AHZ;
-#endif
#if ACCT_VERSION == 1 || ACCT_VERSION == 2
/* backward-compatible 16 bit fields */
ac.ac_uid16 = ac.ac_uid;
@@ -522,45 +505,18 @@ static void do_acct_process(struct bsd_acct_struct *acct,
ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
rcu_read_unlock();
#endif
-
- spin_lock_irq(&current->sighand->siglock);
- tty = current->signal->tty; /* Safe as we hold the siglock */
- ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
- ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
- ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
- ac.ac_flag = pacct->ac_flag;
- ac.ac_mem = encode_comp_t(pacct->ac_mem);
- ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
- ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
- ac.ac_exitcode = pacct->ac_exitcode;
- spin_unlock_irq(&current->sighand->siglock);
- ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
- ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
- ac.ac_swaps = encode_comp_t(0);
-
/*
* Get freeze protection. If the fs is frozen, just skip the write
* as we could deadlock the system otherwise.
*/
- if (!file_start_write_trylock(file))
- goto out;
- /*
- * Kernel segment override to datasegment and write it
- * to the accounting file.
- */
- fs = get_fs();
- set_fs(KERNEL_DS);
- /*
- * Accounting records are not subject to resource limits.
- */
- flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
- file->f_op->write(file, (char *)&ac,
- sizeof(acct_t), &file->f_pos);
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
- set_fs(fs);
- file_end_write(file);
+ if (file_start_write_trylock(file)) {
+ /* it's been opened O_APPEND, so position is irrelevant */
+ loff_t pos = 0;
+ __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos);
+ file_end_write(file);
+ }
out:
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
revert_creds(orig_cred);
}
@@ -609,34 +565,20 @@ void acct_collect(long exitcode, int group_dead)
spin_unlock_irq(&current->sighand->siglock);
}
-static void acct_process_in_ns(struct pid_namespace *ns)
+static void slow_acct_process(struct pid_namespace *ns)
{
- struct file *file = NULL;
- struct bsd_acct_struct *acct;
-
- acct = ns->bacct;
- /*
- * accelerate the common fastpath:
- */
- if (!acct || !acct->file)
- return;
-
- spin_lock(&acct_lock);
- file = acct->file;
- if (unlikely(!file)) {
- spin_unlock(&acct_lock);
- return;
+ for ( ; ns; ns = ns->parent) {
+ struct bsd_acct_struct *acct = acct_get(ns);
+ if (acct) {
+ do_acct_process(acct);
+ mutex_unlock(&acct->lock);
+ pin_put(&acct->pin);
+ }
}
- get_file(file);
- spin_unlock(&acct_lock);
-
- do_acct_process(acct, ns, file);
- fput(file);
}
/**
- * acct_process - now just a wrapper around acct_process_in_ns,
- * which in turn is a wrapper around do_acct_process.
+ * acct_process
*
* handles process accounting for an exiting task
*/
@@ -649,6 +591,10 @@ void acct_process(void)
* alive and holds its namespace, which in turn holds
* its parent.
*/
- for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
- acct_process_in_ns(ns);
+ for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) {
+ if (ns->bacct)
+ break;
+ }
+ if (unlikely(ns))
+ slow_acct_process(ns);
}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7dc8788cfd52..940aced4ed00 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1035,6 +1035,11 @@ static void cgroup_get(struct cgroup *cgrp)
css_get(&cgrp->self);
}
+static bool cgroup_tryget(struct cgroup *cgrp)
+{
+ return css_tryget(&cgrp->self);
+}
+
static void cgroup_put(struct cgroup *cgrp)
{
css_put(&cgrp->self);
@@ -1147,7 +1152,8 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
* protection against removal. Ensure @cgrp stays accessible and
* break the active_ref protection.
*/
- cgroup_get(cgrp);
+ if (!cgroup_tryget(cgrp))
+ return NULL;
kernfs_break_active_protection(kn);
mutex_lock(&cgroup_mutex);
@@ -3271,8 +3277,17 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
struct cftype *cft;
- for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
- cft->flags |= __CFTYPE_NOT_ON_DFL;
+ /*
+ * If legacy_flies_on_dfl, we want to show the legacy files on the
+ * dfl hierarchy but iff the target subsystem hasn't been updated
+ * for the dfl hierarchy yet.
+ */
+ if (!cgroup_legacy_files_on_dfl ||
+ ss->dfl_cftypes != ss->legacy_cftypes) {
+ for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+ cft->flags |= __CFTYPE_NOT_ON_DFL;
+ }
+
return cgroup_add_cftypes(ss, cfts);
}
@@ -4387,6 +4402,15 @@ static void css_release_work_fn(struct work_struct *work)
/* cgroup release path */
cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
cgrp->id = -1;
+
+ /*
+ * There are two control paths which try to determine
+ * cgroup from dentry without going through kernfs -
+ * cgroupstats_build() and css_tryget_online_from_dir().
+ * Those are supported by RCU protecting clearing of
+ * cgrp->kn->priv backpointer.
+ */
+ RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
}
mutex_unlock(&cgroup_mutex);
@@ -4543,6 +4567,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
struct cftype *base_files;
int ssid, ret;
+ /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
+ */
+ if (strchr(name, '\n'))
+ return -EINVAL;
+
parent = cgroup_kn_lock_live(parent_kn);
if (!parent)
return -ENODEV;
@@ -4820,16 +4849,6 @@ static int cgroup_rmdir(struct kernfs_node *kn)
cgroup_kn_unlock(kn);
- /*
- * There are two control paths which try to determine cgroup from
- * dentry without going through kernfs - cgroupstats_build() and
- * css_tryget_online_from_dir(). Those are supported by RCU
- * protecting clearing of cgrp->kn->priv backpointer, which should
- * happen after all files under it have been removed.
- */
- if (!ret)
- RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL);
-
cgroup_put(cgrp);
return ret;
}
@@ -5416,7 +5435,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
/*
* This path doesn't originate from kernfs and @kn could already
* have been or be removed at any point. @kn->priv is RCU
- * protected for this access. See cgroup_rmdir() for details.
+ * protected for this access. See css_release_work_fn() for details.
*/
cgrp = rcu_dereference(kn->priv);
if (cgrp)
diff --git a/kernel/compat.c b/kernel/compat.c
index 633394f442f8..ebb3c369d03d 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -226,7 +226,7 @@ static long compat_nanosleep_restart(struct restart_block *restart)
ret = hrtimer_nanosleep_restart(restart);
set_fs(oldfs);
- if (ret) {
+ if (ret == -ERESTART_RESTARTBLOCK) {
rmtp = restart->nanosleep.compat_rmtp;
if (rmtp && compat_put_timespec(&rmt, rmtp))
@@ -256,7 +256,26 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
HRTIMER_MODE_REL, CLOCK_MONOTONIC);
set_fs(oldfs);
- if (ret) {
+ /*
+ * hrtimer_nanosleep() can only return 0 or
+ * -ERESTART_RESTARTBLOCK here because:
+ *
+ * - we call it with HRTIMER_MODE_REL and therefor exclude the
+ * -ERESTARTNOHAND return path.
+ *
+ * - we supply the rmtp argument from the task stack (due to
+ * the necessary compat conversion. So the update cannot
+ * fail, which excludes the -EFAULT return path as well. If
+ * it fails nevertheless we have a bigger problem and wont
+ * reach this place anymore.
+ *
+ * - if the return value is 0, we do not have to update rmtp
+ * because there is no remaining time.
+ *
+ * We check for -ERESTART_RESTARTBLOCK nevertheless if the
+ * core implementation decides to return random nonsense.
+ */
+ if (ret == -ERESTART_RESTARTBLOCK) {
struct restart_block *restart
= &current_thread_info()->restart_block;
@@ -266,7 +285,6 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
if (rmtp && compat_put_timespec(&rmt, rmtp))
return -EFAULT;
}
-
return ret;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1cf24b3e42ec..f9c1ed002dbc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -41,6 +41,7 @@
#include <linux/cgroup.h>
#include <linux/module.h>
#include <linux/mman.h>
+#include <linux/compat.h>
#include "internal.h"
@@ -3717,6 +3718,26 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return 0;
}
+#ifdef CONFIG_COMPAT
+static long perf_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (_IOC_NR(cmd)) {
+ case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
+ case _IOC_NR(PERF_EVENT_IOC_ID):
+ /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
+ if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
+ cmd &= ~IOCSIZE_MASK;
+ cmd |= sizeof(void *) << IOCSIZE_SHIFT;
+ }
+ break;
+ }
+ return perf_ioctl(file, cmd, arg);
+}
+#else
+# define perf_compat_ioctl NULL
+#endif
+
int perf_event_task_enable(void)
{
struct perf_event *event;
@@ -4222,7 +4243,7 @@ static const struct file_operations perf_fops = {
.read = perf_read,
.poll = perf_poll,
.unlocked_ioctl = perf_ioctl,
- .compat_ioctl = perf_ioctl,
+ .compat_ioctl = perf_compat_ioctl,
.mmap = perf_mmap,
.fasync = perf_fasync,
};
diff --git a/kernel/fork.c b/kernel/fork.c
index 1380d8ace334..0cf9cdb6e491 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1105,7 +1105,7 @@ static void copy_seccomp(struct task_struct *p)
* needed because this new task is not yet running and cannot
* be racing exec.
*/
- BUG_ON(!spin_is_locked(&current->sighand->siglock));
+ assert_spin_locked(&current->sighand->siglock);
/* Ref-count the new filter user, and assign it. */
get_seccomp_filter(current);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index a2b28a2fd7b1..6223fab9a9d2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -517,6 +517,7 @@ out:
chip->irq_eoi(&desc->irq_data);
raw_spin_unlock(&desc->lock);
}
+EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
/**
* handle_edge_irq - edge type IRQ handler
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index e30ac0fe61c3..0aa69ea1d8fd 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -44,11 +44,12 @@ static long kptr_obfuscate(long v, int type)
*/
static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
{
- long ret;
+ long t1, t2;
- ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type);
+ t1 = kptr_obfuscate((long)v1, type);
+ t2 = kptr_obfuscate((long)v2, type);
- return (ret < 0) | ((ret > 0) << 1);
+ return (t1 < t2) | ((t1 > t2) << 1);
}
/* The caller must have pinned the task */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 0b49a0a58102..2bee072268d9 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -64,7 +64,9 @@ bool kexec_in_progress = false;
char __weak kexec_purgatory[0];
size_t __weak kexec_purgatory_size = 0;
+#ifdef CONFIG_KEXEC_FILE
static int kexec_calculate_store_digests(struct kimage *image);
+#endif
/* Location of the reserved area for the crash kernel */
struct resource crashk_res = {
@@ -341,6 +343,7 @@ out_free_image:
return ret;
}
+#ifdef CONFIG_KEXEC_FILE
static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
{
struct fd f = fdget(fd);
@@ -612,6 +615,9 @@ out_free_image:
kfree(image);
return ret;
}
+#else /* CONFIG_KEXEC_FILE */
+static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
+#endif /* CONFIG_KEXEC_FILE */
static int kimage_is_destination_range(struct kimage *image,
unsigned long start,
@@ -1375,6 +1381,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
}
#endif
+#ifdef CONFIG_KEXEC_FILE
SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
unsigned long, cmdline_len, const char __user *, cmdline_ptr,
unsigned long, flags)
@@ -1451,6 +1458,8 @@ out:
return ret;
}
+#endif /* CONFIG_KEXEC_FILE */
+
void crash_kexec(struct pt_regs *regs)
{
/* Take the kexec_mutex here to prevent sys_kexec_load
@@ -2006,6 +2015,7 @@ static int __init crash_save_vmcoreinfo_init(void)
subsys_initcall(crash_save_vmcoreinfo_init);
+#ifdef CONFIG_KEXEC_FILE
static int __kexec_add_segment(struct kimage *image, char *buf,
unsigned long bufsz, unsigned long mem,
unsigned long memsz)
@@ -2682,6 +2692,7 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
return 0;
}
+#endif /* CONFIG_KEXEC_FILE */
/*
* Move into place and start executing a preloaded standalone
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 734e9a7d280b..3995f546d0f3 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1778,7 +1778,18 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
unsigned long hash, flags = 0;
struct kretprobe_instance *ri;
- /*TODO: consider to only swap the RA after the last pre_handler fired */
+ /*
+ * To avoid deadlocks, prohibit return probing in NMI contexts,
+ * just skip the probe and increase the (inexact) 'nmissed'
+ * statistical counter, so that the user is informed that
+ * something happened:
+ */
+ if (unlikely(in_nmi())) {
+ rp->nmissed++;
+ return 0;
+ }
+
+ /* TODO: consider to only swap the RA after the last pre_handler fired */
hash = hash_ptr(current, KPROBE_HASH_BITS);
raw_spin_lock_irqsave(&rp->lock, flags);
if (!hlist_empty(&rp->free_instances)) {
diff --git a/kernel/module.c b/kernel/module.c
index 6f69463f0066..03214bd288e9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3304,6 +3304,11 @@ static int load_module(struct load_info *info, const char __user *uargs,
mutex_lock(&module_mutex);
module_bug_cleanup(mod);
mutex_unlock(&module_mutex);
+
+ /* we can't deallocate the module until we clear memory protection */
+ unset_module_init_ro_nx(mod);
+ unset_module_core_ro_nx(mod);
+
ddebug_cleanup:
dynamic_debug_remove(info->debug);
synchronize_sched();
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 5d49dcac2537..2df883a9d3cb 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -179,6 +179,7 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
#ifdef CONFIG_SUSPEND
/* kernel/power/suspend.c */
+extern const char *pm_labels[];
extern const char *pm_states[];
extern int suspend_devices_and_enter(suspend_state_t state);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 4fc5c32422b3..c4b8093c80b3 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -954,6 +954,25 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
}
}
+static bool is_nosave_page(unsigned long pfn)
+{
+ struct nosave_region *region;
+
+ list_for_each_entry(region, &nosave_regions, list) {
+ if (pfn >= region->start_pfn && pfn < region->end_pfn) {
+ pr_err("PM: %#010llx in e820 nosave region: "
+ "[mem %#010llx-%#010llx]\n",
+ (unsigned long long) pfn << PAGE_SHIFT,
+ (unsigned long long) region->start_pfn << PAGE_SHIFT,
+ ((unsigned long long) region->end_pfn << PAGE_SHIFT)
+ - 1);
+ return true;
+ }
+ }
+
+ return false;
+}
+
/**
* create_basic_memory_bitmaps - create bitmaps needed for marking page
* frames that should not be saved and free page frames. The pointers
@@ -2015,7 +2034,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
do {
pfn = memory_bm_next_pfn(bm);
if (likely(pfn != BM_END_OF_MAP)) {
- if (likely(pfn_valid(pfn)))
+ if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn))
swsusp_set_page_free(pfn_to_page(pfn));
else
return -EFAULT;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6dadb25cb0d8..18c62195660f 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -31,7 +31,7 @@
#include "power.h"
-static const char *pm_labels[] = { "mem", "standby", "freeze", };
+const char *pm_labels[] = { "mem", "standby", "freeze", NULL };
const char *pm_states[PM_SUSPEND_MAX];
static const struct platform_suspend_ops *suspend_ops;
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 2f524928b6aa..bd91bc177c93 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -129,20 +129,20 @@ static int __init has_wakealarm(struct device *dev, const void *data)
* at startup time. They're normally disabled, for faster boot and because
* we can't know which states really work on this particular system.
*/
-static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
+static const char *test_state_label __initdata;
static char warn_bad_state[] __initdata =
KERN_WARNING "PM: can't test '%s' suspend state\n";
static int __init setup_test_suspend(char *value)
{
- suspend_state_t i;
+ int i;
/* "=mem" ==> "mem" */
value++;
- for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
- if (!strcmp(pm_states[i], value)) {
- test_state = i;
+ for (i = 0; pm_labels[i]; i++)
+ if (!strcmp(pm_labels[i], value)) {
+ test_state_label = pm_labels[i];
return 0;
}
@@ -158,13 +158,21 @@ static int __init test_suspend(void)
struct rtc_device *rtc = NULL;
struct device *dev;
+ suspend_state_t test_state;
/* PM is initialized by now; is that state testable? */
- if (test_state == PM_SUSPEND_ON)
- goto done;
- if (!pm_states[test_state]) {
- printk(warn_bad_state, pm_states[test_state]);
- goto done;
+ if (!test_state_label)
+ return 0;
+
+ for (test_state = PM_SUSPEND_MIN; test_state < PM_SUSPEND_MAX; test_state++) {
+ const char *state_label = pm_states[test_state];
+
+ if (state_label && !strcmp(test_state_label, state_label))
+ break;
+ }
+ if (test_state == PM_SUSPEND_MAX) {
+ printk(warn_bad_state, test_state_label);
+ return 0;
}
/* RTCs have initialized by now too ... can we use one? */
@@ -173,13 +181,12 @@ static int __init test_suspend(void)
rtc = rtc_class_open(dev_name(dev));
if (!rtc) {
printk(warn_no_rtc);
- goto done;
+ return 0;
}
/* go for it */
test_wakealarm(rtc, test_state);
rtc_class_close(rtc);
-done:
return 0;
}
late_initcall(test_suspend);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index de1a6bb6861d..1ce770687ea8 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -272,6 +272,18 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;
+/* Return log buffer address */
+char *log_buf_addr_get(void)
+{
+ return log_buf;
+}
+
+/* Return log buffer size */
+u32 log_buf_len_get(void)
+{
+ return log_buf_len;
+}
+
/* human readable text of the record */
static char *log_text(const struct printk_log *msg)
{
@@ -1653,15 +1665,15 @@ asmlinkage int vprintk_emit(int facility, int level,
raw_spin_lock(&logbuf_lock);
logbuf_cpu = this_cpu;
- if (recursion_bug) {
+ if (unlikely(recursion_bug)) {
static const char recursion_msg[] =
"BUG: recent printk recursion!";
recursion_bug = 0;
- text_len = strlen(recursion_msg);
/* emit KERN_CRIT message */
printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
- NULL, 0, recursion_msg, text_len);
+ NULL, 0, recursion_msg,
+ strlen(recursion_msg));
}
/*
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 71e64c718f75..6a86eb7bac45 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -358,7 +358,7 @@ struct rcu_data {
struct rcu_head **nocb_gp_tail;
long nocb_gp_count;
long nocb_gp_count_lazy;
- bool nocb_leader_wake; /* Is the nocb leader thread awake? */
+ bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */
struct rcu_data *nocb_next_follower;
/* Next follower in wakeup chain. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 00dc411e9676..a7997e272564 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2074,9 +2074,9 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
return;
- if (!ACCESS_ONCE(rdp_leader->nocb_leader_wake) || force) {
+ if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
/* Prior xchg orders against prior callback enqueue. */
- ACCESS_ONCE(rdp_leader->nocb_leader_wake) = true;
+ ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
wake_up(&rdp_leader->nocb_wq);
}
}
@@ -2253,7 +2253,7 @@ wait_again:
if (!rcu_nocb_poll) {
trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
wait_event_interruptible(my_rdp->nocb_wq,
- ACCESS_ONCE(my_rdp->nocb_leader_wake));
+ !ACCESS_ONCE(my_rdp->nocb_leader_sleep));
/* Memory barrier handled by smp_mb() calls below and repoll. */
} else if (firsttime) {
firsttime = false; /* Don't drown trace log with "Poll"! */
@@ -2292,12 +2292,12 @@ wait_again:
schedule_timeout_interruptible(1);
/* Rescan in case we were a victim of memory ordering. */
- my_rdp->nocb_leader_wake = false;
- smp_mb(); /* Ensure _wake false before scan. */
+ my_rdp->nocb_leader_sleep = true;
+ smp_mb(); /* Ensure _sleep true before scan. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
if (ACCESS_ONCE(rdp->nocb_head)) {
/* Found CB, so short-circuit next wait. */
- my_rdp->nocb_leader_wake = true;
+ my_rdp->nocb_leader_sleep = false;
break;
}
goto wait_again;
@@ -2307,17 +2307,17 @@ wait_again:
rcu_nocb_wait_gp(my_rdp);
/*
- * We left ->nocb_leader_wake set to reduce cache thrashing.
- * We clear it now, but recheck for new callbacks while
+ * We left ->nocb_leader_sleep unset to reduce cache thrashing.
+ * We set it now, but recheck for new callbacks while
* traversing our follower list.
*/
- my_rdp->nocb_leader_wake = false;
- smp_mb(); /* Ensure _wake false before scan of ->nocb_head. */
+ my_rdp->nocb_leader_sleep = true;
+ smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */
/* Each pass through the following loop wakes a follower, if needed. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
if (ACCESS_ONCE(rdp->nocb_head))
- my_rdp->nocb_leader_wake = true; /* No need to wait. */
+ my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
if (!rdp->nocb_gp_head)
continue; /* No CBs, so no need to wake follower. */
diff --git a/kernel/resource.c b/kernel/resource.c
index da14b8d09296..60c5a3856ab7 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -351,15 +351,12 @@ static int find_next_iomem_res(struct resource *res, char *name,
end = res->end;
BUG_ON(start >= end);
- read_lock(&resource_lock);
-
- if (first_level_children_only) {
- p = iomem_resource.child;
+ if (first_level_children_only)
sibling_only = true;
- } else
- p = &iomem_resource;
- while ((p = next_resource(p, sibling_only))) {
+ read_lock(&resource_lock);
+
+ for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) {
if (p->flags != res->flags)
continue;
if (name && strcmp(p->name, name))
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1211575a2208..ec1a286684a5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2393,6 +2393,13 @@ unsigned long nr_iowait_cpu(int cpu)
return atomic_read(&this->nr_iowait);
}
+void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
+{
+ struct rq *this = this_rq();
+ *nr_waiters = atomic_read(&this->nr_iowait);
+ *load = this->cpu_load[0];
+}
+
#ifdef CONFIG_SMP
/*
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c
index 16f5a30f9c88..8ecd552fe4f2 100644
--- a/kernel/sched/proc.c
+++ b/kernel/sched/proc.c
@@ -8,13 +8,6 @@
#include "sched.h"
-unsigned long this_cpu_load(void)
-{
- struct rq *this = this_rq();
- return this->cpu_load[0];
-}
-
-
/*
* Global load-average calculations
*
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 25b0043f4755..44eb005c6695 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -203,7 +203,7 @@ static u32 seccomp_run_filters(int syscall)
static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
{
- BUG_ON(!spin_is_locked(&current->sighand->siglock));
+ assert_spin_locked(&current->sighand->siglock);
if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
return false;
@@ -214,7 +214,7 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
static inline void seccomp_assign_mode(struct task_struct *task,
unsigned long seccomp_mode)
{
- BUG_ON(!spin_is_locked(&task->sighand->siglock));
+ assert_spin_locked(&task->sighand->siglock);
task->seccomp.mode = seccomp_mode;
/*
@@ -253,7 +253,7 @@ static inline pid_t seccomp_can_sync_threads(void)
struct task_struct *thread, *caller;
BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
- BUG_ON(!spin_is_locked(&current->sighand->siglock));
+ assert_spin_locked(&current->sighand->siglock);
/* Validate all threads being eligible for synchronization. */
caller = current;
@@ -294,7 +294,7 @@ static inline void seccomp_sync_threads(void)
struct task_struct *thread, *caller;
BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
- BUG_ON(!spin_is_locked(&current->sighand->siglock));
+ assert_spin_locked(&current->sighand->siglock);
/* Synchronize all threads. */
caller = current;
@@ -464,7 +464,7 @@ static long seccomp_attach_filter(unsigned int flags,
unsigned long total_insns;
struct seccomp_filter *walker;
- BUG_ON(!spin_is_locked(&current->sighand->siglock));
+ assert_spin_locked(&current->sighand->siglock);
/* Validate resulting filter length. */
total_insns = filter->prog->len;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 99aa6ee3908f..f654a8a298fa 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -225,6 +225,20 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
};
/*
+ * Kick this CPU if it's full dynticks in order to force it to
+ * re-evaluate its dependency on the tick and restart it if necessary.
+ * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
+ * is NMI safe.
+ */
+void tick_nohz_full_kick(void)
+{
+ if (!tick_nohz_full_cpu(smp_processor_id()))
+ return;
+
+ irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
+}
+
+/*
* Kick the CPU if it's full dynticks in order to force it to
* re-evaluate its dependency on the tick and restart it if necessary.
*/
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f36b02838a47..ec1791fae965 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -338,10 +338,11 @@ EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
static inline void update_vsyscall(struct timekeeper *tk)
{
- struct timespec xt;
+ struct timespec xt, wm;
xt = timespec64_to_timespec(tk_xtime(tk));
- update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->tkr.clock, tk->tkr.mult,
+ wm = timespec64_to_timespec(tk->wall_to_monotonic);
+ update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult,
tk->tkr.cycle_last);
}
@@ -441,11 +442,12 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
tk->ntp_error = 0;
ntp_clear();
}
- update_vsyscall(tk);
- update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
tk_update_ktime_data(tk);
+ update_vsyscall(tk);
+ update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
+
if (action & TK_MIRROR)
memcpy(&shadow_timekeeper, &tk_core.timekeeper,
sizeof(tk_core.timekeeper));
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1654b12c891a..5916a8e59e87 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -65,15 +65,21 @@
#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL)
#ifdef CONFIG_DYNAMIC_FTRACE
-#define INIT_REGEX_LOCK(opsname) \
- .regex_lock = __MUTEX_INITIALIZER(opsname.regex_lock),
+#define INIT_OPS_HASH(opsname) \
+ .func_hash = &opsname.local_hash, \
+ .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
+#define ASSIGN_OPS_HASH(opsname, val) \
+ .func_hash = val, \
+ .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
#else
-#define INIT_REGEX_LOCK(opsname)
+#define INIT_OPS_HASH(opsname)
+#define ASSIGN_OPS_HASH(opsname, val)
#endif
static struct ftrace_ops ftrace_list_end __read_mostly = {
.func = ftrace_stub,
.flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
+ INIT_OPS_HASH(ftrace_list_end)
};
/* ftrace_enabled is a method to turn ftrace on or off */
@@ -140,7 +146,8 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops)
{
#ifdef CONFIG_DYNAMIC_FTRACE
if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) {
- mutex_init(&ops->regex_lock);
+ mutex_init(&ops->local_hash.regex_lock);
+ ops->func_hash = &ops->local_hash;
ops->flags |= FTRACE_OPS_FL_INITIALIZED;
}
#endif
@@ -899,7 +906,7 @@ static void unregister_ftrace_profiler(void)
static struct ftrace_ops ftrace_profile_ops __read_mostly = {
.func = function_profile_call,
.flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
- INIT_REGEX_LOCK(ftrace_profile_ops)
+ INIT_OPS_HASH(ftrace_profile_ops)
};
static int register_ftrace_profiler(void)
@@ -1081,11 +1088,12 @@ static const struct ftrace_hash empty_hash = {
#define EMPTY_HASH ((struct ftrace_hash *)&empty_hash)
static struct ftrace_ops global_ops = {
- .func = ftrace_stub,
- .notrace_hash = EMPTY_HASH,
- .filter_hash = EMPTY_HASH,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
- INIT_REGEX_LOCK(global_ops)
+ .func = ftrace_stub,
+ .local_hash.notrace_hash = EMPTY_HASH,
+ .local_hash.filter_hash = EMPTY_HASH,
+ INIT_OPS_HASH(global_ops)
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE |
+ FTRACE_OPS_FL_INITIALIZED,
};
struct ftrace_page {
@@ -1226,8 +1234,8 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
void ftrace_free_filter(struct ftrace_ops *ops)
{
ftrace_ops_init(ops);
- free_ftrace_hash(ops->filter_hash);
- free_ftrace_hash(ops->notrace_hash);
+ free_ftrace_hash(ops->func_hash->filter_hash);
+ free_ftrace_hash(ops->func_hash->notrace_hash);
}
static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
@@ -1288,9 +1296,9 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
}
static void
-ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash);
+ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash);
static void
-ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash);
+ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
static int
ftrace_hash_move(struct ftrace_ops *ops, int enable,
@@ -1342,13 +1350,13 @@ update:
* Remove the current set, update the hash and add
* them back.
*/
- ftrace_hash_rec_disable(ops, enable);
+ ftrace_hash_rec_disable_modify(ops, enable);
old_hash = *dst;
rcu_assign_pointer(*dst, new_hash);
free_ftrace_hash_rcu(old_hash);
- ftrace_hash_rec_enable(ops, enable);
+ ftrace_hash_rec_enable_modify(ops, enable);
return 0;
}
@@ -1382,8 +1390,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
return 0;
#endif
- filter_hash = rcu_dereference_raw_notrace(ops->filter_hash);
- notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash);
+ filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash);
+ notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash);
if ((ftrace_hash_empty(filter_hash) ||
ftrace_lookup_ip(filter_hash, ip)) &&
@@ -1503,25 +1511,38 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec)
static void ftrace_remove_tramp(struct ftrace_ops *ops,
struct dyn_ftrace *rec)
{
- struct ftrace_func_entry *entry;
-
- entry = ftrace_lookup_ip(ops->tramp_hash, rec->ip);
- if (!entry)
+ /* If TRAMP is not set, no ops should have a trampoline for this */
+ if (!(rec->flags & FTRACE_FL_TRAMP))
return;
+ rec->flags &= ~FTRACE_FL_TRAMP;
+
+ if ((!ftrace_hash_empty(ops->func_hash->filter_hash) &&
+ !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) ||
+ ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip))
+ return;
/*
* The tramp_hash entry will be removed at time
* of update.
*/
ops->nr_trampolines--;
- rec->flags &= ~FTRACE_FL_TRAMP;
}
-static void ftrace_clear_tramps(struct dyn_ftrace *rec)
+static void ftrace_clear_tramps(struct dyn_ftrace *rec, struct ftrace_ops *ops)
{
struct ftrace_ops *op;
+ /* If TRAMP is not set, no ops should have a trampoline for this */
+ if (!(rec->flags & FTRACE_FL_TRAMP))
+ return;
+
do_for_each_ftrace_op(op, ftrace_ops_list) {
+ /*
+ * This function is called to clear other tramps
+ * not the one that is being updated.
+ */
+ if (op == ops)
+ continue;
if (op->nr_trampolines)
ftrace_remove_tramp(op, rec);
} while_for_each_ftrace_op(op);
@@ -1554,14 +1575,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
* gets inversed.
*/
if (filter_hash) {
- hash = ops->filter_hash;
- other_hash = ops->notrace_hash;
+ hash = ops->func_hash->filter_hash;
+ other_hash = ops->func_hash->notrace_hash;
if (ftrace_hash_empty(hash))
all = 1;
} else {
inc = !inc;
- hash = ops->notrace_hash;
- other_hash = ops->filter_hash;
+ hash = ops->func_hash->notrace_hash;
+ other_hash = ops->func_hash->filter_hash;
/*
* If the notrace hash has no items,
* then there's nothing to do.
@@ -1622,13 +1643,10 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
/*
* If we are adding another function callback
* to this function, and the previous had a
- * trampoline used, then we need to go back to
- * the default trampoline.
+ * custom trampoline in use, then we need to go
+ * back to the default trampoline.
*/
- rec->flags &= ~FTRACE_FL_TRAMP;
-
- /* remove trampolines from any ops for this rec */
- ftrace_clear_tramps(rec);
+ ftrace_clear_tramps(rec, ops);
}
/*
@@ -1682,6 +1700,41 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
__ftrace_hash_rec_update(ops, filter_hash, 1);
}
+static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops,
+ int filter_hash, int inc)
+{
+ struct ftrace_ops *op;
+
+ __ftrace_hash_rec_update(ops, filter_hash, inc);
+
+ if (ops->func_hash != &global_ops.local_hash)
+ return;
+
+ /*
+ * If the ops shares the global_ops hash, then we need to update
+ * all ops that are enabled and use this hash.
+ */
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ /* Already done */
+ if (op == ops)
+ continue;
+ if (op->func_hash == &global_ops.local_hash)
+ __ftrace_hash_rec_update(op, filter_hash, inc);
+ } while_for_each_ftrace_op(op);
+}
+
+static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops,
+ int filter_hash)
+{
+ ftrace_hash_rec_update_modify(ops, filter_hash, 0);
+}
+
+static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
+ int filter_hash)
+{
+ ftrace_hash_rec_update_modify(ops, filter_hash, 1);
+}
+
static void print_ip_ins(const char *fmt, unsigned char *p)
{
int i;
@@ -1896,8 +1949,8 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
if (rec->flags & FTRACE_FL_TRAMP) {
ops = ftrace_find_tramp_ops_new(rec);
if (FTRACE_WARN_ON(!ops || !ops->trampoline)) {
- pr_warning("Bad trampoline accounting at: %p (%pS)\n",
- (void *)rec->ip, (void *)rec->ip);
+ pr_warn("Bad trampoline accounting at: %p (%pS) (%lx)\n",
+ (void *)rec->ip, (void *)rec->ip, rec->flags);
/* Ftrace is shutting down, return anything */
return (unsigned long)FTRACE_ADDR;
}
@@ -1964,7 +2017,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
return ftrace_make_call(rec, ftrace_addr);
case FTRACE_UPDATE_MAKE_NOP:
- return ftrace_make_nop(NULL, rec, ftrace_addr);
+ return ftrace_make_nop(NULL, rec, ftrace_old_addr);
case FTRACE_UPDATE_MODIFY_CALL:
return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
@@ -2227,7 +2280,10 @@ static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops)
} while_for_each_ftrace_rec();
/* The number of recs in the hash must match nr_trampolines */
- FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines);
+ if (FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines))
+ pr_warn("count=%ld trampolines=%d\n",
+ ops->tramp_hash->count,
+ ops->nr_trampolines);
return 0;
}
@@ -2436,8 +2492,8 @@ static inline int ops_traces_mod(struct ftrace_ops *ops)
* Filter_hash being empty will default to trace module.
* But notrace hash requires a test of individual module functions.
*/
- return ftrace_hash_empty(ops->filter_hash) &&
- ftrace_hash_empty(ops->notrace_hash);
+ return ftrace_hash_empty(ops->func_hash->filter_hash) &&
+ ftrace_hash_empty(ops->func_hash->notrace_hash);
}
/*
@@ -2459,12 +2515,12 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
return 0;
/* The function must be in the filter */
- if (!ftrace_hash_empty(ops->filter_hash) &&
- !ftrace_lookup_ip(ops->filter_hash, rec->ip))
+ if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
+ !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))
return 0;
/* If in notrace hash, we ignore it too */
- if (ftrace_lookup_ip(ops->notrace_hash, rec->ip))
+ if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip))
return 0;
return 1;
@@ -2785,10 +2841,10 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
} else {
rec = &iter->pg->records[iter->idx++];
if (((iter->flags & FTRACE_ITER_FILTER) &&
- !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
+ !(ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))) ||
((iter->flags & FTRACE_ITER_NOTRACE) &&
- !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) ||
+ !ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) ||
((iter->flags & FTRACE_ITER_ENABLED) &&
!(rec->flags & FTRACE_FL_ENABLED))) {
@@ -2837,9 +2893,9 @@ static void *t_start(struct seq_file *m, loff_t *pos)
* functions are enabled.
*/
if ((iter->flags & FTRACE_ITER_FILTER &&
- ftrace_hash_empty(ops->filter_hash)) ||
+ ftrace_hash_empty(ops->func_hash->filter_hash)) ||
(iter->flags & FTRACE_ITER_NOTRACE &&
- ftrace_hash_empty(ops->notrace_hash))) {
+ ftrace_hash_empty(ops->func_hash->notrace_hash))) {
if (*pos > 0)
return t_hash_start(m, pos);
iter->flags |= FTRACE_ITER_PRINTALL;
@@ -3001,12 +3057,12 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
iter->ops = ops;
iter->flags = flag;
- mutex_lock(&ops->regex_lock);
+ mutex_lock(&ops->func_hash->regex_lock);
if (flag & FTRACE_ITER_NOTRACE)
- hash = ops->notrace_hash;
+ hash = ops->func_hash->notrace_hash;
else
- hash = ops->filter_hash;
+ hash = ops->func_hash->filter_hash;
if (file->f_mode & FMODE_WRITE) {
const int size_bits = FTRACE_HASH_DEFAULT_BITS;
@@ -3041,7 +3097,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
file->private_data = iter;
out_unlock:
- mutex_unlock(&ops->regex_lock);
+ mutex_unlock(&ops->func_hash->regex_lock);
return ret;
}
@@ -3279,7 +3335,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly =
{
.func = function_trace_probe_call,
.flags = FTRACE_OPS_FL_INITIALIZED,
- INIT_REGEX_LOCK(trace_probe_ops)
+ INIT_OPS_HASH(trace_probe_ops)
};
static int ftrace_probe_registered;
@@ -3342,7 +3398,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
void *data)
{
struct ftrace_func_probe *entry;
- struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
+ struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
struct ftrace_hash *hash;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
@@ -3359,7 +3415,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
if (WARN_ON(not))
return -EINVAL;
- mutex_lock(&trace_probe_ops.regex_lock);
+ mutex_lock(&trace_probe_ops.func_hash->regex_lock);
hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
if (!hash) {
@@ -3428,7 +3484,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
out_unlock:
mutex_unlock(&ftrace_lock);
out:
- mutex_unlock(&trace_probe_ops.regex_lock);
+ mutex_unlock(&trace_probe_ops.func_hash->regex_lock);
free_ftrace_hash(hash);
return count;
@@ -3446,7 +3502,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
struct ftrace_func_entry *rec_entry;
struct ftrace_func_probe *entry;
struct ftrace_func_probe *p;
- struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
+ struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
struct list_head free_list;
struct ftrace_hash *hash;
struct hlist_node *tmp;
@@ -3468,7 +3524,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
return;
}
- mutex_lock(&trace_probe_ops.regex_lock);
+ mutex_lock(&trace_probe_ops.func_hash->regex_lock);
hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
if (!hash)
@@ -3521,7 +3577,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
mutex_unlock(&ftrace_lock);
out_unlock:
- mutex_unlock(&trace_probe_ops.regex_lock);
+ mutex_unlock(&trace_probe_ops.func_hash->regex_lock);
free_ftrace_hash(hash);
}
@@ -3717,12 +3773,12 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
if (unlikely(ftrace_disabled))
return -ENODEV;
- mutex_lock(&ops->regex_lock);
+ mutex_lock(&ops->func_hash->regex_lock);
if (enable)
- orig_hash = &ops->filter_hash;
+ orig_hash = &ops->func_hash->filter_hash;
else
- orig_hash = &ops->notrace_hash;
+ orig_hash = &ops->func_hash->notrace_hash;
if (reset)
hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
@@ -3752,7 +3808,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
mutex_unlock(&ftrace_lock);
out_regex_unlock:
- mutex_unlock(&ops->regex_lock);
+ mutex_unlock(&ops->func_hash->regex_lock);
free_ftrace_hash(hash);
return ret;
@@ -3975,15 +4031,15 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
trace_parser_put(parser);
- mutex_lock(&iter->ops->regex_lock);
+ mutex_lock(&iter->ops->func_hash->regex_lock);
if (file->f_mode & FMODE_WRITE) {
filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
if (filter_hash)
- orig_hash = &iter->ops->filter_hash;
+ orig_hash = &iter->ops->func_hash->filter_hash;
else
- orig_hash = &iter->ops->notrace_hash;
+ orig_hash = &iter->ops->func_hash->notrace_hash;
mutex_lock(&ftrace_lock);
ret = ftrace_hash_move(iter->ops, filter_hash,
@@ -3994,7 +4050,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
mutex_unlock(&ftrace_lock);
}
- mutex_unlock(&iter->ops->regex_lock);
+ mutex_unlock(&iter->ops->func_hash->regex_lock);
free_ftrace_hash(iter->hash);
kfree(iter);
@@ -4611,7 +4667,6 @@ void __init ftrace_init(void)
static struct ftrace_ops global_ops = {
.func = ftrace_stub,
.flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
- INIT_REGEX_LOCK(global_ops)
};
static int __init ftrace_nodyn_init(void)
@@ -4713,7 +4768,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
static struct ftrace_ops control_ops = {
.func = ftrace_ops_control_func,
.flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
- INIT_REGEX_LOCK(control_ops)
+ INIT_OPS_HASH(control_ops)
};
static inline void
@@ -5145,6 +5200,17 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static struct ftrace_ops graph_ops = {
+ .func = ftrace_stub,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE |
+ FTRACE_OPS_FL_INITIALIZED |
+ FTRACE_OPS_FL_STUB,
+#ifdef FTRACE_GRAPH_TRAMP_ADDR
+ .trampoline = FTRACE_GRAPH_TRAMP_ADDR,
+#endif
+ ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
+};
+
static int ftrace_graph_active;
int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
@@ -5307,12 +5373,28 @@ static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
*/
static void update_function_graph_func(void)
{
- if (ftrace_ops_list == &ftrace_list_end ||
- (ftrace_ops_list == &global_ops &&
- global_ops.next == &ftrace_list_end))
- ftrace_graph_entry = __ftrace_graph_entry;
- else
+ struct ftrace_ops *op;
+ bool do_test = false;
+
+ /*
+ * The graph and global ops share the same set of functions
+ * to test. If any other ops is on the list, then
+ * the graph tracing needs to test if its the function
+ * it should call.
+ */
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (op != &global_ops && op != &graph_ops &&
+ op != &ftrace_list_end) {
+ do_test = true;
+ /* in double loop, break out with goto */
+ goto out;
+ }
+ } while_for_each_ftrace_op(op);
+ out:
+ if (do_test)
ftrace_graph_entry = ftrace_graph_entry_test;
+ else
+ ftrace_graph_entry = __ftrace_graph_entry;
}
static struct notifier_block ftrace_suspend_notifier = {
@@ -5353,16 +5435,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
ftrace_graph_entry = ftrace_graph_entry_test;
update_function_graph_func();
- /* Function graph doesn't use the .func field of global_ops */
- global_ops.flags |= FTRACE_OPS_FL_STUB;
-
-#ifdef CONFIG_DYNAMIC_FTRACE
- /* Optimize function graph calling (if implemented by arch) */
- if (FTRACE_GRAPH_TRAMP_ADDR != 0)
- global_ops.trampoline = FTRACE_GRAPH_TRAMP_ADDR;
-#endif
-
- ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
+ ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
out:
mutex_unlock(&ftrace_lock);
@@ -5380,12 +5453,7 @@ void unregister_ftrace_graph(void)
ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
ftrace_graph_entry = ftrace_graph_entry_stub;
__ftrace_graph_entry = ftrace_graph_entry_stub;
- ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
- global_ops.flags &= ~FTRACE_OPS_FL_STUB;
-#ifdef CONFIG_DYNAMIC_FTRACE
- if (FTRACE_GRAPH_TRAMP_ADDR != 0)
- global_ops.trampoline = 0;
-#endif
+ ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET);
unregister_pm_notifier(&ftrace_suspend_notifier);
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index afb04b9b818a..b38fb2b9e237 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -626,8 +626,22 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
work = &cpu_buffer->irq_work;
}
- work->waiters_pending = true;
poll_wait(filp, &work->waiters, poll_table);
+ work->waiters_pending = true;
+ /*
+ * There's a tight race between setting the waiters_pending and
+ * checking if the ring buffer is empty. Once the waiters_pending bit
+ * is set, the next event will wake the task up, but we can get stuck
+ * if there's only a single event in.
+ *
+ * FIXME: Ideally, we need a memory barrier on the writer side as well,
+ * but adding a memory barrier to all events will cause too much of a
+ * performance hit in the fast path. We only need a memory barrier when
+ * the buffer goes from empty to having content. But as this race is
+ * extremely small, and it's not a problem if another event comes in, we
+ * will fix it later.
+ */
+ smp_mb();
if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
(cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))