aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/acct.c11
-rw-r--r--kernel/bounds.c7
-rw-r--r--kernel/bpf/bloom_filter.c2
-rw-r--r--kernel/bpf/cgroup_iter.c2
-rw-r--r--kernel/bpf/core.c8
-rw-r--r--kernel/bpf/hashtab.c2
-rw-r--r--kernel/bpf/task_iter.c10
-rw-r--r--kernel/bpf/verifier.c2
-rw-r--r--kernel/cgroup/cgroup-internal.h1
-rw-r--r--kernel/cgroup/cgroup.c203
-rw-r--r--kernel/debug/debug_core.c12
-rw-r--r--kernel/delayacct.c13
-rw-r--r--kernel/dma/mapping.c10
-rw-r--r--kernel/entry/common.c5
-rw-r--r--kernel/events/core.c3
-rw-r--r--kernel/events/uprobes.c39
-rw-r--r--kernel/exit.c7
-rw-r--r--kernel/fail_function.c26
-rw-r--r--kernel/fork.c80
-rw-r--r--kernel/gcov/gcc_4_7.c18
-rw-r--r--kernel/irq/irqdesc.c24
-rw-r--r--kernel/kcov.c7
-rw-r--r--kernel/kcsan/selftest.c4
-rw-r--r--kernel/kexec.c11
-rw-r--r--kernel/kexec_core.c36
-rw-r--r--kernel/kexec_file.c4
-rw-r--r--kernel/kexec_internal.h15
-rw-r--r--kernel/ksysfs.c7
-rw-r--r--kernel/latencytop.c4
-rw-r--r--kernel/locking/Makefile3
-rw-r--r--kernel/locking/test-ww_mutex.c4
-rw-r--r--kernel/pid.c1
-rw-r--r--kernel/profile.c32
-rw-r--r--kernel/relay.c5
-rw-r--r--kernel/sched/core.c16
-rw-r--r--kernel/sched/debug.c1
-rw-r--r--kernel/sched/fair.c175
-rw-r--r--kernel/sched/psi.c280
-rw-r--r--kernel/sched/sched.h1
-rw-r--r--kernel/sched/stats.h6
-rw-r--r--kernel/smpboot.c15
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/task_work.c16
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/trace/blktrace.c82
-rw-r--r--kernel/trace/ftrace.c5
-rw-r--r--kernel/trace/ring_buffer.c6
-rw-r--r--kernel/trace/trace_eprobe.c60
-rw-r--r--kernel/trace/trace_events_synth.c23
-rw-r--r--kernel/trace/trace_kprobe.c60
-rw-r--r--kernel/trace/trace_probe_kernel.h115
-rw-r--r--kernel/utsname_sysctl.c7
53 files changed, 1044 insertions, 453 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 318789c728d3..d754e0be1176 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -38,6 +38,7 @@ KCOV_INSTRUMENT_kcov.o := n
KASAN_SANITIZE_kcov.o := n
KCSAN_SANITIZE_kcov.o := n
UBSAN_SANITIZE_kcov.o := n
+KMSAN_SANITIZE_kcov.o := n
CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector
# Don't instrument error handlers
diff --git a/kernel/acct.c b/kernel/acct.c
index 13706356ec54..62200d799b9b 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -555,15 +555,14 @@ void acct_collect(long exitcode, int group_dead)
unsigned long vsize = 0;
if (group_dead && current->mm) {
+ struct mm_struct *mm = current->mm;
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
- mmap_read_lock(current->mm);
- vma = current->mm->mmap;
- while (vma) {
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma)
vsize += vma->vm_end - vma->vm_start;
- vma = vma->vm_next;
- }
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
}
spin_lock_irq(&current->sighand->siglock);
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 9795d75b09b2..b529182e8b04 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -22,6 +22,13 @@ int main(void)
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
#endif
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
+#ifdef CONFIG_LRU_GEN
+ DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
+ DEFINE(__LRU_REFS_WIDTH, MAX_NR_TIERS - 2);
+#else
+ DEFINE(LRU_GEN_WIDTH, 0);
+ DEFINE(__LRU_REFS_WIDTH, 0);
+#endif
/* End of constants */
return 0;
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index b9ea539a5561..48ee750849f2 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -158,7 +158,7 @@ static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
attr->value_size / sizeof(u32);
if (!(attr->map_flags & BPF_F_ZERO_SEED))
- bloom->hash_seed = get_random_int();
+ bloom->hash_seed = get_random_u32();
return &bloom->map;
}
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
index 0d200a993489..9fcf09f2ef00 100644
--- a/kernel/bpf/cgroup_iter.c
+++ b/kernel/bpf/cgroup_iter.c
@@ -196,7 +196,7 @@ static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
return -EINVAL;
if (fd)
- cgrp = cgroup_get_from_fd(fd);
+ cgrp = cgroup_v1v2_get_from_fd(fd);
else if (id)
cgrp = cgroup_get_from_id(id);
else /* walk the entire hierarchy by default. */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 711fd293b6de..25a54e04560e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1032,7 +1032,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
hdr->size = size;
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
- start = (get_random_int() % hole) & ~(alignment - 1);
+ start = prandom_u32_max(hole) & ~(alignment - 1);
/* Leave a random number of instructions before BPF code. */
*image_ptr = &hdr->image[start];
@@ -1094,7 +1094,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
- start = (get_random_int() % hole) & ~(alignment - 1);
+ start = prandom_u32_max(hole) & ~(alignment - 1);
*image_ptr = &ro_header->image[start];
*rw_image = &(*rw_header)->image[start];
@@ -1216,7 +1216,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
bool emit_zext)
{
struct bpf_insn *to = to_buff;
- u32 imm_rnd = get_random_int();
+ u32 imm_rnd = get_random_u32();
s16 off;
BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG);
@@ -2007,7 +2007,7 @@ out:
static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
{ \
u64 stack[stack_size / sizeof(u64)]; \
- u64 regs[MAX_BPF_EXT_REG]; \
+ u64 regs[MAX_BPF_EXT_REG] = {}; \
\
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
ARG1 = (u64) (unsigned long) ctx; \
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index ed3f8a53603b..f39ee3e05589 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -527,7 +527,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (htab->map.map_flags & BPF_F_ZERO_SEED)
htab->hashrnd = 0;
else
- htab->hashrnd = get_random_int();
+ htab->hashrnd = get_random_u32();
htab_init_buckets(htab);
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 67e03e1833ba..c2a2182ce570 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -445,8 +445,8 @@ struct bpf_iter_seq_task_vma_info {
};
enum bpf_task_vma_iter_find_op {
- task_vma_iter_first_vma, /* use mm->mmap */
- task_vma_iter_next_vma, /* use curr_vma->vm_next */
+ task_vma_iter_first_vma, /* use find_vma() with addr 0 */
+ task_vma_iter_next_vma, /* use vma_next() with curr_vma */
task_vma_iter_find_vma, /* use find_vma() to find next vma */
};
@@ -544,10 +544,10 @@ again:
switch (op) {
case task_vma_iter_first_vma:
- curr_vma = curr_task->mm->mmap;
+ curr_vma = find_vma(curr_task->mm, 0);
break;
case task_vma_iter_next_vma:
- curr_vma = curr_vma->vm_next;
+ curr_vma = find_vma(curr_task->mm, curr_vma->vm_end);
break;
case task_vma_iter_find_vma:
/* We dropped mmap_lock so it is necessary to use find_vma
@@ -561,7 +561,7 @@ again:
if (curr_vma &&
curr_vma->vm_start == info->prev_vm_start &&
curr_vma->vm_end == info->prev_vm_end)
- curr_vma = curr_vma->vm_next;
+ curr_vma = find_vma(curr_task->mm, curr_vma->vm_end);
break;
}
if (!curr_vma) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6f6d2d511c06..014ee0953dbd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -13350,7 +13350,7 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
aux[adj_idx].ptr_type == PTR_TO_CTX)
continue;
- imm_rnd = get_random_int();
+ imm_rnd = get_random_u32();
rnd_hi32_patch[0] = insn;
rnd_hi32_patch[1].imm = imm_rnd;
rnd_hi32_patch[3].dst_reg = load_reg;
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 2c7ecca226be..fd4020835ec6 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -164,7 +164,6 @@ struct cgroup_mgctx {
#define DEFINE_CGROUP_MGCTX(name) \
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
-extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 764bdd5fd8d1..2319946715e0 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1392,6 +1392,9 @@ static void cgroup_destroy_root(struct cgroup_root *root)
cgroup_free_root(root);
}
+/*
+ * Returned cgroup is without refcount but it's valid as long as cset pins it.
+ */
static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
struct cgroup_root *root)
{
@@ -1403,6 +1406,7 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
res_cgroup = cset->dfl_cgrp;
} else {
struct cgrp_cset_link *link;
+ lockdep_assert_held(&css_set_lock);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
@@ -1414,6 +1418,7 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
}
}
+ BUG_ON(!res_cgroup);
return res_cgroup;
}
@@ -1436,23 +1441,36 @@ current_cgns_cgroup_from_root(struct cgroup_root *root)
rcu_read_unlock();
- BUG_ON(!res);
return res;
}
+/*
+ * Look up cgroup associated with current task's cgroup namespace on the default
+ * hierarchy.
+ *
+ * Unlike current_cgns_cgroup_from_root(), this doesn't need locks:
+ * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu
+ * pointers.
+ * - css_set_lock is not needed because we just read cset->dfl_cgrp.
+ * - As a bonus returned cgrp is pinned with the current because it cannot
+ * switch cgroup_ns asynchronously.
+ */
+static struct cgroup *current_cgns_cgroup_dfl(void)
+{
+ struct css_set *cset;
+
+ cset = current->nsproxy->cgroup_ns->root_cset;
+ return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
+}
+
/* look up cgroup associated with given css_set on the specified hierarchy */
static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
struct cgroup_root *root)
{
- struct cgroup *res = NULL;
-
lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
- res = __cset_cgroup_from_root(cset, root);
-
- BUG_ON(!res);
- return res;
+ return __cset_cgroup_from_root(cset, root);
}
/*
@@ -3698,27 +3716,27 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_CPU);
}
-static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, enum psi_res res)
+static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, enum psi_res res)
{
struct cgroup_file_ctx *ctx = of->priv;
struct psi_trigger *new;
@@ -3738,7 +3756,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
return -EBUSY;
}
- psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+ psi = cgroup_psi(cgrp);
new = psi_trigger_create(psi, buf, res);
if (IS_ERR(new)) {
cgroup_put(cgrp);
@@ -3755,21 +3773,86 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
+ return pressure_write(of, buf, nbytes, PSI_IO);
}
static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
+ return pressure_write(of, buf, nbytes, PSI_MEM);
}
static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
+ return pressure_write(of, buf, nbytes, PSI_CPU);
+}
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_psi(cgrp);
+
+ return psi_show(seq, psi, PSI_IRQ);
+}
+
+static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return pressure_write(of, buf, nbytes, PSI_IRQ);
+}
+#endif
+
+static int cgroup_pressure_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_psi(cgrp);
+
+ seq_printf(seq, "%d\n", psi->enabled);
+
+ return 0;
+}
+
+static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ ssize_t ret;
+ int enable;
+ struct cgroup *cgrp;
+ struct psi_group *psi;
+
+ ret = kstrtoint(strstrip(buf), 0, &enable);
+ if (ret)
+ return ret;
+
+ if (enable < 0 || enable > 1)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ psi = cgroup_psi(cgrp);
+ if (psi->enabled != enable) {
+ int i;
+
+ /* show or hide {cpu,memory,io,irq}.pressure files */
+ for (i = 0; i < NR_PSI_RESOURCES; i++)
+ cgroup_file_show(&cgrp->psi_files[i], enable);
+
+ psi->enabled = enable;
+ if (enable)
+ psi_cgroup_restart(psi);
+ }
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
}
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
@@ -3789,6 +3872,9 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
bool cgroup_psi_enabled(void)
{
+ if (static_branch_likely(&psi_disabled))
+ return false;
+
return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
}
@@ -5175,6 +5261,7 @@ static struct cftype cgroup_psi_files[] = {
#ifdef CONFIG_PSI
{
.name = "io.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
.seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5182,6 +5269,7 @@ static struct cftype cgroup_psi_files[] = {
},
{
.name = "memory.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5189,11 +5277,27 @@ static struct cftype cgroup_psi_files[] = {
},
{
.name = "cpu.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
.seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll,
.release = cgroup_pressure_release,
},
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ {
+ .name = "irq.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
+ .seq_show = cgroup_irq_pressure_show,
+ .write = cgroup_irq_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
+ },
+#endif
+ {
+ .name = "cgroup.pressure",
+ .seq_show = cgroup_pressure_show,
+ .write = cgroup_pressure_write,
+ },
#endif /* CONFIG_PSI */
{ } /* terminate */
};
@@ -6105,9 +6209,7 @@ struct cgroup *cgroup_get_from_id(u64 id)
if (!cgrp)
return ERR_PTR(-ENOENT);
- spin_lock_irq(&css_set_lock);
- root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root);
- spin_unlock_irq(&css_set_lock);
+ root_cgrp = current_cgns_cgroup_dfl();
if (!cgroup_is_descendant(cgrp, root_cgrp)) {
cgroup_put(cgrp);
return ERR_PTR(-ENOENT);
@@ -6208,16 +6310,42 @@ void cgroup_fork(struct task_struct *child)
INIT_LIST_HEAD(&child->cg_list);
}
-static struct cgroup *cgroup_get_from_file(struct file *f)
+/**
+ * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer
+ * @f: file corresponding to cgroup_dir
+ *
+ * Find the cgroup from a file pointer associated with a cgroup directory.
+ * Returns a pointer to the cgroup on success. ERR_PTR is returned if the
+ * cgroup cannot be found.
+ */
+static struct cgroup *cgroup_v1v2_get_from_file(struct file *f)
{
struct cgroup_subsys_state *css;
- struct cgroup *cgrp;
css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
if (IS_ERR(css))
return ERR_CAST(css);
- cgrp = css->cgroup;
+ return css->cgroup;
+}
+
+/**
+ * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports
+ * cgroup2.
+ * @f: file corresponding to cgroup2_dir
+ */
+static struct cgroup *cgroup_get_from_file(struct file *f)
+{
+ struct cgroup *cgrp = cgroup_v1v2_get_from_file(f);
+
+ if (IS_ERR(cgrp))
+ return ERR_CAST(cgrp);
+
+ if (!cgroup_on_dfl(cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-EBADF);
+ }
+
return cgrp;
}
@@ -6686,10 +6814,8 @@ struct cgroup *cgroup_get_from_path(const char *path)
struct cgroup *cgrp = ERR_PTR(-ENOENT);
struct cgroup *root_cgrp;
- spin_lock_irq(&css_set_lock);
- root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root);
+ root_cgrp = current_cgns_cgroup_dfl();
kn = kernfs_walk_and_get(root_cgrp->kn, path);
- spin_unlock_irq(&css_set_lock);
if (!kn)
goto out;
@@ -6714,15 +6840,15 @@ out:
EXPORT_SYMBOL_GPL(cgroup_get_from_path);
/**
- * cgroup_get_from_fd - get a cgroup pointer from a fd
- * @fd: fd obtained by open(cgroup2_dir)
+ * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd
+ * @fd: fd obtained by open(cgroup_dir)
*
* Find the cgroup from a fd which should be obtained
* by opening a cgroup directory. Returns a pointer to the
* cgroup on success. ERR_PTR is returned if the cgroup
* cannot be found.
*/
-struct cgroup *cgroup_get_from_fd(int fd)
+struct cgroup *cgroup_v1v2_get_from_fd(int fd)
{
struct cgroup *cgrp;
struct file *f;
@@ -6731,10 +6857,29 @@ struct cgroup *cgroup_get_from_fd(int fd)
if (!f)
return ERR_PTR(-EBADF);
- cgrp = cgroup_get_from_file(f);
+ cgrp = cgroup_v1v2_get_from_file(f);
fput(f);
return cgrp;
}
+
+/**
+ * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports
+ * cgroup2.
+ * @fd: fd obtained by open(cgroup2_dir)
+ */
+struct cgroup *cgroup_get_from_fd(int fd)
+{
+ struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd);
+
+ if (IS_ERR(cgrp))
+ return ERR_CAST(cgrp);
+
+ if (!cgroup_on_dfl(cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-EBADF);
+ }
+ return cgrp;
+}
EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
static u64 power_of_ten(int power)
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 7beceb447211..d5e9ccde3ab8 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -50,7 +50,6 @@
#include <linux/pid.h>
#include <linux/smp.h>
#include <linux/mm.h>
-#include <linux/vmacache.h>
#include <linux/rcupdate.h>
#include <linux/irq.h>
#include <linux/security.h>
@@ -283,17 +282,6 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
if (!CACHE_FLUSH_IS_SAFE)
return;
- if (current->mm) {
- int i;
-
- for (i = 0; i < VMACACHE_SIZE; i++) {
- if (!current->vmacache.vmas[i])
- continue;
- flush_cache_range(current->vmacache.vmas[i],
- addr, addr + BREAK_INSTR_SIZE);
- }
- }
-
/* Force flush instruction cache if it was outside the mm */
flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
}
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 164ed9ef77a3..e39cb696cfbd 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -214,13 +214,22 @@ void __delayacct_freepages_end(void)
&current->delays->freepages_count);
}
-void __delayacct_thrashing_start(void)
+void __delayacct_thrashing_start(bool *in_thrashing)
{
+ *in_thrashing = !!current->in_thrashing;
+ if (*in_thrashing)
+ return;
+
+ current->in_thrashing = 1;
current->delays->thrashing_start = local_clock();
}
-void __delayacct_thrashing_end(void)
+void __delayacct_thrashing_end(bool *in_thrashing)
{
+ if (*in_thrashing)
+ return;
+
+ current->in_thrashing = 0;
delayacct_end(&current->delays->lock,
&current->delays->thrashing_start,
&current->delays->thrashing_delay,
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 27f272381cf2..33437d620644 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -10,6 +10,7 @@
#include <linux/dma-map-ops.h>
#include <linux/export.h>
#include <linux/gfp.h>
+#include <linux/kmsan.h>
#include <linux/of_device.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
@@ -156,6 +157,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
else
addr = ops->map_page(dev, page, offset, size, dir, attrs);
+ kmsan_handle_dma(page, offset, size, dir);
debug_dma_map_page(dev, page, offset, size, dir, addr, attrs);
return addr;
@@ -194,11 +196,13 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
else
ents = ops->map_sg(dev, sg, nents, dir, attrs);
- if (ents > 0)
+ if (ents > 0) {
+ kmsan_handle_dma_sg(sg, nents, dir);
debug_dma_map_sg(dev, sg, nents, ents, dir, attrs);
- else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
- ents != -EIO && ents != -EREMOTEIO))
+ } else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
+ ents != -EIO && ents != -EREMOTEIO)) {
return -EIO;
+ }
return ents;
}
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 063068a9ea9b..846add8394c4 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -5,6 +5,7 @@
#include <linux/resume_user_mode.h>
#include <linux/highmem.h>
#include <linux/jump_label.h>
+#include <linux/kmsan.h>
#include <linux/livepatch.h>
#include <linux/audit.h>
#include <linux/tick.h>
@@ -24,6 +25,7 @@ static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
user_exit_irqoff();
instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
instrumentation_end();
}
@@ -352,6 +354,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
lockdep_hardirqs_off(CALLER_ADDR0);
ct_irq_enter();
instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
instrumentation_end();
@@ -367,6 +370,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
*/
lockdep_hardirqs_off(CALLER_ADDR0);
instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
rcu_irq_enter_check_tick();
trace_hardirqs_off_finish();
instrumentation_end();
@@ -452,6 +456,7 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
ct_nmi_enter();
instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
ftrace_nmi_enter();
instrumentation_end();
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 43b0df997d13..aefc1e08e015 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10270,8 +10270,9 @@ static void perf_addr_filter_apply(struct perf_addr_filter *filter,
struct perf_addr_filter_range *fr)
{
struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!vma->vm_file)
continue;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 2eaa327f8158..d9e357b7e17c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -19,7 +19,7 @@
#include <linux/export.h>
#include <linux/rmap.h> /* anon_vma_prepare */
#include <linux/mmu_notifier.h> /* set_pte_at_notify */
-#include <linux/swap.h> /* try_to_free_swap */
+#include <linux/swap.h> /* folio_free_swap */
#include <linux/ptrace.h> /* user_enable_single_step */
#include <linux/kdebug.h> /* notifier mechanism */
#include "../../mm/internal.h" /* munlock_vma_page */
@@ -154,8 +154,10 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
struct page *old_page, struct page *new_page)
{
+ struct folio *old_folio = page_folio(old_page);
+ struct folio *new_folio;
struct mm_struct *mm = vma->vm_mm;
- DEFINE_FOLIO_VMA_WALK(pvmw, page_folio(old_page), vma, addr, 0);
+ DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0);
int err;
struct mmu_notifier_range range;
@@ -163,14 +165,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
addr + PAGE_SIZE);
if (new_page) {
- err = mem_cgroup_charge(page_folio(new_page), vma->vm_mm,
- GFP_KERNEL);
+ new_folio = page_folio(new_page);
+ err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL);
if (err)
return err;
}
- /* For try_to_free_swap() below */
- lock_page(old_page);
+ /* For folio_free_swap() below */
+ folio_lock(old_folio);
mmu_notifier_invalidate_range_start(&range);
err = -EAGAIN;
@@ -179,14 +181,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
if (new_page) {
- get_page(new_page);
+ folio_get(new_folio);
page_add_new_anon_rmap(new_page, vma, addr);
- lru_cache_add_inactive_or_unevictable(new_page, vma);
+ folio_add_lru_vma(new_folio, vma);
} else
/* no new page, just dec_mm_counter for old_page */
dec_mm_counter(mm, MM_ANONPAGES);
- if (!PageAnon(old_page)) {
+ if (!folio_test_anon(old_folio)) {
dec_mm_counter(mm, mm_counter_file(old_page));
inc_mm_counter(mm, MM_ANONPAGES);
}
@@ -198,15 +200,15 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mk_pte(new_page, vma->vm_page_prot));
page_remove_rmap(old_page, vma, false);
- if (!page_mapped(old_page))
- try_to_free_swap(old_page);
+ if (!folio_mapped(old_folio))
+ folio_free_swap(old_folio);
page_vma_mapped_walk_done(&pvmw);
- put_page(old_page);
+ folio_put(old_folio);
err = 0;
unlock:
mmu_notifier_invalidate_range_end(&range);
- unlock_page(old_page);
+ folio_unlock(old_folio);
return err;
}
@@ -349,9 +351,10 @@ static bool valid_ref_ctr_vma(struct uprobe *uprobe,
static struct vm_area_struct *
find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *tmp;
- for (tmp = mm->mmap; tmp; tmp = tmp->vm_next)
+ for_each_vma(vmi, tmp)
if (valid_ref_ctr_vma(uprobe, tmp))
return tmp;
@@ -552,7 +555,7 @@ put_old:
/* try collapse pmd for compound page */
if (!ret && orig_page_huge)
- collapse_pte_mapped_thp(mm, vaddr);
+ collapse_pte_mapped_thp(mm, vaddr, false);
return ret;
}
@@ -1231,11 +1234,12 @@ int uprobe_apply(struct inode *inode, loff_t offset,
static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
int err = 0;
mmap_read_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
unsigned long vaddr;
loff_t offset;
@@ -1983,9 +1987,10 @@ bool uprobe_deny_signal(void)
static void mmf_recalc_uprobes(struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!valid_vma(vma, false))
continue;
/*
diff --git a/kernel/exit.c b/kernel/exit.c
index 4b8e7c94a3c0..35e0a31a0315 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -60,6 +60,7 @@
#include <linux/writeback.h>
#include <linux/shm.h>
#include <linux/kcov.h>
+#include <linux/kmsan.h>
#include <linux/random.h>
#include <linux/rcuwait.h>
#include <linux/compat.h>
@@ -183,6 +184,10 @@ void put_task_struct_rcu_user(struct task_struct *task)
call_rcu(&task->rcu, delayed_put_task_struct);
}
+void __weak release_thread(struct task_struct *dead_task)
+{
+}
+
void release_task(struct task_struct *p)
{
struct task_struct *leader;
@@ -466,6 +471,7 @@ assign_new_owner:
goto retry;
}
WRITE_ONCE(mm->owner, c);
+ lru_gen_migrate_mm(mm);
task_unlock(c);
put_task_struct(c);
}
@@ -759,6 +765,7 @@ void __noreturn do_exit(long code)
WARN_ON(tsk->plug);
kcov_task_exit(tsk);
+ kmsan_task_exit(tsk);
coredump_task_exit(tsk);
ptrace_event(PTRACE_EVENT_EXIT, code);
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index 60dc825ecc2b..a7ccd2930c5f 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -247,15 +247,11 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,
/* cut off if it is too long */
if (count > KSYM_NAME_LEN)
count = KSYM_NAME_LEN;
- buf = kmalloc(count + 1, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
- if (copy_from_user(buf, buffer, count)) {
- ret = -EFAULT;
- goto out_free;
- }
- buf[count] = '\0';
+ buf = memdup_user_nul(buffer, count);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+
sym = strstrip(buf);
mutex_lock(&fei_lock);
@@ -298,17 +294,15 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,
}
ret = register_kprobe(&attr->kp);
- if (!ret)
- fei_debugfs_add_attr(attr);
- if (ret < 0)
- fei_attr_remove(attr);
- else {
- list_add_tail(&attr->list, &fei_attr_list);
- ret = count;
+ if (ret) {
+ fei_attr_free(attr);
+ goto out;
}
+ fei_debugfs_add_attr(attr);
+ list_add_tail(&attr->list, &fei_attr_list);
+ ret = count;
out:
mutex_unlock(&fei_lock);
-out_free:
kfree(buf);
return ret;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 947eb1a6399a..08969f5aa38d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,13 +37,13 @@
#include <linux/fdtable.h>
#include <linux/iocontext.h>
#include <linux/key.h>
+#include <linux/kmsan.h>
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
-#include <linux/vmacache.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
@@ -97,7 +97,6 @@
#include <linux/scs.h>
#include <linux/io_uring.h>
#include <linux/bpf.h>
-#include <linux/sched/mm.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -475,7 +474,6 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
*/
*new = data_race(*orig);
INIT_LIST_HEAD(&new->anon_vma_chain);
- new->vm_next = new->vm_prev = NULL;
dup_anon_vma_name(orig, new);
}
return new;
@@ -580,11 +578,12 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
static __latent_entropy int dup_mmap(struct mm_struct *mm,
struct mm_struct *oldmm)
{
- struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
- struct rb_node **rb_link, *rb_parent;
+ struct vm_area_struct *mpnt, *tmp;
int retval;
- unsigned long charge;
+ unsigned long charge = 0;
LIST_HEAD(uf);
+ MA_STATE(old_mas, &oldmm->mm_mt, 0, 0);
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
uprobe_start_dup_mmap();
if (mmap_write_lock_killable(oldmm)) {
@@ -606,16 +605,16 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
mm->exec_vm = oldmm->exec_vm;
mm->stack_vm = oldmm->stack_vm;
- rb_link = &mm->mm_rb.rb_node;
- rb_parent = NULL;
- pprev = &mm->mmap;
retval = ksm_fork(mm, oldmm);
if (retval)
goto out;
khugepaged_fork(mm, oldmm);
- prev = NULL;
- for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
+ retval = mas_expected_entries(&mas, oldmm->map_count);
+ if (retval)
+ goto out;
+
+ mas_for_each(&old_mas, mpnt, ULONG_MAX) {
struct file *file;
if (mpnt->vm_flags & VM_DONTCOPY) {
@@ -629,7 +628,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
*/
if (fatal_signal_pending(current)) {
retval = -EINTR;
- goto out;
+ goto loop_out;
}
if (mpnt->vm_flags & VM_ACCOUNT) {
unsigned long len = vma_pages(mpnt);
@@ -675,24 +674,17 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
}
/*
- * Clear hugetlb-related page reserves for children. This only
- * affects MAP_PRIVATE mappings. Faults generated by the child
- * are not guaranteed to succeed, even if read-only
+ * Copy/update hugetlb private vma information.
*/
if (is_vm_hugetlb_page(tmp))
- reset_vma_resv_huge_pages(tmp);
-
- /*
- * Link in the new vma and copy the page table entries.
- */
- *pprev = tmp;
- pprev = &tmp->vm_next;
- tmp->vm_prev = prev;
- prev = tmp;
+ hugetlb_dup_vma_private(tmp);
- __vma_link_rb(mm, tmp, rb_link, rb_parent);
- rb_link = &tmp->vm_rb.rb_right;
- rb_parent = &tmp->vm_rb;
+ /* Link the vma into the MT */
+ mas.index = tmp->vm_start;
+ mas.last = tmp->vm_end - 1;
+ mas_store(&mas, tmp);
+ if (mas_is_err(&mas))
+ goto fail_nomem_mas_store;
mm->map_count++;
if (!(tmp->vm_flags & VM_WIPEONFORK))
@@ -702,10 +694,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
tmp->vm_ops->open(tmp);
if (retval)
- goto out;
+ goto loop_out;
}
/* a new mm has just been created */
retval = arch_dup_mmap(oldmm, mm);
+loop_out:
+ mas_destroy(&mas);
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
@@ -714,6 +708,9 @@ out:
fail_uprobe_end:
uprobe_end_dup_mmap();
return retval;
+
+fail_nomem_mas_store:
+ unlink_anon_vmas(tmp);
fail_nomem_anon_vma_fork:
mpol_put(vma_policy(tmp));
fail_nomem_policy:
@@ -721,7 +718,7 @@ fail_nomem_policy:
fail_nomem:
retval = -ENOMEM;
vm_unacct_memory(charge);
- goto out;
+ goto loop_out;
}
static inline int mm_alloc_pgd(struct mm_struct *mm)
@@ -1026,6 +1023,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->worker_private = NULL;
kcov_task_init(tsk);
+ kmsan_task_create(tsk);
kmap_local_fork(tsk);
#ifdef CONFIG_FAULT_INJECTION
@@ -1109,9 +1107,8 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
- mm->mmap = NULL;
- mm->mm_rb = RB_ROOT;
- mm->vmacache_seqnum = 0;
+ mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
+ mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
seqcount_init(&mm->write_protect_seq);
@@ -1152,6 +1149,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
goto fail_nocontext;
mm->user_ns = get_user_ns(user_ns);
+ lru_gen_init_mm(mm);
return mm;
fail_nocontext:
@@ -1194,6 +1192,7 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
+ lru_gen_del_mm(mm);
mmdrop(mm);
}
@@ -1285,13 +1284,16 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
/* Forbid mm->exe_file change if old file still mapped. */
old_exe_file = get_mm_exe_file(mm);
if (old_exe_file) {
+ VMA_ITERATOR(vmi, mm, 0);
mmap_read_lock(mm);
- for (vma = mm->mmap; vma && !ret; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!vma->vm_file)
continue;
if (path_equal(&vma->vm_file->f_path,
- &old_exe_file->f_path))
+ &old_exe_file->f_path)) {
ret = -EBUSY;
+ break;
+ }
}
mmap_read_unlock(mm);
fput(old_exe_file);
@@ -1566,9 +1568,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
if (!oldmm)
return 0;
- /* initialize the new vmacache entries */
- vmacache_flush(tsk);
-
if (clone_flags & CLONE_VM) {
mmget(oldmm);
mm = oldmm;
@@ -2693,6 +2692,13 @@ pid_t kernel_clone(struct kernel_clone_args *args)
get_task_struct(p);
}
+ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
+ /* lock the task to synchronize with memcg migration */
+ task_lock(p);
+ lru_gen_add_mm(p->mm);
+ task_unlock(p);
+ }
+
wake_up_new_task(p);
/* forking complete and child started to run, tell ptracer */
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 460c12b7dfea..7971e989e425 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -30,6 +30,13 @@
#define GCOV_TAG_FUNCTION_LENGTH 3
+/* Since GCC 12.1 sizes are in BYTES and not in WORDS (4B). */
+#if (__GNUC__ >= 12)
+#define GCOV_UNIT_SIZE 4
+#else
+#define GCOV_UNIT_SIZE 1
+#endif
+
static struct gcov_info *gcov_info_head;
/**
@@ -383,12 +390,18 @@ size_t convert_to_gcda(char *buffer, struct gcov_info *info)
pos += store_gcov_u32(buffer, pos, info->version);
pos += store_gcov_u32(buffer, pos, info->stamp);
+#if (__GNUC__ >= 12)
+ /* Use zero as checksum of the compilation unit. */
+ pos += store_gcov_u32(buffer, pos, 0);
+#endif
+
for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
fi_ptr = info->functions[fi_idx];
/* Function record. */
pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
- pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION_LENGTH);
+ pos += store_gcov_u32(buffer, pos,
+ GCOV_TAG_FUNCTION_LENGTH * GCOV_UNIT_SIZE);
pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
pos += store_gcov_u32(buffer, pos, fi_ptr->lineno_checksum);
pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
@@ -402,7 +415,8 @@ size_t convert_to_gcda(char *buffer, struct gcov_info *info)
/* Counter record. */
pos += store_gcov_u32(buffer, pos,
GCOV_TAG_FOR_COUNTER(ct_idx));
- pos += store_gcov_u32(buffer, pos, ci_ptr->num * 2);
+ pos += store_gcov_u32(buffer, pos,
+ ci_ptr->num * 2 * GCOV_UNIT_SIZE);
for (cv_idx = 0; cv_idx < ci_ptr->num; cv_idx++) {
pos += store_gcov_u64(buffer, pos,
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 5db0230aa6b5..a91f9001103c 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -705,6 +705,30 @@ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
}
EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
+ /**
+ * generic_handle_irq_safe - Invoke the handler for a HW irq belonging
+ * to a domain from any context.
+ * @domain: The domain where to perform the lookup
+ * @hwirq: The HW irq number to convert to a logical one
+ *
+ * Returns: 0 on success, a negative value on error.
+ *
+ * This function can be called from any context (IRQ or process
+ * context). If the interrupt is marked as 'enforce IRQ-context only' then
+ * the function must be invoked from hard interrupt context.
+ */
+int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq)
+{
+ unsigned long flags;
+ int ret;
+
+ local_irq_save(flags);
+ ret = handle_irq_desc(irq_resolve_mapping(domain, hwirq));
+ local_irq_restore(flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(generic_handle_domain_irq_safe);
+
/**
* generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging
* to a domain.
diff --git a/kernel/kcov.c b/kernel/kcov.c
index e19c84b02452..e5cd09fd8a05 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -11,6 +11,7 @@
#include <linux/fs.h>
#include <linux/hashtable.h>
#include <linux/init.h>
+#include <linux/kmsan-checks.h>
#include <linux/mm.h>
#include <linux/preempt.h>
#include <linux/printk.h>
@@ -152,6 +153,12 @@ static void kcov_remote_area_put(struct kcov_remote_area *area,
INIT_LIST_HEAD(&area->list);
area->size = size;
list_add(&area->list, &kcov_remote_areas);
+ /*
+ * KMSAN doesn't instrument this file, so it may not know area->list
+ * is initialized. Unpoison it explicitly to avoid reports in
+ * kcov_remote_area_get().
+ */
+ kmsan_unpoison_memory(&area->list, sizeof(area->list));
}
static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c
index 75712959c84e..00cdf8fa5693 100644
--- a/kernel/kcsan/selftest.c
+++ b/kernel/kcsan/selftest.c
@@ -26,7 +26,7 @@
static bool __init test_requires(void)
{
/* random should be initialized for the below tests */
- return prandom_u32() + prandom_u32() != 0;
+ return get_random_u32() + get_random_u32() != 0;
}
/*
@@ -46,7 +46,7 @@ static bool __init test_encode_decode(void)
unsigned long addr;
size_t verif_size;
- prandom_bytes(&addr, sizeof(addr));
+ get_random_bytes(&addr, sizeof(addr));
if (addr < PAGE_SIZE)
addr = PAGE_SIZE;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b5e40f069768..cb8e6e6f983c 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -93,13 +93,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
/*
* Because we write directly to the reserved memory region when loading
- * crash kernels we need a mutex here to prevent multiple crash kernels
- * from attempting to load simultaneously, and to prevent a crash kernel
- * from loading over the top of a in use crash kernel.
- *
- * KISS: always take the mutex.
+ * crash kernels we need a serialization here to prevent multiple crash
+ * kernels from attempting to load simultaneously.
*/
- if (!mutex_trylock(&kexec_mutex))
+ if (!kexec_trylock())
return -EBUSY;
if (flags & KEXEC_ON_CRASH) {
@@ -165,7 +162,7 @@ out:
kimage_free(image);
out_unlock:
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
return ret;
}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index acd029b307e4..ca2743f9c634 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -46,7 +46,7 @@
#include <crypto/hash.h>
#include "kexec_internal.h"
-DEFINE_MUTEX(kexec_mutex);
+atomic_t __kexec_lock = ATOMIC_INIT(0);
/* Per cpu memory for storing cpu states in case of system crash. */
note_buf_t __percpu *crash_notes;
@@ -809,7 +809,7 @@ static int kimage_load_normal_segment(struct kimage *image,
if (result < 0)
goto out;
- ptr = kmap(page);
+ ptr = kmap_local_page(page);
/* Start with a clear page */
clear_page(ptr);
ptr += maddr & ~PAGE_MASK;
@@ -822,7 +822,7 @@ static int kimage_load_normal_segment(struct kimage *image,
memcpy(ptr, kbuf, uchunk);
else
result = copy_from_user(ptr, buf, uchunk);
- kunmap(page);
+ kunmap_local(ptr);
if (result) {
result = -EFAULT;
goto out;
@@ -873,7 +873,7 @@ static int kimage_load_crash_segment(struct kimage *image,
goto out;
}
arch_kexec_post_alloc_pages(page_address(page), 1, 0);
- ptr = kmap(page);
+ ptr = kmap_local_page(page);
ptr += maddr & ~PAGE_MASK;
mchunk = min_t(size_t, mbytes,
PAGE_SIZE - (maddr & ~PAGE_MASK));
@@ -889,7 +889,7 @@ static int kimage_load_crash_segment(struct kimage *image,
else
result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page);
- kunmap(page);
+ kunmap_local(ptr);
arch_kexec_pre_free_pages(page_address(page), 1);
if (result) {
result = -EFAULT;
@@ -959,7 +959,7 @@ late_initcall(kexec_core_sysctl_init);
*/
void __noclone __crash_kexec(struct pt_regs *regs)
{
- /* Take the kexec_mutex here to prevent sys_kexec_load
+ /* Take the kexec_lock here to prevent sys_kexec_load
* running on one cpu from replacing the crash kernel
* we are using after a panic on a different cpu.
*
@@ -967,7 +967,7 @@ void __noclone __crash_kexec(struct pt_regs *regs)
* of memory the xchg(&kexec_crash_image) would be
* sufficient. But since I reuse the memory...
*/
- if (mutex_trylock(&kexec_mutex)) {
+ if (kexec_trylock()) {
if (kexec_crash_image) {
struct pt_regs fixed_regs;
@@ -976,7 +976,7 @@ void __noclone __crash_kexec(struct pt_regs *regs)
machine_crash_shutdown(&fixed_regs);
machine_kexec(kexec_crash_image);
}
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
}
}
STACK_FRAME_NON_STANDARD(__crash_kexec);
@@ -1004,14 +1004,17 @@ void crash_kexec(struct pt_regs *regs)
}
}
-size_t crash_get_memory_size(void)
+ssize_t crash_get_memory_size(void)
{
- size_t size = 0;
+ ssize_t size = 0;
+
+ if (!kexec_trylock())
+ return -EBUSY;
- mutex_lock(&kexec_mutex);
if (crashk_res.end != crashk_res.start)
size = resource_size(&crashk_res);
- mutex_unlock(&kexec_mutex);
+
+ kexec_unlock();
return size;
}
@@ -1022,7 +1025,8 @@ int crash_shrink_memory(unsigned long new_size)
unsigned long old_size;
struct resource *ram_res;
- mutex_lock(&kexec_mutex);
+ if (!kexec_trylock())
+ return -EBUSY;
if (kexec_crash_image) {
ret = -ENOENT;
@@ -1060,7 +1064,7 @@ int crash_shrink_memory(unsigned long new_size)
insert_resource(&iomem_resource, ram_res);
unlock:
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
return ret;
}
@@ -1132,7 +1136,7 @@ int kernel_kexec(void)
{
int error = 0;
- if (!mutex_trylock(&kexec_mutex))
+ if (!kexec_trylock())
return -EBUSY;
if (!kexec_image) {
error = -EINVAL;
@@ -1208,6 +1212,6 @@ int kernel_kexec(void)
#endif
Unlock:
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
return error;
}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 1d546dc97c50..45637511e0de 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -339,7 +339,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
image = NULL;
- if (!mutex_trylock(&kexec_mutex))
+ if (!kexec_trylock())
return -EBUSY;
dest_image = &kexec_image;
@@ -411,7 +411,7 @@ out:
if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
arch_kexec_protect_crashkres();
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
kimage_free(image);
return ret;
}
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 48aaf2ac0d0d..74da1409cd14 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -13,7 +13,20 @@ void kimage_terminate(struct kimage *image);
int kimage_is_destination_range(struct kimage *image,
unsigned long start, unsigned long end);
-extern struct mutex kexec_mutex;
+/*
+ * Whatever is used to serialize accesses to the kexec_crash_image needs to be
+ * NMI safe, as __crash_kexec() can happen during nmi_panic(), so here we use a
+ * "simple" atomic variable that is acquired with a cmpxchg().
+ */
+extern atomic_t __kexec_lock;
+static inline bool kexec_trylock(void)
+{
+ return atomic_cmpxchg_acquire(&__kexec_lock, 0, 1) == 0;
+}
+static inline void kexec_unlock(void)
+{
+ atomic_set_release(&__kexec_lock, 0);
+}
#ifdef CONFIG_KEXEC_FILE
#include <linux/purgatory.h>
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index b1292a57c2a5..65dba9076f31 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -105,7 +105,12 @@ KERNEL_ATTR_RO(kexec_crash_loaded);
static ssize_t kexec_crash_size_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%zu\n", crash_get_memory_size());
+ ssize_t size = crash_get_memory_size();
+
+ if (size < 0)
+ return size;
+
+ return sprintf(buf, "%zd\n", size);
}
static ssize_t kexec_crash_size_store(struct kobject *kobj,
struct kobj_attribute *attr,
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 76166df011a4..781249098cb6 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -112,7 +112,7 @@ static void __sched
account_global_scheduler_latency(struct task_struct *tsk,
struct latency_record *lat)
{
- int firstnonnull = MAXLR + 1;
+ int firstnonnull = MAXLR;
int i;
/* skip kernel threads for now */
@@ -150,7 +150,7 @@ account_global_scheduler_latency(struct task_struct *tsk,
}
i = firstnonnull;
- if (i >= MAXLR - 1)
+ if (i >= MAXLR)
return;
/* Allocted a new one: */
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index d51cabf28f38..ea925731fa40 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -5,8 +5,9 @@ KCOV_INSTRUMENT := n
obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
-# Avoid recursion lockdep -> KCSAN -> ... -> lockdep.
+# Avoid recursion lockdep -> sanitizer -> ... -> lockdep.
KCSAN_SANITIZE_lockdep.o := n
+KMSAN_SANITIZE_lockdep.o := n
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 353004155d65..43efb2a04160 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -399,7 +399,7 @@ static int *get_random_order(int count)
order[n] = n;
for (n = count - 1; n > 1; n--) {
- r = get_random_int() % (n + 1);
+ r = prandom_u32_max(n + 1);
if (r != n) {
tmp = order[n];
order[n] = order[r];
@@ -538,7 +538,7 @@ static void stress_one_work(struct work_struct *work)
{
struct stress *stress = container_of(work, typeof(*stress), work);
const int nlocks = stress->nlocks;
- struct ww_mutex *lock = stress->locks + (get_random_int() % nlocks);
+ struct ww_mutex *lock = stress->locks + prandom_u32_max(nlocks);
int err;
do {
diff --git a/kernel/pid.c b/kernel/pid.c
index 2fc0a16ec77b..3fbc5e46b721 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -519,6 +519,7 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
return idr_get_next(&ns->idr, &nr);
}
+EXPORT_SYMBOL_GPL(find_ge_pid);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
{
diff --git a/kernel/profile.c b/kernel/profile.c
index 7ea01ba30e75..8a77769bc4b4 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -59,43 +59,39 @@ int profile_setup(char *str)
static const char schedstr[] = "schedule";
static const char sleepstr[] = "sleep";
static const char kvmstr[] = "kvm";
+ const char *select = NULL;
int par;
if (!strncmp(str, sleepstr, strlen(sleepstr))) {
#ifdef CONFIG_SCHEDSTATS
force_schedstat_enabled();
prof_on = SLEEP_PROFILING;
- if (str[strlen(sleepstr)] == ',')
- str += strlen(sleepstr) + 1;
- if (get_option(&str, &par))
- prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
- pr_info("kernel sleep profiling enabled (shift: %u)\n",
- prof_shift);
+ select = sleepstr;
#else
pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
#endif /* CONFIG_SCHEDSTATS */
} else if (!strncmp(str, schedstr, strlen(schedstr))) {
prof_on = SCHED_PROFILING;
- if (str[strlen(schedstr)] == ',')
- str += strlen(schedstr) + 1;
- if (get_option(&str, &par))
- prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
- pr_info("kernel schedule profiling enabled (shift: %u)\n",
- prof_shift);
+ select = schedstr;
} else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
prof_on = KVM_PROFILING;
- if (str[strlen(kvmstr)] == ',')
- str += strlen(kvmstr) + 1;
- if (get_option(&str, &par))
- prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
- pr_info("kernel KVM profiling enabled (shift: %u)\n",
- prof_shift);
+ select = kvmstr;
} else if (get_option(&str, &par)) {
prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
prof_on = CPU_PROFILING;
pr_info("kernel profiling enabled (shift: %u)\n",
prof_shift);
}
+
+ if (select) {
+ if (str[strlen(select)] == ',')
+ str += strlen(select) + 1;
+ if (get_option(&str, &par))
+ prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
+ pr_info("kernel %s profiling enabled (shift: %u)\n",
+ select, prof_shift);
+ }
+
return 1;
}
__setup("profile=", profile_setup);
diff --git a/kernel/relay.c b/kernel/relay.c
index 6a611e779e95..d7edc934c56d 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -60,10 +60,7 @@ static const struct vm_operations_struct relay_file_mmap_ops = {
*/
static struct page **relay_alloc_page_array(unsigned int n_pages)
{
- const size_t pa_size = n_pages * sizeof(struct page *);
- if (pa_size > PAGE_SIZE)
- return vzalloc(pa_size);
- return kzalloc(pa_size, GFP_KERNEL);
+ return kvcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
}
/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f4d02201f424..5800b0623ff3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -701,6 +701,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
+ psi_account_irqtime(rq->curr, irq_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((&paravirt_steal_rq_enabled))) {
@@ -4389,6 +4390,17 @@ void set_numabalancing_state(bool enabled)
}
#ifdef CONFIG_PROC_SYSCTL
+static void reset_memory_tiering(void)
+{
+ struct pglist_data *pgdat;
+
+ for_each_online_pgdat(pgdat) {
+ pgdat->nbp_threshold = 0;
+ pgdat->nbp_th_nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ pgdat->nbp_th_start = jiffies_to_msecs(jiffies);
+ }
+}
+
int sysctl_numa_balancing(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -4405,6 +4417,9 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
if (err < 0)
return err;
if (write) {
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ (state & NUMA_BALANCING_MEMORY_TIERING))
+ reset_memory_tiering();
sysctl_numa_balancing_mode = state;
__set_numabalancing_state(state);
}
@@ -5159,6 +5174,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
* finish_task_switch()'s mmdrop().
*/
switch_mm_irqs_off(prev->active_mm, next->mm, next);
+ lru_gen_use_mm(next->mm);
if (!prev->mm) { // from kernel
/* will mmdrop() in finish_task_switch(). */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 667876da8382..1637b65ba07a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -333,6 +333,7 @@ static __init int sched_init_debug(void)
debugfs_create_u32("scan_period_min_ms", 0644, numa, &sysctl_numa_balancing_scan_period_min);
debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max);
debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
+ debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
#endif
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5ffec4370602..e4a0b8bd941c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -40,6 +40,7 @@
#include <linux/cpuidle.h>
#include <linux/interrupt.h>
+#include <linux/memory-tiers.h>
#include <linux/mempolicy.h>
#include <linux/mutex_api.h>
#include <linux/profile.h>
@@ -1090,6 +1091,12 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
+/* The page with hint page fault latency < threshold in ms is considered hot */
+unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
+
+/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
+unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
+
struct numa_group {
refcount_t refcount;
@@ -1432,6 +1439,120 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
return 1000 * faults / total_faults;
}
+/*
+ * If memory tiering mode is enabled, cpupid of slow memory page is
+ * used to record scan time instead of CPU and PID. When tiering mode
+ * is disabled at run time, the scan time (in cpupid) will be
+ * interpreted as CPU and PID. So CPU needs to be checked to avoid to
+ * access out of array bound.
+ */
+static inline bool cpupid_valid(int cpupid)
+{
+ return cpupid_to_cpu(cpupid) < nr_cpu_ids;
+}
+
+/*
+ * For memory tiering mode, if there are enough free pages (more than
+ * enough watermark defined here) in fast memory node, to take full
+ * advantage of fast memory capacity, all recently accessed slow
+ * memory pages will be migrated to fast memory node without
+ * considering hot threshold.
+ */
+static bool pgdat_free_space_enough(struct pglist_data *pgdat)
+{
+ int z;
+ unsigned long enough_wmark;
+
+ enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
+ pgdat->node_present_pages >> 4);
+ for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+ struct zone *zone = pgdat->node_zones + z;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone_watermark_ok(zone, 0,
+ wmark_pages(zone, WMARK_PROMO) + enough_wmark,
+ ZONE_MOVABLE, 0))
+ return true;
+ }
+ return false;
+}
+
+/*
+ * For memory tiering mode, when page tables are scanned, the scan
+ * time will be recorded in struct page in addition to make page
+ * PROT_NONE for slow memory page. So when the page is accessed, in
+ * hint page fault handler, the hint page fault latency is calculated
+ * via,
+ *
+ * hint page fault latency = hint page fault time - scan time
+ *
+ * The smaller the hint page fault latency, the higher the possibility
+ * for the page to be hot.
+ */
+static int numa_hint_fault_latency(struct page *page)
+{
+ int last_time, time;
+
+ time = jiffies_to_msecs(jiffies);
+ last_time = xchg_page_access_time(page, time);
+
+ return (time - last_time) & PAGE_ACCESS_TIME_MASK;
+}
+
+/*
+ * For memory tiering mode, too high promotion/demotion throughput may
+ * hurt application latency. So we provide a mechanism to rate limit
+ * the number of pages that are tried to be promoted.
+ */
+static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
+ unsigned long rate_limit, int nr)
+{
+ unsigned long nr_cand;
+ unsigned int now, start;
+
+ now = jiffies_to_msecs(jiffies);
+ mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ start = pgdat->nbp_rl_start;
+ if (now - start > MSEC_PER_SEC &&
+ cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
+ pgdat->nbp_rl_nr_cand = nr_cand;
+ if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
+ return true;
+ return false;
+}
+
+#define NUMA_MIGRATION_ADJUST_STEPS 16
+
+static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
+ unsigned long rate_limit,
+ unsigned int ref_th)
+{
+ unsigned int now, start, th_period, unit_th, th;
+ unsigned long nr_cand, ref_cand, diff_cand;
+
+ now = jiffies_to_msecs(jiffies);
+ th_period = sysctl_numa_balancing_scan_period_max;
+ start = pgdat->nbp_th_start;
+ if (now - start > th_period &&
+ cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
+ ref_cand = rate_limit *
+ sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC;
+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
+ unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS;
+ th = pgdat->nbp_threshold ? : ref_th;
+ if (diff_cand > ref_cand * 11 / 10)
+ th = max(th - unit_th, unit_th);
+ else if (diff_cand < ref_cand * 9 / 10)
+ th = min(th + unit_th, ref_th * 2);
+ pgdat->nbp_th_nr_cand = nr_cand;
+ pgdat->nbp_threshold = th;
+ }
+}
+
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int src_nid, int dst_cpu)
{
@@ -1439,9 +1560,44 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int dst_nid = cpu_to_node(dst_cpu);
int last_cpupid, this_cpupid;
+ /*
+ * The pages in slow memory node should be migrated according
+ * to hot/cold instead of private/shared.
+ */
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
+ !node_is_toptier(src_nid)) {
+ struct pglist_data *pgdat;
+ unsigned long rate_limit;
+ unsigned int latency, th, def_th;
+
+ pgdat = NODE_DATA(dst_nid);
+ if (pgdat_free_space_enough(pgdat)) {
+ /* workload changed, reset hot threshold */
+ pgdat->nbp_threshold = 0;
+ return true;
+ }
+
+ def_th = sysctl_numa_balancing_hot_threshold;
+ rate_limit = sysctl_numa_balancing_promote_rate_limit << \
+ (20 - PAGE_SHIFT);
+ numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
+
+ th = pgdat->nbp_threshold ? : def_th;
+ latency = numa_hint_fault_latency(page);
+ if (latency >= th)
+ return false;
+
+ return !numa_promotion_rate_limit(pgdat, rate_limit,
+ thp_nr_pages(page));
+ }
+
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
+ return false;
+
/*
* Allow first faults or private faults to migrate immediately early in
* the lifetime of a task. The magic number 4 is based on waiting for
@@ -2681,6 +2837,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
if (!p->mm)
return;
+ /*
+ * NUMA faults statistics are unnecessary for the slow memory
+ * node for memory tiering mode.
+ */
+ if (!node_is_toptier(mem_node) &&
+ (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ||
+ !cpupid_valid(last_cpupid)))
+ return;
+
/* Allocate buffer to track faults on a per-node basis */
if (unlikely(!p->numa_faults)) {
int size = sizeof(*p->numa_faults) *
@@ -2761,6 +2926,7 @@ static void task_numa_work(struct callback_head *work)
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
u64 runtime = p->se.sum_exec_runtime;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
struct vm_area_struct *vma;
unsigned long start, end;
unsigned long nr_pte_updates = 0;
@@ -2817,13 +2983,16 @@ static void task_numa_work(struct callback_head *work)
if (!mmap_read_trylock(mm))
return;
- vma = find_vma(mm, start);
+ mas_set(&mas, start);
+ vma = mas_find(&mas, ULONG_MAX);
if (!vma) {
reset_ptenuma_scan(p);
start = 0;
- vma = mm->mmap;
+ mas_set(&mas, start);
+ vma = mas_find(&mas, ULONG_MAX);
}
- for (; vma; vma = vma->vm_next) {
+
+ for (; vma; vma = mas_find(&mas, ULONG_MAX)) {
if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
continue;
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 7f6030091aee..ee2ecc081422 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -181,6 +181,7 @@ static void group_init(struct psi_group *group)
{
int cpu;
+ group->enabled = true;
for_each_possible_cpu(cpu)
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
group->avg_last_update = sched_clock();
@@ -201,6 +202,7 @@ void __init psi_init(void)
{
if (!psi_enable) {
static_branch_enable(&psi_disabled);
+ static_branch_disable(&psi_cgroups_enabled);
return;
}
@@ -211,7 +213,7 @@ void __init psi_init(void)
group_init(&psi_system);
}
-static bool test_state(unsigned int *tasks, enum psi_states state)
+static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu)
{
switch (state) {
case PSI_IO_SOME:
@@ -224,9 +226,9 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
return unlikely(tasks[NR_MEMSTALL] &&
tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
case PSI_CPU_SOME:
- return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
+ return unlikely(tasks[NR_RUNNING] > oncpu);
case PSI_CPU_FULL:
- return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
+ return unlikely(tasks[NR_RUNNING] && !oncpu);
case PSI_NONIDLE:
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
tasks[NR_RUNNING];
@@ -688,35 +690,53 @@ static void psi_group_change(struct psi_group *group, int cpu,
bool wake_clock)
{
struct psi_group_cpu *groupc;
- u32 state_mask = 0;
unsigned int t, m;
enum psi_states s;
+ u32 state_mask;
groupc = per_cpu_ptr(group->pcpu, cpu);
/*
- * First we assess the aggregate resource states this CPU's
- * tasks have been in since the last change, and account any
- * SOME and FULL time these may have resulted in.
- *
- * Then we update the task counts according to the state
+ * First we update the task counts according to the state
* change requested through the @clear and @set bits.
+ *
+ * Then if the cgroup PSI stats accounting enabled, we
+ * assess the aggregate resource states this CPU's tasks
+ * have been in since the last change, and account any
+ * SOME and FULL time these may have resulted in.
*/
write_seqcount_begin(&groupc->seq);
- record_times(groupc, now);
+ /*
+ * Start with TSK_ONCPU, which doesn't have a corresponding
+ * task count - it's just a boolean flag directly encoded in
+ * the state mask. Clear, set, or carry the current state if
+ * no changes are requested.
+ */
+ if (unlikely(clear & TSK_ONCPU)) {
+ state_mask = 0;
+ clear &= ~TSK_ONCPU;
+ } else if (unlikely(set & TSK_ONCPU)) {
+ state_mask = PSI_ONCPU;
+ set &= ~TSK_ONCPU;
+ } else {
+ state_mask = groupc->state_mask & PSI_ONCPU;
+ }
+ /*
+ * The rest of the state mask is calculated based on the task
+ * counts. Update those first, then construct the mask.
+ */
for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
if (!(m & (1 << t)))
continue;
if (groupc->tasks[t]) {
groupc->tasks[t]--;
} else if (!psi_bug) {
- printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
+ printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
cpu, t, groupc->tasks[0],
groupc->tasks[1], groupc->tasks[2],
- groupc->tasks[3], groupc->tasks[4],
- clear, set);
+ groupc->tasks[3], clear, set);
psi_bug = 1;
}
}
@@ -725,9 +745,25 @@ static void psi_group_change(struct psi_group *group, int cpu,
if (set & (1 << t))
groupc->tasks[t]++;
- /* Calculate state mask representing active states */
+ if (!group->enabled) {
+ /*
+ * On the first group change after disabling PSI, conclude
+ * the current state and flush its time. This is unlikely
+ * to matter to the user, but aggregation (get_recent_times)
+ * may have already incorporated the live state into times_prev;
+ * avoid a delta sample underflow when PSI is later re-enabled.
+ */
+ if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
+ record_times(groupc, now);
+
+ groupc->state_mask = state_mask;
+
+ write_seqcount_end(&groupc->seq);
+ return;
+ }
+
for (s = 0; s < NR_PSI_STATES; s++) {
- if (test_state(groupc->tasks, s))
+ if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
state_mask |= (1 << s);
}
@@ -739,9 +775,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
* task in a cgroup is in_memstall, the corresponding groupc
* on that cpu is in PSI_MEM_FULL state.
*/
- if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall))
+ if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
state_mask |= (1 << PSI_MEM_FULL);
+ record_times(groupc, now);
+
groupc->state_mask = state_mask;
write_seqcount_end(&groupc->seq);
@@ -753,27 +791,12 @@ static void psi_group_change(struct psi_group *group, int cpu,
schedule_delayed_work(&group->avgs_work, PSI_FREQ);
}
-static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
+static inline struct psi_group *task_psi_group(struct task_struct *task)
{
- if (*iter == &psi_system)
- return NULL;
-
#ifdef CONFIG_CGROUPS
- if (static_branch_likely(&psi_cgroups_enabled)) {
- struct cgroup *cgroup = NULL;
-
- if (!*iter)
- cgroup = task->cgroups->dfl_cgrp;
- else
- cgroup = cgroup_parent(*iter);
-
- if (cgroup && cgroup_parent(cgroup)) {
- *iter = cgroup;
- return cgroup_psi(cgroup);
- }
- }
+ if (static_branch_likely(&psi_cgroups_enabled))
+ return cgroup_psi(task_dfl_cgroup(task));
#endif
- *iter = &psi_system;
return &psi_system;
}
@@ -796,8 +819,6 @@ void psi_task_change(struct task_struct *task, int clear, int set)
{
int cpu = task_cpu(task);
struct psi_group *group;
- bool wake_clock = true;
- void *iter = NULL;
u64 now;
if (!task->pid)
@@ -806,19 +827,11 @@ void psi_task_change(struct task_struct *task, int clear, int set)
psi_flags_change(task, clear, set);
now = cpu_clock(cpu);
- /*
- * Periodic aggregation shuts off if there is a period of no
- * task changes, so we wake it back up if necessary. However,
- * don't do this if the task change is the aggregation worker
- * itself going to sleep, or we'll ping-pong forever.
- */
- if (unlikely((clear & TSK_RUNNING) &&
- (task->flags & PF_WQ_WORKER) &&
- wq_worker_last_func(task) == psi_avgs_work))
- wake_clock = false;
- while ((group = iterate_groups(task, &iter)))
- psi_group_change(group, cpu, clear, set, now, wake_clock);
+ group = task_psi_group(task);
+ do {
+ psi_group_change(group, cpu, clear, set, now, true);
+ } while ((group = group->parent));
}
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
@@ -826,34 +839,30 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
{
struct psi_group *group, *common = NULL;
int cpu = task_cpu(prev);
- void *iter;
u64 now = cpu_clock(cpu);
if (next->pid) {
- bool identical_state;
-
psi_flags_change(next, 0, TSK_ONCPU);
/*
- * When switching between tasks that have an identical
- * runtime state, the cgroup that contains both tasks
- * we reach the first common ancestor. Iterate @next's
- * ancestors only until we encounter @prev's ONCPU.
+ * Set TSK_ONCPU on @next's cgroups. If @next shares any
+ * ancestors with @prev, those will already have @prev's
+ * TSK_ONCPU bit set, and we can stop the iteration there.
*/
- identical_state = prev->psi_flags == next->psi_flags;
- iter = NULL;
- while ((group = iterate_groups(next, &iter))) {
- if (identical_state &&
- per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
+ group = task_psi_group(next);
+ do {
+ if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
+ PSI_ONCPU) {
common = group;
break;
}
psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
- }
+ } while ((group = group->parent));
}
if (prev->pid) {
int clear = TSK_ONCPU, set = 0;
+ bool wake_clock = true;
/*
* When we're going to sleep, psi_dequeue() lets us
@@ -867,26 +876,74 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
clear |= TSK_MEMSTALL_RUNNING;
if (prev->in_iowait)
set |= TSK_IOWAIT;
+
+ /*
+ * Periodic aggregation shuts off if there is a period of no
+ * task changes, so we wake it back up if necessary. However,
+ * don't do this if the task change is the aggregation worker
+ * itself going to sleep, or we'll ping-pong forever.
+ */
+ if (unlikely((prev->flags & PF_WQ_WORKER) &&
+ wq_worker_last_func(prev) == psi_avgs_work))
+ wake_clock = false;
}
psi_flags_change(prev, clear, set);
- iter = NULL;
- while ((group = iterate_groups(prev, &iter)) && group != common)
- psi_group_change(group, cpu, clear, set, now, true);
+ group = task_psi_group(prev);
+ do {
+ if (group == common)
+ break;
+ psi_group_change(group, cpu, clear, set, now, wake_clock);
+ } while ((group = group->parent));
/*
- * TSK_ONCPU is handled up to the common ancestor. If we're tasked
- * with dequeuing too, finish that for the rest of the hierarchy.
+ * TSK_ONCPU is handled up to the common ancestor. If there are
+ * any other differences between the two tasks (e.g. prev goes
+ * to sleep, or only one task is memstall), finish propagating
+ * those differences all the way up to the root.
*/
- if (sleep) {
+ if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
clear &= ~TSK_ONCPU;
- for (; group; group = iterate_groups(prev, &iter))
- psi_group_change(group, cpu, clear, set, now, true);
+ for (; group; group = group->parent)
+ psi_group_change(group, cpu, clear, set, now, wake_clock);
}
}
}
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+void psi_account_irqtime(struct task_struct *task, u32 delta)
+{
+ int cpu = task_cpu(task);
+ struct psi_group *group;
+ struct psi_group_cpu *groupc;
+ u64 now;
+
+ if (!task->pid)
+ return;
+
+ now = cpu_clock(cpu);
+
+ group = task_psi_group(task);
+ do {
+ if (!group->enabled)
+ continue;
+
+ groupc = per_cpu_ptr(group->pcpu, cpu);
+
+ write_seqcount_begin(&groupc->seq);
+
+ record_times(groupc, now);
+ groupc->times[PSI_IRQ_FULL] += delta;
+
+ write_seqcount_end(&groupc->seq);
+
+ if (group->poll_states & (1 << PSI_IRQ_FULL))
+ psi_schedule_poll_work(group, 1);
+ } while ((group = group->parent));
+}
+#endif
+
/**
* psi_memstall_enter - mark the beginning of a memory stall section
* @flags: flags to handle nested sections
@@ -952,7 +1009,7 @@ EXPORT_SYMBOL_GPL(psi_memstall_leave);
#ifdef CONFIG_CGROUPS
int psi_cgroup_alloc(struct cgroup *cgroup)
{
- if (static_branch_likely(&psi_disabled))
+ if (!static_branch_likely(&psi_cgroups_enabled))
return 0;
cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL);
@@ -965,12 +1022,13 @@ int psi_cgroup_alloc(struct cgroup *cgroup)
return -ENOMEM;
}
group_init(cgroup->psi);
+ cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup));
return 0;
}
void psi_cgroup_free(struct cgroup *cgroup)
{
- if (static_branch_likely(&psi_disabled))
+ if (!static_branch_likely(&psi_cgroups_enabled))
return;
cancel_delayed_work_sync(&cgroup->psi->avgs_work);
@@ -998,7 +1056,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
struct rq_flags rf;
struct rq *rq;
- if (static_branch_likely(&psi_disabled)) {
+ if (!static_branch_likely(&psi_cgroups_enabled)) {
/*
* Lame to do this here, but the scheduler cannot be locked
* from the outside, so we move cgroups from inside sched/.
@@ -1046,10 +1104,45 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
task_rq_unlock(rq, task, &rf);
}
+
+void psi_cgroup_restart(struct psi_group *group)
+{
+ int cpu;
+
+ /*
+ * After we disable psi_group->enabled, we don't actually
+ * stop percpu tasks accounting in each psi_group_cpu,
+ * instead only stop test_state() loop, record_times()
+ * and averaging worker, see psi_group_change() for details.
+ *
+ * When disable cgroup PSI, this function has nothing to sync
+ * since cgroup pressure files are hidden and percpu psi_group_cpu
+ * would see !psi_group->enabled and only do task accounting.
+ *
+ * When re-enable cgroup PSI, this function use psi_group_change()
+ * to get correct state mask from test_state() loop on tasks[],
+ * and restart groupc->state_start from now, use .clear = .set = 0
+ * here since no task status really changed.
+ */
+ if (!group->enabled)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+ u64 now;
+
+ rq_lock_irq(rq, &rf);
+ now = cpu_clock(cpu);
+ psi_group_change(group, cpu, 0, 0, now, true);
+ rq_unlock_irq(rq, &rf);
+ }
+}
#endif /* CONFIG_CGROUPS */
int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
{
+ bool only_full = false;
int full;
u64 now;
@@ -1064,7 +1157,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
group->avg_next_update = update_averages(group, now);
mutex_unlock(&group->avgs_lock);
- for (full = 0; full < 2; full++) {
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ only_full = res == PSI_IRQ;
+#endif
+
+ for (full = 0; full < 2 - only_full; full++) {
unsigned long avg[3] = { 0, };
u64 total = 0;
int w;
@@ -1078,7 +1175,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
}
seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
- full ? "full" : "some",
+ full || only_full ? "full" : "some",
LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
@@ -1106,6 +1203,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
else
return ERR_PTR(-EINVAL);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ if (res == PSI_IRQ && --state != PSI_IRQ_FULL)
+ return ERR_PTR(-EINVAL);
+#endif
+
if (state >= PSI_NONIDLE)
return ERR_PTR(-EINVAL);
@@ -1390,6 +1492,33 @@ static const struct proc_ops psi_cpu_proc_ops = {
.proc_release = psi_fop_release,
};
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int psi_irq_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_IRQ);
+}
+
+static int psi_irq_open(struct inode *inode, struct file *file)
+{
+ return psi_open(file, psi_irq_show);
+}
+
+static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return psi_write(file, user_buf, nbytes, PSI_IRQ);
+}
+
+static const struct proc_ops psi_irq_proc_ops = {
+ .proc_open = psi_irq_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = psi_irq_write,
+ .proc_poll = psi_fop_poll,
+ .proc_release = psi_fop_release,
+};
+#endif
+
static int __init psi_proc_init(void)
{
if (psi_enable) {
@@ -1397,6 +1526,9 @@ static int __init psi_proc_init(void)
proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops);
+#endif
}
return 0;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1fc198be1ffd..1644242ecd11 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2446,6 +2446,7 @@ extern unsigned int sysctl_numa_balancing_scan_delay;
extern unsigned int sysctl_numa_balancing_scan_period_min;
extern unsigned int sysctl_numa_balancing_scan_period_max;
extern unsigned int sysctl_numa_balancing_scan_size;
+extern unsigned int sysctl_numa_balancing_hot_threshold;
#endif
#ifdef CONFIG_SCHED_HRTICK
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index baa839c1ba96..84a188913cc9 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -107,6 +107,11 @@ __schedstats_from_se(struct sched_entity *se)
}
#ifdef CONFIG_PSI
+void psi_task_change(struct task_struct *task, int clear, int set);
+void psi_task_switch(struct task_struct *prev, struct task_struct *next,
+ bool sleep);
+void psi_account_irqtime(struct task_struct *task, u32 delta);
+
/*
* PSI tracks state that persists across sleeps, such as iowaits and
* memory stalls. As a result, it has to distinguish between sleeps,
@@ -201,6 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
+static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
#endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index b9f54544e749..2c7396da470c 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -433,7 +433,7 @@ bool cpu_wait_death(unsigned int cpu, int seconds)
/* The outgoing CPU will normally get done quite quickly. */
if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD)
- goto update_state;
+ goto update_state_early;
udelay(5);
/* But if the outgoing CPU dawdles, wait increasingly long times. */
@@ -444,16 +444,17 @@ bool cpu_wait_death(unsigned int cpu, int seconds)
break;
sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10);
}
-update_state:
+update_state_early:
oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+update_state:
if (oldstate == CPU_DEAD) {
/* Outgoing CPU died normally, update state. */
smp_mb(); /* atomic_read() before update. */
atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD);
} else {
/* Outgoing CPU still hasn't died, set state accordingly. */
- if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
- oldstate, CPU_BROKEN) != oldstate)
+ if (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
+ &oldstate, CPU_BROKEN))
goto update_state;
ret = false;
}
@@ -475,14 +476,14 @@ bool cpu_report_death(void)
int newstate;
int cpu = smp_processor_id();
+ oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
do {
- oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
if (oldstate != CPU_BROKEN)
newstate = CPU_DEAD;
else
newstate = CPU_DEAD_FROZEN;
- } while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
- oldstate, newstate) != oldstate);
+ } while (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
+ &oldstate, newstate));
return newstate == CPU_DEAD;
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 82ab288758f5..188c305aeb8b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1643,6 +1643,14 @@ static struct ctl_table kern_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_FOUR,
},
+ {
+ .procname = "numa_balancing_promote_rate_limit_MBps",
+ .data = &sysctl_numa_balancing_promote_rate_limit,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
#endif /* CONFIG_NUMA_BALANCING */
{
.procname = "panic",
diff --git a/kernel/task_work.c b/kernel/task_work.c
index dff75bcde151..065e1ef8fc8d 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -47,12 +47,12 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
/* record the work call stack in order to print it in KASAN reports */
kasan_record_aux_stack(work);
+ head = READ_ONCE(task->task_works);
do {
- head = READ_ONCE(task->task_works);
if (unlikely(head == &work_exited))
return -ESRCH;
work->next = head;
- } while (cmpxchg(&task->task_works, head, work) != head);
+ } while (!try_cmpxchg(&task->task_works, &head, work));
switch (notify) {
case TWA_NONE:
@@ -100,10 +100,12 @@ task_work_cancel_match(struct task_struct *task,
* we raced with task_work_run(), *pprev == NULL/exited.
*/
raw_spin_lock_irqsave(&task->pi_lock, flags);
- while ((work = READ_ONCE(*pprev))) {
- if (!match(work, data))
+ work = READ_ONCE(*pprev);
+ while (work) {
+ if (!match(work, data)) {
pprev = &work->next;
- else if (cmpxchg(pprev, work, work->next) == work)
+ work = READ_ONCE(*pprev);
+ } else if (try_cmpxchg(pprev, &work, work->next))
break;
}
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
@@ -151,16 +153,16 @@ void task_work_run(void)
* work->func() can do task_work_add(), do not set
* work_exited unless the list is empty.
*/
+ work = READ_ONCE(task->task_works);
do {
head = NULL;
- work = READ_ONCE(task->task_works);
if (!work) {
if (task->flags & PF_EXITING)
head = &work_exited;
else
break;
}
- } while (cmpxchg(&task->task_works, work, head) != work);
+ } while (!try_cmpxchg(&task->task_works, &work, head));
if (!work)
break;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index cee5da1e54c4..8058bec87ace 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -310,7 +310,7 @@ static void clocksource_verify_choose_cpus(void)
* CPUs that are currently online.
*/
for (i = 1; i < n; i++) {
- cpu = prandom_u32() % nr_cpu_ids;
+ cpu = prandom_u32_max(nr_cpu_ids);
cpu = cpumask_next(cpu - 1, cpu_online_mask);
if (cpu >= nr_cpu_ids)
cpu = cpumask_first(cpu_online_mask);
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7f5eb295fe19..a995ea1ef849 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -346,8 +346,40 @@ static void put_probe_ref(void)
mutex_unlock(&blk_probe_mutex);
}
+static int blk_trace_start(struct blk_trace *bt)
+{
+ if (bt->trace_state != Blktrace_setup &&
+ bt->trace_state != Blktrace_stopped)
+ return -EINVAL;
+
+ blktrace_seq++;
+ smp_mb();
+ bt->trace_state = Blktrace_running;
+ raw_spin_lock_irq(&running_trace_lock);
+ list_add(&bt->running_list, &running_trace_list);
+ raw_spin_unlock_irq(&running_trace_lock);
+ trace_note_time(bt);
+
+ return 0;
+}
+
+static int blk_trace_stop(struct blk_trace *bt)
+{
+ if (bt->trace_state != Blktrace_running)
+ return -EINVAL;
+
+ bt->trace_state = Blktrace_stopped;
+ raw_spin_lock_irq(&running_trace_lock);
+ list_del_init(&bt->running_list);
+ raw_spin_unlock_irq(&running_trace_lock);
+ relay_flush(bt->rchan);
+
+ return 0;
+}
+
static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt)
{
+ blk_trace_stop(bt);
synchronize_rcu();
blk_trace_free(q, bt);
put_probe_ref();
@@ -362,8 +394,7 @@ static int __blk_trace_remove(struct request_queue *q)
if (!bt)
return -EINVAL;
- if (bt->trace_state != Blktrace_running)
- blk_trace_cleanup(q, bt);
+ blk_trace_cleanup(q, bt);
return 0;
}
@@ -658,7 +689,6 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
static int __blk_trace_startstop(struct request_queue *q, int start)
{
- int ret;
struct blk_trace *bt;
bt = rcu_dereference_protected(q->blk_trace,
@@ -666,36 +696,10 @@ static int __blk_trace_startstop(struct request_queue *q, int start)
if (bt == NULL)
return -EINVAL;
- /*
- * For starting a trace, we can transition from a setup or stopped
- * trace. For stopping a trace, the state must be running
- */
- ret = -EINVAL;
- if (start) {
- if (bt->trace_state == Blktrace_setup ||
- bt->trace_state == Blktrace_stopped) {
- blktrace_seq++;
- smp_mb();
- bt->trace_state = Blktrace_running;
- raw_spin_lock_irq(&running_trace_lock);
- list_add(&bt->running_list, &running_trace_list);
- raw_spin_unlock_irq(&running_trace_lock);
-
- trace_note_time(bt);
- ret = 0;
- }
- } else {
- if (bt->trace_state == Blktrace_running) {
- bt->trace_state = Blktrace_stopped;
- raw_spin_lock_irq(&running_trace_lock);
- list_del_init(&bt->running_list);
- raw_spin_unlock_irq(&running_trace_lock);
- relay_flush(bt->rchan);
- ret = 0;
- }
- }
-
- return ret;
+ if (start)
+ return blk_trace_start(bt);
+ else
+ return blk_trace_stop(bt);
}
int blk_trace_startstop(struct request_queue *q, int start)
@@ -772,10 +776,8 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
void blk_trace_shutdown(struct request_queue *q)
{
if (rcu_dereference_protected(q->blk_trace,
- lockdep_is_held(&q->debugfs_mutex))) {
- __blk_trace_startstop(q, 0);
+ lockdep_is_held(&q->debugfs_mutex)))
__blk_trace_remove(q);
- }
}
#ifdef CONFIG_BLK_CGROUP
@@ -1614,13 +1616,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
if (bt == NULL)
return -EINVAL;
- if (bt->trace_state == Blktrace_running) {
- bt->trace_state = Blktrace_stopped;
- raw_spin_lock_irq(&running_trace_lock);
- list_del_init(&bt->running_list);
- raw_spin_unlock_irq(&running_trace_lock);
- relay_flush(bt->rchan);
- }
+ blk_trace_stop(bt);
put_probe_ref();
synchronize_rcu();
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 02e18c436439..fbf2543111c0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2028,7 +2028,6 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
static void print_ip_ins(const char *fmt, const unsigned char *p)
{
char ins[MCOUNT_INSN_SIZE];
- int i;
if (copy_from_kernel_nofault(ins, p, MCOUNT_INSN_SIZE)) {
printk(KERN_CONT "%s[FAULT] %px\n", fmt, p);
@@ -2036,9 +2035,7 @@ static void print_ip_ins(const char *fmt, const unsigned char *p)
}
printk(KERN_CONT "%s", fmt);
-
- for (i = 0; i < MCOUNT_INSN_SIZE; i++)
- printk(KERN_CONT "%s%02x", i ? ":" : "", ins[i]);
+ pr_cont("%*phC", MCOUNT_INSN_SIZE, ins);
}
enum ftrace_bug_type ftrace_bug_type;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c3f354cfc5ba..199759c73519 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -885,7 +885,7 @@ size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu)
}
/**
- * ring_buffer_nr_pages_dirty - get the number of used pages in the ring buffer
+ * ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer
* @buffer: The ring_buffer to get the number of pages from
* @cpu: The cpu of the ring_buffer to get the number of pages from
*
@@ -5305,7 +5305,7 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
/**
- * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer
* @buffer: The ring buffer to reset a per cpu buffer of
* @cpu: The CPU buffer to be reset
*/
@@ -5375,7 +5375,7 @@ void ring_buffer_reset(struct trace_buffer *buffer)
EXPORT_SYMBOL_GPL(ring_buffer_reset);
/**
- * rind_buffer_empty - is the ring buffer empty?
+ * ring_buffer_empty - is the ring buffer empty?
* @buffer: The ring buffer to test
*/
bool ring_buffer_empty(struct trace_buffer *buffer)
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index c08bde9871ec..5dd0617e5df6 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -16,6 +16,7 @@
#include "trace_dynevent.h"
#include "trace_probe.h"
#include "trace_probe_tmpl.h"
+#include "trace_probe_kernel.h"
#define EPROBE_EVENT_SYSTEM "eprobes"
@@ -456,29 +457,14 @@ NOKPROBE_SYMBOL(process_fetch_insn)
static nokprobe_inline int
fetch_store_strlen_user(unsigned long addr)
{
- const void __user *uaddr = (__force const void __user *)addr;
-
- return strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
+ return kern_fetch_store_strlen_user(addr);
}
/* Return the length of string -- including null terminal byte */
static nokprobe_inline int
fetch_store_strlen(unsigned long addr)
{
- int ret, len = 0;
- u8 c;
-
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- if (addr < TASK_SIZE)
- return fetch_store_strlen_user(addr);
-#endif
-
- do {
- ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1);
- len++;
- } while (c && ret == 0 && len < MAX_STRING_SIZE);
-
- return (ret < 0) ? ret : len;
+ return kern_fetch_store_strlen(addr);
}
/*
@@ -488,21 +474,7 @@ fetch_store_strlen(unsigned long addr)
static nokprobe_inline int
fetch_store_string_user(unsigned long addr, void *dest, void *base)
{
- const void __user *uaddr = (__force const void __user *)addr;
- int maxlen = get_loc_len(*(u32 *)dest);
- void *__dest;
- long ret;
-
- if (unlikely(!maxlen))
- return -ENOMEM;
-
- __dest = get_loc_data(dest, base);
-
- ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
- if (ret >= 0)
- *(u32 *)dest = make_data_loc(ret, __dest - base);
-
- return ret;
+ return kern_fetch_store_string_user(addr, dest, base);
}
/*
@@ -512,29 +484,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base)
static nokprobe_inline int
fetch_store_string(unsigned long addr, void *dest, void *base)
{
- int maxlen = get_loc_len(*(u32 *)dest);
- void *__dest;
- long ret;
-
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- if ((unsigned long)addr < TASK_SIZE)
- return fetch_store_string_user(addr, dest, base);
-#endif
-
- if (unlikely(!maxlen))
- return -ENOMEM;
-
- __dest = get_loc_data(dest, base);
-
- /*
- * Try to get string again, since the string can be changed while
- * probing.
- */
- ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
- if (ret >= 0)
- *(u32 *)dest = make_data_loc(ret, __dest - base);
-
- return ret;
+ return kern_fetch_store_string(addr, dest, base);
}
static nokprobe_inline int
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 5e8c07aef071..e310052dc83c 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -17,6 +17,8 @@
/* for gfp flag names */
#include <linux/trace_events.h>
#include <trace/events/mmflags.h>
+#include "trace_probe.h"
+#include "trace_probe_kernel.h"
#include "trace_synth.h"
@@ -409,6 +411,7 @@ static unsigned int trace_string(struct synth_trace_event *entry,
{
unsigned int len = 0;
char *str_field;
+ int ret;
if (is_dynamic) {
u32 data_offset;
@@ -417,19 +420,27 @@ static unsigned int trace_string(struct synth_trace_event *entry,
data_offset += event->n_u64 * sizeof(u64);
data_offset += data_size;
- str_field = (char *)entry + data_offset;
-
- len = strlen(str_val) + 1;
- strscpy(str_field, str_val, len);
+ len = kern_fetch_store_strlen((unsigned long)str_val);
data_offset |= len << 16;
*(u32 *)&entry->fields[*n_u64] = data_offset;
+ ret = kern_fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry);
+
(*n_u64)++;
} else {
str_field = (char *)&entry->fields[*n_u64];
- strscpy(str_field, str_val, STR_VAR_LEN_MAX);
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)str_val < TASK_SIZE)
+ ret = strncpy_from_user_nofault(str_field, str_val, STR_VAR_LEN_MAX);
+ else
+#endif
+ ret = strncpy_from_kernel_nofault(str_field, str_val, STR_VAR_LEN_MAX);
+
+ if (ret < 0)
+ strcpy(str_field, FAULT_STRING);
+
(*n_u64) += STR_VAR_LEN_MAX / sizeof(u64);
}
@@ -462,7 +473,7 @@ static notrace void trace_event_raw_event_synth(void *__data,
val_idx = var_ref_idx[field_pos];
str_val = (char *)(long)var_ref_vals[val_idx];
- len = strlen(str_val) + 1;
+ len = kern_fetch_store_strlen((unsigned long)str_val);
fields_size += len;
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 23f7f0ec4f4c..5a75b039e586 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -20,6 +20,7 @@
#include "trace_kprobe_selftest.h"
#include "trace_probe.h"
#include "trace_probe_tmpl.h"
+#include "trace_probe_kernel.h"
#define KPROBE_EVENT_SYSTEM "kprobes"
#define KRETPROBE_MAXACTIVE_MAX 4096
@@ -1223,29 +1224,14 @@ static const struct file_operations kprobe_profile_ops = {
static nokprobe_inline int
fetch_store_strlen_user(unsigned long addr)
{
- const void __user *uaddr = (__force const void __user *)addr;
-
- return strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
+ return kern_fetch_store_strlen_user(addr);
}
/* Return the length of string -- including null terminal byte */
static nokprobe_inline int
fetch_store_strlen(unsigned long addr)
{
- int ret, len = 0;
- u8 c;
-
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- if (addr < TASK_SIZE)
- return fetch_store_strlen_user(addr);
-#endif
-
- do {
- ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1);
- len++;
- } while (c && ret == 0 && len < MAX_STRING_SIZE);
-
- return (ret < 0) ? ret : len;
+ return kern_fetch_store_strlen(addr);
}
/*
@@ -1255,21 +1241,7 @@ fetch_store_strlen(unsigned long addr)
static nokprobe_inline int
fetch_store_string_user(unsigned long addr, void *dest, void *base)
{
- const void __user *uaddr = (__force const void __user *)addr;
- int maxlen = get_loc_len(*(u32 *)dest);
- void *__dest;
- long ret;
-
- if (unlikely(!maxlen))
- return -ENOMEM;
-
- __dest = get_loc_data(dest, base);
-
- ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
- if (ret >= 0)
- *(u32 *)dest = make_data_loc(ret, __dest - base);
-
- return ret;
+ return kern_fetch_store_string_user(addr, dest, base);
}
/*
@@ -1279,29 +1251,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base)
static nokprobe_inline int
fetch_store_string(unsigned long addr, void *dest, void *base)
{
- int maxlen = get_loc_len(*(u32 *)dest);
- void *__dest;
- long ret;
-
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- if ((unsigned long)addr < TASK_SIZE)
- return fetch_store_string_user(addr, dest, base);
-#endif
-
- if (unlikely(!maxlen))
- return -ENOMEM;
-
- __dest = get_loc_data(dest, base);
-
- /*
- * Try to get string again, since the string can be changed while
- * probing.
- */
- ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
- if (ret >= 0)
- *(u32 *)dest = make_data_loc(ret, __dest - base);
-
- return ret;
+ return kern_fetch_store_string(addr, dest, base);
}
static nokprobe_inline int
diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h
new file mode 100644
index 000000000000..77dbd9ff9782
--- /dev/null
+++ b/kernel/trace/trace_probe_kernel.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __TRACE_PROBE_KERNEL_H_
+#define __TRACE_PROBE_KERNEL_H_
+
+#define FAULT_STRING "(fault)"
+
+/*
+ * This depends on trace_probe.h, but can not include it due to
+ * the way trace_probe_tmpl.h is used by trace_kprobe.c and trace_eprobe.c.
+ * Which means that any other user must include trace_probe.h before including
+ * this file.
+ */
+/* Return the length of string -- including null terminal byte */
+static nokprobe_inline int
+kern_fetch_store_strlen_user(unsigned long addr)
+{
+ const void __user *uaddr = (__force const void __user *)addr;
+ int ret;
+
+ ret = strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
+ /*
+ * strnlen_user_nofault returns zero on fault, insert the
+ * FAULT_STRING when that occurs.
+ */
+ if (ret <= 0)
+ return strlen(FAULT_STRING) + 1;
+ return ret;
+}
+
+/* Return the length of string -- including null terminal byte */
+static nokprobe_inline int
+kern_fetch_store_strlen(unsigned long addr)
+{
+ int ret, len = 0;
+ u8 c;
+
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if (addr < TASK_SIZE)
+ return kern_fetch_store_strlen_user(addr);
+#endif
+
+ do {
+ ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1);
+ len++;
+ } while (c && ret == 0 && len < MAX_STRING_SIZE);
+
+ /* For faults, return enough to hold the FAULT_STRING */
+ return (ret < 0) ? strlen(FAULT_STRING) + 1 : len;
+}
+
+static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void *base, int len)
+{
+ if (ret >= 0) {
+ *(u32 *)dest = make_data_loc(ret, __dest - base);
+ } else {
+ strscpy(__dest, FAULT_STRING, len);
+ ret = strlen(__dest) + 1;
+ }
+}
+
+/*
+ * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf
+ * with max length and relative data location.
+ */
+static nokprobe_inline int
+kern_fetch_store_string_user(unsigned long addr, void *dest, void *base)
+{
+ const void __user *uaddr = (__force const void __user *)addr;
+ int maxlen = get_loc_len(*(u32 *)dest);
+ void *__dest;
+ long ret;
+
+ if (unlikely(!maxlen))
+ return -ENOMEM;
+
+ __dest = get_loc_data(dest, base);
+
+ ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
+ set_data_loc(ret, dest, __dest, base, maxlen);
+
+ return ret;
+}
+
+/*
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max
+ * length and relative data location.
+ */
+static nokprobe_inline int
+kern_fetch_store_string(unsigned long addr, void *dest, void *base)
+{
+ int maxlen = get_loc_len(*(u32 *)dest);
+ void *__dest;
+ long ret;
+
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)addr < TASK_SIZE)
+ return kern_fetch_store_string_user(addr, dest, base);
+#endif
+
+ if (unlikely(!maxlen))
+ return -ENOMEM;
+
+ __dest = get_loc_data(dest, base);
+
+ /*
+ * Try to get string again, since the string can be changed while
+ * probing.
+ */
+ ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
+ set_data_loc(ret, dest, __dest, base, maxlen);
+
+ return ret;
+}
+
+#endif /* __TRACE_PROBE_KERNEL_H_ */
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index de16bcf14b03..064072c16e3d 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -76,6 +76,13 @@ static DEFINE_CTL_TABLE_POLL(domainname_poll);
static struct ctl_table uts_kern_table[] = {
{
+ .procname = "arch",
+ .data = init_uts_ns.name.machine,
+ .maxlen = sizeof(init_uts_ns.name.machine),
+ .mode = 0444,
+ .proc_handler = proc_do_uts_string,
+ },
+ {
.procname = "ostype",
.data = init_uts_ns.name.sysname,
.maxlen = sizeof(init_uts_ns.name.sysname),