aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2020-05-01 17:02:27 -0700
committerDavid S. Miller <davem@davemloft.net>2020-05-01 17:02:27 -0700
commit115506fea499f1cd9a80290b31eca4352e0559e9 (patch)
tree25e4ff3b5a49115d964fab690cf72fa18a5f96bf /kernel
parent5b95dea31636ce93660930d16172fe75589b2e70 (diff)
parent57dc6f3b4133f45e73d87895180ca1f3eaf01722 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Alexei Starovoitov says: ==================== pull-request: bpf-next 2020-05-01 (v2) The following pull-request contains BPF updates for your *net-next* tree. We've added 61 non-merge commits during the last 6 day(s) which contain a total of 153 files changed, 6739 insertions(+), 3367 deletions(-). The main changes are: 1) pulled work.sysctl from vfs tree with sysctl bpf changes. 2) bpf_link observability, from Andrii. 3) BTF-defined map in map, from Andrii. 4) asan fixes for selftests, from Andrii. 5) Allow bpf_map_lookup_elem for SOCKMAP and SOCKHASH, from Jakub. 6) production cloudflare classifier as a selftes, from Lorenz. 7) bpf_ktime_get_*_ns() helper improvements, from Maciej. 8) unprivileged bpftool feature probe, from Quentin. 9) BPF_ENABLE_STATS command, from Song. 10) enable bpf_[gs]etsockopt() helpers for sock_ops progs, from Stanislav. 11) enable a bunch of common helpers for cg-device, sysctl, sockopt progs, from Stanislav. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/btf.c2
-rw-r--r--kernel/bpf/cgroup.c146
-rw-r--r--kernel/bpf/core.c6
-rw-r--r--kernel/bpf/helpers.c89
-rw-r--r--kernel/bpf/syscall.c410
-rw-r--r--kernel/bpf/verifier.c80
-rw-r--r--kernel/cgroup/cgroup.c27
-rw-r--r--kernel/events/callchain.c2
-rw-r--r--kernel/events/core.c6
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/latencytop.c4
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/printk/printk.c2
-rw-r--r--kernel/sched/core.c9
-rw-r--r--kernel/sched/fair.c3
-rw-r--r--kernel/sched/rt.c10
-rw-r--r--kernel/sched/topology.c2
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/sysctl.c3057
-rw-r--r--kernel/time/timer.c3
-rw-r--r--kernel/trace/bpf_trace.c2
-rw-r--r--kernel/trace/trace.c2
-rw-r--r--kernel/umh.c2
-rw-r--r--kernel/utsname_sysctl.c2
-rw-r--r--kernel/watchdog.c12
25 files changed, 2101 insertions, 1783 deletions
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index d65c6912bdaf..a2cfba89a8e1 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3482,6 +3482,7 @@ extern char __weak __stop_BTF[];
extern struct btf *btf_vmlinux;
#define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name)
static union {
struct bpf_ctx_convert {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
@@ -3508,6 +3509,7 @@ static u8 bpf_ctx_convert_map[] = {
0, /* avoid empty array */
};
#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
static const struct btf_member *
btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index cb305e71e7de..5c0e964105ac 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -557,8 +557,9 @@ found:
*
* Must be called with cgroup_mutex held.
*/
-int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
- struct bpf_prog *new_prog)
+static int __cgroup_bpf_replace(struct cgroup *cgrp,
+ struct bpf_cgroup_link *link,
+ struct bpf_prog *new_prog)
{
struct list_head *progs = &cgrp->bpf.progs[link->type];
struct bpf_prog *old_prog;
@@ -583,6 +584,30 @@ int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
return 0;
}
+static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog)
+{
+ struct bpf_cgroup_link *cg_link;
+ int ret;
+
+ cg_link = container_of(link, struct bpf_cgroup_link, link);
+
+ mutex_lock(&cgroup_mutex);
+ /* link might have been auto-released by dying cgroup, so fail */
+ if (!cg_link->cgroup) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ if (old_prog && link->prog != old_prog) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+ ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
+out_unlock:
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
struct bpf_prog *prog,
struct bpf_cgroup_link *link,
@@ -808,17 +833,56 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link)
kfree(cg_link);
}
-const struct bpf_link_ops bpf_cgroup_link_lops = {
+static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_cgroup_link *cg_link =
+ container_of(link, struct bpf_cgroup_link, link);
+ u64 cg_id = 0;
+
+ mutex_lock(&cgroup_mutex);
+ if (cg_link->cgroup)
+ cg_id = cgroup_id(cg_link->cgroup);
+ mutex_unlock(&cgroup_mutex);
+
+ seq_printf(seq,
+ "cgroup_id:\t%llu\n"
+ "attach_type:\t%d\n",
+ cg_id,
+ cg_link->type);
+}
+
+static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_cgroup_link *cg_link =
+ container_of(link, struct bpf_cgroup_link, link);
+ u64 cg_id = 0;
+
+ mutex_lock(&cgroup_mutex);
+ if (cg_link->cgroup)
+ cg_id = cgroup_id(cg_link->cgroup);
+ mutex_unlock(&cgroup_mutex);
+
+ info->cgroup.cgroup_id = cg_id;
+ info->cgroup.attach_type = cg_link->type;
+ return 0;
+}
+
+static const struct bpf_link_ops bpf_cgroup_link_lops = {
.release = bpf_cgroup_link_release,
.dealloc = bpf_cgroup_link_dealloc,
+ .update_prog = cgroup_bpf_replace,
+ .show_fdinfo = bpf_cgroup_link_show_fdinfo,
+ .fill_link_info = bpf_cgroup_link_fill_link_info,
};
int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
+ struct bpf_link_primer link_primer;
struct bpf_cgroup_link *link;
- struct file *link_file;
struct cgroup *cgrp;
- int err, link_fd;
+ int err;
if (attr->link_create.flags)
return -EINVAL;
@@ -832,26 +896,25 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
err = -ENOMEM;
goto out_put_cgroup;
}
- bpf_link_init(&link->link, &bpf_cgroup_link_lops, prog);
+ bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops,
+ prog);
link->cgroup = cgrp;
link->type = attr->link_create.attach_type;
- link_file = bpf_link_new_file(&link->link, &link_fd);
- if (IS_ERR(link_file)) {
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
kfree(link);
- err = PTR_ERR(link_file);
goto out_put_cgroup;
}
err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type,
BPF_F_ALLOW_MULTI);
if (err) {
- bpf_link_cleanup(&link->link, link_file, link_fd);
+ bpf_link_cleanup(&link_primer);
goto out_put_cgroup;
}
- fd_install(link_fd, link_file);
- return link_fd;
+ return bpf_link_settle(&link_primer);
out_put_cgroup:
cgroup_put(cgrp);
@@ -1054,36 +1117,21 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
return !allow;
}
-EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
static const struct bpf_func_proto *
cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
- case BPF_FUNC_map_lookup_elem:
- return &bpf_map_lookup_elem_proto;
- case BPF_FUNC_map_update_elem:
- return &bpf_map_update_elem_proto;
- case BPF_FUNC_map_delete_elem:
- return &bpf_map_delete_elem_proto;
- case BPF_FUNC_map_push_elem:
- return &bpf_map_push_elem_proto;
- case BPF_FUNC_map_pop_elem:
- return &bpf_map_pop_elem_proto;
- case BPF_FUNC_map_peek_elem:
- return &bpf_map_peek_elem_proto;
case BPF_FUNC_get_current_uid_gid:
return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_get_local_storage:
return &bpf_get_local_storage_proto;
case BPF_FUNC_get_current_cgroup_id:
return &bpf_get_current_cgroup_id_proto;
- case BPF_FUNC_trace_printk:
- if (capable(CAP_SYS_ADMIN))
- return bpf_get_trace_printk_proto();
- /* fall through */
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
default:
- return NULL;
+ return bpf_base_func_proto(func_id);
}
}
@@ -1137,16 +1185,13 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
* @head: sysctl table header
* @table: sysctl table
* @write: sysctl is being read (= 0) or written (= 1)
- * @buf: pointer to buffer passed by user space
+ * @buf: pointer to buffer (in and out)
* @pcount: value-result argument: value is size of buffer pointed to by @buf,
* result is size of @new_buf if program set new value, initial value
* otherwise
* @ppos: value-result argument: value is position at which read from or write
* to sysctl is happening, result is new position if program overrode it,
* initial value otherwise
- * @new_buf: pointer to pointer to new buffer that will be allocated if program
- * overrides new value provided by user space on sysctl write
- * NOTE: it's caller responsibility to free *new_buf if it was set
* @type: type of program to be executed
*
* Program is run when sysctl is being accessed, either read or written, and
@@ -1157,8 +1202,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
*/
int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
struct ctl_table *table, int write,
- void __user *buf, size_t *pcount,
- loff_t *ppos, void **new_buf,
+ void **buf, size_t *pcount, loff_t *ppos,
enum bpf_attach_type type)
{
struct bpf_sysctl_kern ctx = {
@@ -1173,36 +1217,28 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
.new_updated = 0,
};
struct cgroup *cgrp;
+ loff_t pos = 0;
int ret;
ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
- if (ctx.cur_val) {
- mm_segment_t old_fs;
- loff_t pos = 0;
-
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- if (table->proc_handler(table, 0, (void __user *)ctx.cur_val,
- &ctx.cur_len, &pos)) {
- /* Let BPF program decide how to proceed. */
- ctx.cur_len = 0;
- }
- set_fs(old_fs);
- } else {
+ if (!ctx.cur_val ||
+ table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) {
/* Let BPF program decide how to proceed. */
ctx.cur_len = 0;
}
- if (write && buf && *pcount) {
+ if (write && *buf && *pcount) {
/* BPF program should be able to override new value with a
* buffer bigger than provided by user.
*/
ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
- if (!ctx.new_val ||
- copy_from_user(ctx.new_val, buf, ctx.new_len))
+ if (ctx.new_val) {
+ memcpy(ctx.new_val, *buf, ctx.new_len);
+ } else {
/* Let BPF program decide how to proceed. */
ctx.new_len = 0;
+ }
}
rcu_read_lock();
@@ -1213,7 +1249,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
kfree(ctx.cur_val);
if (ret == 1 && ctx.new_updated) {
- *new_buf = ctx.new_val;
+ kfree(*buf);
+ *buf = ctx.new_val;
*pcount = ctx.new_len;
} else {
kfree(ctx.new_val);
@@ -1221,7 +1258,6 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
return ret == 1 ? 0 : -EPERM;
}
-EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
#ifdef CONFIG_NET
static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
@@ -1326,7 +1362,6 @@ out:
sockopt_free_buf(&ctx);
return ret;
}
-EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt);
int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
int optname, char __user *optval,
@@ -1413,7 +1448,6 @@ out:
sockopt_free_buf(&ctx);
return ret;
}
-EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt);
#endif
static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 916f5132a984..6aa11de67315 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2136,6 +2136,11 @@ BPF_CALL_0(bpf_user_rnd_u32)
return res;
}
+BPF_CALL_0(bpf_get_raw_cpu_id)
+{
+ return raw_smp_processor_id();
+}
+
/* Weak definitions of helper functions in case we don't have bpf syscall. */
const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
const struct bpf_func_proto bpf_map_update_elem_proto __weak;
@@ -2151,6 +2156,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index bafc53ddd350..5c0290e0696e 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -151,7 +151,19 @@ BPF_CALL_0(bpf_ktime_get_ns)
const struct bpf_func_proto bpf_ktime_get_ns_proto = {
.func = bpf_ktime_get_ns,
- .gpl_only = true,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_0(bpf_ktime_get_boot_ns)
+{
+ /* NMI safe access to clock boottime */
+ return ktime_get_boot_fast_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = {
+ .func = bpf_ktime_get_boot_ns,
+ .gpl_only = false,
.ret_type = RET_INTEGER,
};
@@ -562,3 +574,78 @@ const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = {
.arg3_type = ARG_PTR_TO_UNINIT_MEM,
.arg4_type = ARG_CONST_SIZE,
};
+
+static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
+ .func = bpf_get_raw_cpu_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map,
+ u64, flags, void *, data, u64, size)
+{
+ if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+ return -EINVAL;
+
+ return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
+}
+
+const struct bpf_func_proto bpf_event_output_data_proto = {
+ .func = bpf_event_output_data,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+const struct bpf_func_proto *
+bpf_base_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_map_lookup_elem:
+ return &bpf_map_lookup_elem_proto;
+ case BPF_FUNC_map_update_elem:
+ return &bpf_map_update_elem_proto;
+ case BPF_FUNC_map_delete_elem:
+ return &bpf_map_delete_elem_proto;
+ case BPF_FUNC_map_push_elem:
+ return &bpf_map_push_elem_proto;
+ case BPF_FUNC_map_pop_elem:
+ return &bpf_map_pop_elem_proto;
+ case BPF_FUNC_map_peek_elem:
+ return &bpf_map_peek_elem_proto;
+ case BPF_FUNC_get_prandom_u32:
+ return &bpf_get_prandom_u32_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_raw_smp_processor_id_proto;
+ case BPF_FUNC_get_numa_node_id:
+ return &bpf_get_numa_node_id_proto;
+ case BPF_FUNC_tail_call:
+ return &bpf_tail_call_proto;
+ case BPF_FUNC_ktime_get_ns:
+ return &bpf_ktime_get_ns_proto;
+ case BPF_FUNC_ktime_get_boot_ns:
+ return &bpf_ktime_get_boot_ns_proto;
+ default:
+ break;
+ }
+
+ if (!capable(CAP_SYS_ADMIN))
+ return NULL;
+
+ switch (func_id) {
+ case BPF_FUNC_spin_lock:
+ return &bpf_spin_lock_proto;
+ case BPF_FUNC_spin_unlock:
+ return &bpf_spin_unlock_proto;
+ case BPF_FUNC_trace_printk:
+ return bpf_get_trace_printk_proto();
+ case BPF_FUNC_jiffies64:
+ return &bpf_jiffies64_proto;
+ default:
+ return NULL;
+ }
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7626b8024471..bb1ab7da6103 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -42,6 +42,8 @@ static DEFINE_IDR(prog_idr);
static DEFINE_SPINLOCK(prog_idr_lock);
static DEFINE_IDR(map_idr);
static DEFINE_SPINLOCK(map_idr_lock);
+static DEFINE_IDR(link_idr);
+static DEFINE_SPINLOCK(link_idr_lock);
int sysctl_unprivileged_bpf_disabled __read_mostly;
@@ -49,9 +51,11 @@ static const struct bpf_map_ops * const bpf_map_types[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
#define BPF_MAP_TYPE(_id, _ops) \
[_id] = &_ops,
+#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
};
/*
@@ -1546,9 +1550,11 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
[_id] = & _name ## _prog_ops,
#define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
};
static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
@@ -2181,25 +2187,39 @@ static int bpf_obj_get(const union bpf_attr *attr)
attr->file_flags);
}
-void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
- struct bpf_prog *prog)
+void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
+ const struct bpf_link_ops *ops, struct bpf_prog *prog)
{
atomic64_set(&link->refcnt, 1);
+ link->type = type;
+ link->id = 0;
link->ops = ops;
link->prog = prog;
}
+static void bpf_link_free_id(int id)
+{
+ if (!id)
+ return;
+
+ spin_lock_bh(&link_idr_lock);
+ idr_remove(&link_idr, id);
+ spin_unlock_bh(&link_idr_lock);
+}
+
/* Clean up bpf_link and corresponding anon_inode file and FD. After
* anon_inode is created, bpf_link can't be just kfree()'d due to deferred
- * anon_inode's release() call. This helper manages marking bpf_link as
- * defunct, releases anon_inode file and puts reserved FD.
+ * anon_inode's release() call. This helper marksbpf_link as
+ * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
+ * is not decremented, it's the responsibility of a calling code that failed
+ * to complete bpf_link initialization.
*/
-void bpf_link_cleanup(struct bpf_link *link, struct file *link_file,
- int link_fd)
+void bpf_link_cleanup(struct bpf_link_primer *primer)
{
- link->prog = NULL;
- fput(link_file);
- put_unused_fd(link_fd);
+ primer->link->prog = NULL;
+ bpf_link_free_id(primer->id);
+ fput(primer->file);
+ put_unused_fd(primer->fd);
}
void bpf_link_inc(struct bpf_link *link)
@@ -2210,6 +2230,7 @@ void bpf_link_inc(struct bpf_link *link)
/* bpf_link_free is guaranteed to be called from process context */
static void bpf_link_free(struct bpf_link *link)
{
+ bpf_link_free_id(link->id);
if (link->prog) {
/* detach BPF program, clean up used resources */
link->ops->release(link);
@@ -2251,35 +2272,35 @@ static int bpf_link_release(struct inode *inode, struct file *filp)
}
#ifdef CONFIG_PROC_FS
-static const struct bpf_link_ops bpf_raw_tp_lops;
-static const struct bpf_link_ops bpf_tracing_link_lops;
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
+#define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name) [_id] = #_name,
+static const char *bpf_link_type_strs[] = {
+ [BPF_LINK_TYPE_UNSPEC] = "<invalid>",
+#include <linux/bpf_types.h>
+};
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_link *link = filp->private_data;
const struct bpf_prog *prog = link->prog;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
- const char *link_type;
-
- if (link->ops == &bpf_raw_tp_lops)
- link_type = "raw_tracepoint";
- else if (link->ops == &bpf_tracing_link_lops)
- link_type = "tracing";
-#ifdef CONFIG_CGROUP_BPF
- else if (link->ops == &bpf_cgroup_link_lops)
- link_type = "cgroup";
-#endif
- else
- link_type = "unknown";
bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
"link_type:\t%s\n"
+ "link_id:\t%u\n"
"prog_tag:\t%s\n"
"prog_id:\t%u\n",
- link_type,
+ bpf_link_type_strs[link->type],
+ link->id,
prog_tag,
prog->aux->id);
+ if (link->ops->show_fdinfo)
+ link->ops->show_fdinfo(link, m);
}
#endif
@@ -2292,36 +2313,77 @@ static const struct file_operations bpf_link_fops = {
.write = bpf_dummy_write,
};
-int bpf_link_new_fd(struct bpf_link *link)
+static int bpf_link_alloc_id(struct bpf_link *link)
{
- return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
-}
+ int id;
-/* Similar to bpf_link_new_fd, create anon_inode for given bpf_link, but
- * instead of immediately installing fd in fdtable, just reserve it and
- * return. Caller then need to either install it with fd_install(fd, file) or
- * release with put_unused_fd(fd).
- * This is useful for cases when bpf_link attachment/detachment are
- * complicated and expensive operations and should be delayed until all the fd
- * reservation and anon_inode creation succeeds.
+ idr_preload(GFP_KERNEL);
+ spin_lock_bh(&link_idr_lock);
+ id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC);
+ spin_unlock_bh(&link_idr_lock);
+ idr_preload_end();
+
+ return id;
+}
+
+/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file,
+ * reserving unused FD and allocating ID from link_idr. This is to be paired
+ * with bpf_link_settle() to install FD and ID and expose bpf_link to
+ * user-space, if bpf_link is successfully attached. If not, bpf_link and
+ * pre-allocated resources are to be freed with bpf_cleanup() call. All the
+ * transient state is passed around in struct bpf_link_primer.
+ * This is preferred way to create and initialize bpf_link, especially when
+ * there are complicated and expensive operations inbetween creating bpf_link
+ * itself and attaching it to BPF hook. By using bpf_link_prime() and
+ * bpf_link_settle() kernel code using bpf_link doesn't have to perform
+ * expensive (and potentially failing) roll back operations in a rare case
+ * that file, FD, or ID can't be allocated.
*/
-struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd)
+int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
{
struct file *file;
- int fd;
+ int fd, id;
fd = get_unused_fd_flags(O_CLOEXEC);
if (fd < 0)
- return ERR_PTR(fd);
+ return fd;
+
+
+ id = bpf_link_alloc_id(link);
+ if (id < 0) {
+ put_unused_fd(fd);
+ return id;
+ }
file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
if (IS_ERR(file)) {
+ bpf_link_free_id(id);
put_unused_fd(fd);
- return file;
+ return PTR_ERR(file);
}
- *reserved_fd = fd;
- return file;
+ primer->link = link;
+ primer->file = file;
+ primer->fd = fd;
+ primer->id = id;
+ return 0;
+}
+
+int bpf_link_settle(struct bpf_link_primer *primer)
+{
+ /* make bpf_link fetchable by ID */
+ spin_lock_bh(&link_idr_lock);
+ primer->link->id = primer->id;
+ spin_unlock_bh(&link_idr_lock);
+ /* make bpf_link fetchable by FD */
+ fd_install(primer->fd, primer->file);
+ /* pass through installed FD */
+ return primer->fd;
+}
+
+int bpf_link_new_fd(struct bpf_link *link)
+{
+ return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
}
struct bpf_link *bpf_link_get_from_fd(u32 ufd)
@@ -2345,6 +2407,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)
struct bpf_tracing_link {
struct bpf_link link;
+ enum bpf_attach_type attach_type;
};
static void bpf_tracing_link_release(struct bpf_link *link)
@@ -2360,16 +2423,40 @@ static void bpf_tracing_link_dealloc(struct bpf_link *link)
kfree(tr_link);
}
+static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_tracing_link *tr_link =
+ container_of(link, struct bpf_tracing_link, link);
+
+ seq_printf(seq,
+ "attach_type:\t%d\n",
+ tr_link->attach_type);
+}
+
+static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_tracing_link *tr_link =
+ container_of(link, struct bpf_tracing_link, link);
+
+ info->tracing.attach_type = tr_link->attach_type;
+
+ return 0;
+}
+
static const struct bpf_link_ops bpf_tracing_link_lops = {
.release = bpf_tracing_link_release,
.dealloc = bpf_tracing_link_dealloc,
+ .show_fdinfo = bpf_tracing_link_show_fdinfo,
+ .fill_link_info = bpf_tracing_link_fill_link_info,
};
static int bpf_tracing_prog_attach(struct bpf_prog *prog)
{
+ struct bpf_link_primer link_primer;
struct bpf_tracing_link *link;
- struct file *link_file;
- int link_fd, err;
+ int err;
switch (prog->type) {
case BPF_PROG_TYPE_TRACING:
@@ -2402,24 +2489,23 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog)
err = -ENOMEM;
goto out_put_prog;
}
- bpf_link_init(&link->link, &bpf_tracing_link_lops, prog);
+ bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING,
+ &bpf_tracing_link_lops, prog);
+ link->attach_type = prog->expected_attach_type;
- link_file = bpf_link_new_file(&link->link, &link_fd);
- if (IS_ERR(link_file)) {
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
kfree(link);
- err = PTR_ERR(link_file);
goto out_put_prog;
}
err = bpf_trampoline_link_prog(prog);
if (err) {
- bpf_link_cleanup(&link->link, link_file, link_fd);
+ bpf_link_cleanup(&link_primer);
goto out_put_prog;
}
- fd_install(link_fd, link_file);
- return link_fd;
-
+ return bpf_link_settle(&link_primer);
out_put_prog:
bpf_prog_put(prog);
return err;
@@ -2447,22 +2533,69 @@ static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
kfree(raw_tp);
}
-static const struct bpf_link_ops bpf_raw_tp_lops = {
+static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_raw_tp_link *raw_tp_link =
+ container_of(link, struct bpf_raw_tp_link, link);
+
+ seq_printf(seq,
+ "tp_name:\t%s\n",
+ raw_tp_link->btp->tp->name);
+}
+
+static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_raw_tp_link *raw_tp_link =
+ container_of(link, struct bpf_raw_tp_link, link);
+ char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name);
+ const char *tp_name = raw_tp_link->btp->tp->name;
+ u32 ulen = info->raw_tracepoint.tp_name_len;
+ size_t tp_len = strlen(tp_name);
+
+ if (ulen && !ubuf)
+ return -EINVAL;
+
+ info->raw_tracepoint.tp_name_len = tp_len + 1;
+
+ if (!ubuf)
+ return 0;
+
+ if (ulen >= tp_len + 1) {
+ if (copy_to_user(ubuf, tp_name, tp_len + 1))
+ return -EFAULT;
+ } else {
+ char zero = '\0';
+
+ if (copy_to_user(ubuf, tp_name, ulen - 1))
+ return -EFAULT;
+ if (put_user(zero, ubuf + ulen - 1))
+ return -EFAULT;
+ return -ENOSPC;
+ }
+
+ return 0;
+}
+
+static const struct bpf_link_ops bpf_raw_tp_link_lops = {
.release = bpf_raw_tp_link_release,
.dealloc = bpf_raw_tp_link_dealloc,
+ .show_fdinfo = bpf_raw_tp_link_show_fdinfo,
+ .fill_link_info = bpf_raw_tp_link_fill_link_info,
};
#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
{
+ struct bpf_link_primer link_primer;
struct bpf_raw_tp_link *link;
struct bpf_raw_event_map *btp;
- struct file *link_file;
struct bpf_prog *prog;
const char *tp_name;
char buf[128];
- int link_fd, err;
+ int err;
if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
return -EINVAL;
@@ -2515,24 +2648,23 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
err = -ENOMEM;
goto out_put_btp;
}
- bpf_link_init(&link->link, &bpf_raw_tp_lops, prog);
+ bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
+ &bpf_raw_tp_link_lops, prog);
link->btp = btp;
- link_file = bpf_link_new_file(&link->link, &link_fd);
- if (IS_ERR(link_file)) {
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
kfree(link);
- err = PTR_ERR(link_file);
goto out_put_btp;
}
err = bpf_probe_register(link->btp, prog);
if (err) {
- bpf_link_cleanup(&link->link, link_file, link_fd);
+ bpf_link_cleanup(&link_primer);
goto out_put_btp;
}
- fd_install(link_fd, link_file);
- return link_fd;
+ return bpf_link_settle(&link_primer);
out_put_btp:
bpf_put_raw_tracepoint(btp);
@@ -3313,6 +3445,42 @@ static int bpf_btf_get_info_by_fd(struct btf *btf,
return btf_get_info_by_fd(btf, attr, uattr);
}
+static int bpf_link_get_info_by_fd(struct bpf_link *link,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ struct bpf_link_info info;
+ u32 info_len = attr->info.info_len;
+ int err;
+
+ err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ if (err)
+ return err;
+ info_len = min_t(u32, sizeof(info), info_len);
+
+ memset(&info, 0, sizeof(info));
+ if (copy_from_user(&info, uinfo, info_len))
+ return -EFAULT;
+
+ info.type = link->type;
+ info.id = link->id;
+ info.prog_id = link->prog->aux->id;
+
+ if (link->ops->fill_link_info) {
+ err = link->ops->fill_link_info(link, &info);
+ if (err)
+ return err;
+ }
+
+ if (copy_to_user(uinfo, &info, info_len) ||
+ put_user(info_len, &uattr->info.info_len))
+ return -EFAULT;
+
+ return 0;
+}
+
+
#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
@@ -3337,6 +3505,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
uattr);
else if (f.file->f_op == &btf_fops)
err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr);
+ else if (f.file->f_op == &bpf_link_fops)
+ err = bpf_link_get_info_by_fd(f.file->private_data,
+ attr, uattr);
else
err = -EINVAL;
@@ -3464,7 +3635,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
if (file->f_op == &bpf_link_fops) {
struct bpf_link *link = file->private_data;
- if (link->ops == &bpf_raw_tp_lops) {
+ if (link->ops == &bpf_raw_tp_link_lops) {
struct bpf_raw_tp_link *raw_tp =
container_of(link, struct bpf_raw_tp_link, link);
struct bpf_raw_event_map *btp = raw_tp->btp;
@@ -3645,13 +3816,10 @@ static int link_update(union bpf_attr *attr)
goto out_put_progs;
}
-#ifdef CONFIG_CGROUP_BPF
- if (link->ops == &bpf_cgroup_link_lops) {
- ret = cgroup_bpf_replace(link, old_prog, new_prog);
- goto out_put_progs;
- }
-#endif
- ret = -EINVAL;
+ if (link->ops->update_prog)
+ ret = link->ops->update_prog(link, new_prog, old_prog);
+ else
+ ret = EINVAL;
out_put_progs:
if (old_prog)
@@ -3663,6 +3831,102 @@ out_put_link:
return ret;
}
+static int bpf_link_inc_not_zero(struct bpf_link *link)
+{
+ return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT;
+}
+
+#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
+
+static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
+{
+ struct bpf_link *link;
+ u32 id = attr->link_id;
+ int fd, err;
+
+ if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ spin_lock_bh(&link_idr_lock);
+ link = idr_find(&link_idr, id);
+ /* before link is "settled", ID is 0, pretend it doesn't exist yet */
+ if (link) {
+ if (link->id)
+ err = bpf_link_inc_not_zero(link);
+ else
+ err = -EAGAIN;
+ } else {
+ err = -ENOENT;
+ }
+ spin_unlock_bh(&link_idr_lock);
+
+ if (err)
+ return err;
+
+ fd = bpf_link_new_fd(link);
+ if (fd < 0)
+ bpf_link_put(link);
+
+ return fd;
+}
+
+DEFINE_MUTEX(bpf_stats_enabled_mutex);
+
+static int bpf_stats_release(struct inode *inode, struct file *file)
+{
+ mutex_lock(&bpf_stats_enabled_mutex);
+ static_key_slow_dec(&bpf_stats_enabled_key.key);
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return 0;
+}
+
+static const struct file_operations bpf_stats_fops = {
+ .release = bpf_stats_release,
+};
+
+static int bpf_enable_runtime_stats(void)
+{
+ int fd;
+
+ mutex_lock(&bpf_stats_enabled_mutex);
+
+ /* Set a very high limit to avoid overflow */
+ if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) {
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return -EBUSY;
+ }
+
+ fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC);
+ if (fd >= 0)
+ static_key_slow_inc(&bpf_stats_enabled_key.key);
+
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return fd;
+}
+
+#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type
+
+static int bpf_enable_stats(union bpf_attr *attr)
+{
+
+ if (CHECK_ATTR(BPF_ENABLE_STATS))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (attr->enable_stats.type) {
+ case BPF_STATS_RUN_TIME:
+ return bpf_enable_runtime_stats();
+ default:
+ break;
+ }
+ return -EINVAL;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr;
@@ -3780,6 +4044,16 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_LINK_UPDATE:
err = link_update(&attr);
break;
+ case BPF_LINK_GET_FD_BY_ID:
+ err = bpf_link_get_fd_by_id(&attr);
+ break;
+ case BPF_LINK_GET_NEXT_ID:
+ err = bpf_obj_get_next_id(&attr, uattr,
+ &link_idr, &link_idr_lock);
+ break;
+ case BPF_ENABLE_STATS:
+ err = bpf_enable_stats(&attr);
+ break;
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fa1d8245b925..70ad009577f8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -28,9 +28,11 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
[_id] = & _name ## _verifier_ops,
#define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
};
/* bpf_check() is a static code analyzer that walks eBPF program
@@ -168,6 +170,8 @@ struct bpf_verifier_stack_elem {
int insn_idx;
int prev_insn_idx;
struct bpf_verifier_stack_elem *next;
+ /* length of verifier log at the time this state was pushed on stack */
+ u32 log_pos;
};
#define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192
@@ -283,6 +287,18 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
log->ubuf = NULL;
}
+static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos)
+{
+ char zero = 0;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ log->len_used = new_pos;
+ if (put_user(zero, log->ubuf + new_pos))
+ log->ubuf = NULL;
+}
+
/* log_level controls verbosity level of eBPF verifier.
* bpf_verifier_log_write() is used to dump the verification trace to the log,
* so the user can figure out what's wrong with the program
@@ -413,11 +429,30 @@ static bool is_release_function(enum bpf_func_id func_id)
return func_id == BPF_FUNC_sk_release;
}
-static bool is_acquire_function(enum bpf_func_id func_id)
+static bool may_be_acquire_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_sk_lookup_tcp ||
func_id == BPF_FUNC_sk_lookup_udp ||
- func_id == BPF_FUNC_skc_lookup_tcp;
+ func_id == BPF_FUNC_skc_lookup_tcp ||
+ func_id == BPF_FUNC_map_lookup_elem;
+}
+
+static bool is_acquire_function(enum bpf_func_id func_id,
+ const struct bpf_map *map)
+{
+ enum bpf_map_type map_type = map ? map->map_type : BPF_MAP_TYPE_UNSPEC;
+
+ if (func_id == BPF_FUNC_sk_lookup_tcp ||
+ func_id == BPF_FUNC_sk_lookup_udp ||
+ func_id == BPF_FUNC_skc_lookup_tcp)
+ return true;
+
+ if (func_id == BPF_FUNC_map_lookup_elem &&
+ (map_type == BPF_MAP_TYPE_SOCKMAP ||
+ map_type == BPF_MAP_TYPE_SOCKHASH))
+ return true;
+
+ return false;
}
static bool is_ptr_cast_function(enum bpf_func_id func_id)
@@ -846,7 +881,7 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi
}
static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
- int *insn_idx)
+ int *insn_idx, bool pop_log)
{
struct bpf_verifier_state *cur = env->cur_state;
struct bpf_verifier_stack_elem *elem, *head = env->head;
@@ -860,6 +895,8 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
if (err)
return err;
}
+ if (pop_log)
+ bpf_vlog_reset(&env->log, head->log_pos);
if (insn_idx)
*insn_idx = head->insn_idx;
if (prev_insn_idx)
@@ -887,6 +924,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
elem->next = env->head;
+ elem->log_pos = env->log.len_used;
env->head = elem;
env->stack_size++;
err = copy_verifier_state(&elem->st, cur);
@@ -915,7 +953,7 @@ err:
free_verifier_state(env->cur_state, true);
env->cur_state = NULL;
/* pop all elements and return */
- while (!pop_stack(env, NULL, NULL));
+ while (!pop_stack(env, NULL, NULL, false));
return NULL;
}
@@ -3915,7 +3953,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_sock_map_update &&
func_id != BPF_FUNC_map_delete_elem &&
func_id != BPF_FUNC_msg_redirect_map &&
- func_id != BPF_FUNC_sk_select_reuseport)
+ func_id != BPF_FUNC_sk_select_reuseport &&
+ func_id != BPF_FUNC_map_lookup_elem)
goto error;
break;
case BPF_MAP_TYPE_SOCKHASH:
@@ -3923,7 +3962,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_sock_hash_update &&
func_id != BPF_FUNC_map_delete_elem &&
func_id != BPF_FUNC_msg_redirect_hash &&
- func_id != BPF_FUNC_sk_select_reuseport)
+ func_id != BPF_FUNC_sk_select_reuseport &&
+ func_id != BPF_FUNC_map_lookup_elem)
goto error;
break;
case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
@@ -4093,7 +4133,7 @@ static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id)
/* A reference acquiring function cannot acquire
* another refcounted ptr.
*/
- if (is_acquire_function(func_id) && count)
+ if (may_be_acquire_function(func_id) && count)
return false;
/* We only support one arg being unreferenced at the moment,
@@ -4604,7 +4644,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
if (is_ptr_cast_function(func_id)) {
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
- } else if (is_acquire_function(func_id)) {
+ } else if (is_acquire_function(func_id, meta.map_ptr)) {
int id = acquire_reference_state(env, insn_idx);
if (id < 0)
@@ -5609,7 +5649,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
{
struct bpf_reg_state *regs = cur_regs(env);
u8 opcode = BPF_OP(insn->code);
- bool src_known, dst_known;
+ bool src_known;
s64 smin_val, smax_val;
u64 umin_val, umax_val;
s32 s32_min_val, s32_max_val;
@@ -5631,7 +5671,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
if (alu32) {
src_known = tnum_subreg_is_const(src_reg.var_off);
- dst_known = tnum_subreg_is_const(dst_reg->var_off);
if ((src_known &&
(s32_min_val != s32_max_val || u32_min_val != u32_max_val)) ||
s32_min_val > s32_max_val || u32_min_val > u32_max_val) {
@@ -5643,7 +5682,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
}
} else {
src_known = tnum_is_const(src_reg.var_off);
- dst_known = tnum_is_const(dst_reg->var_off);
if ((src_known &&
(smin_val != smax_val || umin_val != umax_val)) ||
smin_val > smax_val || umin_val > umax_val) {
@@ -6515,12 +6553,16 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
if (is_null) {
reg->type = SCALAR_VALUE;
} else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
- if (reg->map_ptr->inner_map_meta) {
+ const struct bpf_map *map = reg->map_ptr;
+
+ if (map->inner_map_meta) {
reg->type = CONST_PTR_TO_MAP;
- reg->map_ptr = reg->map_ptr->inner_map_meta;
- } else if (reg->map_ptr->map_type ==
- BPF_MAP_TYPE_XSKMAP) {
+ reg->map_ptr = map->inner_map_meta;
+ } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
reg->type = PTR_TO_XDP_SOCK;
+ } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
+ map->map_type == BPF_MAP_TYPE_SOCKHASH) {
+ reg->type = PTR_TO_SOCKET;
} else {
reg->type = PTR_TO_MAP_VALUE;
}
@@ -8409,6 +8451,7 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
static int do_check(struct bpf_verifier_env *env)
{
+ bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
struct bpf_verifier_state *state = env->cur_state;
struct bpf_insn *insns = env->prog->insnsi;
struct bpf_reg_state *regs;
@@ -8685,7 +8728,7 @@ static int do_check(struct bpf_verifier_env *env)
process_bpf_exit:
update_branch_counts(env, env->cur_state);
err = pop_stack(env, &prev_insn_idx,
- &env->insn_idx);
+ &env->insn_idx, pop_log);
if (err < 0) {
if (err != -ENOENT)
return err;
@@ -10208,6 +10251,7 @@ static void sanitize_insn_aux_data(struct bpf_verifier_env *env)
static int do_check_common(struct bpf_verifier_env *env, int subprog)
{
+ bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
struct bpf_verifier_state *state;
struct bpf_reg_state *regs;
int ret, i;
@@ -10270,7 +10314,9 @@ out:
free_verifier_state(env->cur_state, true);
env->cur_state = NULL;
}
- while (!pop_stack(env, NULL, NULL));
+ while (!pop_stack(env, NULL, NULL, false));
+ if (!ret && pop_log)
+ bpf_vlog_reset(&env->log, 0);
free_states(env);
if (ret)
/* clean aux data in case subprog was rejected */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 06b5ea9d899d..557a9b9d2244 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6508,33 +6508,6 @@ int cgroup_bpf_attach(struct cgroup *cgrp,
return ret;
}
-int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *old_prog,
- struct bpf_prog *new_prog)
-{
- struct bpf_cgroup_link *cg_link;
- int ret;
-
- if (link->ops != &bpf_cgroup_link_lops)
- return -EINVAL;
-
- cg_link = container_of(link, struct bpf_cgroup_link, link);
-
- mutex_lock(&cgroup_mutex);
- /* link might have been auto-released by dying cgroup, so fail */
- if (!cg_link->cgroup) {
- ret = -EINVAL;
- goto out_unlock;
- }
- if (old_prog && link->prog != old_prog) {
- ret = -EPERM;
- goto out_unlock;
- }
- ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
-out_unlock:
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-
int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
enum bpf_attach_type type)
{
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index c2b41a263166..bdb1533ada81 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -236,7 +236,7 @@ exit_put:
* sysctl_perf_event_max_contexts_per_stack.
*/
int perf_event_max_stack_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *value = table->data;
int new_value = *value, ret;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 633b4ae72ed5..468139611e06 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -437,8 +437,7 @@ static void update_perf_cpu_limits(void)
static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
int perf_proc_update_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
int perf_cpu = sysctl_perf_cpu_time_max_percent;
@@ -462,8 +461,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 2625c241ac00..ffbe03a45c16 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -892,7 +892,7 @@ static void unoptimize_all_kprobes(void)
static DEFINE_MUTEX(kprobe_sysctl_mutex);
int sysctl_kprobes_optimization;
int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length,
+ void *buffer, size_t *length,
loff_t *ppos)
{
int ret;
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 8d1c15832e55..166d7bf49666 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -269,8 +269,8 @@ static int __init init_lstats_procfs(void)
return 0;
}
-int sysctl_latencytop(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+int sysctl_latencytop(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int err;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 01f8ba32cc0c..3ccaba5f15c0 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -263,7 +263,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
#ifdef CONFIG_CHECKPOINT_RESTORE
static int pid_ns_ctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct pid_namespace *pid_ns = task_active_pid_ns(current);
struct ctl_table tmp = *table;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 9a9b6156270b..471f649b5868 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -173,7 +173,7 @@ __setup("printk.devkmsg=", control_devkmsg);
char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit";
int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
char old_str[DEVKMSG_STR_MAX_SIZE];
unsigned int old;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9a2fbf98fd6f..3e89a042a48f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1110,8 +1110,7 @@ static void uclamp_update_root_tg(void) { }
#endif
int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
bool update_root_tg = false;
int old_min, old_max;
@@ -2718,7 +2717,7 @@ void set_numabalancing_state(bool enabled)
#ifdef CONFIG_PROC_SYSCTL
int sysctl_numa_balancing(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
@@ -2792,8 +2791,8 @@ static void __init init_schedstats(void)
}
#ifdef CONFIG_PROC_SYSCTL
-int sysctl_schedstats(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 02f323b85b6d..b6077fd5b32f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -645,8 +645,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
*/
int sched_proc_update_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
unsigned int factor = get_update_sysctl_factor();
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index df11d88c9895..45da29de3ecc 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2714,9 +2714,8 @@ static void sched_rt_do_global(void)
def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
}
-int sched_rt_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int old_period, old_runtime;
static DEFINE_MUTEX(mutex);
@@ -2754,9 +2753,8 @@ undo:
return ret;
}
-int sched_rr_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int ret;
static DEFINE_MUTEX(mutex);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8344757bba6e..fa64b2ee9fe6 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -209,7 +209,7 @@ bool sched_energy_update;
#ifdef CONFIG_PROC_SYSCTL
int sched_energy_aware_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret, state;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 55a6184f5990..d653d8426de9 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1776,7 +1776,7 @@ static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged,
}
static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8a176d8727a3..7adfe5dbce9d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -68,6 +68,9 @@
#include <linux/bpf.h>
#include <linux/mount.h>
#include <linux/userfaultfd_k.h>
+#include <linux/coredump.h>
+#include <linux/latencytop.h>
+#include <linux/pid.h>
#include "../lib/kstrtox.h"
@@ -103,22 +106,6 @@
#if defined(CONFIG_SYSCTL)
-/* External variables not in a header file. */
-extern int suid_dumpable;
-#ifdef CONFIG_COREDUMP
-extern int core_uses_pid;
-extern char core_pattern[];
-extern unsigned int core_pipe_limit;
-#endif
-extern int pid_max;
-extern int pid_max_min, pid_max_max;
-extern int percpu_pagelist_fraction;
-extern int latencytop_enabled;
-extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max;
-#ifndef CONFIG_MMU
-extern int sysctl_nr_trim_pages;
-#endif
-
/* Constants used for minimum and maximum */
#ifdef CONFIG_LOCKUP_DETECTOR
static int sixty = 60;
@@ -160,24 +147,6 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
#ifdef CONFIG_INOTIFY_USER
#include <linux/inotify.h>
#endif
-#ifdef CONFIG_SPARC
-#endif
-
-#ifdef CONFIG_PARISC
-extern int pwrsw_enabled;
-#endif
-
-#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
-extern int unaligned_enabled;
-#endif
-
-#ifdef CONFIG_IA64
-extern int unaligned_dump_stack;
-#endif
-
-#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
-extern int no_unaligned_warning;
-#endif
#ifdef CONFIG_PROC_SYSCTL
@@ -207,102 +176,1472 @@ enum sysctl_writes_mode {
};
static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
+#endif /* CONFIG_PROC_SYSCTL */
+
+#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
+int sysctl_legacy_va_layout;
+#endif
+
+#ifdef CONFIG_SCHED_DEBUG
+static int min_sched_granularity_ns = 100000; /* 100 usecs */
+static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
+static int min_wakeup_granularity_ns; /* 0 usecs */
+static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
+#ifdef CONFIG_SMP
+static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
+static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_SCHED_DEBUG */
-static int proc_do_cad_pid(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
-static int proc_taint(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
#ifdef CONFIG_COMPACTION
-static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
- int write, void __user *buffer,
- size_t *lenp, loff_t *ppos);
+static int min_extfrag_threshold;
+static int max_extfrag_threshold = 1000;
#endif
+
+#endif /* CONFIG_SYSCTL */
+
+#ifdef CONFIG_BPF_SYSCALL
+static int bpf_stats_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct static_key *key = (struct static_key *)table->data;
+ static int saved_val;
+ int val, ret;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(val),
+ .mode = table->mode,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ };
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&bpf_stats_enabled_mutex);
+ val = saved_val;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret && val != saved_val) {
+ if (val)
+ static_key_slow_inc(key);
+ else
+ static_key_slow_dec(key);
+ saved_val = val;
+ }
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return ret;
+}
#endif
+/*
+ * /proc/sys support
+ */
+
+#ifdef CONFIG_PROC_SYSCTL
+
+static int _proc_do_string(char *data, int maxlen, int write,
+ char *buffer, size_t *lenp, loff_t *ppos)
+{
+ size_t len;
+ char c, *p;
+
+ if (!data || !maxlen || !*lenp) {
+ *lenp = 0;
+ return 0;
+ }
+
+ if (write) {
+ if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) {
+ /* Only continue writes not past the end of buffer. */
+ len = strlen(data);
+ if (len > maxlen - 1)
+ len = maxlen - 1;
+
+ if (*ppos > len)
+ return 0;
+ len = *ppos;
+ } else {
+ /* Start writing from beginning of buffer. */
+ len = 0;
+ }
+
+ *ppos += *lenp;
+ p = buffer;
+ while ((p - buffer) < *lenp && len < maxlen - 1) {
+ c = *(p++);
+ if (c == 0 || c == '\n')
+ break;
+ data[len++] = c;
+ }
+ data[len] = 0;
+ } else {
+ len = strlen(data);
+ if (len > maxlen)
+ len = maxlen;
+
+ if (*ppos > len) {
+ *lenp = 0;
+ return 0;
+ }
+
+ data += *ppos;
+ len -= *ppos;
+
+ if (len > *lenp)
+ len = *lenp;
+ if (len)
+ memcpy(buffer, data, len);
+ if (len < *lenp) {
+ buffer[len] = '\n';
+ len++;
+ }
+ *lenp = len;
+ *ppos += len;
+ }
+ return 0;
+}
+
+static void warn_sysctl_write(struct ctl_table *table)
+{
+ pr_warn_once("%s wrote to %s when file position was not 0!\n"
+ "This will not be supported in the future. To silence this\n"
+ "warning, set kernel.sysctl_writes_strict = -1\n",
+ current->comm, table->procname);
+}
+
+/**
+ * proc_first_pos_non_zero_ignore - check if first position is allowed
+ * @ppos: file position
+ * @table: the sysctl table
+ *
+ * Returns true if the first position is non-zero and the sysctl_writes_strict
+ * mode indicates this is not allowed for numeric input types. String proc
+ * handlers can ignore the return value.
+ */
+static bool proc_first_pos_non_zero_ignore(loff_t *ppos,
+ struct ctl_table *table)
+{
+ if (!*ppos)
+ return false;
+
+ switch (sysctl_writes_strict) {
+ case SYSCTL_WRITES_STRICT:
+ return true;
+ case SYSCTL_WRITES_WARN:
+ warn_sysctl_write(table);
+ return false;
+ default:
+ return false;
+ }
+}
+
+/**
+ * proc_dostring - read a string sysctl
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes a string from/to the user buffer. If the kernel
+ * buffer provided is not large enough to hold the string, the
+ * string is truncated. The copied string is %NULL-terminated.
+ * If the string is being read by the user process, it is copied
+ * and a newline '\n' is added. It is truncated if the buffer is
+ * not large enough.
+ *
+ * Returns 0 on success.
+ */
+int proc_dostring(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (write)
+ proc_first_pos_non_zero_ignore(ppos, table);
+
+ return _proc_do_string(table->data, table->maxlen, write, buffer, lenp,
+ ppos);
+}
+
+static size_t proc_skip_spaces(char **buf)
+{
+ size_t ret;
+ char *tmp = skip_spaces(*buf);
+ ret = tmp - *buf;
+ *buf = tmp;
+ return ret;
+}
+
+static void proc_skip_char(char **buf, size_t *size, const char v)
+{
+ while (*size) {
+ if (**buf != v)
+ break;
+ (*size)--;
+ (*buf)++;
+ }
+}
+
+/**
+ * strtoul_lenient - parse an ASCII formatted integer from a buffer and only
+ * fail on overflow
+ *
+ * @cp: kernel buffer containing the string to parse
+ * @endp: pointer to store the trailing characters
+ * @base: the base to use
+ * @res: where the parsed integer will be stored
+ *
+ * In case of success 0 is returned and @res will contain the parsed integer,
+ * @endp will hold any trailing characters.
+ * This function will fail the parse on overflow. If there wasn't an overflow
+ * the function will defer the decision what characters count as invalid to the
+ * caller.
+ */
+static int strtoul_lenient(const char *cp, char **endp, unsigned int base,
+ unsigned long *res)
+{
+ unsigned long long result;
+ unsigned int rv;
+
+ cp = _parse_integer_fixup_radix(cp, &base);
+ rv = _parse_integer(cp, base, &result);
+ if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result))
+ return -ERANGE;
+
+ cp += rv;
+
+ if (endp)
+ *endp = (char *)cp;
+
+ *res = (unsigned long)result;
+ return 0;
+}
+
+#define TMPBUFLEN 22
+/**
+ * proc_get_long - reads an ASCII formatted integer from a user buffer
+ *
+ * @buf: a kernel buffer
+ * @size: size of the kernel buffer
+ * @val: this is where the number will be stored
+ * @neg: set to %TRUE if number is negative
+ * @perm_tr: a vector which contains the allowed trailers
+ * @perm_tr_len: size of the perm_tr vector
+ * @tr: pointer to store the trailer character
+ *
+ * In case of success %0 is returned and @buf and @size are updated with
+ * the amount of bytes read. If @tr is non-NULL and a trailing
+ * character exists (size is non-zero after returning from this
+ * function), @tr is updated with the trailing character.
+ */
+static int proc_get_long(char **buf, size_t *size,
+ unsigned long *val, bool *neg,
+ const char *perm_tr, unsigned perm_tr_len, char *tr)
+{
+ int len;
+ char *p, tmp[TMPBUFLEN];
+
+ if (!*size)
+ return -EINVAL;
+
+ len = *size;
+ if (len > TMPBUFLEN - 1)
+ len = TMPBUFLEN - 1;
+
+ memcpy(tmp, *buf, len);
+
+ tmp[len] = 0;
+ p = tmp;
+ if (*p == '-' && *size > 1) {
+ *neg = true;
+ p++;
+ } else
+ *neg = false;
+ if (!isdigit(*p))
+ return -EINVAL;
+
+ if (strtoul_lenient(p, &p, 0, val))
+ return -EINVAL;
+
+ len = p - tmp;
+
+ /* We don't know if the next char is whitespace thus we may accept
+ * invalid integers (e.g. 1234...a) or two integers instead of one
+ * (e.g. 123...1). So lets not allow such large numbers. */
+ if (len == TMPBUFLEN - 1)
+ return -EINVAL;
+
+ if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len))
+ return -EINVAL;
+
+ if (tr && (len < *size))
+ *tr = *p;
+
+ *buf += len;
+ *size -= len;
+
+ return 0;
+}
+
+/**
+ * proc_put_long - converts an integer to a decimal ASCII formatted string
+ *
+ * @buf: the user buffer
+ * @size: the size of the user buffer
+ * @val: the integer to be converted
+ * @neg: sign of the number, %TRUE for negative
+ *
+ * In case of success @buf and @size are updated with the amount of bytes
+ * written.
+ */
+static void proc_put_long(void **buf, size_t *size, unsigned long val, bool neg)
+{
+ int len;
+ char tmp[TMPBUFLEN], *p = tmp;
+
+ sprintf(p, "%s%lu", neg ? "-" : "", val);
+ len = strlen(tmp);
+ if (len > *size)
+ len = *size;
+ memcpy(*buf, tmp, len);
+ *size -= len;
+ *buf += len;
+}
+#undef TMPBUFLEN
+
+static void proc_put_char(void **buf, size_t *size, char c)
+{
+ if (*size) {
+ char **buffer = (char **)buf;
+ **buffer = c;
+
+ (*size)--;
+ (*buffer)++;
+ *buf = *buffer;
+ }
+}
+
+static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (*negp) {
+ if (*lvalp > (unsigned long) INT_MAX + 1)
+ return -EINVAL;
+ *valp = -*lvalp;
+ } else {
+ if (*lvalp > (unsigned long) INT_MAX)
+ return -EINVAL;
+ *valp = *lvalp;
+ }
+ } else {
+ int val = *valp;
+ if (val < 0) {
+ *negp = true;
+ *lvalp = -(unsigned long)val;
+ } else {
+ *negp = false;
+ *lvalp = (unsigned long)val;
+ }
+ }
+ return 0;
+}
+
+static int do_proc_douintvec_conv(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (*lvalp > UINT_MAX)
+ return -EINVAL;
+ *valp = *lvalp;
+ } else {
+ unsigned int val = *valp;
+ *lvalp = (unsigned long)val;
+ }
+ return 0;
+}
+
+static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
+
+static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
+ int write, void *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
+ int write, void *data),
+ void *data)
+{
+ int *i, vleft, first = 1, err = 0;
+ size_t left;
+ char *p;
+
+ if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ i = (int *) tbl_data;
+ vleft = table->maxlen / sizeof(*i);
+ left = *lenp;
+
+ if (!conv)
+ conv = do_proc_dointvec_conv;
+
+ if (write) {
+ if (proc_first_pos_non_zero_ignore(ppos, table))
+ goto out;
+
+ if (left > PAGE_SIZE - 1)
+ left = PAGE_SIZE - 1;
+ p = buffer;
+ }
+
+ for (; left && vleft--; i++, first=0) {
+ unsigned long lval;
+ bool neg;
+
+ if (write) {
+ left -= proc_skip_spaces(&p);
+
+ if (!left)
+ break;
+ err = proc_get_long(&p, &left, &lval, &neg,
+ proc_wspace_sep,
+ sizeof(proc_wspace_sep), NULL);
+ if (err)
+ break;
+ if (conv(&neg, &lval, i, 1, data)) {
+ err = -EINVAL;
+ break;
+ }
+ } else {
+ if (conv(&neg, &lval, i, 0, data)) {
+ err = -EINVAL;
+ break;
+ }
+ if (!first)
+ proc_put_char(&buffer, &left, '\t');
+ proc_put_long(&buffer, &left, lval, neg);
+ }
+ }
+
+ if (!write && !first && left && !err)
+ proc_put_char(&buffer, &left, '\n');
+ if (write && !err && left)
+ left -= proc_skip_spaces(&p);
+ if (write && first)
+ return err ? : -EINVAL;
+ *lenp -= left;
+out:
+ *ppos += *lenp;
+ return err;
+}
+
+static int do_proc_dointvec(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos,
+ int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
+ int write, void *data),
+ void *data)
+{
+ return __do_proc_dointvec(table->data, table, write,
+ buffer, lenp, ppos, conv, data);
+}
+
+static int do_proc_douintvec_w(unsigned int *tbl_data,
+ struct ctl_table *table,
+ void *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ unsigned long lval;
+ int err = 0;
+ size_t left;
+ bool neg;
+ char *p = buffer;
+
+ left = *lenp;
+
+ if (proc_first_pos_non_zero_ignore(ppos, table))
+ goto bail_early;
+
+ if (left > PAGE_SIZE - 1)
+ left = PAGE_SIZE - 1;
+
+ left -= proc_skip_spaces(&p);
+ if (!left) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ err = proc_get_long(&p, &left, &lval, &neg,
+ proc_wspace_sep,
+ sizeof(proc_wspace_sep), NULL);
+ if (err || neg) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ if (conv(&lval, tbl_data, 1, data)) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ if (!err && left)
+ left -= proc_skip_spaces(&p);
+
+out_free:
+ if (err)
+ return -EINVAL;
+
+ return 0;
+
+ /* This is in keeping with old __do_proc_dointvec() */
+bail_early:
+ *ppos += *lenp;
+ return err;
+}
+
+static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ unsigned long lval;
+ int err = 0;
+ size_t left;
+
+ left = *lenp;
+
+ if (conv(&lval, tbl_data, 0, data)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ proc_put_long(&buffer, &left, lval, false);
+ if (!left)
+ goto out;
+
+ proc_put_char(&buffer, &left, '\n');
+
+out:
+ *lenp -= left;
+ *ppos += *lenp;
+
+ return err;
+}
+
+static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table,
+ int write, void *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ unsigned int *i, vleft;
+
+ if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ i = (unsigned int *) tbl_data;
+ vleft = table->maxlen / sizeof(*i);
+
+ /*
+ * Arrays are not supported, keep this simple. *Do not* add
+ * support for them.
+ */
+ if (vleft != 1) {
+ *lenp = 0;
+ return -EINVAL;
+ }
+
+ if (!conv)
+ conv = do_proc_douintvec_conv;
+
+ if (write)
+ return do_proc_douintvec_w(i, table, buffer, lenp, ppos,
+ conv, data);
+ return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data);
+}
+
+static int do_proc_douintvec(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ return __do_proc_douintvec(table->data, table, write,
+ buffer, lenp, ppos, conv, data);
+}
+
+/**
+ * proc_dointvec - read a vector of integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
+}
+
+#ifdef CONFIG_COMPACTION
+static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
+ int write, void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret, old;
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write)
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ old = *(int *)table->data;
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+ if (old != *(int *)table->data)
+ pr_warn_once("sysctl attribute %s changed by %s[%d]\n",
+ table->procname, current->comm,
+ task_pid_nr(current));
+ return ret;
+}
+#endif
+
+/**
+ * proc_douintvec - read a vector of unsigned integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * Returns 0 on success.
+ */
+int proc_douintvec(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return do_proc_douintvec(table, write, buffer, lenp, ppos,
+ do_proc_douintvec_conv, NULL);
+}
+
+/*
+ * Taint values can only be increased
+ * This means we can safely use a temporary.
+ */
+static int proc_taint(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ unsigned long tmptaint = get_taint();
+ int err;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &tmptaint;
+ err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+
+ if (write) {
+ /*
+ * Poor man's atomic or. Not worth adding a primitive
+ * to everyone's atomic.h for this
+ */
+ int i;
+ for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
+ if ((tmptaint >> i) & 1)
+ add_taint(i, LOCKDEP_STILL_OK);
+ }
+ }
+
+ return err;
+}
+
#ifdef CONFIG_PRINTK
static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
#endif
+/**
+ * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure
+ * @min: pointer to minimum allowable value
+ * @max: pointer to maximum allowable value
+ *
+ * The do_proc_dointvec_minmax_conv_param structure provides the
+ * minimum and maximum values for doing range checking for those sysctl
+ * parameters that use the proc_dointvec_minmax() handler.
+ */
+struct do_proc_dointvec_minmax_conv_param {
+ int *min;
+ int *max;
+};
+
+static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ int tmp, ret;
+ struct do_proc_dointvec_minmax_conv_param *param = data;
+ /*
+ * If writing, first do so via a temporary local int so we can
+ * bounds-check it before touching *valp.
+ */
+ int *ip = write ? &tmp : valp;
+
+ ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data);
+ if (ret)
+ return ret;
+
+ if (write) {
+ if ((param->min && *param->min > tmp) ||
+ (param->max && *param->max < tmp))
+ return -EINVAL;
+ *valp = tmp;
+ }
+
+ return 0;
+}
+
+/**
+ * proc_dointvec_minmax - read a vector of integers with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success or -EINVAL on write when the range check fails.
+ */
+int proc_dointvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct do_proc_dointvec_minmax_conv_param param = {
+ .min = (int *) table->extra1,
+ .max = (int *) table->extra2,
+ };
+ return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ do_proc_dointvec_minmax_conv, &param);
+}
+
+/**
+ * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure
+ * @min: pointer to minimum allowable value
+ * @max: pointer to maximum allowable value
+ *
+ * The do_proc_douintvec_minmax_conv_param structure provides the
+ * minimum and maximum values for doing range checking for those sysctl
+ * parameters that use the proc_douintvec_minmax() handler.
+ */
+struct do_proc_douintvec_minmax_conv_param {
+ unsigned int *min;
+ unsigned int *max;
+};
+
+static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data)
+{
+ int ret;
+ unsigned int tmp;
+ struct do_proc_douintvec_minmax_conv_param *param = data;
+ /* write via temporary local uint for bounds-checking */
+ unsigned int *up = write ? &tmp : valp;
+
+ ret = do_proc_douintvec_conv(lvalp, up, write, data);
+ if (ret)
+ return ret;
+
+ if (write) {
+ if ((param->min && *param->min > tmp) ||
+ (param->max && *param->max < tmp))
+ return -ERANGE;
+
+ *valp = tmp;
+ }
+
+ return 0;
+}
+
+/**
+ * proc_douintvec_minmax - read a vector of unsigned ints with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+ * values from/to the user buffer, treated as an ASCII string. Negative
+ * strings are not allowed.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max). There is a final sanity
+ * check for UINT_MAX to avoid having to support wrap around uses from
+ * userspace.
+ *
+ * Returns 0 on success or -ERANGE on write when the range check fails.
+ */
+int proc_douintvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct do_proc_douintvec_minmax_conv_param param = {
+ .min = (unsigned int *) table->extra1,
+ .max = (unsigned int *) table->extra2,
+ };
+ return do_proc_douintvec(table, write, buffer, lenp, ppos,
+ do_proc_douintvec_minmax_conv, &param);
+}
+
+static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data)
+{
+ if (write) {
+ unsigned int val;
+
+ val = round_pipe_size(*lvalp);
+ if (val == 0)
+ return -EINVAL;
+
+ *valp = val;
+ } else {
+ unsigned int val = *valp;
+ *lvalp = (unsigned long) val;
+ }
+
+ return 0;
+}
+
+static int proc_dopipe_max_size(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_douintvec(table, write, buffer, lenp, ppos,
+ do_proc_dopipe_max_size_conv, NULL);
+}
+
+static void validate_coredump_safety(void)
+{
+#ifdef CONFIG_COREDUMP
+ if (suid_dumpable == SUID_DUMP_ROOT &&
+ core_pattern[0] != '/' && core_pattern[0] != '|') {
+ printk(KERN_WARNING
+"Unsafe core_pattern used with fs.suid_dumpable=2.\n"
+"Pipe handler or fully qualified core dump path required.\n"
+"Set kernel.core_pattern before fs.suid_dumpable.\n"
+ );
+ }
+#endif
+}
+
static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (!error)
+ validate_coredump_safety();
+ return error;
+}
+
#ifdef CONFIG_COREDUMP
static int proc_dostring_coredump(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int error = proc_dostring(table, write, buffer, lenp, ppos);
+ if (!error)
+ validate_coredump_safety();
+ return error;
+}
#endif
-static int proc_dopipe_max_size(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
#ifdef CONFIG_MAGIC_SYSRQ
static int sysrq_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
-#endif
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int tmp, ret;
-static struct ctl_table kern_table[];
-static struct ctl_table vm_table[];
-static struct ctl_table fs_table[];
-static struct ctl_table debug_table[];
-static struct ctl_table dev_table[];
-extern struct ctl_table random_table[];
-#ifdef CONFIG_EPOLL
-extern struct ctl_table epoll_table[];
-#endif
+ tmp = sysrq_mask();
-#ifdef CONFIG_FW_LOADER_USER_HELPER
-extern struct ctl_table firmware_config_table[];
-#endif
+ ret = __do_proc_dointvec(&tmp, table, write, buffer,
+ lenp, ppos, NULL, NULL);
+ if (ret || !write)
+ return ret;
-#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
- defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
-int sysctl_legacy_va_layout;
+ if (write)
+ sysrq_toggle_support(tmp);
+
+ return 0;
+}
#endif
-/* The default sysctl tables: */
+static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table,
+ int write, void *buffer, size_t *lenp, loff_t *ppos,
+ unsigned long convmul, unsigned long convdiv)
+{
+ unsigned long *i, *min, *max;
+ int vleft, first = 1, err = 0;
+ size_t left;
+ char *p;
-static struct ctl_table sysctl_base_table[] = {
- {
- .procname = "kernel",
- .mode = 0555,
- .child = kern_table,
- },
- {
- .procname = "vm",
- .mode = 0555,
- .child = vm_table,
- },
- {
- .procname = "fs",
- .mode = 0555,
- .child = fs_table,
- },
- {
- .procname = "debug",
- .mode = 0555,
- .child = debug_table,
- },
- {
- .procname = "dev",
- .mode = 0555,
- .child = dev_table,
- },
- { }
-};
+ if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
-#ifdef CONFIG_SCHED_DEBUG
-static int min_sched_granularity_ns = 100000; /* 100 usecs */
-static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
-static int min_wakeup_granularity_ns; /* 0 usecs */
-static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
-#ifdef CONFIG_SMP
-static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
-static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif /* CONFIG_SMP */
-#endif /* CONFIG_SCHED_DEBUG */
+ i = (unsigned long *) data;
+ min = (unsigned long *) table->extra1;
+ max = (unsigned long *) table->extra2;
+ vleft = table->maxlen / sizeof(unsigned long);
+ left = *lenp;
-#ifdef CONFIG_COMPACTION
-static int min_extfrag_threshold;
-static int max_extfrag_threshold = 1000;
-#endif
+ if (write) {
+ if (proc_first_pos_non_zero_ignore(ppos, table))
+ goto out;
+
+ if (left > PAGE_SIZE - 1)
+ left = PAGE_SIZE - 1;
+ p = buffer;
+ }
+
+ for (; left && vleft--; i++, first = 0) {
+ unsigned long val;
+
+ if (write) {
+ bool neg;
+
+ left -= proc_skip_spaces(&p);
+ if (!left)
+ break;
+
+ err = proc_get_long(&p, &left, &val, &neg,
+ proc_wspace_sep,
+ sizeof(proc_wspace_sep), NULL);
+ if (err)
+ break;
+ if (neg)
+ continue;
+ val = convmul * val / convdiv;
+ if ((min && val < *min) || (max && val > *max)) {
+ err = -EINVAL;
+ break;
+ }
+ *i = val;
+ } else {
+ val = convdiv * (*i) / convmul;
+ if (!first)
+ proc_put_char(&buffer, &left, '\t');
+ proc_put_long(&buffer, &left, val, false);
+ }
+ }
+
+ if (!write && !first && left && !err)
+ proc_put_char(&buffer, &left, '\n');
+ if (write && !err)
+ left -= proc_skip_spaces(&p);
+ if (write && first)
+ return err ? : -EINVAL;
+ *lenp -= left;
+out:
+ *ppos += *lenp;
+ return err;
+}
+
+static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul,
+ unsigned long convdiv)
+{
+ return __do_proc_doulongvec_minmax(table->data, table, write,
+ buffer, lenp, ppos, convmul, convdiv);
+}
+
+/**
+ * proc_doulongvec_minmax - read a vector of long integers with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_doulongvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
+}
+
+/**
+ * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
+ * values from/to the user buffer, treated as an ASCII string. The values
+ * are treated as milliseconds, and converted to jiffies when they are stored.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_doulongvec_minmax(table, write, buffer,
+ lenp, ppos, HZ, 1000l);
+}
+
+
+static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (*lvalp > INT_MAX / HZ)
+ return 1;
+ *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
+ } else {
+ int val = *valp;
+ unsigned long lval;
+ if (val < 0) {
+ *negp = true;
+ lval = -(unsigned long)val;
+ } else {
+ *negp = false;
+ lval = (unsigned long)val;
+ }
+ *lvalp = lval / HZ;
+ }
+ return 0;
+}
+
+static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ)
+ return 1;
+ *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp);
+ } else {
+ int val = *valp;
+ unsigned long lval;
+ if (val < 0) {
+ *negp = true;
+ lval = -(unsigned long)val;
+ } else {
+ *negp = false;
+ lval = (unsigned long)val;
+ }
+ *lvalp = jiffies_to_clock_t(lval);
+ }
+ return 0;
+}
+
+static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp);
+
+ if (jif > INT_MAX)
+ return 1;
+ *valp = (int)jif;
+ } else {
+ int val = *valp;
+ unsigned long lval;
+ if (val < 0) {
+ *negp = true;
+ lval = -(unsigned long)val;
+ } else {
+ *negp = false;
+ lval = (unsigned long)val;
+ }
+ *lvalp = jiffies_to_msecs(lval);
+ }
+ return 0;
+}
+
+/**
+ * proc_dointvec_jiffies - read a vector of integers as seconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in seconds, and are converted into
+ * jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_jiffies(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table,write,buffer,lenp,ppos,
+ do_proc_dointvec_jiffies_conv,NULL);
+}
+
+/**
+ * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: pointer to the file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/USER_HZ seconds, and
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table,write,buffer,lenp,ppos,
+ do_proc_dointvec_userhz_jiffies_conv,NULL);
+}
+
+/**
+ * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ * @ppos: the current position in the file
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/1000 seconds, and
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ do_proc_dointvec_ms_jiffies_conv, NULL);
+}
+
+static int proc_do_cad_pid(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct pid *new_pid;
+ pid_t tmp;
+ int r;
+
+ tmp = pid_vnr(cad_pid);
+
+ r = __do_proc_dointvec(&tmp, table, write, buffer,
+ lenp, ppos, NULL, NULL);
+ if (r || !write)
+ return r;
+
+ new_pid = find_get_pid(tmp);
+ if (!new_pid)
+ return -ESRCH;
+
+ put_pid(xchg(&cad_pid, new_pid));
+ return 0;
+}
+
+/**
+ * proc_do_large_bitmap - read/write from/to a large bitmap
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * The bitmap is stored at table->data and the bitmap length (in bits)
+ * in table->maxlen.
+ *
+ * We use a range comma separated format (e.g. 1,3-4,10-10) so that
+ * large bitmaps may be represented in a compact manner. Writing into
+ * the file will clear the bitmap then update it with the given input.
+ *
+ * Returns 0 on success.
+ */
+int proc_do_large_bitmap(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int err = 0;
+ bool first = 1;
+ size_t left = *lenp;
+ unsigned long bitmap_len = table->maxlen;
+ unsigned long *bitmap = *(unsigned long **) table->data;
+ unsigned long *tmp_bitmap = NULL;
+ char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
+
+ if (!bitmap || !bitmap_len || !left || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ if (write) {
+ char *p = buffer;
+ size_t skipped = 0;
+
+ if (left > PAGE_SIZE - 1) {
+ left = PAGE_SIZE - 1;
+ /* How much of the buffer we'll skip this pass */
+ skipped = *lenp - left;
+ }
+
+ tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL);
+ if (!tmp_bitmap)
+ return -ENOMEM;
+ proc_skip_char(&p, &left, '\n');
+ while (!err && left) {
+ unsigned long val_a, val_b;
+ bool neg;
+ size_t saved_left;
+
+ /* In case we stop parsing mid-number, we can reset */
+ saved_left = left;
+ err = proc_get_long(&p, &left, &val_a, &neg, tr_a,
+ sizeof(tr_a), &c);
+ /*
+ * If we consumed the entirety of a truncated buffer or
+ * only one char is left (may be a "-"), then stop here,
+ * reset, & come back for more.
+ */
+ if ((left <= 1) && skipped) {
+ left = saved_left;
+ break;
+ }
+
+ if (err)
+ break;
+ if (val_a >= bitmap_len || neg) {
+ err = -EINVAL;
+ break;
+ }
+
+ val_b = val_a;
+ if (left) {
+ p++;
+ left--;
+ }
+
+ if (c == '-') {
+ err = proc_get_long(&p, &left, &val_b,
+ &neg, tr_b, sizeof(tr_b),
+ &c);
+ /*
+ * If we consumed all of a truncated buffer or
+ * then stop here, reset, & come back for more.
+ */
+ if (!left && skipped) {
+ left = saved_left;
+ break;
+ }
+
+ if (err)
+ break;
+ if (val_b >= bitmap_len || neg ||
+ val_a > val_b) {
+ err = -EINVAL;
+ break;
+ }
+ if (left) {
+ p++;
+ left--;
+ }
+ }
+
+ bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
+ first = 0;
+ proc_skip_char(&p, &left, '\n');
+ }
+ left += skipped;
+ } else {
+ unsigned long bit_a, bit_b = 0;
+
+ while (left) {
+ bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
+ if (bit_a >= bitmap_len)
+ break;
+ bit_b = find_next_zero_bit(bitmap, bitmap_len,
+ bit_a + 1) - 1;
+
+ if (!first)
+ proc_put_char(&buffer, &left, ',');
+ proc_put_long(&buffer, &left, bit_a, false);
+ if (bit_a != bit_b) {
+ proc_put_char(&buffer, &left, '-');
+ proc_put_long(&buffer, &left, bit_b, false);
+ }
+
+ first = 0; bit_b++;
+ }
+ proc_put_char(&buffer, &left, '\n');
+ }
+
+ if (!err) {
+ if (write) {
+ if (*ppos)
+ bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
+ else
+ bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
+ }
+ *lenp -= left;
+ *ppos += *lenp;
+ }
+
+ bitmap_free(tmp_bitmap);
+ return err;
+}
+
+#else /* CONFIG_PROC_SYSCTL */
+
+int proc_dostring(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_douintvec(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_douintvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_jiffies(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_doulongvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_do_large_bitmap(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+#endif /* CONFIG_PROC_SYSCTL */
+
+#if defined(CONFIG_SYSCTL)
+int proc_do_static_key(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct static_key *key = (struct static_key *)table->data;
+ static DEFINE_MUTEX(static_key_mutex);
+ int val, ret;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(val),
+ .mode = table->mode,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ };
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&static_key_mutex);
+ val = static_key_enabled(key);
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret) {
+ if (val)
+ static_key_enable(key);
+ else
+ static_key_disable(key);
+ }
+ mutex_unlock(&static_key_mutex);
+ return ret;
+}
static struct ctl_table kern_table[] = {
{
@@ -613,7 +1952,7 @@ static struct ctl_table kern_table[] = {
.procname = "soft-power",
.data = &pwrsw_enabled,
.maxlen = sizeof (int),
- .mode = 0644,
+ .mode = 0644,
.proc_handler = proc_dointvec,
},
#endif
@@ -1072,7 +2411,7 @@ static struct ctl_table kern_table[] = {
.procname = "ignore-unaligned-usertrap",
.data = &no_unaligned_warning,
.maxlen = sizeof (int),
- .mode = 0644,
+ .mode = 0644,
.proc_handler = proc_dointvec,
},
#endif
@@ -1244,7 +2583,7 @@ static struct ctl_table kern_table[] = {
.data = &bpf_stats_enabled_key.key,
.maxlen = sizeof(bpf_stats_enabled_key),
.mode = 0644,
- .proc_handler = proc_do_static_key,
+ .proc_handler = bpf_stats_handler,
},
#endif
#if defined(CONFIG_TREE_RCU)
@@ -1320,7 +2659,7 @@ static struct ctl_table vm_table[] = {
.proc_handler = overcommit_kbytes_handler,
},
{
- .procname = "page-cluster",
+ .procname = "page-cluster",
.data = &page_cluster,
.maxlen = sizeof(int),
.mode = 0644,
@@ -1491,7 +2830,7 @@ static struct ctl_table vm_table[] = {
.data = &watermark_boost_factor,
.maxlen = sizeof(watermark_boost_factor),
.mode = 0644,
- .proc_handler = watermark_boost_factor_sysctl_handler,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
{
@@ -1959,6 +3298,35 @@ static struct ctl_table dev_table[] = {
{ }
};
+static struct ctl_table sysctl_base_table[] = {
+ {
+ .procname = "kernel",
+ .mode = 0555,
+ .child = kern_table,
+ },
+ {
+ .procname = "vm",
+ .mode = 0555,
+ .child = vm_table,
+ },
+ {
+ .procname = "fs",
+ .mode = 0555,
+ .child = fs_table,
+ },
+ {
+ .procname = "debug",
+ .mode = 0555,
+ .child = debug_table,
+ },
+ {
+ .procname = "dev",
+ .mode = 0555,
+ .child = dev_table,
+ },
+ { }
+};
+
int __init sysctl_init(void)
{
struct ctl_table_header *hdr;
@@ -1967,1474 +3335,7 @@ int __init sysctl_init(void)
kmemleak_not_leak(hdr);
return 0;
}
-
#endif /* CONFIG_SYSCTL */
-
-/*
- * /proc/sys support
- */
-
-#ifdef CONFIG_PROC_SYSCTL
-
-static int _proc_do_string(char *data, int maxlen, int write,
- char __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- size_t len;
- char __user *p;
- char c;
-
- if (!data || !maxlen || !*lenp) {
- *lenp = 0;
- return 0;
- }
-
- if (write) {
- if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) {
- /* Only continue writes not past the end of buffer. */
- len = strlen(data);
- if (len > maxlen - 1)
- len = maxlen - 1;
-
- if (*ppos > len)
- return 0;
- len = *ppos;
- } else {
- /* Start writing from beginning of buffer. */
- len = 0;
- }
-
- *ppos += *lenp;
- p = buffer;
- while ((p - buffer) < *lenp && len < maxlen - 1) {
- if (get_user(c, p++))
- return -EFAULT;
- if (c == 0 || c == '\n')
- break;
- data[len++] = c;
- }
- data[len] = 0;
- } else {
- len = strlen(data);
- if (len > maxlen)
- len = maxlen;
-
- if (*ppos > len) {
- *lenp = 0;
- return 0;
- }
-
- data += *ppos;
- len -= *ppos;
-
- if (len > *lenp)
- len = *lenp;
- if (len)
- if (copy_to_user(buffer, data, len))
- return -EFAULT;
- if (len < *lenp) {
- if (put_user('\n', buffer + len))
- return -EFAULT;
- len++;
- }
- *lenp = len;
- *ppos += len;
- }
- return 0;
-}
-
-static void warn_sysctl_write(struct ctl_table *table)
-{
- pr_warn_once("%s wrote to %s when file position was not 0!\n"
- "This will not be supported in the future. To silence this\n"
- "warning, set kernel.sysctl_writes_strict = -1\n",
- current->comm, table->procname);
-}
-
-/**
- * proc_first_pos_non_zero_ignore - check if first position is allowed
- * @ppos: file position
- * @table: the sysctl table
- *
- * Returns true if the first position is non-zero and the sysctl_writes_strict
- * mode indicates this is not allowed for numeric input types. String proc
- * handlers can ignore the return value.
- */
-static bool proc_first_pos_non_zero_ignore(loff_t *ppos,
- struct ctl_table *table)
-{
- if (!*ppos)
- return false;
-
- switch (sysctl_writes_strict) {
- case SYSCTL_WRITES_STRICT:
- return true;
- case SYSCTL_WRITES_WARN:
- warn_sysctl_write(table);
- return false;
- default:
- return false;
- }
-}
-
-/**
- * proc_dostring - read a string sysctl
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes a string from/to the user buffer. If the kernel
- * buffer provided is not large enough to hold the string, the
- * string is truncated. The copied string is %NULL-terminated.
- * If the string is being read by the user process, it is copied
- * and a newline '\n' is added. It is truncated if the buffer is
- * not large enough.
- *
- * Returns 0 on success.
- */
-int proc_dostring(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- if (write)
- proc_first_pos_non_zero_ignore(ppos, table);
-
- return _proc_do_string((char *)(table->data), table->maxlen, write,
- (char __user *)buffer, lenp, ppos);
-}
-
-static size_t proc_skip_spaces(char **buf)
-{
- size_t ret;
- char *tmp = skip_spaces(*buf);
- ret = tmp - *buf;
- *buf = tmp;
- return ret;
-}
-
-static void proc_skip_char(char **buf, size_t *size, const char v)
-{
- while (*size) {
- if (**buf != v)
- break;
- (*size)--;
- (*buf)++;
- }
-}
-
-/**
- * strtoul_lenient - parse an ASCII formatted integer from a buffer and only
- * fail on overflow
- *
- * @cp: kernel buffer containing the string to parse
- * @endp: pointer to store the trailing characters
- * @base: the base to use
- * @res: where the parsed integer will be stored
- *
- * In case of success 0 is returned and @res will contain the parsed integer,
- * @endp will hold any trailing characters.
- * This function will fail the parse on overflow. If there wasn't an overflow
- * the function will defer the decision what characters count as invalid to the
- * caller.
- */
-static int strtoul_lenient(const char *cp, char **endp, unsigned int base,
- unsigned long *res)
-{
- unsigned long long result;
- unsigned int rv;
-
- cp = _parse_integer_fixup_radix(cp, &base);
- rv = _parse_integer(cp, base, &result);
- if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result))
- return -ERANGE;
-
- cp += rv;
-
- if (endp)
- *endp = (char *)cp;
-
- *res = (unsigned long)result;
- return 0;
-}
-
-#define TMPBUFLEN 22
-/**
- * proc_get_long - reads an ASCII formatted integer from a user buffer
- *
- * @buf: a kernel buffer
- * @size: size of the kernel buffer
- * @val: this is where the number will be stored
- * @neg: set to %TRUE if number is negative
- * @perm_tr: a vector which contains the allowed trailers
- * @perm_tr_len: size of the perm_tr vector
- * @tr: pointer to store the trailer character
- *
- * In case of success %0 is returned and @buf and @size are updated with
- * the amount of bytes read. If @tr is non-NULL and a trailing
- * character exists (size is non-zero after returning from this
- * function), @tr is updated with the trailing character.
- */
-static int proc_get_long(char **buf, size_t *size,
- unsigned long *val, bool *neg,
- const char *perm_tr, unsigned perm_tr_len, char *tr)
-{
- int len;
- char *p, tmp[TMPBUFLEN];
-
- if (!*size)
- return -EINVAL;
-
- len = *size;
- if (len > TMPBUFLEN - 1)
- len = TMPBUFLEN - 1;
-
- memcpy(tmp, *buf, len);
-
- tmp[len] = 0;
- p = tmp;
- if (*p == '-' && *size > 1) {
- *neg = true;
- p++;
- } else
- *neg = false;
- if (!isdigit(*p))
- return -EINVAL;
-
- if (strtoul_lenient(p, &p, 0, val))
- return -EINVAL;
-
- len = p - tmp;
-
- /* We don't know if the next char is whitespace thus we may accept
- * invalid integers (e.g. 1234...a) or two integers instead of one
- * (e.g. 123...1). So lets not allow such large numbers. */
- if (len == TMPBUFLEN - 1)
- return -EINVAL;
-
- if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len))
- return -EINVAL;
-
- if (tr && (len < *size))
- *tr = *p;
-
- *buf += len;
- *size -= len;
-
- return 0;
-}
-
-/**
- * proc_put_long - converts an integer to a decimal ASCII formatted string
- *
- * @buf: the user buffer
- * @size: the size of the user buffer
- * @val: the integer to be converted
- * @neg: sign of the number, %TRUE for negative
- *
- * In case of success %0 is returned and @buf and @size are updated with
- * the amount of bytes written.
- */
-static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
- bool neg)
-{
- int len;
- char tmp[TMPBUFLEN], *p = tmp;
-
- sprintf(p, "%s%lu", neg ? "-" : "", val);
- len = strlen(tmp);
- if (len > *size)
- len = *size;
- if (copy_to_user(*buf, tmp, len))
- return -EFAULT;
- *size -= len;
- *buf += len;
- return 0;
-}
-#undef TMPBUFLEN
-
-static int proc_put_char(void __user **buf, size_t *size, char c)
-{
- if (*size) {
- char __user **buffer = (char __user **)buf;
- if (put_user(c, *buffer))
- return -EFAULT;
- (*size)--, (*buffer)++;
- *buf = *buffer;
- }
- return 0;
-}
-
-static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- if (write) {
- if (*negp) {
- if (*lvalp > (unsigned long) INT_MAX + 1)
- return -EINVAL;
- *valp = -*lvalp;
- } else {
- if (*lvalp > (unsigned long) INT_MAX)
- return -EINVAL;
- *valp = *lvalp;
- }
- } else {
- int val = *valp;
- if (val < 0) {
- *negp = true;
- *lvalp = -(unsigned long)val;
- } else {
- *negp = false;
- *lvalp = (unsigned long)val;
- }
- }
- return 0;
-}
-
-static int do_proc_douintvec_conv(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data)
-{
- if (write) {
- if (*lvalp > UINT_MAX)
- return -EINVAL;
- *valp = *lvalp;
- } else {
- unsigned int val = *valp;
- *lvalp = (unsigned long)val;
- }
- return 0;
-}
-
-static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
-
-static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
- int write, void __user *buffer,
- size_t *lenp, loff_t *ppos,
- int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
- int write, void *data),
- void *data)
-{
- int *i, vleft, first = 1, err = 0;
- size_t left;
- char *kbuf = NULL, *p;
-
- if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
- *lenp = 0;
- return 0;
- }
-
- i = (int *) tbl_data;
- vleft = table->maxlen / sizeof(*i);
- left = *lenp;
-
- if (!conv)
- conv = do_proc_dointvec_conv;
-
- if (write) {
- if (proc_first_pos_non_zero_ignore(ppos, table))
- goto out;
-
- if (left > PAGE_SIZE - 1)
- left = PAGE_SIZE - 1;
- p = kbuf = memdup_user_nul(buffer, left);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
- }
-
- for (; left && vleft--; i++, first=0) {
- unsigned long lval;
- bool neg;
-
- if (write) {
- left -= proc_skip_spaces(&p);
-
- if (!left)
- break;
- err = proc_get_long(&p, &left, &lval, &neg,
- proc_wspace_sep,
- sizeof(proc_wspace_sep), NULL);
- if (err)
- break;
- if (conv(&neg, &lval, i, 1, data)) {
- err = -EINVAL;
- break;
- }
- } else {
- if (conv(&neg, &lval, i, 0, data)) {
- err = -EINVAL;
- break;
- }
- if (!first)
- err = proc_put_char(&buffer, &left, '\t');
- if (err)
- break;
- err = proc_put_long(&buffer, &left, lval, neg);
- if (err)
- break;
- }
- }
-
- if (!write && !first && left && !err)
- err = proc_put_char(&buffer, &left, '\n');
- if (write && !err && left)
- left -= proc_skip_spaces(&p);
- if (write) {
- kfree(kbuf);
- if (first)
- return err ? : -EINVAL;
- }
- *lenp -= left;
-out:
- *ppos += *lenp;
- return err;
-}
-
-static int do_proc_dointvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos,
- int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
- int write, void *data),
- void *data)
-{
- return __do_proc_dointvec(table->data, table, write,
- buffer, lenp, ppos, conv, data);
-}
-
-static int do_proc_douintvec_w(unsigned int *tbl_data,
- struct ctl_table *table,
- void __user *buffer,
- size_t *lenp, loff_t *ppos,
- int (*conv)(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data),
- void *data)
-{
- unsigned long lval;
- int err = 0;
- size_t left;
- bool neg;
- char *kbuf = NULL, *p;
-
- left = *lenp;
-
- if (proc_first_pos_non_zero_ignore(ppos, table))
- goto bail_early;
-
- if (left > PAGE_SIZE - 1)
- left = PAGE_SIZE - 1;
-
- p = kbuf = memdup_user_nul(buffer, left);
- if (IS_ERR(kbuf))
- return -EINVAL;
-
- left -= proc_skip_spaces(&p);
- if (!left) {
- err = -EINVAL;
- goto out_free;
- }
-
- err = proc_get_long(&p, &left, &lval, &neg,
- proc_wspace_sep,
- sizeof(proc_wspace_sep), NULL);
- if (err || neg) {
- err = -EINVAL;
- goto out_free;
- }
-
- if (conv(&lval, tbl_data, 1, data)) {
- err = -EINVAL;
- goto out_free;
- }
-
- if (!err && left)
- left -= proc_skip_spaces(&p);
-
-out_free:
- kfree(kbuf);
- if (err)
- return -EINVAL;
-
- return 0;
-
- /* This is in keeping with old __do_proc_dointvec() */
-bail_early:
- *ppos += *lenp;
- return err;
-}
-
-static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer,
- size_t *lenp, loff_t *ppos,
- int (*conv)(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data),
- void *data)
-{
- unsigned long lval;
- int err = 0;
- size_t left;
-
- left = *lenp;
-
- if (conv(&lval, tbl_data, 0, data)) {
- err = -EINVAL;
- goto out;
- }
-
- err = proc_put_long(&buffer, &left, lval, false);
- if (err || !left)
- goto out;
-
- err = proc_put_char(&buffer, &left, '\n');
-
-out:
- *lenp -= left;
- *ppos += *lenp;
-
- return err;
-}
-
-static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table,
- int write, void __user *buffer,
- size_t *lenp, loff_t *ppos,
- int (*conv)(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data),
- void *data)
-{
- unsigned int *i, vleft;
-
- if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
- *lenp = 0;
- return 0;
- }
-
- i = (unsigned int *) tbl_data;
- vleft = table->maxlen / sizeof(*i);
-
- /*
- * Arrays are not supported, keep this simple. *Do not* add
- * support for them.
- */
- if (vleft != 1) {
- *lenp = 0;
- return -EINVAL;
- }
-
- if (!conv)
- conv = do_proc_douintvec_conv;
-
- if (write)
- return do_proc_douintvec_w(i, table, buffer, lenp, ppos,
- conv, data);
- return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data);
-}
-
-static int do_proc_douintvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos,
- int (*conv)(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data),
- void *data)
-{
- return __do_proc_douintvec(table->data, table, write,
- buffer, lenp, ppos, conv, data);
-}
-
-/**
- * proc_dointvec - read a vector of integers
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- *
- * Returns 0 on success.
- */
-int proc_dointvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
-}
-
-#ifdef CONFIG_COMPACTION
-static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
- int write, void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- int ret, old;
-
- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write)
- return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-
- old = *(int *)table->data;
- ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (ret)
- return ret;
- if (old != *(int *)table->data)
- pr_warn_once("sysctl attribute %s changed by %s[%d]\n",
- table->procname, current->comm,
- task_pid_nr(current));
- return ret;
-}
-#endif
-
-/**
- * proc_douintvec - read a vector of unsigned integers
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
- * values from/to the user buffer, treated as an ASCII string.
- *
- * Returns 0 on success.
- */
-int proc_douintvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_douintvec(table, write, buffer, lenp, ppos,
- do_proc_douintvec_conv, NULL);
-}
-
-/*
- * Taint values can only be increased
- * This means we can safely use a temporary.
- */
-static int proc_taint(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- struct ctl_table t;
- unsigned long tmptaint = get_taint();
- int err;
-
- if (write && !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- t = *table;
- t.data = &tmptaint;
- err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
- if (err < 0)
- return err;
-
- if (write) {
- /*
- * Poor man's atomic or. Not worth adding a primitive
- * to everyone's atomic.h for this
- */
- int i;
- for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
- if ((tmptaint >> i) & 1)
- add_taint(i, LOCKDEP_STILL_OK);
- }
- }
-
- return err;
-}
-
-#ifdef CONFIG_PRINTK
-static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- if (write && !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-}
-#endif
-
-/**
- * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure
- * @min: pointer to minimum allowable value
- * @max: pointer to maximum allowable value
- *
- * The do_proc_dointvec_minmax_conv_param structure provides the
- * minimum and maximum values for doing range checking for those sysctl
- * parameters that use the proc_dointvec_minmax() handler.
- */
-struct do_proc_dointvec_minmax_conv_param {
- int *min;
- int *max;
-};
-
-static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- int tmp, ret;
- struct do_proc_dointvec_minmax_conv_param *param = data;
- /*
- * If writing, first do so via a temporary local int so we can
- * bounds-check it before touching *valp.
- */
- int *ip = write ? &tmp : valp;
-
- ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data);
- if (ret)
- return ret;
-
- if (write) {
- if ((param->min && *param->min > tmp) ||
- (param->max && *param->max < tmp))
- return -EINVAL;
- *valp = tmp;
- }
-
- return 0;
-}
-
-/**
- * proc_dointvec_minmax - read a vector of integers with min/max values
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- *
- * This routine will ensure the values are within the range specified by
- * table->extra1 (min) and table->extra2 (max).
- *
- * Returns 0 on success or -EINVAL on write when the range check fails.
- */
-int proc_dointvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- struct do_proc_dointvec_minmax_conv_param param = {
- .min = (int *) table->extra1,
- .max = (int *) table->extra2,
- };
- return do_proc_dointvec(table, write, buffer, lenp, ppos,
- do_proc_dointvec_minmax_conv, &param);
-}
-
-/**
- * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure
- * @min: pointer to minimum allowable value
- * @max: pointer to maximum allowable value
- *
- * The do_proc_douintvec_minmax_conv_param structure provides the
- * minimum and maximum values for doing range checking for those sysctl
- * parameters that use the proc_douintvec_minmax() handler.
- */
-struct do_proc_douintvec_minmax_conv_param {
- unsigned int *min;
- unsigned int *max;
-};
-
-static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data)
-{
- int ret;
- unsigned int tmp;
- struct do_proc_douintvec_minmax_conv_param *param = data;
- /* write via temporary local uint for bounds-checking */
- unsigned int *up = write ? &tmp : valp;
-
- ret = do_proc_douintvec_conv(lvalp, up, write, data);
- if (ret)
- return ret;
-
- if (write) {
- if ((param->min && *param->min > tmp) ||
- (param->max && *param->max < tmp))
- return -ERANGE;
-
- *valp = tmp;
- }
-
- return 0;
-}
-
-/**
- * proc_douintvec_minmax - read a vector of unsigned ints with min/max values
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
- * values from/to the user buffer, treated as an ASCII string. Negative
- * strings are not allowed.
- *
- * This routine will ensure the values are within the range specified by
- * table->extra1 (min) and table->extra2 (max). There is a final sanity
- * check for UINT_MAX to avoid having to support wrap around uses from
- * userspace.
- *
- * Returns 0 on success or -ERANGE on write when the range check fails.
- */
-int proc_douintvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- struct do_proc_douintvec_minmax_conv_param param = {
- .min = (unsigned int *) table->extra1,
- .max = (unsigned int *) table->extra2,
- };
- return do_proc_douintvec(table, write, buffer, lenp, ppos,
- do_proc_douintvec_minmax_conv, &param);
-}
-
-static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data)
-{
- if (write) {
- unsigned int val;
-
- val = round_pipe_size(*lvalp);
- if (val == 0)
- return -EINVAL;
-
- *valp = val;
- } else {
- unsigned int val = *valp;
- *lvalp = (unsigned long) val;
- }
-
- return 0;
-}
-
-static int proc_dopipe_max_size(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_douintvec(table, write, buffer, lenp, ppos,
- do_proc_dopipe_max_size_conv, NULL);
-}
-
-static void validate_coredump_safety(void)
-{
-#ifdef CONFIG_COREDUMP
- if (suid_dumpable == SUID_DUMP_ROOT &&
- core_pattern[0] != '/' && core_pattern[0] != '|') {
- printk(KERN_WARNING
-"Unsafe core_pattern used with fs.suid_dumpable=2.\n"
-"Pipe handler or fully qualified core dump path required.\n"
-"Set kernel.core_pattern before fs.suid_dumpable.\n"
- );
- }
-#endif
-}
-
-static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (!error)
- validate_coredump_safety();
- return error;
-}
-
-#ifdef CONFIG_COREDUMP
-static int proc_dostring_coredump(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int error = proc_dostring(table, write, buffer, lenp, ppos);
- if (!error)
- validate_coredump_safety();
- return error;
-}
-#endif
-
-#ifdef CONFIG_MAGIC_SYSRQ
-static int sysrq_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int tmp, ret;
-
- tmp = sysrq_mask();
-
- ret = __do_proc_dointvec(&tmp, table, write, buffer,
- lenp, ppos, NULL, NULL);
- if (ret || !write)
- return ret;
-
- if (write)
- sysrq_toggle_support(tmp);
-
- return 0;
-}
-#endif
-
-static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos,
- unsigned long convmul,
- unsigned long convdiv)
-{
- unsigned long *i, *min, *max;
- int vleft, first = 1, err = 0;
- size_t left;
- char *kbuf = NULL, *p;
-
- if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
- *lenp = 0;
- return 0;
- }
-
- i = (unsigned long *) data;
- min = (unsigned long *) table->extra1;
- max = (unsigned long *) table->extra2;
- vleft = table->maxlen / sizeof(unsigned long);
- left = *lenp;
-
- if (write) {
- if (proc_first_pos_non_zero_ignore(ppos, table))
- goto out;
-
- if (left > PAGE_SIZE - 1)
- left = PAGE_SIZE - 1;
- p = kbuf = memdup_user_nul(buffer, left);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
- }
-
- for (; left && vleft--; i++, first = 0) {
- unsigned long val;
-
- if (write) {
- bool neg;
-
- left -= proc_skip_spaces(&p);
- if (!left)
- break;
-
- err = proc_get_long(&p, &left, &val, &neg,
- proc_wspace_sep,
- sizeof(proc_wspace_sep), NULL);
- if (err)
- break;
- if (neg)
- continue;
- val = convmul * val / convdiv;
- if ((min && val < *min) || (max && val > *max)) {
- err = -EINVAL;
- break;
- }
- *i = val;
- } else {
- val = convdiv * (*i) / convmul;
- if (!first) {
- err = proc_put_char(&buffer, &left, '\t');
- if (err)
- break;
- }
- err = proc_put_long(&buffer, &left, val, false);
- if (err)
- break;
- }
- }
-
- if (!write && !first && left && !err)
- err = proc_put_char(&buffer, &left, '\n');
- if (write && !err)
- left -= proc_skip_spaces(&p);
- if (write) {
- kfree(kbuf);
- if (first)
- return err ? : -EINVAL;
- }
- *lenp -= left;
-out:
- *ppos += *lenp;
- return err;
-}
-
-static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos,
- unsigned long convmul,
- unsigned long convdiv)
-{
- return __do_proc_doulongvec_minmax(table->data, table, write,
- buffer, lenp, ppos, convmul, convdiv);
-}
-
-/**
- * proc_doulongvec_minmax - read a vector of long integers with min/max values
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
- * values from/to the user buffer, treated as an ASCII string.
- *
- * This routine will ensure the values are within the range specified by
- * table->extra1 (min) and table->extra2 (max).
- *
- * Returns 0 on success.
- */
-int proc_doulongvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
-}
-
-/**
- * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
- * values from/to the user buffer, treated as an ASCII string. The values
- * are treated as milliseconds, and converted to jiffies when they are stored.
- *
- * This routine will ensure the values are within the range specified by
- * table->extra1 (min) and table->extra2 (max).
- *
- * Returns 0 on success.
- */
-int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- return do_proc_doulongvec_minmax(table, write, buffer,
- lenp, ppos, HZ, 1000l);
-}
-
-
-static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- if (write) {
- if (*lvalp > INT_MAX / HZ)
- return 1;
- *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
- } else {
- int val = *valp;
- unsigned long lval;
- if (val < 0) {
- *negp = true;
- lval = -(unsigned long)val;
- } else {
- *negp = false;
- lval = (unsigned long)val;
- }
- *lvalp = lval / HZ;
- }
- return 0;
-}
-
-static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- if (write) {
- if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ)
- return 1;
- *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp);
- } else {
- int val = *valp;
- unsigned long lval;
- if (val < 0) {
- *negp = true;
- lval = -(unsigned long)val;
- } else {
- *negp = false;
- lval = (unsigned long)val;
- }
- *lvalp = jiffies_to_clock_t(lval);
- }
- return 0;
-}
-
-static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- if (write) {
- unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp);
-
- if (jif > INT_MAX)
- return 1;
- *valp = (int)jif;
- } else {
- int val = *valp;
- unsigned long lval;
- if (val < 0) {
- *negp = true;
- lval = -(unsigned long)val;
- } else {
- *negp = false;
- lval = (unsigned long)val;
- }
- *lvalp = jiffies_to_msecs(lval);
- }
- return 0;
-}
-
-/**
- * proc_dointvec_jiffies - read a vector of integers as seconds
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- * The values read are assumed to be in seconds, and are converted into
- * jiffies.
- *
- * Returns 0 on success.
- */
-int proc_dointvec_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_dointvec(table,write,buffer,lenp,ppos,
- do_proc_dointvec_jiffies_conv,NULL);
-}
-
-/**
- * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: pointer to the file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- * The values read are assumed to be in 1/USER_HZ seconds, and
- * are converted into jiffies.
- *
- * Returns 0 on success.
- */
-int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_dointvec(table,write,buffer,lenp,ppos,
- do_proc_dointvec_userhz_jiffies_conv,NULL);
-}
-
-/**
- * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- * @ppos: the current position in the file
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- * The values read are assumed to be in 1/1000 seconds, and
- * are converted into jiffies.
- *
- * Returns 0 on success.
- */
-int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_dointvec(table, write, buffer, lenp, ppos,
- do_proc_dointvec_ms_jiffies_conv, NULL);
-}
-
-static int proc_do_cad_pid(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- struct pid *new_pid;
- pid_t tmp;
- int r;
-
- tmp = pid_vnr(cad_pid);
-
- r = __do_proc_dointvec(&tmp, table, write, buffer,
- lenp, ppos, NULL, NULL);
- if (r || !write)
- return r;
-
- new_pid = find_get_pid(tmp);
- if (!new_pid)
- return -ESRCH;
-
- put_pid(xchg(&cad_pid, new_pid));
- return 0;
-}
-
-/**
- * proc_do_large_bitmap - read/write from/to a large bitmap
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * The bitmap is stored at table->data and the bitmap length (in bits)
- * in table->maxlen.
- *
- * We use a range comma separated format (e.g. 1,3-4,10-10) so that
- * large bitmaps may be represented in a compact manner. Writing into
- * the file will clear the bitmap then update it with the given input.
- *
- * Returns 0 on success.
- */
-int proc_do_large_bitmap(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int err = 0;
- bool first = 1;
- size_t left = *lenp;
- unsigned long bitmap_len = table->maxlen;
- unsigned long *bitmap = *(unsigned long **) table->data;
- unsigned long *tmp_bitmap = NULL;
- char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
-
- if (!bitmap || !bitmap_len || !left || (*ppos && !write)) {
- *lenp = 0;
- return 0;
- }
-
- if (write) {
- char *kbuf, *p;
- size_t skipped = 0;
-
- if (left > PAGE_SIZE - 1) {
- left = PAGE_SIZE - 1;
- /* How much of the buffer we'll skip this pass */
- skipped = *lenp - left;
- }
-
- p = kbuf = memdup_user_nul(buffer, left);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
-
- tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL);
- if (!tmp_bitmap) {
- kfree(kbuf);
- return -ENOMEM;
- }
- proc_skip_char(&p, &left, '\n');
- while (!err && left) {
- unsigned long val_a, val_b;
- bool neg;
- size_t saved_left;
-
- /* In case we stop parsing mid-number, we can reset */
- saved_left = left;
- err = proc_get_long(&p, &left, &val_a, &neg, tr_a,
- sizeof(tr_a), &c);
- /*
- * If we consumed the entirety of a truncated buffer or
- * only one char is left (may be a "-"), then stop here,
- * reset, & come back for more.
- */
- if ((left <= 1) && skipped) {
- left = saved_left;
- break;
- }
-
- if (err)
- break;
- if (val_a >= bitmap_len || neg) {
- err = -EINVAL;
- break;
- }
-
- val_b = val_a;
- if (left) {
- p++;
- left--;
- }
-
- if (c == '-') {
- err = proc_get_long(&p, &left, &val_b,
- &neg, tr_b, sizeof(tr_b),
- &c);
- /*
- * If we consumed all of a truncated buffer or
- * then stop here, reset, & come back for more.
- */
- if (!left && skipped) {
- left = saved_left;
- break;
- }
-
- if (err)
- break;
- if (val_b >= bitmap_len || neg ||
- val_a > val_b) {
- err = -EINVAL;
- break;
- }
- if (left) {
- p++;
- left--;
- }
- }
-
- bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
- first = 0;
- proc_skip_char(&p, &left, '\n');
- }
- kfree(kbuf);
- left += skipped;
- } else {
- unsigned long bit_a, bit_b = 0;
-
- while (left) {
- bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
- if (bit_a >= bitmap_len)
- break;
- bit_b = find_next_zero_bit(bitmap, bitmap_len,
- bit_a + 1) - 1;
-
- if (!first) {
- err = proc_put_char(&buffer, &left, ',');
- if (err)
- break;
- }
- err = proc_put_long(&buffer, &left, bit_a, false);
- if (err)
- break;
- if (bit_a != bit_b) {
- err = proc_put_char(&buffer, &left, '-');
- if (err)
- break;
- err = proc_put_long(&buffer, &left, bit_b, false);
- if (err)
- break;
- }
-
- first = 0; bit_b++;
- }
- if (!err)
- err = proc_put_char(&buffer, &left, '\n');
- }
-
- if (!err) {
- if (write) {
- if (*ppos)
- bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
- else
- bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
- }
- *lenp -= left;
- *ppos += *lenp;
- }
-
- bitmap_free(tmp_bitmap);
- return err;
-}
-
-#else /* CONFIG_PROC_SYSCTL */
-
-int proc_dostring(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_douintvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_douintvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_doulongvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_do_large_bitmap(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-#endif /* CONFIG_PROC_SYSCTL */
-
-#if defined(CONFIG_SYSCTL)
-int proc_do_static_key(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
-{
- struct static_key *key = (struct static_key *)table->data;
- static DEFINE_MUTEX(static_key_mutex);
- int val, ret;
- struct ctl_table tmp = {
- .data = &val,
- .maxlen = sizeof(val),
- .mode = table->mode,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- };
-
- if (write && !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- mutex_lock(&static_key_mutex);
- val = static_key_enabled(key);
- ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
- if (write && !ret) {
- if (val)
- static_key_enable(key);
- else
- static_key_disable(key);
- }
- mutex_unlock(&static_key_mutex);
- return ret;
-}
-#endif
/*
* No sense putting this after each symbol definition, twice,
* exception granted :-)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index a5221abb4594..398e6eadb861 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -249,8 +249,7 @@ void timers_update_nohz(void)
}
int timer_migration_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ca1796747a77..e875c95d3ced 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -797,6 +797,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_map_peek_elem_proto;
case BPF_FUNC_ktime_get_ns:
return &bpf_ktime_get_ns_proto;
+ case BPF_FUNC_ktime_get_boot_ns:
+ return &bpf_ktime_get_boot_ns_proto;
case BPF_FUNC_tail_call:
return &bpf_tail_call_proto;
case BPF_FUNC_get_current_pid_tgid:
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8d2b98812625..167a74a15b1a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2661,7 +2661,7 @@ static void output_printk(struct trace_event_buffer *fbuffer)
}
int tracepoint_printk_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
int save_tracepoint_printk;
diff --git a/kernel/umh.c b/kernel/umh.c
index 7f255b5a8845..9788ed481a6a 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -630,7 +630,7 @@ int call_usermodehelper(const char *path, char **argv, char **envp, int wait)
EXPORT_SYMBOL(call_usermodehelper);
static int proc_cap_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 3732c888a949..4ca61d49885b 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -30,7 +30,7 @@ static void *get_uts(struct ctl_table *table)
* to observe. Should this be in kernel/sys.c ????
*/
static int proc_do_uts_string(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table uts_table;
int r;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index b6b1f54a7837..53ff2c81b084 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -661,7 +661,7 @@ static void proc_watchdog_update(void)
* proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED
*/
static int proc_watchdog_common(int which, struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int err, old, *param = table->data;
@@ -688,7 +688,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
* /proc/sys/kernel/watchdog
*/
int proc_watchdog(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED,
table, write, buffer, lenp, ppos);
@@ -698,7 +698,7 @@ int proc_watchdog(struct ctl_table *table, int write,
* /proc/sys/kernel/nmi_watchdog
*/
int proc_nmi_watchdog(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
if (!nmi_watchdog_available && write)
return -ENOTSUPP;
@@ -710,7 +710,7 @@ int proc_nmi_watchdog(struct ctl_table *table, int write,
* /proc/sys/kernel/soft_watchdog
*/
int proc_soft_watchdog(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
return proc_watchdog_common(SOFT_WATCHDOG_ENABLED,
table, write, buffer, lenp, ppos);
@@ -720,7 +720,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write,
* /proc/sys/kernel/watchdog_thresh
*/
int proc_watchdog_thresh(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int err, old;
@@ -743,7 +743,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
* been brought online, if desired.
*/
int proc_watchdog_cpumask(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int err;