aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.kexec2
-rw-r--r--kernel/audit.c23
-rw-r--r--kernel/audit.h7
-rw-r--r--kernel/auditfilter.c9
-rw-r--r--kernel/auditsc.c69
-rw-r--r--kernel/bpf/bpf_inode_storage.c1
-rw-r--r--kernel/bpf/bpf_task_storage.c1
-rw-r--r--kernel/bpf/cgroup.c19
-rw-r--r--kernel/bpf/helpers.c54
-rw-r--r--kernel/bpf/lpm_trie.c2
-rw-r--r--kernel/bpf/memalloc.c14
-rw-r--r--kernel/bpf/task_iter.c6
-rw-r--r--kernel/bpf/token.c1
-rw-r--r--kernel/bpf/verifier.c23
-rw-r--r--kernel/cgroup/cgroup.c25
-rw-r--r--kernel/events/core.c65
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c14
-rw-r--r--kernel/irq/msi.c2
-rw-r--r--kernel/kcmp.c4
-rw-r--r--kernel/module/dups.c1
-rw-r--r--kernel/module/kmod.c1
-rw-r--r--kernel/module/main.c15
-rw-r--r--kernel/nsproxy.c5
-rw-r--r--kernel/pid.c20
-rw-r--r--kernel/resource.c4
-rw-r--r--kernel/sched/core.c21
-rw-r--r--kernel/sched/ext.c95
-rw-r--r--kernel/sched/ext.h2
-rw-r--r--kernel/sched/fair.c25
-rw-r--r--kernel/sched/sched.h41
-rw-r--r--kernel/sched/syscalls.c44
-rw-r--r--kernel/signal.c32
-rw-r--r--kernel/sys.c15
-rw-r--r--kernel/taskstats.c18
-rw-r--r--kernel/time/timekeeping.c105
-rw-r--r--kernel/time/timekeeping_debug.c13
-rw-r--r--kernel/time/timekeeping_internal.h15
-rw-r--r--kernel/trace/ring_buffer.c9
-rw-r--r--kernel/trace/trace.c32
-rw-r--r--kernel/ucount.c9
-rw-r--r--kernel/umh.c1
-rw-r--r--kernel/watch_queue.c6
43 files changed, 525 insertions, 346 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 6c34e63c88ff..4d111f871951 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -97,7 +97,7 @@ config KEXEC_JUMP
config CRASH_DUMP
bool "kernel crash dumps"
- default y
+ default ARCH_DEFAULT_CRASH_DUMP
depends on ARCH_SUPPORTS_CRASH_DUMP
depends on KEXEC_CORE
select VMCORE_INFO
diff --git a/kernel/audit.c b/kernel/audit.c
index 1edaa4846a47..6a95a6077953 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -123,7 +123,7 @@ static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
/* The identity of the user shutting down the audit system. */
static kuid_t audit_sig_uid = INVALID_UID;
static pid_t audit_sig_pid = -1;
-static u32 audit_sig_sid;
+static struct lsm_prop audit_sig_lsm;
/* Records can be lost in several ways:
0) [suppressed in audit_alloc]
@@ -1473,20 +1473,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
}
case AUDIT_SIGNAL_INFO:
len = 0;
- if (audit_sig_sid) {
- err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
+ if (lsmprop_is_set(&audit_sig_lsm)) {
+ err = security_lsmprop_to_secctx(&audit_sig_lsm, &ctx,
+ &len);
if (err)
return err;
}
sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL);
if (!sig_data) {
- if (audit_sig_sid)
+ if (lsmprop_is_set(&audit_sig_lsm))
security_release_secctx(ctx, len);
return -ENOMEM;
}
sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid);
sig_data->pid = audit_sig_pid;
- if (audit_sig_sid) {
+ if (lsmprop_is_set(&audit_sig_lsm)) {
memcpy(sig_data->ctx, ctx, len);
security_release_secctx(ctx, len);
}
@@ -2102,8 +2103,8 @@ bool audit_string_contains_control(const char *string, size_t len)
/**
* audit_log_n_untrustedstring - log a string that may contain random characters
* @ab: audit_buffer
- * @len: length of string (not including trailing null)
* @string: string to be logged
+ * @len: length of string (not including trailing null)
*
* This code will escape a string that is passed to it if the string
* contains a control character, unprintable character, double quote mark,
@@ -2178,16 +2179,16 @@ void audit_log_key(struct audit_buffer *ab, char *key)
int audit_log_task_context(struct audit_buffer *ab)
{
+ struct lsm_prop prop;
char *ctx = NULL;
unsigned len;
int error;
- u32 sid;
- security_current_getsecid_subj(&sid);
- if (!sid)
+ security_current_getlsmprop_subj(&prop);
+ if (!lsmprop_is_set(&prop))
return 0;
- error = security_secid_to_secctx(sid, &ctx, &len);
+ error = security_lsmprop_to_secctx(&prop, &ctx, &len);
if (error) {
if (error != -EINVAL)
goto error_path;
@@ -2404,7 +2405,7 @@ int audit_signal_info(int sig, struct task_struct *t)
audit_sig_uid = auid;
else
audit_sig_uid = uid;
- security_current_getsecid_subj(&audit_sig_sid);
+ security_current_getlsmprop_subj(&audit_sig_lsm);
}
return audit_signal_info_syscall(t);
diff --git a/kernel/audit.h b/kernel/audit.h
index a60d2840559e..0211cb307d30 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -11,6 +11,7 @@
#include <linux/fs.h>
#include <linux/audit.h>
+#include <linux/security.h>
#include <linux/skbuff.h>
#include <uapi/linux/mqueue.h>
#include <linux/tty.h>
@@ -81,7 +82,7 @@ struct audit_names {
kuid_t uid;
kgid_t gid;
dev_t rdev;
- u32 osid;
+ struct lsm_prop oprop;
struct audit_cap_data fcap;
unsigned int fcap_ver;
unsigned char type; /* record type */
@@ -143,7 +144,7 @@ struct audit_context {
kuid_t target_auid;
kuid_t target_uid;
unsigned int target_sessionid;
- u32 target_sid;
+ struct lsm_prop target_ref;
char target_comm[TASK_COMM_LEN];
struct audit_tree_refs *trees, *first_trees;
@@ -160,7 +161,7 @@ struct audit_context {
kuid_t uid;
kgid_t gid;
umode_t mode;
- u32 osid;
+ struct lsm_prop oprop;
int has_perm;
uid_t perm_uid;
gid_t perm_gid;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 470041c49a44..bceb9f58a09e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1339,8 +1339,8 @@ int audit_filter(int msgtype, unsigned int listtype)
for (i = 0; i < e->rule.field_count; i++) {
struct audit_field *f = &e->rule.fields[i];
+ struct lsm_prop prop = { };
pid_t pid;
- u32 sid;
switch (f->type) {
case AUDIT_PID:
@@ -1370,9 +1370,10 @@ int audit_filter(int msgtype, unsigned int listtype)
case AUDIT_SUBJ_SEN:
case AUDIT_SUBJ_CLR:
if (f->lsm_rule) {
- security_current_getsecid_subj(&sid);
- result = security_audit_rule_match(sid,
- f->type, f->op, f->lsm_rule);
+ security_current_getlsmprop_subj(&prop);
+ result = security_audit_rule_match(
+ &prop, f->type, f->op,
+ f->lsm_rule);
}
break;
case AUDIT_EXE:
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index cd57053b4a69..91afdd0d036e 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -100,7 +100,7 @@ struct audit_aux_data_pids {
kuid_t target_auid[AUDIT_AUX_PIDS];
kuid_t target_uid[AUDIT_AUX_PIDS];
unsigned int target_sessionid[AUDIT_AUX_PIDS];
- u32 target_sid[AUDIT_AUX_PIDS];
+ struct lsm_prop target_ref[AUDIT_AUX_PIDS];
char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
int pid_count;
};
@@ -470,7 +470,7 @@ static int audit_filter_rules(struct task_struct *tsk,
{
const struct cred *cred;
int i, need_sid = 1;
- u32 sid;
+ struct lsm_prop prop = { };
unsigned int sessionid;
if (ctx && rule->prio <= ctx->prio)
@@ -674,14 +674,16 @@ static int audit_filter_rules(struct task_struct *tsk,
* fork()/copy_process() in which case
* the new @tsk creds are still a dup
* of @current's creds so we can still
- * use security_current_getsecid_subj()
+ * use
+ * security_current_getlsmprop_subj()
* here even though it always refs
* @current's creds
*/
- security_current_getsecid_subj(&sid);
+ security_current_getlsmprop_subj(&prop);
need_sid = 0;
}
- result = security_audit_rule_match(sid, f->type,
+ result = security_audit_rule_match(&prop,
+ f->type,
f->op,
f->lsm_rule);
}
@@ -697,14 +699,14 @@ static int audit_filter_rules(struct task_struct *tsk,
/* Find files that match */
if (name) {
result = security_audit_rule_match(
- name->osid,
+ &name->oprop,
f->type,
f->op,
f->lsm_rule);
} else if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
if (security_audit_rule_match(
- n->osid,
+ &n->oprop,
f->type,
f->op,
f->lsm_rule)) {
@@ -716,7 +718,7 @@ static int audit_filter_rules(struct task_struct *tsk,
/* Find ipc objects that match */
if (!ctx || ctx->type != AUDIT_IPC)
break;
- if (security_audit_rule_match(ctx->ipc.osid,
+ if (security_audit_rule_match(&ctx->ipc.oprop,
f->type, f->op,
f->lsm_rule))
++result;
@@ -1017,7 +1019,7 @@ static void audit_reset_context(struct audit_context *ctx)
ctx->target_pid = 0;
ctx->target_auid = ctx->target_uid = KUIDT_INIT(0);
ctx->target_sessionid = 0;
- ctx->target_sid = 0;
+ lsmprop_init(&ctx->target_ref);
ctx->target_comm[0] = '\0';
unroll_tree_refs(ctx, NULL, 0);
WARN_ON(!list_empty(&ctx->killed_trees));
@@ -1091,8 +1093,9 @@ static inline void audit_free_context(struct audit_context *context)
}
static int audit_log_pid_context(struct audit_context *context, pid_t pid,
- kuid_t auid, kuid_t uid, unsigned int sessionid,
- u32 sid, char *comm)
+ kuid_t auid, kuid_t uid,
+ unsigned int sessionid, struct lsm_prop *prop,
+ char *comm)
{
struct audit_buffer *ab;
char *ctx = NULL;
@@ -1106,8 +1109,8 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid,
from_kuid(&init_user_ns, auid),
from_kuid(&init_user_ns, uid), sessionid);
- if (sid) {
- if (security_secid_to_secctx(sid, &ctx, &len)) {
+ if (lsmprop_is_set(prop)) {
+ if (security_lsmprop_to_secctx(prop, &ctx, &len)) {
audit_log_format(ab, " obj=(none)");
rc = 1;
} else {
@@ -1384,19 +1387,17 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_format(ab, " a%d=%lx", i,
context->socketcall.args[i]);
break; }
- case AUDIT_IPC: {
- u32 osid = context->ipc.osid;
-
+ case AUDIT_IPC:
audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
from_kuid(&init_user_ns, context->ipc.uid),
from_kgid(&init_user_ns, context->ipc.gid),
context->ipc.mode);
- if (osid) {
+ if (lsmprop_is_set(&context->ipc.oprop)) {
char *ctx = NULL;
u32 len;
- if (security_secid_to_secctx(osid, &ctx, &len)) {
- audit_log_format(ab, " osid=%u", osid);
+ if (security_lsmprop_to_secctx(&context->ipc.oprop,
+ &ctx, &len)) {
*call_panic = 1;
} else {
audit_log_format(ab, " obj=%s", ctx);
@@ -1416,7 +1417,7 @@ static void show_special(struct audit_context *context, int *call_panic)
context->ipc.perm_gid,
context->ipc.perm_mode);
}
- break; }
+ break;
case AUDIT_MQ_OPEN:
audit_log_format(ab,
"oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld "
@@ -1558,13 +1559,11 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
from_kgid(&init_user_ns, n->gid),
MAJOR(n->rdev),
MINOR(n->rdev));
- if (n->osid != 0) {
+ if (lsmprop_is_set(&n->oprop)) {
char *ctx = NULL;
u32 len;
- if (security_secid_to_secctx(
- n->osid, &ctx, &len)) {
- audit_log_format(ab, " osid=%u", n->osid);
+ if (security_lsmprop_to_secctx(&n->oprop, &ctx, &len)) {
if (call_panic)
*call_panic = 2;
} else {
@@ -1653,8 +1652,8 @@ static void audit_log_uring(struct audit_context *ctx)
audit_log_format(ab, "uring_op=%d", ctx->uring_op);
if (ctx->return_valid != AUDITSC_INVALID)
audit_log_format(ab, " success=%s exit=%ld",
- (ctx->return_valid == AUDITSC_SUCCESS ?
- "yes" : "no"),
+ str_yes_no(ctx->return_valid ==
+ AUDITSC_SUCCESS),
ctx->return_code);
audit_log_format(ab,
" items=%d"
@@ -1696,8 +1695,8 @@ static void audit_log_exit(void)
audit_log_format(ab, " per=%lx", context->personality);
if (context->return_valid != AUDITSC_INVALID)
audit_log_format(ab, " success=%s exit=%ld",
- (context->return_valid == AUDITSC_SUCCESS ?
- "yes" : "no"),
+ str_yes_no(context->return_valid ==
+ AUDITSC_SUCCESS),
context->return_code);
audit_log_format(ab,
" a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
@@ -1780,7 +1779,7 @@ static void audit_log_exit(void)
axs->target_auid[i],
axs->target_uid[i],
axs->target_sessionid[i],
- axs->target_sid[i],
+ &axs->target_ref[i],
axs->target_comm[i]))
call_panic = 1;
}
@@ -1789,7 +1788,7 @@ static void audit_log_exit(void)
audit_log_pid_context(context, context->target_pid,
context->target_auid, context->target_uid,
context->target_sessionid,
- context->target_sid, context->target_comm))
+ &context->target_ref, context->target_comm))
call_panic = 1;
if (context->pwd.dentry && context->pwd.mnt) {
@@ -2278,7 +2277,7 @@ static void audit_copy_inode(struct audit_names *name,
name->uid = inode->i_uid;
name->gid = inode->i_gid;
name->rdev = inode->i_rdev;
- security_inode_getsecid(inode, &name->osid);
+ security_inode_getlsmprop(inode, &name->oprop);
if (flags & AUDIT_INODE_NOEVAL) {
name->fcap_ver = -1;
return;
@@ -2632,7 +2631,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
context->ipc.gid = ipcp->gid;
context->ipc.mode = ipcp->mode;
context->ipc.has_perm = 0;
- security_ipc_getsecid(ipcp, &context->ipc.osid);
+ security_ipc_getlsmprop(ipcp, &context->ipc.oprop);
context->type = AUDIT_IPC;
}
@@ -2729,7 +2728,7 @@ void __audit_ptrace(struct task_struct *t)
context->target_auid = audit_get_loginuid(t);
context->target_uid = task_uid(t);
context->target_sessionid = audit_get_sessionid(t);
- security_task_getsecid_obj(t, &context->target_sid);
+ security_task_getlsmprop_obj(t, &context->target_ref);
memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
}
@@ -2756,7 +2755,7 @@ int audit_signal_info_syscall(struct task_struct *t)
ctx->target_auid = audit_get_loginuid(t);
ctx->target_uid = t_uid;
ctx->target_sessionid = audit_get_sessionid(t);
- security_task_getsecid_obj(t, &ctx->target_sid);
+ security_task_getlsmprop_obj(t, &ctx->target_ref);
memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
return 0;
}
@@ -2777,7 +2776,7 @@ int audit_signal_info_syscall(struct task_struct *t)
axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
axp->target_uid[axp->pid_count] = t_uid;
axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
- security_task_getsecid_obj(t, &axp->target_sid[axp->pid_count]);
+ security_task_getlsmprop_obj(t, &axp->target_ref[axp->pid_count]);
memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
axp->pid_count++;
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 29da6d3838f6..e16e79f8cd6d 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -16,7 +16,6 @@
#include <uapi/linux/btf.h>
#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
-#include <linux/fdtable.h>
#include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(inode_cache);
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index adf6dfe0ba68..1eb9852a9f8e 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -16,7 +16,6 @@
#include <linux/filter.h>
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>
-#include <linux/fdtable.h>
#include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(task_cache);
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index e7113d700b87..025d7e2214ae 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -24,6 +24,23 @@
DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+/*
+ * cgroup bpf destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions. Use a separate workqueue so that cgroup bpf
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_bpf_destroy_wq;
+
+static int __init cgroup_bpf_wq_init(void)
+{
+ cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1);
+ if (!cgroup_bpf_destroy_wq)
+ panic("Failed to alloc workqueue for cgroup bpf destroy.\n");
+ return 0;
+}
+core_initcall(cgroup_bpf_wq_init);
+
/* __always_inline is necessary to prevent indirect call through run_prog
* function pointer.
*/
@@ -334,7 +351,7 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref)
struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
- queue_work(system_wq, &cgrp->bpf.release_work);
+ queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work);
}
/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index ca3f0a2e5ed5..3d45ebe8afb4 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2851,21 +2851,47 @@ struct bpf_iter_bits {
__u64 __opaque[2];
} __aligned(8);
+#define BITS_ITER_NR_WORDS_MAX 511
+
struct bpf_iter_bits_kern {
union {
- unsigned long *bits;
- unsigned long bits_copy;
+ __u64 *bits;
+ __u64 bits_copy;
};
- u32 nr_bits;
+ int nr_bits;
int bit;
} __aligned(8);
+/* On 64-bit hosts, unsigned long and u64 have the same size, so passing
+ * a u64 pointer and an unsigned long pointer to find_next_bit() will
+ * return the same result, as both point to the same 8-byte area.
+ *
+ * For 32-bit little-endian hosts, using a u64 pointer or unsigned long
+ * pointer also makes no difference. This is because the first iterated
+ * unsigned long is composed of bits 0-31 of the u64 and the second unsigned
+ * long is composed of bits 32-63 of the u64.
+ *
+ * However, for 32-bit big-endian hosts, this is not the case. The first
+ * iterated unsigned long will be bits 32-63 of the u64, so swap these two
+ * ulong values within the u64.
+ */
+static void swap_ulong_in_u64(u64 *bits, unsigned int nr)
+{
+#if (BITS_PER_LONG == 32) && defined(__BIG_ENDIAN)
+ unsigned int i;
+
+ for (i = 0; i < nr; i++)
+ bits[i] = (bits[i] >> 32) | ((u64)(u32)bits[i] << 32);
+#endif
+}
+
/**
* bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area
* @it: The new bpf_iter_bits to be created
* @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over
* @nr_words: The size of the specified memory area, measured in 8-byte units.
- * Due to the limitation of memalloc, it can't be greater than 512.
+ * The maximum value of @nr_words is @BITS_ITER_NR_WORDS_MAX. This limit may be
+ * further reduced by the BPF memory allocator implementation.
*
* This function initializes a new bpf_iter_bits structure for iterating over
* a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It
@@ -2892,6 +2918,8 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w
if (!unsafe_ptr__ign || !nr_words)
return -EINVAL;
+ if (nr_words > BITS_ITER_NR_WORDS_MAX)
+ return -E2BIG;
/* Optimization for u64 mask */
if (nr_bits == 64) {
@@ -2899,10 +2927,15 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w
if (err)
return -EFAULT;
+ swap_ulong_in_u64(&kit->bits_copy, nr_words);
+
kit->nr_bits = nr_bits;
return 0;
}
+ if (bpf_mem_alloc_check_size(false, nr_bytes))
+ return -E2BIG;
+
/* Fallback to memalloc */
kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes);
if (!kit->bits)
@@ -2914,6 +2947,8 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w
return err;
}
+ swap_ulong_in_u64(kit->bits, nr_words);
+
kit->nr_bits = nr_bits;
return 0;
}
@@ -2930,17 +2965,16 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w
__bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it)
{
struct bpf_iter_bits_kern *kit = (void *)it;
- u32 nr_bits = kit->nr_bits;
- const unsigned long *bits;
- int bit;
+ int bit = kit->bit, nr_bits = kit->nr_bits;
+ const void *bits;
- if (nr_bits == 0)
+ if (!nr_bits || bit >= nr_bits)
return NULL;
bits = nr_bits == 64 ? &kit->bits_copy : kit->bits;
- bit = find_next_bit(bits, nr_bits, kit->bit + 1);
+ bit = find_next_bit(bits, nr_bits, bit + 1);
if (bit >= nr_bits) {
- kit->nr_bits = 0;
+ kit->bit = bit;
return NULL;
}
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 0218a5132ab5..9b60eda0f727 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -655,7 +655,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
if (!key || key->prefixlen > trie->max_prefixlen)
goto find_leftmost;
- node_stack = kmalloc_array(trie->max_prefixlen,
+ node_stack = kmalloc_array(trie->max_prefixlen + 1,
sizeof(struct lpm_trie_node *),
GFP_ATOMIC | __GFP_NOWARN);
if (!node_stack)
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index b3858a76e0b3..146f5b57cfb1 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -35,6 +35,8 @@
*/
#define LLIST_NODE_SZ sizeof(struct llist_node)
+#define BPF_MEM_ALLOC_SIZE_MAX 4096
+
/* similar to kmalloc, but sizeof == 8 bucket is gone */
static u8 size_index[24] __ro_after_init = {
3, /* 8 */
@@ -65,7 +67,7 @@ static u8 size_index[24] __ro_after_init = {
static int bpf_mem_cache_idx(size_t size)
{
- if (!size || size > 4096)
+ if (!size || size > BPF_MEM_ALLOC_SIZE_MAX)
return -1;
if (size <= 192)
@@ -1005,3 +1007,13 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
return !ret ? NULL : ret + LLIST_NODE_SZ;
}
+
+int bpf_mem_alloc_check_size(bool percpu, size_t size)
+{
+ /* The size of percpu allocation doesn't have LLIST_NODE_SZ overhead */
+ if ((percpu && size > BPF_MEM_ALLOC_SIZE_MAX) ||
+ (!percpu && size > BPF_MEM_ALLOC_SIZE_MAX - LLIST_NODE_SZ))
+ return -E2BIG;
+
+ return 0;
+}
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 5af9e130e500..98d9b4c0daff 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -5,7 +5,6 @@
#include <linux/namei.h>
#include <linux/pid_namespace.h>
#include <linux/fs.h>
-#include <linux/fdtable.h>
#include <linux/filter.h>
#include <linux/bpf_mem_alloc.h>
#include <linux/btf_ids.h>
@@ -286,17 +285,14 @@ again:
curr_fd = 0;
}
- rcu_read_lock();
- f = task_lookup_next_fdget_rcu(curr_task, &curr_fd);
+ f = fget_task_next(curr_task, &curr_fd);
if (f) {
/* set info->fd */
info->fd = curr_fd;
- rcu_read_unlock();
return f;
}
/* the current task is done, go to the next task */
- rcu_read_unlock();
put_task_struct(curr_task);
if (info->common.type == BPF_TASK_ITER_TID) {
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
index dcbec1a0dfb3..26057aa13503 100644
--- a/kernel/bpf/token.c
+++ b/kernel/bpf/token.c
@@ -1,6 +1,5 @@
#include <linux/bpf.h>
#include <linux/vmalloc.h>
-#include <linux/fdtable.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/kernel.h>
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 587a6c76e564..bb99bada7e2e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6804,20 +6804,10 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env,
struct bpf_func_state *state,
enum bpf_access_type t)
{
- struct bpf_insn_aux_data *aux = &env->insn_aux_data[env->insn_idx];
- int min_valid_off, max_bpf_stack;
-
- /* If accessing instruction is a spill/fill from bpf_fastcall pattern,
- * add room for all caller saved registers below MAX_BPF_STACK.
- * In case if bpf_fastcall rewrite won't happen maximal stack depth
- * would be checked by check_max_stack_depth_subprog().
- */
- max_bpf_stack = MAX_BPF_STACK;
- if (aux->fastcall_pattern)
- max_bpf_stack += CALLER_SAVED_REGS * BPF_REG_SIZE;
+ int min_valid_off;
if (t == BPF_WRITE || env->allow_uninit_stack)
- min_valid_off = -max_bpf_stack;
+ min_valid_off = -MAX_BPF_STACK;
else
min_valid_off = -state->allocated_stack;
@@ -17886,9 +17876,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
struct bpf_verifier_state_list *sl, **pprev;
struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry;
int i, j, n, err, states_cnt = 0;
- bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
- bool add_new_state = force_new_state;
- bool force_exact;
+ bool force_new_state, add_new_state, force_exact;
+
+ force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) ||
+ /* Avoid accumulating infinitely long jmp history */
+ cur->jmp_history_cnt > 40;
/* bpf progs typically have pruning point every 4 instructions
* http://vger.kernel.org/bpfconf2019.html#session-1
@@ -17898,6 +17890,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* In tests that amounts to up to 50% reduction into total verifier
* memory consumption and 20% verifier time speedup.
*/
+ add_new_state = force_new_state;
if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
env->insn_processed - env->prev_insn_processed >= 8)
add_new_state = true;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 5886b95c6eae..9bc4a84bd309 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5789,7 +5789,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
{
struct cgroup *cgroup;
int ret = false;
- int level = 1;
+ int level = 0;
lockdep_assert_held(&cgroup_mutex);
@@ -5797,7 +5797,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
if (cgroup->nr_descendants >= cgroup->max_descendants)
goto fail;
- if (level > cgroup->max_depth)
+ if (level >= cgroup->max_depth)
goto fail;
level++;
@@ -6476,7 +6476,6 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
struct cgroup *dst_cgrp = NULL;
struct css_set *cset;
struct super_block *sb;
- struct file *f;
if (kargs->flags & CLONE_INTO_CGROUP)
cgroup_lock();
@@ -6493,14 +6492,14 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
return 0;
}
- f = fget_raw(kargs->cgroup);
- if (!f) {
+ CLASS(fd_raw, f)(kargs->cgroup);
+ if (fd_empty(f)) {
ret = -EBADF;
goto err;
}
- sb = f->f_path.dentry->d_sb;
+ sb = fd_file(f)->f_path.dentry->d_sb;
- dst_cgrp = cgroup_get_from_file(f);
+ dst_cgrp = cgroup_get_from_file(fd_file(f));
if (IS_ERR(dst_cgrp)) {
ret = PTR_ERR(dst_cgrp);
dst_cgrp = NULL;
@@ -6548,15 +6547,12 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
}
put_css_set(cset);
- fput(f);
kargs->cgrp = dst_cgrp;
return ret;
err:
cgroup_threadgroup_change_end(current);
cgroup_unlock();
- if (f)
- fput(f);
if (dst_cgrp)
cgroup_put(dst_cgrp);
put_css_set(cset);
@@ -6966,14 +6962,11 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path);
*/
struct cgroup *cgroup_v1v2_get_from_fd(int fd)
{
- struct cgroup *cgrp;
- struct fd f = fdget_raw(fd);
- if (!fd_file(f))
+ CLASS(fd_raw, f)(fd);
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
- cgrp = cgroup_v1v2_get_from_file(fd_file(f));
- fdput(f);
- return cgrp;
+ return cgroup_v1v2_get_from_file(fd_file(f));
}
/**
diff --git a/kernel/events/core.c b/kernel/events/core.c
index cdd09769e6c5..b65446be00a7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -966,22 +966,20 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
{
struct perf_cgroup *cgrp;
struct cgroup_subsys_state *css;
- struct fd f = fdget(fd);
+ CLASS(fd, f)(fd);
int ret = 0;
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry,
&perf_event_cgrp_subsys);
- if (IS_ERR(css)) {
- ret = PTR_ERR(css);
- goto out;
- }
+ if (IS_ERR(css))
+ return PTR_ERR(css);
ret = perf_cgroup_ensure_storage(event, css);
if (ret)
- goto out;
+ return ret;
cgrp = container_of(css, struct perf_cgroup, css);
event->cgrp = cgrp;
@@ -995,8 +993,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
perf_detach_cgroup(event);
ret = -EINVAL;
}
-out:
- fdput(f);
return ret;
}
@@ -5998,18 +5994,9 @@ EXPORT_SYMBOL_GPL(perf_event_period);
static const struct file_operations perf_fops;
-static inline int perf_fget_light(int fd, struct fd *p)
+static inline bool is_perf_file(struct fd f)
{
- struct fd f = fdget(fd);
- if (!fd_file(f))
- return -EBADF;
-
- if (fd_file(f)->f_op != &perf_fops) {
- fdput(f);
- return -EBADF;
- }
- *p = f;
- return 0;
+ return !fd_empty(f) && fd_file(f)->f_op == &perf_fops;
}
static int perf_event_set_output(struct perf_event *event,
@@ -6057,20 +6044,14 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
case PERF_EVENT_IOC_SET_OUTPUT:
{
- int ret;
+ CLASS(fd, output)(arg); // arg == -1 => empty
+ struct perf_event *output_event = NULL;
if (arg != -1) {
- struct perf_event *output_event;
- struct fd output;
- ret = perf_fget_light(arg, &output);
- if (ret)
- return ret;
+ if (!is_perf_file(output))
+ return -EBADF;
output_event = fd_file(output)->private_data;
- ret = perf_event_set_output(event, output_event);
- fdput(output);
- } else {
- ret = perf_event_set_output(event, NULL);
}
- return ret;
+ return perf_event_set_output(event, output_event);
}
case PERF_EVENT_IOC_SET_FILTER:
@@ -12664,7 +12645,6 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event_attr attr;
struct perf_event_context *ctx;
struct file *event_file = NULL;
- struct fd group = EMPTY_FD;
struct task_struct *task = NULL;
struct pmu *pmu;
int event_fd;
@@ -12735,10 +12715,12 @@ SYSCALL_DEFINE5(perf_event_open,
if (event_fd < 0)
return event_fd;
+ CLASS(fd, group)(group_fd); // group_fd == -1 => empty
if (group_fd != -1) {
- err = perf_fget_light(group_fd, &group);
- if (err)
+ if (!is_perf_file(group)) {
+ err = -EBADF;
goto err_fd;
+ }
group_leader = fd_file(group)->private_data;
if (flags & PERF_FLAG_FD_OUTPUT)
output_event = group_leader;
@@ -12750,7 +12732,7 @@ SYSCALL_DEFINE5(perf_event_open,
task = find_lively_task_by_vpid(pid);
if (IS_ERR(task)) {
err = PTR_ERR(task);
- goto err_group_fd;
+ goto err_fd;
}
}
@@ -13017,12 +12999,11 @@ SYSCALL_DEFINE5(perf_event_open,
mutex_unlock(&current->perf_event_mutex);
/*
- * Drop the reference on the group_event after placing the
- * new event on the sibling_list. This ensures destruction
- * of the group leader will find the pointer to itself in
- * perf_group_detach().
+ * File reference in group guarantees that group_leader has been
+ * kept alive until we place the new event on the sibling_list.
+ * This ensures destruction of the group leader will find
+ * the pointer to itself in perf_group_detach().
*/
- fdput(group);
fd_install(event_fd, event_file);
return event_fd;
@@ -13041,8 +13022,6 @@ err_alloc:
err_task:
if (task)
put_task_struct(task);
-err_group_fd:
- fdput(group);
err_fd:
put_unused_fd(event_fd);
return err;
@@ -13959,7 +13938,7 @@ static void perf_event_clear_cpumask(unsigned int cpu)
}
/* migrate */
- list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
+ list_for_each_entry(pmu, &pmus, entry) {
if (pmu->scope == PERF_PMU_SCOPE_NONE ||
WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
continue;
diff --git a/kernel/exit.c b/kernel/exit.c
index 619f0014c33b..1dcddfe537ee 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -25,7 +25,6 @@
#include <linux/acct.h>
#include <linux/tsacct_kern.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/freezer.h>
#include <linux/binfmts.h>
#include <linux/nsproxy.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 89ceb4a68af2..22f43721d031 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -105,6 +105,7 @@
#include <linux/rseq.h>
#include <uapi/linux/pidfd.h>
#include <linux/pidfs.h>
+#include <linux/tick.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -653,11 +654,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
mm->exec_vm = oldmm->exec_vm;
mm->stack_vm = oldmm->stack_vm;
- retval = ksm_fork(mm, oldmm);
- if (retval)
- goto out;
- khugepaged_fork(mm, oldmm);
-
/* Use __mt_dup() to efficiently build an identical maple tree. */
retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
if (unlikely(retval))
@@ -760,6 +756,8 @@ loop_out:
vma_iter_free(&vmi);
if (!retval) {
mt_set_in_rcu(vmi.mas.tree);
+ ksm_fork(mm, oldmm);
+ khugepaged_fork(mm, oldmm);
} else if (mpnt) {
/*
* The entire maple tree has already been duplicated. If the
@@ -775,7 +773,10 @@ out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
mmap_write_unlock(oldmm);
- dup_userfaultfd_complete(&uf);
+ if (!retval)
+ dup_userfaultfd_complete(&uf);
+ else
+ dup_userfaultfd_fail(&uf);
fail_uprobe_end:
uprobe_end_dup_mmap();
return retval;
@@ -2292,6 +2293,7 @@ __latent_entropy struct task_struct *copy_process(
acct_clear_integrals(p);
posix_cputimers_init(&p->posix_cputimers);
+ tick_dep_init_task(p);
p->io_context = NULL;
audit_set_context(p, NULL);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 3a24d6b5f559..396a067a8a56 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -718,7 +718,7 @@ static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg);
if (ret < 0) {
if (ops->msi_free) {
- for (i--; i > 0; i--)
+ for (i--; i >= 0; i--)
ops->msi_free(domain, info, virq + i);
}
irq_domain_free_irqs_top(domain, virq, nr_irqs);
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index b0639f21041f..2c596851f8a9 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -63,9 +63,7 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx)
{
struct file *file;
- rcu_read_lock();
- file = task_lookup_fdget_rcu(task, idx);
- rcu_read_unlock();
+ file = fget_task(task, idx);
if (file)
fput(file);
diff --git a/kernel/module/dups.c b/kernel/module/dups.c
index 9a92f2f8c9d3..bd2149fbe117 100644
--- a/kernel/module/dups.c
+++ b/kernel/module/dups.c
@@ -18,7 +18,6 @@
#include <linux/completion.h>
#include <linux/cred.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/mount.h>
diff --git a/kernel/module/kmod.c b/kernel/module/kmod.c
index 0800d9891692..25f253812512 100644
--- a/kernel/module/kmod.c
+++ b/kernel/module/kmod.c
@@ -15,7 +15,6 @@
#include <linux/completion.h>
#include <linux/cred.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/mount.h>
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 49b9bca9de12..4490924fe24e 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -3202,7 +3202,7 @@ static int idempotent_init_module(struct file *f, const char __user * uargs, int
{
struct idempotent idem;
- if (!f || !(f->f_mode & FMODE_READ))
+ if (!(f->f_mode & FMODE_READ))
return -EBADF;
/* Are we the winners of the race and get to do this? */
@@ -3219,10 +3219,7 @@ static int idempotent_init_module(struct file *f, const char __user * uargs, int
SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
{
- int err;
- struct fd f;
-
- err = may_init_module();
+ int err = may_init_module();
if (err)
return err;
@@ -3233,10 +3230,10 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
|MODULE_INIT_COMPRESSED_FILE))
return -EINVAL;
- f = fdget(fd);
- err = idempotent_init_module(fd_file(f), uargs, flags);
- fdput(f);
- return err;
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
+ return idempotent_init_module(fd_file(f), uargs, flags);
}
/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index dc952c3b05af..c9d97ed20122 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -545,12 +545,12 @@ static void commit_nsset(struct nsset *nsset)
SYSCALL_DEFINE2(setns, int, fd, int, flags)
{
- struct fd f = fdget(fd);
+ CLASS(fd, f)(fd);
struct ns_common *ns = NULL;
struct nsset nsset = {};
int err = 0;
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
if (proc_ns_file(fd_file(f))) {
@@ -580,7 +580,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
}
put_nsset(&nsset);
out:
- fdput(f);
return err;
}
diff --git a/kernel/pid.c b/kernel/pid.c
index 2715afb77eab..115448e89c3e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -536,11 +536,10 @@ EXPORT_SYMBOL_GPL(find_ge_pid);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
{
- struct fd f;
+ CLASS(fd, f)(fd);
struct pid *pid;
- f = fdget(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
pid = pidfd_pid(fd_file(f));
@@ -548,8 +547,6 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
get_pid(pid);
*flags = fd_file(f)->f_flags;
}
-
- fdput(f);
return pid;
}
@@ -747,23 +744,18 @@ SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
unsigned int, flags)
{
struct pid *pid;
- struct fd f;
- int ret;
/* flags is currently unused - make sure it's unset */
if (flags)
return -EINVAL;
- f = fdget(pidfd);
- if (!fd_file(f))
+ CLASS(fd, f)(pidfd);
+ if (fd_empty(f))
return -EBADF;
pid = pidfd_pid(fd_file(f));
if (IS_ERR(pid))
- ret = PTR_ERR(pid);
- else
- ret = pidfd_getfd(pid, fd);
+ return PTR_ERR(pid);
- fdput(f);
- return ret;
+ return pidfd_getfd(pid, fd);
}
diff --git a/kernel/resource.c b/kernel/resource.c
index b730bd28b422..4101016e8b20 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -459,9 +459,7 @@ int walk_system_ram_res_rev(u64 start, u64 end, void *arg,
rams_size += 16;
}
- rams[i].start = res.start;
- rams[i++].end = res.end;
-
+ rams[i++] = res;
start = res.end + 1;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dbfb5717d6af..a1c353a62c56 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4711,7 +4711,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
if (rt_prio(p->prio)) {
p->sched_class = &rt_sched_class;
#ifdef CONFIG_SCHED_CLASS_EXT
- } else if (task_should_scx(p)) {
+ } else if (task_should_scx(p->policy)) {
p->sched_class = &ext_sched_class;
#endif
} else {
@@ -5920,12 +5920,15 @@ static void prev_balance(struct rq *rq, struct task_struct *prev,
#ifdef CONFIG_SCHED_CLASS_EXT
/*
- * SCX requires a balance() call before every pick_next_task() including
- * when waking up from SCHED_IDLE. If @start_class is below SCX, start
- * from SCX instead.
+ * SCX requires a balance() call before every pick_task() including when
+ * waking up from SCHED_IDLE. If @start_class is below SCX, start from
+ * SCX instead. Also, set a flag to detect missing balance() call.
*/
- if (scx_enabled() && sched_class_above(&ext_sched_class, start_class))
- start_class = &ext_sched_class;
+ if (scx_enabled()) {
+ rq->scx.flags |= SCX_RQ_BAL_PENDING;
+ if (sched_class_above(&ext_sched_class, start_class))
+ start_class = &ext_sched_class;
+ }
#endif
/*
@@ -7025,7 +7028,7 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
}
EXPORT_SYMBOL(default_wake_function);
-const struct sched_class *__setscheduler_class(struct task_struct *p, int prio)
+const struct sched_class *__setscheduler_class(int policy, int prio)
{
if (dl_prio(prio))
return &dl_sched_class;
@@ -7034,7 +7037,7 @@ const struct sched_class *__setscheduler_class(struct task_struct *p, int prio)
return &rt_sched_class;
#ifdef CONFIG_SCHED_CLASS_EXT
- if (task_should_scx(p))
+ if (task_should_scx(policy))
return &ext_sched_class;
#endif
@@ -7142,7 +7145,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
queue_flag &= ~DEQUEUE_MOVE;
prev_class = p->sched_class;
- next_class = __setscheduler_class(p, prio);
+ next_class = __setscheduler_class(p->policy, prio);
if (prev_class != next_class && p->se.sched_delayed)
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5900b06fd036..751d73d500e5 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -862,7 +862,8 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
-static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0);
+static int scx_ops_bypass_depth;
+static DEFINE_RAW_SPINLOCK(__scx_ops_bypass_lock);
static bool scx_ops_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
@@ -2633,7 +2634,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
lockdep_assert_rq_held(rq);
rq->scx.flags |= SCX_RQ_IN_BALANCE;
- rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
+ rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
unlikely(rq->scx.cpu_released)) {
@@ -2644,7 +2645,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* emitted in scx_next_task_picked().
*/
if (SCX_HAS_OP(cpu_acquire))
- SCX_CALL_OP(0, cpu_acquire, cpu_of(rq), NULL);
+ SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL);
rq->scx.cpu_released = false;
}
@@ -2947,12 +2948,11 @@ static struct task_struct *pick_task_scx(struct rq *rq)
{
struct task_struct *prev = rq->curr;
struct task_struct *p;
+ bool prev_on_scx = prev->sched_class == &ext_sched_class;
+ bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
+ bool kick_idle = false;
/*
- * If balance_scx() is telling us to keep running @prev, replenish slice
- * if necessary and keep running @prev. Otherwise, pop the first one
- * from the local DSQ.
- *
* WORKAROUND:
*
* %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just
@@ -2961,22 +2961,41 @@ static struct task_struct *pick_task_scx(struct rq *rq)
* which then ends up calling pick_task_scx() without preceding
* balance_scx().
*
- * For now, ignore cases where $prev is not on SCX. This isn't great and
- * can theoretically lead to stalls. However, for switch_all cases, this
- * happens only while a BPF scheduler is being loaded or unloaded, and,
- * for partial cases, fair will likely keep triggering this CPU.
+ * Keep running @prev if possible and avoid stalling from entering idle
+ * without balancing.
*
- * Once fair is fixed, restore WARN_ON_ONCE().
+ * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE()
+ * if pick_task_scx() is called without preceding balance_scx().
*/
- if ((rq->scx.flags & SCX_RQ_BAL_KEEP) &&
- prev->sched_class == &ext_sched_class) {
+ if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {
+ if (prev_on_scx) {
+ keep_prev = true;
+ } else {
+ keep_prev = false;
+ kick_idle = true;
+ }
+ } else if (unlikely(keep_prev && !prev_on_scx)) {
+ /* only allowed during transitions */
+ WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED);
+ keep_prev = false;
+ }
+
+ /*
+ * If balance_scx() is telling us to keep running @prev, replenish slice
+ * if necessary and keep running @prev. Otherwise, pop the first one
+ * from the local DSQ.
+ */
+ if (keep_prev) {
p = prev;
if (!p->scx.slice)
p->scx.slice = SCX_SLICE_DFL;
} else {
p = first_local_task(rq);
- if (!p)
+ if (!p) {
+ if (kick_idle)
+ scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE);
return NULL;
+ }
if (unlikely(!p->scx.slice)) {
if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
@@ -4256,14 +4275,14 @@ static const struct kset_uevent_ops scx_uevent_ops = {
* Used by sched_fork() and __setscheduler_prio() to pick the matching
* sched_class. dl/rt are already handled.
*/
-bool task_should_scx(struct task_struct *p)
+bool task_should_scx(int policy)
{
if (!scx_enabled() ||
unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING))
return false;
if (READ_ONCE(scx_switching_all))
return true;
- return p->policy == SCHED_EXT;
+ return policy == SCHED_EXT;
}
/**
@@ -4298,18 +4317,20 @@ bool task_should_scx(struct task_struct *p)
*/
static void scx_ops_bypass(bool bypass)
{
- int depth, cpu;
+ int cpu;
+ unsigned long flags;
+ raw_spin_lock_irqsave(&__scx_ops_bypass_lock, flags);
if (bypass) {
- depth = atomic_inc_return(&scx_ops_bypass_depth);
- WARN_ON_ONCE(depth <= 0);
- if (depth != 1)
- return;
+ scx_ops_bypass_depth++;
+ WARN_ON_ONCE(scx_ops_bypass_depth <= 0);
+ if (scx_ops_bypass_depth != 1)
+ goto unlock;
} else {
- depth = atomic_dec_return(&scx_ops_bypass_depth);
- WARN_ON_ONCE(depth < 0);
- if (depth != 0)
- return;
+ scx_ops_bypass_depth--;
+ WARN_ON_ONCE(scx_ops_bypass_depth < 0);
+ if (scx_ops_bypass_depth != 0)
+ goto unlock;
}
/*
@@ -4326,7 +4347,7 @@ static void scx_ops_bypass(bool bypass)
struct rq_flags rf;
struct task_struct *p, *n;
- rq_lock_irqsave(rq, &rf);
+ rq_lock(rq, &rf);
if (bypass) {
WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
@@ -4362,11 +4383,13 @@ static void scx_ops_bypass(bool bypass)
sched_enq_and_set_task(&ctx);
}
- rq_unlock_irqrestore(rq, &rf);
+ rq_unlock(rq, &rf);
/* resched to restore ticks and idle state */
resched_cpu(cpu);
}
+unlock:
+ raw_spin_unlock_irqrestore(&__scx_ops_bypass_lock, flags);
}
static void free_exit_info(struct scx_exit_info *ei)
@@ -4489,11 +4512,16 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
const struct sched_class *old_class = p->sched_class;
+ const struct sched_class *new_class =
+ __setscheduler_class(p->policy, p->prio);
struct sched_enq_and_set_ctx ctx;
+ if (old_class != new_class && p->se.sched_delayed)
+ dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+
sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
- p->sched_class = __setscheduler_class(p, p->prio);
+ p->sched_class = new_class;
check_class_changing(task_rq(p), p, old_class);
sched_enq_and_set_task(&ctx);
@@ -4969,7 +4997,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
cpu_possible_mask)) {
- pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation");
+ pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
return -EINVAL;
}
@@ -5199,12 +5227,17 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
const struct sched_class *old_class = p->sched_class;
+ const struct sched_class *new_class =
+ __setscheduler_class(p->policy, p->prio);
struct sched_enq_and_set_ctx ctx;
+ if (old_class != new_class && p->se.sched_delayed)
+ dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+
sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
p->scx.slice = SCX_SLICE_DFL;
- p->sched_class = __setscheduler_class(p, p->prio);
+ p->sched_class = new_class;
check_class_changing(task_rq(p), p, old_class);
sched_enq_and_set_task(&ctx);
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 246019519231..b1675bb59fc4 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -18,7 +18,7 @@ bool scx_can_stop_tick(struct rq *rq);
void scx_rq_activate(struct rq *rq);
void scx_rq_deactivate(struct rq *rq);
int scx_check_setscheduler(struct task_struct *p, int policy);
-bool task_should_scx(struct task_struct *p);
+bool task_should_scx(int policy);
void init_sched_ext_class(void);
static inline u32 scx_cpuperf_target(s32 cpu)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c157d4860a3b..2d16c8545c71 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3369,7 +3369,7 @@ retry_pids:
vma = vma_next(&vmi);
}
- do {
+ for (; vma; vma = vma_next(&vmi)) {
if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
@@ -3491,7 +3491,7 @@ retry_pids:
*/
if (vma_pids_forced)
break;
- } for_each_vma(vmi, vma);
+ }
/*
* If no VMAs are remaining and VMAs were skipped due to the PID
@@ -5625,8 +5625,9 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
struct sched_entity *se = pick_eevdf(cfs_rq);
if (se->sched_delayed) {
dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
- SCHED_WARN_ON(se->sched_delayed);
- SCHED_WARN_ON(se->on_rq);
+ /*
+ * Must not reference @se again, see __block_task().
+ */
return NULL;
}
return se;
@@ -7176,7 +7177,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
/* Fix-up what dequeue_task_fair() skipped */
hrtick_update(rq);
- /* Fix-up what block_task() skipped. */
+ /*
+ * Fix-up what block_task() skipped.
+ *
+ * Must be last, @p might not be valid after this.
+ */
__block_task(rq, p);
}
@@ -7193,12 +7198,14 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
util_est_dequeue(&rq->cfs, p);
- if (dequeue_entities(rq, &p->se, flags) < 0) {
- util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
+ util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
+ if (dequeue_entities(rq, &p->se, flags) < 0)
return false;
- }
- util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
+ /*
+ * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED).
+ */
+
hrtick_update(rq);
return true;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 081519ffab46..c03b3d7b320e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -751,8 +751,9 @@ enum scx_rq_flags {
*/
SCX_RQ_ONLINE = 1 << 0,
SCX_RQ_CAN_STOP_TICK = 1 << 1,
- SCX_RQ_BAL_KEEP = 1 << 2, /* balance decided to keep current */
- SCX_RQ_BYPASSING = 1 << 3,
+ SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */
+ SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */
+ SCX_RQ_BYPASSING = 1 << 4,
SCX_RQ_IN_WAKEUP = 1 << 16,
SCX_RQ_IN_BALANCE = 1 << 17,
@@ -2769,8 +2770,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
static inline void __block_task(struct rq *rq, struct task_struct *p)
{
- WRITE_ONCE(p->on_rq, 0);
- ASSERT_EXCLUSIVE_WRITER(p->on_rq);
if (p->sched_contributes_to_load)
rq->nr_uninterruptible++;
@@ -2778,6 +2777,38 @@ static inline void __block_task(struct rq *rq, struct task_struct *p)
atomic_inc(&rq->nr_iowait);
delayacct_blkio_start();
}
+
+ ASSERT_EXCLUSIVE_WRITER(p->on_rq);
+
+ /*
+ * The moment this write goes through, ttwu() can swoop in and migrate
+ * this task, rendering our rq->__lock ineffective.
+ *
+ * __schedule() try_to_wake_up()
+ * LOCK rq->__lock LOCK p->pi_lock
+ * pick_next_task()
+ * pick_next_task_fair()
+ * pick_next_entity()
+ * dequeue_entities()
+ * __block_task()
+ * RELEASE p->on_rq = 0 if (p->on_rq && ...)
+ * break;
+ *
+ * ACQUIRE (after ctrl-dep)
+ *
+ * cpu = select_task_rq();
+ * set_task_cpu(p, cpu);
+ * ttwu_queue()
+ * ttwu_do_activate()
+ * LOCK rq->__lock
+ * activate_task()
+ * STORE p->on_rq = 1
+ * UNLOCK rq->__lock
+ *
+ * Callers must ensure to not reference @p after this -- we no longer
+ * own it.
+ */
+ smp_store_release(&p->on_rq, 0);
}
extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
@@ -3800,7 +3831,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi);
extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
-extern const struct sched_class *__setscheduler_class(struct task_struct *p, int prio);
+extern const struct sched_class *__setscheduler_class(int policy, int prio);
extern void set_load_weight(struct task_struct *p, bool update_load);
extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 0470bcc3d204..f9bed5ec719e 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -707,7 +707,7 @@ change:
}
prev_class = p->sched_class;
- next_class = __setscheduler_class(p, newprio);
+ next_class = __setscheduler_class(policy, newprio);
if (prev_class != next_class && p->se.sched_delayed)
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
@@ -1081,45 +1081,6 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
}
-/*
- * Copy the kernel size attribute structure (which might be larger
- * than what user-space knows about) to user-space.
- *
- * Note that all cases are valid: user-space buffer can be larger or
- * smaller than the kernel-space buffer. The usual case is that both
- * have the same size.
- */
-static int
-sched_attr_copy_to_user(struct sched_attr __user *uattr,
- struct sched_attr *kattr,
- unsigned int usize)
-{
- unsigned int ksize = sizeof(*kattr);
-
- if (!access_ok(uattr, usize))
- return -EFAULT;
-
- /*
- * sched_getattr() ABI forwards and backwards compatibility:
- *
- * If usize == ksize then we just copy everything to user-space and all is good.
- *
- * If usize < ksize then we only copy as much as user-space has space for,
- * this keeps ABI compatibility as well. We skip the rest.
- *
- * If usize > ksize then user-space is using a newer version of the ABI,
- * which part the kernel doesn't know about. Just ignore it - tooling can
- * detect the kernel's knowledge of attributes from the attr->size value
- * which is set to ksize in this case.
- */
- kattr->size = min(usize, ksize);
-
- if (copy_to_user(uattr, kattr, kattr->size))
- return -EFAULT;
-
- return 0;
-}
-
/**
* sys_sched_getattr - similar to sched_getparam, but with sched_attr
* @pid: the pid in question.
@@ -1164,7 +1125,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
#endif
}
- return sched_attr_copy_to_user(uattr, &kattr, usize);
+ kattr.size = min(usize, sizeof(kattr));
+ return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL);
}
#ifdef CONFIG_SMP
diff --git a/kernel/signal.c b/kernel/signal.c
index 4344860ffcac..65fd233f6f23 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -419,7 +419,8 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
*/
rcu_read_lock();
ucounts = task_ucounts(t);
- sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
+ sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING,
+ override_rlimit);
rcu_read_unlock();
if (!sigpending)
return NULL;
@@ -3908,7 +3909,6 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
siginfo_t __user *, info, unsigned int, flags)
{
int ret;
- struct fd f;
struct pid *pid;
kernel_siginfo_t kinfo;
enum pid_type type;
@@ -3921,20 +3921,17 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1)
return -EINVAL;
- f = fdget(pidfd);
- if (!fd_file(f))
+ CLASS(fd, f)(pidfd);
+ if (fd_empty(f))
return -EBADF;
/* Is this a pidfd? */
pid = pidfd_to_pid(fd_file(f));
- if (IS_ERR(pid)) {
- ret = PTR_ERR(pid);
- goto err;
- }
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
- ret = -EINVAL;
if (!access_pidfd_pidns(pid))
- goto err;
+ return -EINVAL;
switch (flags) {
case 0:
@@ -3958,28 +3955,23 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
if (info) {
ret = copy_siginfo_from_user_any(&kinfo, info);
if (unlikely(ret))
- goto err;
+ return ret;
- ret = -EINVAL;
if (unlikely(sig != kinfo.si_signo))
- goto err;
+ return -EINVAL;
/* Only allow sending arbitrary signals to yourself. */
- ret = -EPERM;
if ((task_pid(current) != pid || type > PIDTYPE_TGID) &&
(kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
- goto err;
+ return -EPERM;
} else {
prepare_kill_siginfo(sig, &kinfo, type);
}
if (type == PIDTYPE_PGID)
- ret = kill_pgrp_info(sig, &kinfo, pid);
+ return kill_pgrp_info(sig, &kinfo, pid);
else
- ret = kill_pid_info_type(sig, &kinfo, pid, type);
-err:
- fdput(f);
- return ret;
+ return kill_pid_info_type(sig, &kinfo, pid, type);
}
static int
diff --git a/kernel/sys.c b/kernel/sys.c
index 4da31f28fda8..ebe10c27a9f4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1911,12 +1911,11 @@ SYSCALL_DEFINE1(umask, int, mask)
static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
{
- struct fd exe;
+ CLASS(fd, exe)(fd);
struct inode *inode;
int err;
- exe = fdget(fd);
- if (!fd_file(exe))
+ if (fd_empty(exe))
return -EBADF;
inode = file_inode(fd_file(exe));
@@ -1926,18 +1925,14 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
* sure that this one is executable as well, to avoid breaking an
* overall picture.
*/
- err = -EACCES;
if (!S_ISREG(inode->i_mode) || path_noexec(&fd_file(exe)->f_path))
- goto exit;
+ return -EACCES;
err = file_permission(fd_file(exe), MAY_EXEC);
if (err)
- goto exit;
+ return err;
- err = replace_mm_exe_file(mm, fd_file(exe));
-exit:
- fdput(exe);
- return err;
+ return replace_mm_exe_file(mm, fd_file(exe));
}
/*
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 0700f40c53ac..0cd680ccc7e5 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -411,15 +411,14 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
struct nlattr *na;
size_t size;
u32 fd;
- struct fd f;
na = info->attrs[CGROUPSTATS_CMD_ATTR_FD];
if (!na)
return -EINVAL;
fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
- f = fdget(fd);
- if (!fd_file(f))
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return 0;
size = nla_total_size(sizeof(struct cgroupstats));
@@ -427,14 +426,13 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb,
size);
if (rc < 0)
- goto err;
+ return rc;
na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
sizeof(struct cgroupstats));
if (na == NULL) {
nlmsg_free(rep_skb);
- rc = -EMSGSIZE;
- goto err;
+ return -EMSGSIZE;
}
stats = nla_data(na);
@@ -443,14 +441,10 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
rc = cgroupstats_build(stats, fd_file(f)->f_path.dentry);
if (rc < 0) {
nlmsg_free(rep_skb);
- goto err;
+ return rc;
}
- rc = send_reply(rep_skb, info);
-
-err:
- fdput(f);
- return rc;
+ return send_reply(rep_skb, info);
}
static int cmd_attr_register_cpumask(struct genl_info *info)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7e6f409bf311..962b2a31f015 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -114,6 +114,23 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = {
.base[1] = FAST_TK_INIT,
};
+/*
+ * Multigrain timestamps require tracking the latest fine-grained timestamp
+ * that has been issued, and never returning a coarse-grained timestamp that is
+ * earlier than that value.
+ *
+ * mg_floor represents the latest fine-grained time that has been handed out as
+ * a file timestamp on the system. This is tracked as a monotonic ktime_t, and
+ * converted to a realtime clock value on an as-needed basis.
+ *
+ * Maintaining mg_floor ensures the multigrain interfaces never issue a
+ * timestamp earlier than one that has been previously issued.
+ *
+ * The exception to this rule is when there is a backward realtime clock jump. If
+ * such an event occurs, a timestamp can appear to be earlier than a previous one.
+ */
+static __cacheline_aligned_in_smp atomic64_t mg_floor;
+
static inline void tk_normalize_xtime(struct timekeeper *tk)
{
while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
@@ -2394,6 +2411,94 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts)
}
EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
+/**
+ * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor
+ * @ts: timespec64 to be filled
+ *
+ * Fetch the global mg_floor value, convert it to realtime and compare it
+ * to the current coarse-grained time. Fill @ts with whichever is
+ * latest. Note that this is a filesystem-specific interface and should be
+ * avoided outside of that context.
+ */
+void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ u64 floor = atomic64_read(&mg_floor);
+ ktime_t f_real, offset, coarse;
+ unsigned int seq;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ *ts = tk_xtime(tk);
+ offset = tk_core.timekeeper.offs_real;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ coarse = timespec64_to_ktime(*ts);
+ f_real = ktime_add(floor, offset);
+ if (ktime_after(f_real, coarse))
+ *ts = ktime_to_timespec64(f_real);
+}
+
+/**
+ * ktime_get_real_ts64_mg - attempt to update floor value and return result
+ * @ts: pointer to the timespec to be set
+ *
+ * Get a monotonic fine-grained time value and attempt to swap it into
+ * mg_floor. If that succeeds then accept the new floor value. If it fails
+ * then another task raced in during the interim time and updated the
+ * floor. Since any update to the floor must be later than the previous
+ * floor, either outcome is acceptable.
+ *
+ * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(),
+ * and determining that the resulting coarse-grained timestamp did not effect
+ * a change in ctime. Any more recent floor value would effect a change to
+ * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure.
+ *
+ * @ts will be filled with the latest floor value, regardless of the outcome of
+ * the cmpxchg. Note that this is a filesystem specific interface and should be
+ * avoided outside of that context.
+ */
+void ktime_get_real_ts64_mg(struct timespec64 *ts)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ ktime_t old = atomic64_read(&mg_floor);
+ ktime_t offset, mono;
+ unsigned int seq;
+ u64 nsecs;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+
+ ts->tv_sec = tk->xtime_sec;
+ mono = tk->tkr_mono.base;
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
+ offset = tk_core.timekeeper.offs_real;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ mono = ktime_add_ns(mono, nsecs);
+
+ /*
+ * Attempt to update the floor with the new time value. As any
+ * update must be later then the existing floor, and would effect
+ * a change to ctime from the perspective of the current task,
+ * accept the resulting floor value regardless of the outcome of
+ * the swap.
+ */
+ if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) {
+ ts->tv_nsec = 0;
+ timespec64_add_ns(ts, nsecs);
+ timekeeping_inc_mg_floor_swaps();
+ } else {
+ /*
+ * Another task changed mg_floor since "old" was fetched.
+ * "old" has been updated with the latest value of "mg_floor".
+ * That value is newer than the previous floor value, which
+ * is enough to effect a change to ctime. Accept it.
+ */
+ *ts = ktime_to_timespec64(ktime_add(old, offset));
+ }
+}
+
void ktime_get_coarse_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index b73e8850e58d..badeb222eab9 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -17,6 +17,9 @@
#define NUM_BINS 32
+/* Incremented every time mg_floor is updated */
+DEFINE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps);
+
static unsigned int sleep_time_bin[NUM_BINS] = {0};
static int tk_debug_sleep_time_show(struct seq_file *s, void *data)
@@ -53,3 +56,13 @@ void tk_debug_account_sleep_time(const struct timespec64 *t)
(s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC);
}
+unsigned long timekeeping_get_mg_floor_swaps(void)
+{
+ unsigned long sum = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ sum += data_race(per_cpu(timekeeping_mg_floor_swaps, cpu));
+
+ return sum;
+}
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 4ca2787d1642..0bbae825bc02 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -10,9 +10,24 @@
* timekeeping debug functions
*/
#ifdef CONFIG_DEBUG_FS
+
+DECLARE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps);
+
+static inline void timekeeping_inc_mg_floor_swaps(void)
+{
+ this_cpu_inc(timekeeping_mg_floor_swaps);
+}
+
extern void tk_debug_account_sleep_time(const struct timespec64 *t);
+
#else
+
#define tk_debug_account_sleep_time(x)
+
+static inline void timekeeping_inc_mg_floor_swaps(void)
+{
+}
+
#endif
#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3ea4f7bb1837..5807116bcd0b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2337,12 +2337,9 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
if (!buffer->buffers[cpu])
goto fail_free_buffers;
- /* If already mapped, do not hook to CPU hotplug */
- if (!start) {
- ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
- if (ret < 0)
- goto fail_free_buffers;
- }
+ ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
+ if (ret < 0)
+ goto fail_free_buffers;
mutex_init(&buffer->mutex);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a8f52b6527ca..6a891e00aa7f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2386,6 +2386,25 @@ void tracing_reset_online_cpus(struct array_buffer *buf)
ring_buffer_record_enable(buffer);
}
+static void tracing_reset_all_cpus(struct array_buffer *buf)
+{
+ struct trace_buffer *buffer = buf->buffer;
+
+ if (!buffer)
+ return;
+
+ ring_buffer_record_disable(buffer);
+
+ /* Make sure all commits have finished */
+ synchronize_rcu();
+
+ buf->time_start = buffer_ftrace_now(buf, buf->cpu);
+
+ ring_buffer_reset(buffer);
+
+ ring_buffer_record_enable(buffer);
+}
+
/* Must have trace_types_lock held */
void tracing_reset_all_online_cpus_unlocked(void)
{
@@ -5501,6 +5520,10 @@ static const struct file_operations tracing_iter_fops = {
static const char readme_msg[] =
"tracing mini-HOWTO:\n\n"
+ "By default tracefs removes all OTH file permission bits.\n"
+ "When mounting tracefs an optional group id can be specified\n"
+ "which adds the group to every directory and file in tracefs:\n\n"
+ "\t e.g. mount -t tracefs [-o [gid=<gid>]] nodev /sys/kernel/tracing\n\n"
"# echo 0 > tracing_on : quick way to disable tracing\n"
"# echo 1 > tracing_on : quick way to re-enable tracing\n\n"
" Important files:\n"
@@ -6141,8 +6164,13 @@ static void update_last_data(struct trace_array *tr)
if (!tr->text_delta && !tr->data_delta)
return;
- /* Clear old data */
- tracing_reset_online_cpus(&tr->array_buffer);
+ /*
+ * Need to clear all CPU buffers as there cannot be events
+ * from the previous boot mixed with events with this boot
+ * as that will cause a confusing trace. Need to clear all
+ * CPU buffers, even for those that may currently be offline.
+ */
+ tracing_reset_all_cpus(&tr->array_buffer);
/* Using current data now */
tr->text_delta = 0;
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 8c07714ff27d..696406939be5 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -307,7 +307,8 @@ void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type)
do_dec_rlimit_put_ucounts(ucounts, NULL, type);
}
-long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type)
+long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type,
+ bool override_rlimit)
{
/* Caller must hold a reference to ucounts */
struct ucounts *iter;
@@ -317,10 +318,11 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type)
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
long new = atomic_long_add_return(1, &iter->rlimit[type]);
if (new < 0 || new > max)
- goto unwind;
+ goto dec_unwind;
if (iter == ucounts)
ret = new;
- max = get_userns_rlimit_max(iter->ns, type);
+ if (!override_rlimit)
+ max = get_userns_rlimit_max(iter->ns, type);
/*
* Grab an extra ucount reference for the caller when
* the rlimit count was previously 0.
@@ -334,7 +336,6 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type)
dec_unwind:
dec = atomic_long_sub_return(1, &iter->rlimit[type]);
WARN_ON_ONCE(dec < 0);
-unwind:
do_dec_rlimit_put_ucounts(ucounts, iter, type);
return 0;
}
diff --git a/kernel/umh.c b/kernel/umh.c
index ff1f13a27d29..be9234270777 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -13,7 +13,6 @@
#include <linux/completion.h>
#include <linux/cred.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/fs_struct.h>
#include <linux/workqueue.h>
#include <linux/security.h>
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index d36242fd4936..1895fbc32bcb 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -663,16 +663,14 @@ struct watch_queue *get_watch_queue(int fd)
{
struct pipe_inode_info *pipe;
struct watch_queue *wqueue = ERR_PTR(-EINVAL);
- struct fd f;
+ CLASS(fd, f)(fd);
- f = fdget(fd);
- if (fd_file(f)) {
+ if (!fd_empty(f)) {
pipe = get_pipe_info(fd_file(f), false);
if (pipe && pipe->watch_queue) {
wqueue = pipe->watch_queue;
kref_get(&wqueue->usage);
}
- fdput(f);
}
return wqueue;