aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt27
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/bpf/Makefile6
-rw-r--r--kernel/bpf/arraymap.c17
-rw-r--r--kernel/bpf/bpf_inode_storage.c28
-rw-r--r--kernel/bpf/bpf_lsm.c65
-rw-r--r--kernel/bpf/bpf_struct_ops.c9
-rw-r--r--kernel/bpf/btf.c172
-rw-r--r--kernel/bpf/btf_iter.c2
-rw-r--r--kernel/bpf/btf_relocate.c2
-rw-r--r--kernel/bpf/cgroup.c2
-rw-r--r--kernel/bpf/core.c21
-rw-r--r--kernel/bpf/hashtab.c16
-rw-r--r--kernel/bpf/helpers.c94
-rw-r--r--kernel/bpf/inode.c4
-rw-r--r--kernel/bpf/local_storage.c4
-rw-r--r--kernel/bpf/map_in_map.c38
-rw-r--r--kernel/bpf/memalloc.c12
-rw-r--r--kernel/bpf/relo_core.c2
-rw-r--r--kernel/bpf/reuseport_array.c2
-rw-r--r--kernel/bpf/stackmap.c131
-rw-r--r--kernel/bpf/syscall.c228
-rw-r--r--kernel/bpf/token.c78
-rw-r--r--kernel/bpf/verifier.c1401
-rw-r--r--kernel/cgroup/cgroup-internal.h2
-rw-r--r--kernel/cgroup/cgroup.c27
-rw-r--r--kernel/configs/tiny.config6
-rw-r--r--kernel/crash_core.c33
-rw-r--r--kernel/crash_reserve.c3
-rw-r--r--kernel/dma/Kconfig7
-rw-r--r--kernel/dma/Makefile4
-rw-r--r--kernel/dma/direct.c8
-rw-r--r--kernel/dma/dummy.c21
-rw-r--r--kernel/dma/mapping.c154
-rw-r--r--kernel/dma/ops_helpers.c14
-rw-r--r--kernel/dma/pool.c4
-rw-r--r--kernel/dma/remap.c6
-rw-r--r--kernel/dma/swiotlb.c6
-rw-r--r--kernel/events/core.c26
-rw-r--r--kernel/events/uprobes.c35
-rw-r--r--kernel/exit.c57
-rw-r--r--kernel/fork.c21
-rw-r--r--kernel/freezer.c2
-rw-r--r--kernel/futex/core.c1
-rw-r--r--kernel/irq/msi.c2
-rw-r--r--kernel/kexec_internal.h3
-rw-r--r--kernel/kthread.c10
-rw-r--r--kernel/locking/rtmutex.c4
-rw-r--r--kernel/locking/rwsem.c4
-rw-r--r--kernel/locking/test-ww_mutex.c1
-rw-r--r--kernel/locking/ww_mutex.h2
-rw-r--r--kernel/module/Kconfig1
-rw-r--r--kernel/module/main.c2
-rw-r--r--kernel/nsproxy.c12
-rw-r--r--kernel/numa.c26
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/pid.c10
-rw-r--r--kernel/printk/printk.c11
-rw-r--r--kernel/resource.c71
-rw-r--r--kernel/resource_kunit.c143
-rw-r--r--kernel/sched/build_policy.c11
-rw-r--r--kernel/sched/core.c534
-rw-r--r--kernel/sched/cpufreq_schedutil.c56
-rw-r--r--kernel/sched/deadline.c503
-rw-r--r--kernel/sched/debug.c201
-rw-r--r--kernel/sched/ext.c7191
-rw-r--r--kernel/sched/ext.h91
-rw-r--r--kernel/sched/fair.c801
-rw-r--r--kernel/sched/features.h30
-rw-r--r--kernel/sched/idle.c25
-rw-r--r--kernel/sched/pelt.c20
-rw-r--r--kernel/sched/pelt.h1
-rw-r--r--kernel/sched/rt.c261
-rw-r--r--kernel/sched/sched.h301
-rw-r--r--kernel/sched/stop_task.c18
-rw-r--r--kernel/sched/syscalls.c141
-rw-r--r--kernel/sched/topology.c8
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/sys.c12
-rw-r--r--kernel/taskstats.c4
-rw-r--r--kernel/time/hrtimer.c2
-rw-r--r--kernel/trace/bpf_trace.c108
-rw-r--r--kernel/trace/ring_buffer.c949
-rw-r--r--kernel/trace/trace.c372
-rw-r--r--kernel/trace/trace.h14
-rw-r--r--kernel/trace/trace_functions_graph.c23
-rw-r--r--kernel/trace/trace_output.c17
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
-rw-r--r--kernel/trace/trace_syscalls.c12
-rw-r--r--kernel/user_namespace.c5
-rw-r--r--kernel/vmcore_info.c8
-rw-r--r--kernel/watch_queue.c4
-rw-r--r--kernel/watchdog.c5
93 files changed, 12826 insertions, 2013 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index c2f1fd95a821..fe782cd77388 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -133,4 +133,29 @@ config SCHED_CORE
which is the likely usage by Linux distributions, there should
be no measurable impact on performance.
-
+config SCHED_CLASS_EXT
+ bool "Extensible Scheduling Class"
+ depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF
+ select STACKTRACE if STACKTRACE_SUPPORT
+ help
+ This option enables a new scheduler class sched_ext (SCX), which
+ allows scheduling policies to be implemented as BPF programs to
+ achieve the following:
+
+ - Ease of experimentation and exploration: Enabling rapid
+ iteration of new scheduling policies.
+ - Customization: Building application-specific schedulers which
+ implement policies that are not applicable to general-purpose
+ schedulers.
+ - Rapid scheduler deployments: Non-disruptive swap outs of
+ scheduling policies in production environments.
+
+ sched_ext leverages BPF struct_ops feature to define a structure
+ which exports function callbacks and flags to BPF programs that
+ wish to implement scheduling policies. The struct_ops structure
+ exported by sched_ext is struct sched_ext_ops, and is conceptually
+ similar to struct sched_class.
+
+ For more information:
+ Documentation/scheduler/sched-ext.rst
+ https://github.com/sched-ext/scx
diff --git a/kernel/Makefile b/kernel/Makefile
index 3c13240dfc9f..87866b037fbe 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -116,7 +116,6 @@ obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o
obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o
obj-$(CONFIG_CFI_CLANG) += cfi.o
-obj-$(CONFIG_NUMA) += numa.o
obj-$(CONFIG_PERF_EVENTS) += events/
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 0291eef9ce92..9b9c151b5c82 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -52,9 +52,3 @@ obj-$(CONFIG_BPF_PRELOAD) += preload/
obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o
obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o
-
-# Some source files are common to libbpf.
-vpath %.c $(srctree)/kernel/bpf:$(srctree)/tools/lib/bpf
-
-$(obj)/%.o: %.c FORCE
- $(call if_changed_rule,cc_o_c)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index feabc0193852..79660e3fca4c 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -73,6 +73,9 @@ int array_map_alloc_check(union bpf_attr *attr)
/* avoid overflow on round_up(map->value_size) */
if (attr->value_size > INT_MAX)
return -E2BIG;
+ /* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */
+ if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE)
+ return -E2BIG;
return 0;
}
@@ -494,7 +497,7 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
if (map->btf_key_type_id)
seq_printf(m, "%u: ", *(u32 *)key);
btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
rcu_read_unlock();
}
@@ -515,7 +518,7 @@ static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key,
seq_printf(m, "\tcpu%d: ", cpu);
btf_type_seq_show(map->btf, map->btf_value_type_id,
per_cpu_ptr(pptr, cpu), m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
}
seq_puts(m, "}\n");
@@ -600,7 +603,7 @@ static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos)
array = container_of(map, struct bpf_array, map);
index = info->index & array->index_mask;
if (info->percpu_value_buf)
- return array->pptrs[index];
+ return (void *)(uintptr_t)array->pptrs[index];
return array_map_elem_ptr(array, index);
}
@@ -619,7 +622,7 @@ static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
array = container_of(map, struct bpf_array, map);
index = info->index & array->index_mask;
if (info->percpu_value_buf)
- return array->pptrs[index];
+ return (void *)(uintptr_t)array->pptrs[index];
return array_map_elem_ptr(array, index);
}
@@ -632,7 +635,7 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
struct bpf_iter_meta meta;
struct bpf_prog *prog;
int off = 0, cpu = 0;
- void __percpu **pptr;
+ void __percpu *pptr;
u32 size;
meta.seq = seq;
@@ -648,7 +651,7 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
if (!info->percpu_value_buf) {
ctx.value = v;
} else {
- pptr = v;
+ pptr = (void __percpu *)(uintptr_t)v;
size = array->elem_size;
for_each_possible_cpu(cpu) {
copy_map_value_long(map, info->percpu_value_buf + off,
@@ -993,7 +996,7 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
prog_id = prog_fd_array_sys_lookup_elem(ptr);
btf_type_seq_show(map->btf, map->btf_value_type_id,
&prog_id, m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
}
}
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index b0ef45db207c..29da6d3838f6 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -78,13 +78,12 @@ void bpf_inode_storage_free(struct inode *inode)
static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_local_storage_data *sdata;
- struct fd f = fdget_raw(*(int *)key);
+ CLASS(fd_raw, f)(*(int *)key);
- if (!f.file)
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
- sdata = inode_storage_lookup(file_inode(f.file), map, true);
- fdput(f);
+ sdata = inode_storage_lookup(file_inode(fd_file(f)), map, true);
return sdata ? sdata->data : NULL;
}
@@ -92,19 +91,16 @@ static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags)
{
struct bpf_local_storage_data *sdata;
- struct fd f = fdget_raw(*(int *)key);
+ CLASS(fd_raw, f)(*(int *)key);
- if (!f.file)
+ if (fd_empty(f))
return -EBADF;
- if (!inode_storage_ptr(file_inode(f.file))) {
- fdput(f);
+ if (!inode_storage_ptr(file_inode(fd_file(f))))
return -EBADF;
- }
- sdata = bpf_local_storage_update(file_inode(f.file),
+ sdata = bpf_local_storage_update(file_inode(fd_file(f)),
(struct bpf_local_storage_map *)map,
value, map_flags, GFP_ATOMIC);
- fdput(f);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -123,15 +119,11 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
{
- struct fd f = fdget_raw(*(int *)key);
- int err;
+ CLASS(fd_raw, f)(*(int *)key);
- if (!f.file)
+ if (fd_empty(f))
return -EBADF;
-
- err = inode_storage_delete(file_inode(f.file), map);
- fdput(f);
- return err;
+ return inode_storage_delete(file_inode(fd_file(f)), map);
}
/* *gfp_flags* is a hidden argument provided by the verifier */
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 08a338e1f231..6292ac5f9bd1 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -11,7 +11,6 @@
#include <linux/lsm_hooks.h>
#include <linux/bpf_lsm.h>
#include <linux/kallsyms.h>
-#include <linux/bpf_verifier.h>
#include <net/bpf_sk_storage.h>
#include <linux/bpf_local_storage.h>
#include <linux/btf_ids.h>
@@ -36,6 +35,24 @@ BTF_SET_START(bpf_lsm_hooks)
#undef LSM_HOOK
BTF_SET_END(bpf_lsm_hooks)
+BTF_SET_START(bpf_lsm_disabled_hooks)
+BTF_ID(func, bpf_lsm_vm_enough_memory)
+BTF_ID(func, bpf_lsm_inode_need_killpriv)
+BTF_ID(func, bpf_lsm_inode_getsecurity)
+BTF_ID(func, bpf_lsm_inode_listsecurity)
+BTF_ID(func, bpf_lsm_inode_copy_up_xattr)
+BTF_ID(func, bpf_lsm_getselfattr)
+BTF_ID(func, bpf_lsm_getprocattr)
+BTF_ID(func, bpf_lsm_setprocattr)
+#ifdef CONFIG_KEYS
+BTF_ID(func, bpf_lsm_key_getsecurity)
+#endif
+#ifdef CONFIG_AUDIT
+BTF_ID(func, bpf_lsm_audit_rule_match)
+#endif
+BTF_ID(func, bpf_lsm_ismaclabel)
+BTF_SET_END(bpf_lsm_disabled_hooks)
+
/* List of LSM hooks that should operate on 'current' cgroup regardless
* of function signature.
*/
@@ -97,15 +114,24 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
const struct bpf_prog *prog)
{
+ u32 btf_id = prog->aux->attach_btf_id;
+ const char *func_name = prog->aux->attach_func_name;
+
if (!prog->gpl_compatible) {
bpf_log(vlog,
"LSM programs must have a GPL compatible license\n");
return -EINVAL;
}
- if (!btf_id_set_contains(&bpf_lsm_hooks, prog->aux->attach_btf_id)) {
+ if (btf_id_set_contains(&bpf_lsm_disabled_hooks, btf_id)) {
+ bpf_log(vlog, "attach_btf_id %u points to disabled hook %s\n",
+ btf_id, func_name);
+ return -EINVAL;
+ }
+
+ if (!btf_id_set_contains(&bpf_lsm_hooks, btf_id)) {
bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n",
- prog->aux->attach_btf_id, prog->aux->attach_func_name);
+ btf_id, func_name);
return -EINVAL;
}
@@ -390,3 +416,36 @@ const struct bpf_verifier_ops lsm_verifier_ops = {
.get_func_proto = bpf_lsm_func_proto,
.is_valid_access = btf_ctx_access,
};
+
+/* hooks return 0 or 1 */
+BTF_SET_START(bool_lsm_hooks)
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+BTF_ID(func, bpf_lsm_xfrm_state_pol_flow_match)
+#endif
+#ifdef CONFIG_AUDIT
+BTF_ID(func, bpf_lsm_audit_rule_known)
+#endif
+BTF_ID(func, bpf_lsm_inode_xattr_skipcap)
+BTF_SET_END(bool_lsm_hooks)
+
+int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
+ struct bpf_retval_range *retval_range)
+{
+ /* no return value range for void hooks */
+ if (!prog->aux->attach_func_proto->type)
+ return -EINVAL;
+
+ if (btf_id_set_contains(&bool_lsm_hooks, prog->aux->attach_btf_id)) {
+ retval_range->minval = 0;
+ retval_range->maxval = 1;
+ } else {
+ /* All other available LSM hooks, except task_prctl, return 0
+ * on success and negative error code on failure.
+ * To keep things simple, we only allow bpf progs to return 0
+ * or negative errno for task_prctl too.
+ */
+ retval_range->minval = -MAX_ERRNO;
+ retval_range->maxval = 0;
+ }
+ return 0;
+}
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 0d515ec57aa5..fda3dd2ee984 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -837,7 +837,7 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
btf_type_seq_show(st_map->btf,
map->btf_vmlinux_value_type_id,
value, m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
}
kfree(value);
@@ -1040,6 +1040,13 @@ void bpf_struct_ops_put(const void *kdata)
bpf_map_put(&st_map->map);
}
+int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff)
+{
+ void *func_ptr = *(void **)(st_ops->cfi_stubs + moff);
+
+ return func_ptr ? 0 : -ENOTSUPP;
+}
+
static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ba91be08763a..75e4fe83c509 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -212,7 +212,7 @@ enum btf_kfunc_hook {
BTF_KFUNC_HOOK_TRACING,
BTF_KFUNC_HOOK_SYSCALL,
BTF_KFUNC_HOOK_FMODRET,
- BTF_KFUNC_HOOK_CGROUP_SKB,
+ BTF_KFUNC_HOOK_CGROUP,
BTF_KFUNC_HOOK_SCHED_ACT,
BTF_KFUNC_HOOK_SK_SKB,
BTF_KFUNC_HOOK_SOCKET_FILTER,
@@ -790,7 +790,7 @@ const char *btf_str_by_offset(const struct btf *btf, u32 offset)
return NULL;
}
-static bool __btf_name_valid(const struct btf *btf, u32 offset)
+static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
{
/* offset must be valid */
const char *src = btf_str_by_offset(btf, offset);
@@ -811,11 +811,6 @@ static bool __btf_name_valid(const struct btf *btf, u32 offset)
return !*src;
}
-static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
-{
- return __btf_name_valid(btf, offset);
-}
-
/* Allow any printable character in DATASEC names */
static bool btf_name_valid_section(const struct btf *btf, u32 offset)
{
@@ -3761,6 +3756,7 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,
return -EINVAL;
}
+/* Callers have to ensure the life cycle of btf if it is program BTF */
static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
struct btf_field_info *info)
{
@@ -3789,7 +3785,6 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
field->kptr.dtor = NULL;
id = info->kptr.type_id;
kptr_btf = (struct btf *)btf;
- btf_get(kptr_btf);
goto found_dtor;
}
if (id < 0)
@@ -4631,7 +4626,7 @@ static s32 btf_var_check_meta(struct btf_verifier_env *env,
}
if (!t->name_off ||
- !__btf_name_valid(env->btf, t->name_off)) {
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
@@ -5519,36 +5514,72 @@ static const char *alloc_obj_fields[] = {
static struct btf_struct_metas *
btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
{
- union {
- struct btf_id_set set;
- struct {
- u32 _cnt;
- u32 _ids[ARRAY_SIZE(alloc_obj_fields)];
- } _arr;
- } aof;
struct btf_struct_metas *tab = NULL;
+ struct btf_id_set *aof;
int i, n, id, ret;
BUILD_BUG_ON(offsetof(struct btf_id_set, cnt) != 0);
BUILD_BUG_ON(sizeof(struct btf_id_set) != sizeof(u32));
- memset(&aof, 0, sizeof(aof));
+ aof = kmalloc(sizeof(*aof), GFP_KERNEL | __GFP_NOWARN);
+ if (!aof)
+ return ERR_PTR(-ENOMEM);
+ aof->cnt = 0;
+
for (i = 0; i < ARRAY_SIZE(alloc_obj_fields); i++) {
/* Try to find whether this special type exists in user BTF, and
* if so remember its ID so we can easily find it among members
* of structs that we iterate in the next loop.
*/
+ struct btf_id_set *new_aof;
+
id = btf_find_by_name_kind(btf, alloc_obj_fields[i], BTF_KIND_STRUCT);
if (id < 0)
continue;
- aof.set.ids[aof.set.cnt++] = id;
+
+ new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!new_aof) {
+ ret = -ENOMEM;
+ goto free_aof;
+ }
+ aof = new_aof;
+ aof->ids[aof->cnt++] = id;
+ }
+
+ n = btf_nr_types(btf);
+ for (i = 1; i < n; i++) {
+ /* Try to find if there are kptrs in user BTF and remember their ID */
+ struct btf_id_set *new_aof;
+ struct btf_field_info tmp;
+ const struct btf_type *t;
+
+ t = btf_type_by_id(btf, i);
+ if (!t) {
+ ret = -EINVAL;
+ goto free_aof;
+ }
+
+ ret = btf_find_kptr(btf, t, 0, 0, &tmp);
+ if (ret != BTF_FIELD_FOUND)
+ continue;
+
+ new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!new_aof) {
+ ret = -ENOMEM;
+ goto free_aof;
+ }
+ aof = new_aof;
+ aof->ids[aof->cnt++] = i;
}
- if (!aof.set.cnt)
+ if (!aof->cnt) {
+ kfree(aof);
return NULL;
- sort(&aof.set.ids, aof.set.cnt, sizeof(aof.set.ids[0]), btf_id_cmp_func, NULL);
+ }
+ sort(&aof->ids, aof->cnt, sizeof(aof->ids[0]), btf_id_cmp_func, NULL);
- n = btf_nr_types(btf);
for (i = 1; i < n; i++) {
struct btf_struct_metas *new_tab;
const struct btf_member *member;
@@ -5558,17 +5589,13 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
int j, tab_cnt;
t = btf_type_by_id(btf, i);
- if (!t) {
- ret = -EINVAL;
- goto free;
- }
if (!__btf_type_is_struct(t))
continue;
cond_resched();
for_each_member(j, t, member) {
- if (btf_id_set_contains(&aof.set, member->type))
+ if (btf_id_set_contains(aof, member->type))
goto parse;
}
continue;
@@ -5587,7 +5614,8 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
type = &tab->types[tab->cnt];
type->btf_id = i;
record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
- BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size);
+ BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT |
+ BPF_KPTR, t->size);
/* The record cannot be unset, treat it as an error if so */
if (IS_ERR_OR_NULL(record)) {
ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
@@ -5596,9 +5624,12 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
type->record = record;
tab->cnt++;
}
+ kfree(aof);
return tab;
free:
btf_struct_metas_free(tab);
+free_aof:
+ kfree(aof);
return ERR_PTR(ret);
}
@@ -6245,12 +6276,11 @@ static struct btf *btf_parse_module(const char *module_name, const void *data,
btf->kernel_btf = true;
snprintf(btf->name, sizeof(btf->name), "%s", module_name);
- btf->data = kvmalloc(data_size, GFP_KERNEL | __GFP_NOWARN);
+ btf->data = kvmemdup(data, data_size, GFP_KERNEL | __GFP_NOWARN);
if (!btf->data) {
err = -ENOMEM;
goto errout;
}
- memcpy(btf->data, data, data_size);
btf->data_size = data_size;
err = btf_parse_hdr(env);
@@ -6418,8 +6448,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (arg == nr_args) {
switch (prog->expected_attach_type) {
- case BPF_LSM_CGROUP:
case BPF_LSM_MAC:
+ /* mark we are accessing the return value */
+ info->is_retval = true;
+ fallthrough;
+ case BPF_LSM_CGROUP:
case BPF_TRACE_FEXIT:
/* When LSM programs are attached to void LSM hooks
* they use FEXIT trampolines and when attached to
@@ -7678,21 +7711,16 @@ int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
struct btf *btf_get_by_fd(int fd)
{
struct btf *btf;
- struct fd f;
-
- f = fdget(fd);
+ CLASS(fd, f)(fd);
- if (!f.file)
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
- if (f.file->f_op != &btf_fops) {
- fdput(f);
+ if (fd_file(f)->f_op != &btf_fops)
return ERR_PTR(-EINVAL);
- }
- btf = f.file->private_data;
+ btf = fd_file(f)->private_data;
refcount_inc(&btf->refcnt);
- fdput(f);
return btf;
}
@@ -8054,15 +8082,44 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
BTF_TRACING_TYPE_xxx
#undef BTF_TRACING_TYPE
+/* Validate well-formedness of iter argument type.
+ * On success, return positive BTF ID of iter state's STRUCT type.
+ * On error, negative error is returned.
+ */
+int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx)
+{
+ const struct btf_param *arg;
+ const struct btf_type *t;
+ const char *name;
+ int btf_id;
+
+ if (btf_type_vlen(func) <= arg_idx)
+ return -EINVAL;
+
+ arg = &btf_params(func)[arg_idx];
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!t || !btf_type_is_ptr(t))
+ return -EINVAL;
+ t = btf_type_skip_modifiers(btf, t->type, &btf_id);
+ if (!t || !__btf_type_is_struct(t))
+ return -EINVAL;
+
+ name = btf_name_by_offset(btf, t->name_off);
+ if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1))
+ return -EINVAL;
+
+ return btf_id;
+}
+
static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name,
const struct btf_type *func, u32 func_flags)
{
u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
- const char *name, *sfx, *iter_name;
- const struct btf_param *arg;
+ const char *sfx, *iter_name;
const struct btf_type *t;
char exp_name[128];
u32 nr_args;
+ int btf_id;
/* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */
if (!flags || (flags & (flags - 1)))
@@ -8073,28 +8130,21 @@ static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name,
if (nr_args < 1)
return -EINVAL;
- arg = &btf_params(func)[0];
- t = btf_type_skip_modifiers(btf, arg->type, NULL);
- if (!t || !btf_type_is_ptr(t))
- return -EINVAL;
- t = btf_type_skip_modifiers(btf, t->type, NULL);
- if (!t || !__btf_type_is_struct(t))
- return -EINVAL;
-
- name = btf_name_by_offset(btf, t->name_off);
- if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1))
- return -EINVAL;
+ btf_id = btf_check_iter_arg(btf, func, 0);
+ if (btf_id < 0)
+ return btf_id;
/* sizeof(struct bpf_iter_<type>) should be a multiple of 8 to
* fit nicely in stack slots
*/
+ t = btf_type_by_id(btf, btf_id);
if (t->size == 0 || (t->size % 8))
return -EINVAL;
/* validate bpf_iter_<type>_{new,next,destroy}(struct bpf_iter_<type> *)
* naming pattern
*/
- iter_name = name + sizeof(ITER_PREFIX) - 1;
+ iter_name = btf_name_by_offset(btf, t->name_off) + sizeof(ITER_PREFIX) - 1;
if (flags & KF_ITER_NEW)
sfx = "new";
else if (flags & KF_ITER_NEXT)
@@ -8309,13 +8359,19 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
case BPF_PROG_TYPE_STRUCT_OPS:
return BTF_KFUNC_HOOK_STRUCT_OPS;
case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_LSM:
return BTF_KFUNC_HOOK_TRACING;
case BPF_PROG_TYPE_SYSCALL:
return BTF_KFUNC_HOOK_SYSCALL;
case BPF_PROG_TYPE_CGROUP_SKB:
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
- return BTF_KFUNC_HOOK_CGROUP_SKB;
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ return BTF_KFUNC_HOOK_CGROUP;
case BPF_PROG_TYPE_SCHED_ACT:
return BTF_KFUNC_HOOK_SCHED_ACT;
case BPF_PROG_TYPE_SK_SKB:
@@ -8891,6 +8947,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
struct bpf_core_cand_list cands = {};
struct bpf_core_relo_res targ_res;
struct bpf_core_spec *specs;
+ const struct btf_type *type;
int err;
/* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5"
@@ -8900,6 +8957,13 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
if (!specs)
return -ENOMEM;
+ type = btf_type_by_id(ctx->btf, relo->type_id);
+ if (!type) {
+ bpf_log(ctx->log, "relo #%u: bad type id %u\n",
+ relo_idx, relo->type_id);
+ return -EINVAL;
+ }
+
if (need_cands) {
struct bpf_cand_cache *cc;
int i;
diff --git a/kernel/bpf/btf_iter.c b/kernel/bpf/btf_iter.c
new file mode 100644
index 000000000000..0e2c66a52df9
--- /dev/null
+++ b/kernel/bpf/btf_iter.c
@@ -0,0 +1,2 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+#include "../../tools/lib/bpf/btf_iter.c"
diff --git a/kernel/bpf/btf_relocate.c b/kernel/bpf/btf_relocate.c
new file mode 100644
index 000000000000..c12ccbf66507
--- /dev/null
+++ b/kernel/bpf/btf_relocate.c
@@ -0,0 +1,2 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+#include "../../tools/lib/bpf/btf_relocate.c"
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 8ba73042a239..e7113d700b87 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -2581,6 +2581,8 @@ cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_cgroup_classid:
return &bpf_get_cgroup_classid_curr_proto;
#endif
+ case BPF_FUNC_current_task_under_cgroup:
+ return &bpf_current_task_under_cgroup_proto;
default:
return NULL;
}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7ee62e38faf0..4e07cc057d6f 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2302,6 +2302,7 @@ bool bpf_prog_map_compatible(struct bpf_map *map,
{
enum bpf_prog_type prog_type = resolve_prog_type(fp);
bool ret;
+ struct bpf_prog_aux *aux = fp->aux;
if (fp->kprobe_override)
return false;
@@ -2311,7 +2312,7 @@ bool bpf_prog_map_compatible(struct bpf_map *map,
* in the case of devmap and cpumap). Until device checks
* are implemented, prohibit adding dev-bound programs to program maps.
*/
- if (bpf_prog_is_dev_bound(fp->aux))
+ if (bpf_prog_is_dev_bound(aux))
return false;
spin_lock(&map->owner.lock);
@@ -2321,12 +2322,26 @@ bool bpf_prog_map_compatible(struct bpf_map *map,
*/
map->owner.type = prog_type;
map->owner.jited = fp->jited;
- map->owner.xdp_has_frags = fp->aux->xdp_has_frags;
+ map->owner.xdp_has_frags = aux->xdp_has_frags;
+ map->owner.attach_func_proto = aux->attach_func_proto;
ret = true;
} else {
ret = map->owner.type == prog_type &&
map->owner.jited == fp->jited &&
- map->owner.xdp_has_frags == fp->aux->xdp_has_frags;
+ map->owner.xdp_has_frags == aux->xdp_has_frags;
+ if (ret &&
+ map->owner.attach_func_proto != aux->attach_func_proto) {
+ switch (prog_type) {
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_EXT:
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ ret = false;
+ break;
+ default:
+ break;
+ }
+ }
}
spin_unlock(&map->owner.lock);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 06115f8728e8..b14b87463ee0 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -462,6 +462,9 @@ static int htab_map_alloc_check(union bpf_attr *attr)
* kmalloc-able later in htab_map_update_elem()
*/
return -E2BIG;
+ /* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */
+ if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE)
+ return -E2BIG;
return 0;
}
@@ -1049,14 +1052,15 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
/* alloc_percpu zero-fills */
- pptr = bpf_mem_cache_alloc(&htab->pcpu_ma);
- if (!pptr) {
+ void *ptr = bpf_mem_cache_alloc(&htab->pcpu_ma);
+
+ if (!ptr) {
bpf_mem_cache_free(&htab->ma, l_new);
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
- l_new->ptr_to_pptr = pptr;
- pptr = *(void **)pptr;
+ l_new->ptr_to_pptr = ptr;
+ pptr = *(void __percpu **)ptr;
}
pcpu_init_value(htab, pptr, value, onallcpus);
@@ -1586,7 +1590,7 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
seq_puts(m, ": ");
btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
rcu_read_unlock();
}
@@ -2450,7 +2454,7 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
seq_printf(m, "\tcpu%d: ", cpu);
btf_type_seq_show(map->btf, map->btf_value_type_id,
per_cpu_ptr(pptr, cpu), m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
}
seq_puts(m, "}\n");
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index b5f0adae8293..1a43d06eab28 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -158,6 +158,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
.func = bpf_get_smp_processor_id,
.gpl_only = false,
.ret_type = RET_INTEGER,
+ .allow_fastcall = true,
};
BPF_CALL_0(bpf_get_numa_node_id)
@@ -517,16 +518,15 @@ static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags,
}
BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags,
- long *, res)
+ s64 *, res)
{
long long _res;
int err;
+ *res = 0;
err = __bpf_strtoll(buf, buf_len, flags, &_res);
if (err < 0)
return err;
- if (_res != (long)_res)
- return -ERANGE;
*res = _res;
return err;
}
@@ -538,23 +538,23 @@ const struct bpf_func_proto bpf_strtol_proto = {
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_LONG,
+ .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED,
+ .arg4_size = sizeof(s64),
};
BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags,
- unsigned long *, res)
+ u64 *, res)
{
unsigned long long _res;
bool is_negative;
int err;
+ *res = 0;
err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
if (err < 0)
return err;
if (is_negative)
return -EINVAL;
- if (_res != (unsigned long)_res)
- return -ERANGE;
*res = _res;
return err;
}
@@ -566,7 +566,8 @@ const struct bpf_func_proto bpf_strtoul_proto = {
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_LONG,
+ .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED,
+ .arg4_size = sizeof(u64),
};
BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
@@ -714,7 +715,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
if (cpu >= nr_cpu_ids)
return (unsigned long)NULL;
- return (unsigned long)per_cpu_ptr((const void __percpu *)ptr, cpu);
+ return (unsigned long)per_cpu_ptr((const void __percpu *)(const uintptr_t)ptr, cpu);
}
const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
@@ -727,7 +728,7 @@ const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
{
- return (unsigned long)this_cpu_ptr((const void __percpu *)percpu_ptr);
+ return (unsigned long)this_cpu_ptr((const void __percpu *)(const uintptr_t)percpu_ptr);
}
const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
@@ -1618,9 +1619,9 @@ void bpf_wq_cancel_and_free(void *val)
schedule_work(&work->delete_work);
}
-BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
+BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr)
{
- unsigned long *kptr = map_value;
+ unsigned long *kptr = dst;
/* This helper may be inlined by verifier. */
return xchg(kptr, (unsigned long)ptr);
@@ -1635,7 +1636,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
.ret_btf_id = BPF_PTR_POISON,
- .arg1_type = ARG_PTR_TO_KPTR,
+ .arg1_type = ARG_KPTR_XCHG_DEST,
.arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE,
.arg2_btf_id = BPF_PTR_POISON,
};
@@ -2033,6 +2034,7 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return NULL;
}
}
+EXPORT_SYMBOL_GPL(bpf_base_func_proto);
void bpf_list_head_free(const struct btf_field *field, void *list_head,
struct bpf_spin_lock *spin_lock)
@@ -2457,6 +2459,29 @@ __bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task,
return ret;
}
+BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct cgroup *cgrp;
+
+ if (unlikely(idx >= array->map.max_entries))
+ return -E2BIG;
+
+ cgrp = READ_ONCE(array->ptrs[idx]);
+ if (unlikely(!cgrp))
+ return -EAGAIN;
+
+ return task_under_cgroup_hierarchy(current, cgrp);
+}
+
+const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
+ .func = bpf_current_task_under_cgroup,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
+
/**
* bpf_task_get_cgroup1 - Acquires the associated cgroup of a task within a
* specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its
@@ -2938,6 +2963,47 @@ __bpf_kfunc void bpf_iter_bits_destroy(struct bpf_iter_bits *it)
bpf_mem_free(&bpf_global_ma, kit->bits);
}
+/**
+ * bpf_copy_from_user_str() - Copy a string from an unsafe user address
+ * @dst: Destination address, in kernel space. This buffer must be
+ * at least @dst__sz bytes long.
+ * @dst__sz: Maximum number of bytes to copy, includes the trailing NUL.
+ * @unsafe_ptr__ign: Source address, in user space.
+ * @flags: The only supported flag is BPF_F_PAD_ZEROS
+ *
+ * Copies a NUL-terminated string from userspace to BPF space. If user string is
+ * too long this will still ensure zero termination in the dst buffer unless
+ * buffer size is 0.
+ *
+ * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst to 0 on success and
+ * memset all of @dst on failure.
+ */
+__bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user *unsafe_ptr__ign, u64 flags)
+{
+ int ret;
+
+ if (unlikely(flags & ~BPF_F_PAD_ZEROS))
+ return -EINVAL;
+
+ if (unlikely(!dst__sz))
+ return 0;
+
+ ret = strncpy_from_user(dst, unsafe_ptr__ign, dst__sz - 1);
+ if (ret < 0) {
+ if (flags & BPF_F_PAD_ZEROS)
+ memset((char *)dst, 0, dst__sz);
+
+ return ret;
+ }
+
+ if (flags & BPF_F_PAD_ZEROS)
+ memset((char *)dst + ret, 0, dst__sz - ret);
+ else
+ ((char *)dst)[ret] = '\0';
+
+ return ret + 1;
+}
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(generic_btf_ids)
@@ -3023,6 +3089,7 @@ BTF_ID_FLAGS(func, bpf_preempt_enable)
BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW)
BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE)
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
@@ -3051,6 +3118,7 @@ static int __init kfunc_init(void)
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &generic_kfunc_set);
ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
ARRAY_SIZE(generic_dtors),
THIS_MODULE);
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index af5d2ffadd70..d8fc5eba529d 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -709,10 +709,10 @@ static void seq_print_delegate_opts(struct seq_file *m,
msk = 1ULL << e->val;
if (delegate_msk & msk) {
/* emit lower-case name without prefix */
- seq_printf(m, "%c", first ? '=' : ':');
+ seq_putc(m, first ? '=' : ':');
name += pfx_len;
while (*name) {
- seq_printf(m, "%c", tolower(*name));
+ seq_putc(m, tolower(*name));
name++;
}
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index a04f505aefe9..3969eb0382af 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -431,7 +431,7 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key,
seq_puts(m, ": ");
btf_type_seq_show(map->btf, map->btf_value_type_id,
&READ_ONCE(storage->buf)->data[0], m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
} else {
seq_puts(m, ": {\n");
for_each_possible_cpu(cpu) {
@@ -439,7 +439,7 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key,
btf_type_seq_show(map->btf, map->btf_value_type_id,
per_cpu_ptr(storage->percpu_buf, cpu),
m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
}
seq_puts(m, "}\n");
}
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index b4f18c85d7bc..645bd30bc9a9 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -11,24 +11,18 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
{
struct bpf_map *inner_map, *inner_map_meta;
u32 inner_map_meta_size;
- struct fd f;
- int ret;
+ CLASS(fd, f)(inner_map_ufd);
- f = fdget(inner_map_ufd);
inner_map = __bpf_map_get(f);
if (IS_ERR(inner_map))
return inner_map;
/* Does not support >1 level map-in-map */
- if (inner_map->inner_map_meta) {
- ret = -EINVAL;
- goto put;
- }
+ if (inner_map->inner_map_meta)
+ return ERR_PTR(-EINVAL);
- if (!inner_map->ops->map_meta_equal) {
- ret = -ENOTSUPP;
- goto put;
- }
+ if (!inner_map->ops->map_meta_equal)
+ return ERR_PTR(-ENOTSUPP);
inner_map_meta_size = sizeof(*inner_map_meta);
/* In some cases verifier needs to access beyond just base map. */
@@ -36,10 +30,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_map_meta_size = sizeof(struct bpf_array);
inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER);
- if (!inner_map_meta) {
- ret = -ENOMEM;
- goto put;
- }
+ if (!inner_map_meta)
+ return ERR_PTR(-ENOMEM);
inner_map_meta->map_type = inner_map->map_type;
inner_map_meta->key_size = inner_map->key_size;
@@ -53,8 +45,9 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
* invalid/empty/valid, but ERR_PTR in case of errors. During
* equality NULL or IS_ERR is equivalent.
*/
- ret = PTR_ERR(inner_map_meta->record);
- goto free;
+ struct bpf_map *ret = ERR_CAST(inner_map_meta->record);
+ kfree(inner_map_meta);
+ return ret;
}
/* Note: We must use the same BTF, as we also used btf_record_dup above
* which relies on BTF being same for both maps, as some members like
@@ -77,14 +70,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_array_meta->elem_size = inner_array->elem_size;
inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1;
}
-
- fdput(f);
return inner_map_meta;
-free:
- kfree(inner_map_meta);
-put:
- fdput(f);
- return ERR_PTR(ret);
}
void bpf_map_meta_free(struct bpf_map *map_meta)
@@ -110,9 +96,8 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
int ufd)
{
struct bpf_map *inner_map, *inner_map_meta;
- struct fd f;
+ CLASS(fd, f)(ufd);
- f = fdget(ufd);
inner_map = __bpf_map_get(f);
if (IS_ERR(inner_map))
return inner_map;
@@ -123,7 +108,6 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
else
inner_map = ERR_PTR(-EINVAL);
- fdput(f);
return inner_map;
}
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index dec892ded031..b3858a76e0b3 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -138,8 +138,8 @@ static struct llist_node notrace *__llist_del_first(struct llist_head *head)
static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)
{
if (c->percpu_size) {
- void **obj = kmalloc_node(c->percpu_size, flags, node);
- void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
+ void __percpu **obj = kmalloc_node(c->percpu_size, flags, node);
+ void __percpu *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
if (!obj || !pptr) {
free_percpu(pptr);
@@ -253,7 +253,7 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic)
static void free_one(void *obj, bool percpu)
{
if (percpu) {
- free_percpu(((void **)obj)[1]);
+ free_percpu(((void __percpu **)obj)[1]);
kfree(obj);
return;
}
@@ -509,8 +509,8 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
*/
int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
{
- struct bpf_mem_caches *cc, __percpu *pcc;
- struct bpf_mem_cache *c, __percpu *pc;
+ struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc;
+ struct bpf_mem_cache *c; struct bpf_mem_cache __percpu *pc;
struct obj_cgroup *objcg = NULL;
int cpu, i, unit_size, percpu_size = 0;
@@ -591,7 +591,7 @@ int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg
int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size)
{
- struct bpf_mem_caches *cc, __percpu *pcc;
+ struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc;
int cpu, i, unit_size, percpu_size;
struct obj_cgroup *objcg;
struct bpf_mem_cache *c;
diff --git a/kernel/bpf/relo_core.c b/kernel/bpf/relo_core.c
new file mode 100644
index 000000000000..aa822c9fcfde
--- /dev/null
+++ b/kernel/bpf/relo_core.c
@@ -0,0 +1,2 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+#include "../../tools/lib/bpf/relo_core.c"
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 4b4f9670f1a9..49b8e5a0c6b4 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -308,7 +308,7 @@ put_file_unlock:
spin_unlock_bh(&reuseport_lock);
put_file:
- fput(socket->file);
+ sockfd_put(socket);
return err;
}
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index c99f8e5234ac..3615c06b7dfa 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -124,8 +124,24 @@ free_smap:
return ERR_PTR(err);
}
+static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, bool may_fault)
+{
+ return may_fault ? build_id_parse(vma, build_id, NULL)
+ : build_id_parse_nofault(vma, build_id, NULL);
+}
+
+/*
+ * Expects all id_offs[i].ip values to be set to correct initial IPs.
+ * They will be subsequently:
+ * - either adjusted in place to a file offset, if build ID fetching
+ * succeeds; in this case id_offs[i].build_id is set to correct build ID,
+ * and id_offs[i].status is set to BPF_STACK_BUILD_ID_VALID;
+ * - or IP will be kept intact, if build ID fetching failed; in this case
+ * id_offs[i].build_id is zeroed out and id_offs[i].status is set to
+ * BPF_STACK_BUILD_ID_IP.
+ */
static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
- u64 *ips, u32 trace_nr, bool user)
+ u32 trace_nr, bool user, bool may_fault)
{
int i;
struct mmap_unlock_irq_work *work = NULL;
@@ -142,30 +158,28 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
/* cannot access current->mm, fall back to ips */
for (i = 0; i < trace_nr; i++) {
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
- id_offs[i].ip = ips[i];
memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
}
return;
}
for (i = 0; i < trace_nr; i++) {
- if (range_in_vma(prev_vma, ips[i], ips[i])) {
+ u64 ip = READ_ONCE(id_offs[i].ip);
+
+ if (range_in_vma(prev_vma, ip, ip)) {
vma = prev_vma;
- memcpy(id_offs[i].build_id, prev_build_id,
- BUILD_ID_SIZE_MAX);
+ memcpy(id_offs[i].build_id, prev_build_id, BUILD_ID_SIZE_MAX);
goto build_id_valid;
}
- vma = find_vma(current->mm, ips[i]);
- if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) {
+ vma = find_vma(current->mm, ip);
+ if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) {
/* per entry fall back to ips */
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
- id_offs[i].ip = ips[i];
memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
continue;
}
build_id_valid:
- id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
- - vma->vm_start;
+ id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ip - vma->vm_start;
id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
prev_vma = vma;
prev_build_id = id_offs[i].build_id;
@@ -216,7 +230,7 @@ static long __bpf_get_stackid(struct bpf_map *map,
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
- u32 hash, id, trace_nr, trace_len;
+ u32 hash, id, trace_nr, trace_len, i;
bool user = flags & BPF_F_USER_STACK;
u64 *ips;
bool hash_matches;
@@ -238,15 +252,18 @@ static long __bpf_get_stackid(struct bpf_map *map,
return id;
if (stack_map_use_build_id(map)) {
+ struct bpf_stack_build_id *id_offs;
+
/* for build_id+offset, pop a bucket before slow cmp */
new_bucket = (struct stack_map_bucket *)
pcpu_freelist_pop(&smap->freelist);
if (unlikely(!new_bucket))
return -ENOMEM;
new_bucket->nr = trace_nr;
- stack_map_get_build_id_offset(
- (struct bpf_stack_build_id *)new_bucket->data,
- ips, trace_nr, user);
+ id_offs = (struct bpf_stack_build_id *)new_bucket->data;
+ for (i = 0; i < trace_nr; i++)
+ id_offs[i].ip = ips[i];
+ stack_map_get_build_id_offset(id_offs, trace_nr, user, false /* !may_fault */);
trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
if (hash_matches && bucket->nr == trace_nr &&
memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
@@ -387,7 +404,7 @@ const struct bpf_func_proto bpf_get_stackid_proto_pe = {
static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
struct perf_callchain_entry *trace_in,
- void *buf, u32 size, u64 flags)
+ void *buf, u32 size, u64 flags, bool may_fault)
{
u32 trace_nr, copy_len, elem_size, num_elem, max_depth;
bool user_build_id = flags & BPF_F_USER_BUILD_ID;
@@ -405,8 +422,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
if (kernel && user_build_id)
goto clear;
- elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
- : sizeof(u64);
+ elem_size = user_build_id ? sizeof(struct bpf_stack_build_id) : sizeof(u64);
if (unlikely(size % elem_size))
goto clear;
@@ -427,6 +443,9 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
if (sysctl_perf_event_max_stack < max_depth)
max_depth = sysctl_perf_event_max_stack;
+ if (may_fault)
+ rcu_read_lock(); /* need RCU for perf's callchain below */
+
if (trace_in)
trace = trace_in;
else if (kernel && task)
@@ -434,21 +453,34 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
else
trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
crosstask, false);
- if (unlikely(!trace))
- goto err_fault;
- if (trace->nr < skip)
+ if (unlikely(!trace) || trace->nr < skip) {
+ if (may_fault)
+ rcu_read_unlock();
goto err_fault;
+ }
trace_nr = trace->nr - skip;
trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
copy_len = trace_nr * elem_size;
ips = trace->ip + skip;
- if (user && user_build_id)
- stack_map_get_build_id_offset(buf, ips, trace_nr, user);
- else
+ if (user_build_id) {
+ struct bpf_stack_build_id *id_offs = buf;
+ u32 i;
+
+ for (i = 0; i < trace_nr; i++)
+ id_offs[i].ip = ips[i];
+ } else {
memcpy(buf, ips, copy_len);
+ }
+
+ /* trace/ips should not be dereferenced after this point */
+ if (may_fault)
+ rcu_read_unlock();
+
+ if (user_build_id)
+ stack_map_get_build_id_offset(buf, trace_nr, user, may_fault);
if (size > copy_len)
memset(buf + copy_len, 0, size - copy_len);
@@ -464,7 +496,7 @@ clear:
BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
u64, flags)
{
- return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */);
}
const struct bpf_func_proto bpf_get_stack_proto = {
@@ -477,8 +509,24 @@ const struct bpf_func_proto bpf_get_stack_proto = {
.arg4_type = ARG_ANYTHING,
};
-BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
- u32, size, u64, flags)
+BPF_CALL_4(bpf_get_stack_sleepable, struct pt_regs *, regs, void *, buf, u32, size,
+ u64, flags)
+{
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, true /* may_fault */);
+}
+
+const struct bpf_func_proto bpf_get_stack_sleepable_proto = {
+ .func = bpf_get_stack_sleepable,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static long __bpf_get_task_stack(struct task_struct *task, void *buf, u32 size,
+ u64 flags, bool may_fault)
{
struct pt_regs *regs;
long res = -EINVAL;
@@ -488,12 +536,18 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
regs = task_pt_regs(task);
if (regs)
- res = __bpf_get_stack(regs, task, NULL, buf, size, flags);
+ res = __bpf_get_stack(regs, task, NULL, buf, size, flags, may_fault);
put_task_stack(task);
return res;
}
+BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
+ u32, size, u64, flags)
+{
+ return __bpf_get_task_stack(task, buf, size, flags, false /* !may_fault */);
+}
+
const struct bpf_func_proto bpf_get_task_stack_proto = {
.func = bpf_get_task_stack,
.gpl_only = false,
@@ -505,6 +559,23 @@ const struct bpf_func_proto bpf_get_task_stack_proto = {
.arg4_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_get_task_stack_sleepable, struct task_struct *, task, void *, buf,
+ u32, size, u64, flags)
+{
+ return __bpf_get_task_stack(task, buf, size, flags, true /* !may_fault */);
+}
+
+const struct bpf_func_proto bpf_get_task_stack_sleepable_proto = {
+ .func = bpf_get_task_stack_sleepable,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
void *, buf, u32, size, u64, flags)
{
@@ -516,7 +587,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
__u64 nr_kernel;
if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
- return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */);
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
BPF_F_USER_BUILD_ID)))
@@ -536,7 +607,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
__u64 nr = trace->nr;
trace->nr = nr_kernel;
- err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
+ err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */);
/* restore nr */
trace->nr = nr;
@@ -548,7 +619,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
goto clear;
flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
- err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
+ err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */);
}
return err;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bf6c5f685ea2..a8f1808a1ca5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -550,7 +550,8 @@ void btf_record_free(struct btf_record *rec)
case BPF_KPTR_PERCPU:
if (rec->fields[i].kptr.module)
module_put(rec->fields[i].kptr.module);
- btf_put(rec->fields[i].kptr.btf);
+ if (btf_is_kernel(rec->fields[i].kptr.btf))
+ btf_put(rec->fields[i].kptr.btf);
break;
case BPF_LIST_HEAD:
case BPF_LIST_NODE:
@@ -596,7 +597,8 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
- btf_get(fields[i].kptr.btf);
+ if (btf_is_kernel(fields[i].kptr.btf))
+ btf_get(fields[i].kptr.btf);
if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
ret = -ENXIO;
goto free;
@@ -733,15 +735,11 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
}
}
-/* called from workqueue */
-static void bpf_map_free_deferred(struct work_struct *work)
+static void bpf_map_free(struct bpf_map *map)
{
- struct bpf_map *map = container_of(work, struct bpf_map, work);
struct btf_record *rec = map->record;
struct btf *btf = map->btf;
- security_bpf_map_free(map);
- bpf_map_release_memcg(map);
/* implementation dependent freeing */
map->ops->map_free(map);
/* Delay freeing of btf_record for maps, as map_free
@@ -760,6 +758,16 @@ static void bpf_map_free_deferred(struct work_struct *work)
btf_put(btf);
}
+/* called from workqueue */
+static void bpf_map_free_deferred(struct work_struct *work)
+{
+ struct bpf_map *map = container_of(work, struct bpf_map, work);
+
+ security_bpf_map_free(map);
+ bpf_map_release_memcg(map);
+ bpf_map_free(map);
+}
+
static void bpf_map_put_uref(struct bpf_map *map)
{
if (atomic64_dec_and_test(&map->usercnt)) {
@@ -829,7 +837,7 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
{
- fmode_t mode = f.file->f_mode;
+ fmode_t mode = fd_file(f)->f_mode;
/* Our file permissions may have been overridden by global
* map permissions facing syscall side.
@@ -1411,28 +1419,12 @@ static int map_create(union bpf_attr *attr)
free_map_sec:
security_bpf_map_free(map);
free_map:
- btf_put(map->btf);
- map->ops->map_free(map);
+ bpf_map_free(map);
put_token:
bpf_token_put(token);
return err;
}
-/* if error is returned, fd is released.
- * On success caller should complete fd access with matching fdput()
- */
-struct bpf_map *__bpf_map_get(struct fd f)
-{
- if (!f.file)
- return ERR_PTR(-EBADF);
- if (f.file->f_op != &bpf_map_fops) {
- fdput(f);
- return ERR_PTR(-EINVAL);
- }
-
- return f.file->private_data;
-}
-
void bpf_map_inc(struct bpf_map *map)
{
atomic64_inc(&map->refcnt);
@@ -1448,15 +1440,11 @@ EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
struct bpf_map *bpf_map_get(u32 ufd)
{
- struct fd f = fdget(ufd);
- struct bpf_map *map;
-
- map = __bpf_map_get(f);
- if (IS_ERR(map))
- return map;
+ CLASS(fd, f)(ufd);
+ struct bpf_map *map = __bpf_map_get(f);
- bpf_map_inc(map);
- fdput(f);
+ if (!IS_ERR(map))
+ bpf_map_inc(map);
return map;
}
@@ -1464,15 +1452,11 @@ EXPORT_SYMBOL(bpf_map_get);
struct bpf_map *bpf_map_get_with_uref(u32 ufd)
{
- struct fd f = fdget(ufd);
- struct bpf_map *map;
-
- map = __bpf_map_get(f);
- if (IS_ERR(map))
- return map;
+ CLASS(fd, f)(ufd);
+ struct bpf_map *map = __bpf_map_get(f);
- bpf_map_inc_with_uref(map);
- fdput(f);
+ if (!IS_ERR(map))
+ bpf_map_inc_with_uref(map);
return map;
}
@@ -1537,11 +1521,9 @@ static int map_lookup_elem(union bpf_attr *attr)
{
void __user *ukey = u64_to_user_ptr(attr->key);
void __user *uvalue = u64_to_user_ptr(attr->value);
- int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
u32 value_size;
- struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
@@ -1550,26 +1532,20 @@ static int map_lookup_elem(union bpf_attr *attr)
if (attr->flags & ~BPF_F_LOCK)
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
- if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
- err = -EPERM;
- goto err_put;
- }
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
+ return -EPERM;
if ((attr->flags & BPF_F_LOCK) &&
- !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
- err = -EINVAL;
- goto err_put;
- }
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK))
+ return -EINVAL;
key = __bpf_copy_key(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
- }
+ if (IS_ERR(key))
+ return PTR_ERR(key);
value_size = bpf_map_value_size(map);
@@ -1600,8 +1576,6 @@ free_value:
kvfree(value);
free_key:
kvfree(key);
-err_put:
- fdput(f);
return err;
}
@@ -1612,17 +1586,15 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
{
bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
- int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
u32 value_size;
- struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -1651,7 +1623,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
goto free_key;
}
- err = bpf_map_update_value(map, f.file, key, value, attr->flags);
+ err = bpf_map_update_value(map, fd_file(f), key, value, attr->flags);
if (!err)
maybe_wait_bpf_programs(map);
@@ -1660,7 +1632,6 @@ free_key:
kvfree(key);
err_put:
bpf_map_write_active_dec(map);
- fdput(f);
return err;
}
@@ -1669,16 +1640,14 @@ err_put:
static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
{
bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
- int ufd = attr->map_fd;
struct bpf_map *map;
- struct fd f;
void *key;
int err;
if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -1715,7 +1684,6 @@ out:
kvfree(key);
err_put:
bpf_map_write_active_dec(map);
- fdput(f);
return err;
}
@@ -1726,30 +1694,24 @@ static int map_get_next_key(union bpf_attr *attr)
{
void __user *ukey = u64_to_user_ptr(attr->key);
void __user *unext_key = u64_to_user_ptr(attr->next_key);
- int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *next_key;
- struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
- if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
- err = -EPERM;
- goto err_put;
- }
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
+ return -EPERM;
if (ukey) {
key = __bpf_copy_key(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
- }
+ if (IS_ERR(key))
+ return PTR_ERR(key);
} else {
key = NULL;
}
@@ -1781,8 +1743,6 @@ free_next_key:
kvfree(next_key);
free_key:
kvfree(key);
-err_put:
- fdput(f);
return err;
}
@@ -2011,11 +1971,9 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
{
void __user *ukey = u64_to_user_ptr(attr->key);
void __user *uvalue = u64_to_user_ptr(attr->value);
- int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
u32 value_size;
- struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
@@ -2024,7 +1982,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
if (attr->flags & ~BPF_F_LOCK)
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -2094,7 +2052,6 @@ free_key:
kvfree(key);
err_put:
bpf_map_write_active_dec(map);
- fdput(f);
return err;
}
@@ -2102,27 +2059,22 @@ err_put:
static int map_freeze(const union bpf_attr *attr)
{
- int err = 0, ufd = attr->map_fd;
+ int err = 0;
struct bpf_map *map;
- struct fd f;
if (CHECK_ATTR(BPF_MAP_FREEZE))
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
- if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) {
- fdput(f);
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record))
return -ENOTSUPP;
- }
- if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
- fdput(f);
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE))
return -EPERM;
- }
mutex_lock(&map->freeze_mutex);
if (bpf_map_write_active(map)) {
@@ -2137,7 +2089,6 @@ static int map_freeze(const union bpf_attr *attr)
WRITE_ONCE(map->frozen, true);
err_put:
mutex_unlock(&map->freeze_mutex);
- fdput(f);
return err;
}
@@ -2407,18 +2358,6 @@ int bpf_prog_new_fd(struct bpf_prog *prog)
O_RDWR | O_CLOEXEC);
}
-static struct bpf_prog *____bpf_prog_get(struct fd f)
-{
- if (!f.file)
- return ERR_PTR(-EBADF);
- if (f.file->f_op != &bpf_prog_fops) {
- fdput(f);
- return ERR_PTR(-EINVAL);
- }
-
- return f.file->private_data;
-}
-
void bpf_prog_add(struct bpf_prog *prog, int i)
{
atomic64_add(i, &prog->aux->refcnt);
@@ -2474,20 +2413,19 @@ bool bpf_prog_get_ok(struct bpf_prog *prog,
static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
bool attach_drv)
{
- struct fd f = fdget(ufd);
+ CLASS(fd, f)(ufd);
struct bpf_prog *prog;
- prog = ____bpf_prog_get(f);
- if (IS_ERR(prog))
- return prog;
- if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
- prog = ERR_PTR(-EINVAL);
- goto out;
- }
+ if (fd_empty(f))
+ return ERR_PTR(-EBADF);
+ if (fd_file(f)->f_op != &bpf_prog_fops)
+ return ERR_PTR(-EINVAL);
+
+ prog = fd_file(f)->private_data;
+ if (!bpf_prog_get_ok(prog, attach_type, attach_drv))
+ return ERR_PTR(-EINVAL);
bpf_prog_inc(prog);
-out:
- fdput(f);
return prog;
}
@@ -3256,20 +3194,16 @@ int bpf_link_new_fd(struct bpf_link *link)
struct bpf_link *bpf_link_get_from_fd(u32 ufd)
{
- struct fd f = fdget(ufd);
+ CLASS(fd, f)(ufd);
struct bpf_link *link;
- if (!f.file)
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
- if (f.file->f_op != &bpf_link_fops && f.file->f_op != &bpf_link_fops_poll) {
- fdput(f);
+ if (fd_file(f)->f_op != &bpf_link_fops && fd_file(f)->f_op != &bpf_link_fops_poll)
return ERR_PTR(-EINVAL);
- }
- link = f.file->private_data;
+ link = fd_file(f)->private_data;
bpf_link_inc(link);
- fdput(f);
-
return link;
}
EXPORT_SYMBOL(bpf_link_get_from_fd);
@@ -4974,33 +4908,25 @@ static int bpf_link_get_info_by_fd(struct file *file,
static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
- int ufd = attr->info.bpf_fd;
- struct fd f;
- int err;
-
if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
return -EINVAL;
- f = fdget(ufd);
- if (!f.file)
+ CLASS(fd, f)(attr->info.bpf_fd);
+ if (fd_empty(f))
return -EBADFD;
- if (f.file->f_op == &bpf_prog_fops)
- err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr,
+ if (fd_file(f)->f_op == &bpf_prog_fops)
+ return bpf_prog_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr,
uattr);
- else if (f.file->f_op == &bpf_map_fops)
- err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr,
+ else if (fd_file(f)->f_op == &bpf_map_fops)
+ return bpf_map_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr,
uattr);
- else if (f.file->f_op == &btf_fops)
- err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr);
- else if (f.file->f_op == &bpf_link_fops || f.file->f_op == &bpf_link_fops_poll)
- err = bpf_link_get_info_by_fd(f.file, f.file->private_data,
+ else if (fd_file(f)->f_op == &btf_fops)
+ return bpf_btf_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr);
+ else if (fd_file(f)->f_op == &bpf_link_fops || fd_file(f)->f_op == &bpf_link_fops_poll)
+ return bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data,
attr, uattr);
- else
- err = -EINVAL;
-
- fdput(f);
- return err;
+ return -EINVAL;
}
#define BPF_BTF_LOAD_LAST_FIELD btf_token_fd
@@ -5188,14 +5114,13 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
struct bpf_map *map;
- int err, ufd;
- struct fd f;
+ int err;
if (CHECK_ATTR(BPF_MAP_BATCH))
return -EINVAL;
- ufd = attr->batch.map_fd;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->batch.map_fd);
+
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -5215,7 +5140,7 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr);
else if (cmd == BPF_MAP_UPDATE_BATCH)
- BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr);
+ BPF_DO_BATCH(map->ops->map_update_batch, map, fd_file(f), attr, uattr);
else
BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
err_put:
@@ -5223,7 +5148,6 @@ err_put:
maybe_wait_bpf_programs(map);
bpf_map_write_active_dec(map);
}
- fdput(f);
return err;
}
@@ -5668,7 +5592,7 @@ static int token_create(union bpf_attr *attr)
return bpf_token_create(attr);
}
-static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
+static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
{
union bpf_attr attr;
int err;
@@ -5932,6 +5856,7 @@ static const struct bpf_func_proto bpf_sys_close_proto = {
BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res)
{
+ *res = 0;
if (flags)
return -EINVAL;
@@ -5952,7 +5877,8 @@ static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
.arg1_type = ARG_PTR_TO_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_LONG,
+ .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED,
+ .arg4_size = sizeof(u64),
};
static const struct bpf_func_proto *
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
index d6ccf8d00eab..dcbec1a0dfb3 100644
--- a/kernel/bpf/token.c
+++ b/kernel/bpf/token.c
@@ -116,67 +116,52 @@ int bpf_token_create(union bpf_attr *attr)
struct user_namespace *userns;
struct inode *inode;
struct file *file;
+ CLASS(fd, f)(attr->token_create.bpffs_fd);
struct path path;
- struct fd f;
+ struct super_block *sb;
umode_t mode;
int err, fd;
- f = fdget(attr->token_create.bpffs_fd);
- if (!f.file)
+ if (fd_empty(f))
return -EBADF;
- path = f.file->f_path;
- path_get(&path);
- fdput(f);
+ path = fd_file(f)->f_path;
+ sb = path.dentry->d_sb;
- if (path.dentry != path.mnt->mnt_sb->s_root) {
- err = -EINVAL;
- goto out_path;
- }
- if (path.mnt->mnt_sb->s_op != &bpf_super_ops) {
- err = -EINVAL;
- goto out_path;
- }
+ if (path.dentry != sb->s_root)
+ return -EINVAL;
+ if (sb->s_op != &bpf_super_ops)
+ return -EINVAL;
err = path_permission(&path, MAY_ACCESS);
if (err)
- goto out_path;
+ return err;
- userns = path.dentry->d_sb->s_user_ns;
+ userns = sb->s_user_ns;
/*
* Enforce that creators of BPF tokens are in the same user
* namespace as the BPF FS instance. This makes reasoning about
* permissions a lot easier and we can always relax this later.
*/
- if (current_user_ns() != userns) {
- err = -EPERM;
- goto out_path;
- }
- if (!ns_capable(userns, CAP_BPF)) {
- err = -EPERM;
- goto out_path;
- }
+ if (current_user_ns() != userns)
+ return -EPERM;
+ if (!ns_capable(userns, CAP_BPF))
+ return -EPERM;
/* Creating BPF token in init_user_ns doesn't make much sense. */
- if (current_user_ns() == &init_user_ns) {
- err = -EOPNOTSUPP;
- goto out_path;
- }
+ if (current_user_ns() == &init_user_ns)
+ return -EOPNOTSUPP;
- mnt_opts = path.dentry->d_sb->s_fs_info;
+ mnt_opts = sb->s_fs_info;
if (mnt_opts->delegate_cmds == 0 &&
mnt_opts->delegate_maps == 0 &&
mnt_opts->delegate_progs == 0 &&
- mnt_opts->delegate_attachs == 0) {
- err = -ENOENT; /* no BPF token delegation is set up */
- goto out_path;
- }
+ mnt_opts->delegate_attachs == 0)
+ return -ENOENT; /* no BPF token delegation is set up */
mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
- inode = bpf_get_inode(path.mnt->mnt_sb, NULL, mode);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- goto out_path;
- }
+ inode = bpf_get_inode(sb, NULL, mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
inode->i_op = &bpf_token_iops;
inode->i_fop = &bpf_token_fops;
@@ -185,8 +170,7 @@ int bpf_token_create(union bpf_attr *attr)
file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops);
if (IS_ERR(file)) {
iput(inode);
- err = PTR_ERR(file);
- goto out_path;
+ return PTR_ERR(file);
}
token = kzalloc(sizeof(*token), GFP_USER);
@@ -218,33 +202,27 @@ int bpf_token_create(union bpf_attr *attr)
file->private_data = token;
fd_install(fd, file);
- path_put(&path);
return fd;
out_token:
bpf_token_free(token);
out_file:
fput(file);
-out_path:
- path_put(&path);
return err;
}
struct bpf_token *bpf_token_get_from_fd(u32 ufd)
{
- struct fd f = fdget(ufd);
+ CLASS(fd, f)(ufd);
struct bpf_token *token;
- if (!f.file)
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
- if (f.file->f_op != &bpf_token_fops) {
- fdput(f);
+ if (fd_file(f)->f_op != &bpf_token_fops)
return ERR_PTR(-EINVAL);
- }
- token = f.file->private_data;
+ token = fd_file(f)->private_data;
bpf_token_inc(token);
- fdput(f);
return token;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 39d5710c68ad..9a7ed527e47e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -385,11 +385,6 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env,
verbose(env, " should have been in [%d, %d]\n", range.minval, range.maxval);
}
-static bool type_may_be_null(u32 type)
-{
- return type & PTR_MAYBE_NULL;
-}
-
static bool reg_not_null(const struct bpf_reg_state *reg)
{
enum bpf_reg_type type;
@@ -2184,6 +2179,44 @@ static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
reg->smin_value = max_t(s64, reg->smin_value, new_smin);
reg->smax_value = min_t(s64, reg->smax_value, new_smax);
}
+
+ /* Here we would like to handle a special case after sign extending load,
+ * when upper bits for a 64-bit range are all 1s or all 0s.
+ *
+ * Upper bits are all 1s when register is in a range:
+ * [0xffff_ffff_0000_0000, 0xffff_ffff_ffff_ffff]
+ * Upper bits are all 0s when register is in a range:
+ * [0x0000_0000_0000_0000, 0x0000_0000_ffff_ffff]
+ * Together this forms are continuous range:
+ * [0xffff_ffff_0000_0000, 0x0000_0000_ffff_ffff]
+ *
+ * Now, suppose that register range is in fact tighter:
+ * [0xffff_ffff_8000_0000, 0x0000_0000_ffff_ffff] (R)
+ * Also suppose that it's 32-bit range is positive,
+ * meaning that lower 32-bits of the full 64-bit register
+ * are in the range:
+ * [0x0000_0000, 0x7fff_ffff] (W)
+ *
+ * If this happens, then any value in a range:
+ * [0xffff_ffff_0000_0000, 0xffff_ffff_7fff_ffff]
+ * is smaller than a lowest bound of the range (R):
+ * 0xffff_ffff_8000_0000
+ * which means that upper bits of the full 64-bit register
+ * can't be all 1s, when lower bits are in range (W).
+ *
+ * Note that:
+ * - 0xffff_ffff_8000_0000 == (s64)S32_MIN
+ * - 0x0000_0000_7fff_ffff == (s64)S32_MAX
+ * These relations are used in the conditions below.
+ */
+ if (reg->s32_min_value >= 0 && reg->smin_value >= S32_MIN && reg->smax_value <= S32_MAX) {
+ reg->smin_value = reg->s32_min_value;
+ reg->smax_value = reg->s32_max_value;
+ reg->umin_value = reg->s32_min_value;
+ reg->umax_value = reg->s32_max_value;
+ reg->var_off = tnum_intersect(reg->var_off,
+ tnum_range(reg->smin_value, reg->smax_value));
+ }
}
static void __reg_deduce_bounds(struct bpf_reg_state *reg)
@@ -2336,6 +2369,25 @@ static void mark_reg_unknown(struct bpf_verifier_env *env,
__mark_reg_unknown(env, regs + regno);
}
+static int __mark_reg_s32_range(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs,
+ u32 regno,
+ s32 s32_min,
+ s32 s32_max)
+{
+ struct bpf_reg_state *reg = regs + regno;
+
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, s32_min);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, s32_max);
+
+ reg->smin_value = max_t(s64, reg->smin_value, s32_min);
+ reg->smax_value = min_t(s64, reg->smax_value, s32_max);
+
+ reg_bounds_sync(reg);
+
+ return reg_bounds_sanity_check(env, reg, "s32_range");
+}
+
static void __mark_reg_not_init(const struct bpf_verifier_env *env,
struct bpf_reg_state *reg)
{
@@ -3337,9 +3389,87 @@ static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx)
return env->insn_aux_data[insn_idx].jmp_point;
}
+#define LR_FRAMENO_BITS 3
+#define LR_SPI_BITS 6
+#define LR_ENTRY_BITS (LR_SPI_BITS + LR_FRAMENO_BITS + 1)
+#define LR_SIZE_BITS 4
+#define LR_FRAMENO_MASK ((1ull << LR_FRAMENO_BITS) - 1)
+#define LR_SPI_MASK ((1ull << LR_SPI_BITS) - 1)
+#define LR_SIZE_MASK ((1ull << LR_SIZE_BITS) - 1)
+#define LR_SPI_OFF LR_FRAMENO_BITS
+#define LR_IS_REG_OFF (LR_SPI_BITS + LR_FRAMENO_BITS)
+#define LINKED_REGS_MAX 6
+
+struct linked_reg {
+ u8 frameno;
+ union {
+ u8 spi;
+ u8 regno;
+ };
+ bool is_reg;
+};
+
+struct linked_regs {
+ int cnt;
+ struct linked_reg entries[LINKED_REGS_MAX];
+};
+
+static struct linked_reg *linked_regs_push(struct linked_regs *s)
+{
+ if (s->cnt < LINKED_REGS_MAX)
+ return &s->entries[s->cnt++];
+
+ return NULL;
+}
+
+/* Use u64 as a vector of 6 10-bit values, use first 4-bits to track
+ * number of elements currently in stack.
+ * Pack one history entry for linked registers as 10 bits in the following format:
+ * - 3-bits frameno
+ * - 6-bits spi_or_reg
+ * - 1-bit is_reg
+ */
+static u64 linked_regs_pack(struct linked_regs *s)
+{
+ u64 val = 0;
+ int i;
+
+ for (i = 0; i < s->cnt; ++i) {
+ struct linked_reg *e = &s->entries[i];
+ u64 tmp = 0;
+
+ tmp |= e->frameno;
+ tmp |= e->spi << LR_SPI_OFF;
+ tmp |= (e->is_reg ? 1 : 0) << LR_IS_REG_OFF;
+
+ val <<= LR_ENTRY_BITS;
+ val |= tmp;
+ }
+ val <<= LR_SIZE_BITS;
+ val |= s->cnt;
+ return val;
+}
+
+static void linked_regs_unpack(u64 val, struct linked_regs *s)
+{
+ int i;
+
+ s->cnt = val & LR_SIZE_MASK;
+ val >>= LR_SIZE_BITS;
+
+ for (i = 0; i < s->cnt; ++i) {
+ struct linked_reg *e = &s->entries[i];
+
+ e->frameno = val & LR_FRAMENO_MASK;
+ e->spi = (val >> LR_SPI_OFF) & LR_SPI_MASK;
+ e->is_reg = (val >> LR_IS_REG_OFF) & 0x1;
+ val >>= LR_ENTRY_BITS;
+ }
+}
+
/* for any branch, call, exit record the history of jmps in the given state */
static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
- int insn_flags)
+ int insn_flags, u64 linked_regs)
{
u32 cnt = cur->jmp_history_cnt;
struct bpf_jmp_history_entry *p;
@@ -3355,6 +3485,10 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_st
"verifier insn history bug: insn_idx %d cur flags %x new flags %x\n",
env->insn_idx, env->cur_hist_ent->flags, insn_flags);
env->cur_hist_ent->flags |= insn_flags;
+ WARN_ONCE(env->cur_hist_ent->linked_regs != 0,
+ "verifier insn history bug: insn_idx %d linked_regs != 0: %#llx\n",
+ env->insn_idx, env->cur_hist_ent->linked_regs);
+ env->cur_hist_ent->linked_regs = linked_regs;
return 0;
}
@@ -3369,6 +3503,7 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_st
p->idx = env->insn_idx;
p->prev_idx = env->prev_insn_idx;
p->flags = insn_flags;
+ p->linked_regs = linked_regs;
cur->jmp_history_cnt = cnt;
env->cur_hist_ent = p;
@@ -3534,6 +3669,11 @@ static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
return bt->reg_masks[bt->frame] & (1 << reg);
}
+static inline bool bt_is_frame_reg_set(struct backtrack_state *bt, u32 frame, u32 reg)
+{
+ return bt->reg_masks[frame] & (1 << reg);
+}
+
static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot)
{
return bt->stack_masks[frame] & (1ull << slot);
@@ -3578,6 +3718,42 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
}
}
+/* If any register R in hist->linked_regs is marked as precise in bt,
+ * do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs.
+ */
+static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist)
+{
+ struct linked_regs linked_regs;
+ bool some_precise = false;
+ int i;
+
+ if (!hist || hist->linked_regs == 0)
+ return;
+
+ linked_regs_unpack(hist->linked_regs, &linked_regs);
+ for (i = 0; i < linked_regs.cnt; ++i) {
+ struct linked_reg *e = &linked_regs.entries[i];
+
+ if ((e->is_reg && bt_is_frame_reg_set(bt, e->frameno, e->regno)) ||
+ (!e->is_reg && bt_is_frame_slot_set(bt, e->frameno, e->spi))) {
+ some_precise = true;
+ break;
+ }
+ }
+
+ if (!some_precise)
+ return;
+
+ for (i = 0; i < linked_regs.cnt; ++i) {
+ struct linked_reg *e = &linked_regs.entries[i];
+
+ if (e->is_reg)
+ bt_set_frame_reg(bt, e->frameno, e->regno);
+ else
+ bt_set_frame_slot(bt, e->frameno, e->spi);
+ }
+}
+
static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
/* For given verifier state backtrack_insn() is called from the last insn to
@@ -3617,6 +3793,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
}
+ /* If there is a history record that some registers gained range at this insn,
+ * propagate precision marks to those registers, so that bt_is_reg_set()
+ * accounts for these registers.
+ */
+ bt_sync_linked_regs(bt, hist);
+
if (class == BPF_ALU || class == BPF_ALU64) {
if (!bt_is_reg_set(bt, dreg))
return 0;
@@ -3846,7 +4028,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
*/
bt_set_reg(bt, dreg);
bt_set_reg(bt, sreg);
- /* else dreg <cond> K
+ } else if (BPF_SRC(insn->code) == BPF_K) {
+ /* dreg <cond> K
* Only dreg still needs precision before
* this insn, so for the K-based conditional
* there is nothing new to be marked.
@@ -3864,6 +4047,10 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
/* to be analyzed */
return -ENOTSUPP;
}
+ /* Propagate precision marks to linked registers, to account for
+ * registers marked as precise in this function.
+ */
+ bt_sync_linked_regs(bt, hist);
return 0;
}
@@ -3991,96 +4178,6 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_
}
}
-static bool idset_contains(struct bpf_idset *s, u32 id)
-{
- u32 i;
-
- for (i = 0; i < s->count; ++i)
- if (s->ids[i] == (id & ~BPF_ADD_CONST))
- return true;
-
- return false;
-}
-
-static int idset_push(struct bpf_idset *s, u32 id)
-{
- if (WARN_ON_ONCE(s->count >= ARRAY_SIZE(s->ids)))
- return -EFAULT;
- s->ids[s->count++] = id & ~BPF_ADD_CONST;
- return 0;
-}
-
-static void idset_reset(struct bpf_idset *s)
-{
- s->count = 0;
-}
-
-/* Collect a set of IDs for all registers currently marked as precise in env->bt.
- * Mark all registers with these IDs as precise.
- */
-static int mark_precise_scalar_ids(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
-{
- struct bpf_idset *precise_ids = &env->idset_scratch;
- struct backtrack_state *bt = &env->bt;
- struct bpf_func_state *func;
- struct bpf_reg_state *reg;
- DECLARE_BITMAP(mask, 64);
- int i, fr;
-
- idset_reset(precise_ids);
-
- for (fr = bt->frame; fr >= 0; fr--) {
- func = st->frame[fr];
-
- bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr));
- for_each_set_bit(i, mask, 32) {
- reg = &func->regs[i];
- if (!reg->id || reg->type != SCALAR_VALUE)
- continue;
- if (idset_push(precise_ids, reg->id))
- return -EFAULT;
- }
-
- bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
- for_each_set_bit(i, mask, 64) {
- if (i >= func->allocated_stack / BPF_REG_SIZE)
- break;
- if (!is_spilled_scalar_reg(&func->stack[i]))
- continue;
- reg = &func->stack[i].spilled_ptr;
- if (!reg->id)
- continue;
- if (idset_push(precise_ids, reg->id))
- return -EFAULT;
- }
- }
-
- for (fr = 0; fr <= st->curframe; ++fr) {
- func = st->frame[fr];
-
- for (i = BPF_REG_0; i < BPF_REG_10; ++i) {
- reg = &func->regs[i];
- if (!reg->id)
- continue;
- if (!idset_contains(precise_ids, reg->id))
- continue;
- bt_set_frame_reg(bt, fr, i);
- }
- for (i = 0; i < func->allocated_stack / BPF_REG_SIZE; ++i) {
- if (!is_spilled_scalar_reg(&func->stack[i]))
- continue;
- reg = &func->stack[i].spilled_ptr;
- if (!reg->id)
- continue;
- if (!idset_contains(precise_ids, reg->id))
- continue;
- bt_set_frame_slot(bt, fr, i);
- }
- }
-
- return 0;
-}
-
/*
* __mark_chain_precision() backtracks BPF program instruction sequence and
* chain of verifier states making sure that register *regno* (if regno >= 0)
@@ -4213,31 +4310,6 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
bt->frame, last_idx, first_idx, subseq_idx);
}
- /* If some register with scalar ID is marked as precise,
- * make sure that all registers sharing this ID are also precise.
- * This is needed to estimate effect of find_equal_scalars().
- * Do this at the last instruction of each state,
- * bpf_reg_state::id fields are valid for these instructions.
- *
- * Allows to track precision in situation like below:
- *
- * r2 = unknown value
- * ...
- * --- state #0 ---
- * ...
- * r1 = r2 // r1 and r2 now share the same ID
- * ...
- * --- state #1 {r1.id = A, r2.id = A} ---
- * ...
- * if (r2 > 10) goto exit; // find_equal_scalars() assigns range to r1
- * ...
- * --- state #2 {r1.id = A, r2.id = A} ---
- * r3 = r10
- * r3 += r1 // need to mark both r1 and r2
- */
- if (mark_precise_scalar_ids(env, st))
- return -EFAULT;
-
if (last_idx < 0) {
/* we are at the entry into subprog, which
* is expected for global funcs, but only if
@@ -4458,7 +4530,7 @@ static void assign_scalar_id_before_mov(struct bpf_verifier_env *env,
if (!src_reg->id && !tnum_is_const(src_reg->var_off))
/* Ensure that src_reg has a valid ID that will be copied to
- * dst_reg and then will be used by find_equal_scalars() to
+ * dst_reg and then will be used by sync_linked_regs() to
* propagate min/max range.
*/
src_reg->id = ++env->id_gen;
@@ -4504,6 +4576,31 @@ static int get_reg_width(struct bpf_reg_state *reg)
return fls64(reg->umax_value);
}
+/* See comment for mark_fastcall_pattern_for_call() */
+static void check_fastcall_stack_contract(struct bpf_verifier_env *env,
+ struct bpf_func_state *state, int insn_idx, int off)
+{
+ struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno];
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ int i;
+
+ if (subprog->fastcall_stack_off <= off || aux[insn_idx].fastcall_pattern)
+ return;
+ /* access to the region [max_stack_depth .. fastcall_stack_off)
+ * from something that is not a part of the fastcall pattern,
+ * disable fastcall rewrites for current subprogram by setting
+ * fastcall_stack_off to a value smaller than any possible offset.
+ */
+ subprog->fastcall_stack_off = S16_MIN;
+ /* reset fastcall aux flags within subprogram,
+ * happens at most once per subprogram
+ */
+ for (i = subprog->start; i < (subprog + 1)->start; ++i) {
+ aux[i].fastcall_spills_num = 0;
+ aux[i].fastcall_pattern = 0;
+ }
+}
+
/* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
@@ -4552,6 +4649,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
if (err)
return err;
+ check_fastcall_stack_contract(env, state, insn_idx, off);
mark_stack_slot_scratched(env, spi);
if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) {
bool reg_value_fits;
@@ -4627,7 +4725,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
}
if (insn_flags)
- return push_jmp_history(env, env->cur_state, insn_flags);
+ return push_jmp_history(env, env->cur_state, insn_flags, 0);
return 0;
}
@@ -4686,6 +4784,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
return err;
}
+ check_fastcall_stack_contract(env, state, insn_idx, min_off);
/* Variable offset writes destroy any spilled pointers in range. */
for (i = min_off; i < max_off; i++) {
u8 new_type, *stype;
@@ -4824,6 +4923,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
reg = &reg_state->stack[spi].spilled_ptr;
mark_stack_slot_scratched(env, spi);
+ check_fastcall_stack_contract(env, state, env->insn_idx, off);
if (is_spilled_reg(&reg_state->stack[spi])) {
u8 spill_size = 1;
@@ -4932,7 +5032,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
insn_flags = 0; /* we are not restoring spilled register */
}
if (insn_flags)
- return push_jmp_history(env, env->cur_state, insn_flags);
+ return push_jmp_history(env, env->cur_state, insn_flags, 0);
return 0;
}
@@ -4984,6 +5084,7 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env,
min_off = reg->smin_value + off;
max_off = reg->smax_value + off;
mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno);
+ check_fastcall_stack_contract(env, ptr_state, env->insn_idx, min_off);
return 0;
}
@@ -5589,11 +5690,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
/* check access to 'struct bpf_context' fields. Supports fixed offsets only */
static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
enum bpf_access_type t, enum bpf_reg_type *reg_type,
- struct btf **btf, u32 *btf_id)
+ struct btf **btf, u32 *btf_id, bool *is_retval, bool is_ldsx)
{
struct bpf_insn_access_aux info = {
.reg_type = *reg_type,
.log = &env->log,
+ .is_retval = false,
+ .is_ldsx = is_ldsx,
};
if (env->ops->is_valid_access &&
@@ -5606,6 +5709,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
* type of narrower access.
*/
*reg_type = info.reg_type;
+ *is_retval = info.is_retval;
if (base_type(*reg_type) == PTR_TO_BTF_ID) {
*btf = info.btf;
@@ -6694,10 +6798,20 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env,
struct bpf_func_state *state,
enum bpf_access_type t)
{
- int min_valid_off;
+ struct bpf_insn_aux_data *aux = &env->insn_aux_data[env->insn_idx];
+ int min_valid_off, max_bpf_stack;
+
+ /* If accessing instruction is a spill/fill from bpf_fastcall pattern,
+ * add room for all caller saved registers below MAX_BPF_STACK.
+ * In case if bpf_fastcall rewrite won't happen maximal stack depth
+ * would be checked by check_max_stack_depth_subprog().
+ */
+ max_bpf_stack = MAX_BPF_STACK;
+ if (aux->fastcall_pattern)
+ max_bpf_stack += CALLER_SAVED_REGS * BPF_REG_SIZE;
if (t == BPF_WRITE || env->allow_uninit_stack)
- min_valid_off = -MAX_BPF_STACK;
+ min_valid_off = -max_bpf_stack;
else
min_valid_off = -state->allocated_stack;
@@ -6774,6 +6888,17 @@ static int check_stack_access_within_bounds(
return grow_stack_state(env, state, -min_off /* size */);
}
+static bool get_func_retval_range(struct bpf_prog *prog,
+ struct bpf_retval_range *range)
+{
+ if (prog->type == BPF_PROG_TYPE_LSM &&
+ prog->expected_attach_type == BPF_LSM_MAC &&
+ !bpf_lsm_get_retval_range(prog, range)) {
+ return true;
+ }
+ return false;
+}
+
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
@@ -6878,6 +7003,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))
mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
+ bool is_retval = false;
+ struct bpf_retval_range range;
enum bpf_reg_type reg_type = SCALAR_VALUE;
struct btf *btf = NULL;
u32 btf_id = 0;
@@ -6893,7 +7020,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
return err;
err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf,
- &btf_id);
+ &btf_id, &is_retval, is_ldsx);
if (err)
verbose_linfo(env, insn_idx, "; ");
if (!err && t == BPF_READ && value_regno >= 0) {
@@ -6902,7 +7029,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* case, we know the offset is zero.
*/
if (reg_type == SCALAR_VALUE) {
- mark_reg_unknown(env, regs, value_regno);
+ if (is_retval && get_func_retval_range(env->prog, &range)) {
+ err = __mark_reg_s32_range(env, regs, value_regno,
+ range.minval, range.maxval);
+ if (err)
+ return err;
+ } else {
+ mark_reg_unknown(env, regs, value_regno);
+ }
} else {
mark_reg_known_zero(env, regs,
value_regno);
@@ -7666,29 +7800,38 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
- struct bpf_map *map_ptr = reg->map_ptr;
struct btf_field *kptr_field;
+ struct bpf_map *map_ptr;
+ struct btf_record *rec;
u32 kptr_off;
+ if (type_is_ptr_alloc_obj(reg->type)) {
+ rec = reg_btf_record(reg);
+ } else { /* PTR_TO_MAP_VALUE */
+ map_ptr = reg->map_ptr;
+ if (!map_ptr->btf) {
+ verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n",
+ map_ptr->name);
+ return -EINVAL;
+ }
+ rec = map_ptr->record;
+ meta->map_ptr = map_ptr;
+ }
+
if (!tnum_is_const(reg->var_off)) {
verbose(env,
"R%d doesn't have constant offset. kptr has to be at the constant offset\n",
regno);
return -EINVAL;
}
- if (!map_ptr->btf) {
- verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n",
- map_ptr->name);
- return -EINVAL;
- }
- if (!btf_record_has_field(map_ptr->record, BPF_KPTR)) {
- verbose(env, "map '%s' has no valid kptr\n", map_ptr->name);
+
+ if (!btf_record_has_field(rec, BPF_KPTR)) {
+ verbose(env, "R%d has no valid kptr\n", regno);
return -EINVAL;
}
- meta->map_ptr = map_ptr;
kptr_off = reg->off + reg->var_off.value;
- kptr_field = btf_record_find(map_ptr->record, kptr_off, BPF_KPTR);
+ kptr_field = btf_record_find(rec, kptr_off, BPF_KPTR);
if (!kptr_field) {
verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
return -EACCES;
@@ -7833,12 +7976,17 @@ static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta)
return meta->kfunc_flags & KF_ITER_DESTROY;
}
-static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg)
+static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx,
+ const struct btf_param *arg)
{
/* btf_check_iter_kfuncs() guarantees that first argument of any iter
* kfunc is iter state pointer
*/
- return arg == 0 && is_iter_kfunc(meta);
+ if (is_iter_kfunc(meta))
+ return arg_idx == 0;
+
+ /* iter passed as an argument to a generic kfunc */
+ return btf_param_match_suffix(meta->btf, arg, "__iter");
}
static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx,
@@ -7846,14 +7994,20 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
const struct btf_type *t;
- const struct btf_param *arg;
- int spi, err, i, nr_slots;
- u32 btf_id;
+ int spi, err, i, nr_slots, btf_id;
- /* btf_check_iter_kfuncs() ensures we don't need to validate anything here */
- arg = &btf_params(meta->func_proto)[0];
- t = btf_type_skip_modifiers(meta->btf, arg->type, NULL); /* PTR */
- t = btf_type_skip_modifiers(meta->btf, t->type, &btf_id); /* STRUCT */
+ /* For iter_{new,next,destroy} functions, btf_check_iter_kfuncs()
+ * ensures struct convention, so we wouldn't need to do any BTF
+ * validation here. But given iter state can be passed as a parameter
+ * to any kfunc, if arg has "__iter" suffix, we need to be a bit more
+ * conservative here.
+ */
+ btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, regno - 1);
+ if (btf_id < 0) {
+ verbose(env, "expected valid iter pointer as arg #%d\n", regno);
+ return -EINVAL;
+ }
+ t = btf_type_by_id(meta->btf, btf_id);
nr_slots = t->size / BPF_REG_SIZE;
if (is_iter_new_kfunc(meta)) {
@@ -7875,7 +8029,9 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
if (err)
return err;
} else {
- /* iter_next() or iter_destroy() expect initialized iter state*/
+ /* iter_next() or iter_destroy(), as well as any kfunc
+ * accepting iter argument, expect initialized iter state
+ */
err = is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots);
switch (err) {
case 0:
@@ -7989,6 +8145,15 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env,
return 0;
}
+static struct bpf_reg_state *get_iter_from_state(struct bpf_verifier_state *cur_st,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ int iter_frameno = meta->iter.frameno;
+ int iter_spi = meta->iter.spi;
+
+ return &cur_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+}
+
/* process_iter_next_call() is called when verifier gets to iterator's next
* "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
* to it as just "iter_next()" in comments below.
@@ -8073,12 +8238,10 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
struct bpf_reg_state *cur_iter, *queued_iter;
- int iter_frameno = meta->iter.frameno;
- int iter_spi = meta->iter.spi;
BTF_TYPE_EMIT(struct bpf_iter);
- cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+ cur_iter = get_iter_from_state(cur_st, meta);
if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE &&
cur_iter->iter.state != BPF_ITER_STATE_DRAINED) {
@@ -8106,7 +8269,7 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
if (!queued_st)
return -ENOMEM;
- queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+ queued_iter = get_iter_from_state(queued_st, meta);
queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
queued_iter->iter.depth++;
if (prev_st)
@@ -8130,6 +8293,12 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type)
type == ARG_CONST_SIZE_OR_ZERO;
}
+static bool arg_type_is_raw_mem(enum bpf_arg_type type)
+{
+ return base_type(type) == ARG_PTR_TO_MEM &&
+ type & MEM_UNINIT;
+}
+
static bool arg_type_is_release(enum bpf_arg_type type)
{
return type & OBJ_RELEASE;
@@ -8140,16 +8309,6 @@ static bool arg_type_is_dynptr(enum bpf_arg_type type)
return base_type(type) == ARG_PTR_TO_DYNPTR;
}
-static int int_ptr_type_to_size(enum bpf_arg_type type)
-{
- if (type == ARG_PTR_TO_INT)
- return sizeof(u32);
- else if (type == ARG_PTR_TO_LONG)
- return sizeof(u64);
-
- return -EINVAL;
-}
-
static int resolve_map_arg_type(struct bpf_verifier_env *env,
const struct bpf_call_arg_meta *meta,
enum bpf_arg_type *arg_type)
@@ -8222,16 +8381,6 @@ static const struct bpf_reg_types mem_types = {
},
};
-static const struct bpf_reg_types int_ptr_types = {
- .types = {
- PTR_TO_STACK,
- PTR_TO_PACKET,
- PTR_TO_PACKET_META,
- PTR_TO_MAP_KEY,
- PTR_TO_MAP_VALUE,
- },
-};
-
static const struct bpf_reg_types spin_lock_types = {
.types = {
PTR_TO_MAP_VALUE,
@@ -8262,7 +8411,12 @@ static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
-static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types kptr_xchg_dest_types = {
+ .types = {
+ PTR_TO_MAP_VALUE,
+ PTR_TO_BTF_ID | MEM_ALLOC
+ }
+};
static const struct bpf_reg_types dynptr_types = {
.types = {
PTR_TO_STACK,
@@ -8287,14 +8441,12 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types,
[ARG_PTR_TO_MEM] = &mem_types,
[ARG_PTR_TO_RINGBUF_MEM] = &ringbuf_mem_types,
- [ARG_PTR_TO_INT] = &int_ptr_types,
- [ARG_PTR_TO_LONG] = &int_ptr_types,
[ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types,
[ARG_PTR_TO_FUNC] = &func_ptr_types,
[ARG_PTR_TO_STACK] = &stack_ptr_types,
[ARG_PTR_TO_CONST_STR] = &const_str_ptr_types,
[ARG_PTR_TO_TIMER] = &timer_types,
- [ARG_PTR_TO_KPTR] = &kptr_types,
+ [ARG_KPTR_XCHG_DEST] = &kptr_xchg_dest_types,
[ARG_PTR_TO_DYNPTR] = &dynptr_types,
};
@@ -8333,7 +8485,8 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
if (base_type(arg_type) == ARG_PTR_TO_MEM)
type &= ~DYNPTR_TYPE_FLAG_MASK;
- if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) {
+ /* Local kptr types are allowed as the source argument of bpf_kptr_xchg */
+ if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type) && regno == BPF_REG_2) {
type &= ~MEM_ALLOC;
type &= ~MEM_PERCPU;
}
@@ -8426,7 +8579,8 @@ found:
verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
return -EFAULT;
}
- if (meta->func_id == BPF_FUNC_kptr_xchg) {
+ /* Check if local kptr in src arg matches kptr in dst arg */
+ if (meta->func_id == BPF_FUNC_kptr_xchg && regno == BPF_REG_2) {
if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
return -EACCES;
}
@@ -8737,7 +8891,7 @@ skip_type_check:
meta->release_regno = regno;
}
- if (reg->ref_obj_id) {
+ if (reg->ref_obj_id && base_type(arg_type) != ARG_KPTR_XCHG_DEST) {
if (meta->ref_obj_id) {
verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
regno, reg->ref_obj_id,
@@ -8849,9 +9003,11 @@ skip_type_check:
*/
meta->raw_mode = arg_type & MEM_UNINIT;
if (arg_type & MEM_FIXED_SIZE) {
- err = check_helper_mem_access(env, regno,
- fn->arg_size[arg], false,
- meta);
+ err = check_helper_mem_access(env, regno, fn->arg_size[arg], false, meta);
+ if (err)
+ return err;
+ if (arg_type & MEM_ALIGNED)
+ err = check_ptr_alignment(env, reg, 0, fn->arg_size[arg], true);
}
break;
case ARG_CONST_SIZE:
@@ -8876,17 +9032,6 @@ skip_type_check:
if (err)
return err;
break;
- case ARG_PTR_TO_INT:
- case ARG_PTR_TO_LONG:
- {
- int size = int_ptr_type_to_size(arg_type);
-
- err = check_helper_mem_access(env, regno, size, false, meta);
- if (err)
- return err;
- err = check_ptr_alignment(env, reg, 0, size, true);
- break;
- }
case ARG_PTR_TO_CONST_STR:
{
err = check_reg_const_str(env, reg, regno);
@@ -8894,7 +9039,7 @@ skip_type_check:
return err;
break;
}
- case ARG_PTR_TO_KPTR:
+ case ARG_KPTR_XCHG_DEST:
err = process_kptr_func(env, regno, meta);
if (err)
return err;
@@ -9203,15 +9348,15 @@ static bool check_raw_mode_ok(const struct bpf_func_proto *fn)
{
int count = 0;
- if (fn->arg1_type == ARG_PTR_TO_UNINIT_MEM)
+ if (arg_type_is_raw_mem(fn->arg1_type))
count++;
- if (fn->arg2_type == ARG_PTR_TO_UNINIT_MEM)
+ if (arg_type_is_raw_mem(fn->arg2_type))
count++;
- if (fn->arg3_type == ARG_PTR_TO_UNINIT_MEM)
+ if (arg_type_is_raw_mem(fn->arg3_type))
count++;
- if (fn->arg4_type == ARG_PTR_TO_UNINIT_MEM)
+ if (arg_type_is_raw_mem(fn->arg4_type))
count++;
- if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM)
+ if (arg_type_is_raw_mem(fn->arg5_type))
count++;
/* We only support one arg being in raw mode at the moment,
@@ -9925,9 +10070,13 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
return is_rbtree_lock_required_kfunc(kfunc_btf_id);
}
-static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg)
+static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg,
+ bool return_32bit)
{
- return range.minval <= reg->smin_value && reg->smax_value <= range.maxval;
+ if (return_32bit)
+ return range.minval <= reg->s32_min_value && reg->s32_max_value <= range.maxval;
+ else
+ return range.minval <= reg->smin_value && reg->smax_value <= range.maxval;
}
static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
@@ -9964,8 +10113,8 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
if (err)
return err;
- /* enforce R0 return value range */
- if (!retval_range_within(callee->callback_ret_range, r0)) {
+ /* enforce R0 return value range, and bpf_callback_t returns 64bit */
+ if (!retval_range_within(callee->callback_ret_range, r0, false)) {
verbose_invalid_scalar(env, r0, callee->callback_ret_range,
"At callback return", "R0");
return -EINVAL;
@@ -10267,6 +10416,19 @@ static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno
state->callback_subprogno == subprogno);
}
+static int get_helper_proto(struct bpf_verifier_env *env, int func_id,
+ const struct bpf_func_proto **ptr)
+{
+ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID)
+ return -ERANGE;
+
+ if (!env->ops->get_func_proto)
+ return -EINVAL;
+
+ *ptr = env->ops->get_func_proto(func_id, env->prog);
+ return *ptr ? 0 : -EINVAL;
+}
+
static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
@@ -10283,18 +10445,16 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* find function prototype */
func_id = insn->imm;
- if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
- verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
- func_id);
+ err = get_helper_proto(env, insn->imm, &fn);
+ if (err == -ERANGE) {
+ verbose(env, "invalid func %s#%d\n", func_id_name(func_id), func_id);
return -EINVAL;
}
- if (env->ops->get_func_proto)
- fn = env->ops->get_func_proto(func_id, env->prog);
- if (!fn) {
+ if (err) {
verbose(env, "program of this type cannot use helper %s#%d\n",
func_id_name(func_id), func_id);
- return -EINVAL;
+ return err;
}
/* eBPF programs must be GPL compatible to use GPL-ed functions */
@@ -11230,7 +11390,7 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_DYNPTR;
- if (is_kfunc_arg_iter(meta, argno))
+ if (is_kfunc_arg_iter(meta, argno, &args[argno]))
return KF_ARG_PTR_TO_ITER;
if (is_kfunc_arg_list_head(meta->btf, &args[argno]))
@@ -11332,8 +11492,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
* btf_struct_ids_match() to walk the struct at the 0th offset, and
* resolve types.
*/
- if (is_kfunc_acquire(meta) ||
- (is_kfunc_release(meta) && reg->ref_obj_id) ||
+ if ((is_kfunc_release(meta) && reg->ref_obj_id) ||
btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id))
strict_type_match = true;
@@ -11950,7 +12109,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
switch (kf_arg_type) {
case KF_ARG_PTR_TO_CTX:
if (reg->type != PTR_TO_CTX) {
- verbose(env, "arg#%d expected pointer to ctx, but got %s\n", i, btf_type_str(t));
+ verbose(env, "arg#%d expected pointer to ctx, but got %s\n",
+ i, reg_type_str(env, reg->type));
return -EINVAL;
}
@@ -12673,6 +12833,17 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
regs[BPF_REG_0].btf = desc_btf;
regs[BPF_REG_0].type = PTR_TO_BTF_ID;
regs[BPF_REG_0].btf_id = ptr_type_id;
+
+ if (is_iter_next_kfunc(&meta)) {
+ struct bpf_reg_state *cur_iter;
+
+ cur_iter = get_iter_from_state(env->cur_state, &meta);
+
+ if (cur_iter->type & MEM_RCU) /* KF_RCU_PROTECTED */
+ regs[BPF_REG_0].type |= MEM_RCU;
+ else
+ regs[BPF_REG_0].type |= PTR_TRUSTED;
+ }
}
if (is_kfunc_ret_null(&meta)) {
@@ -14101,7 +14272,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
u64 val = reg_const_value(src_reg, alu32);
if ((dst_reg->id & BPF_ADD_CONST) ||
- /* prevent overflow in find_equal_scalars() later */
+ /* prevent overflow in sync_linked_regs() later */
val > (u32)S32_MAX) {
/*
* If the register already went through rX += val
@@ -14116,7 +14287,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
} else {
/*
* Make sure ID is cleared otherwise dst_reg min/max could be
- * incorrectly propagated into other registers by find_equal_scalars()
+ * incorrectly propagated into other registers by sync_linked_regs()
*/
dst_reg->id = 0;
}
@@ -14266,7 +14437,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
copy_register_state(dst_reg, src_reg);
/* Make sure ID is cleared if src_reg is not in u32
* range otherwise dst_reg min/max could be incorrectly
- * propagated into src_reg by find_equal_scalars()
+ * propagated into src_reg by sync_linked_regs()
*/
if (!is_src_reg_u32)
dst_reg->id = 0;
@@ -15089,14 +15260,66 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
return true;
}
-static void find_equal_scalars(struct bpf_verifier_state *vstate,
- struct bpf_reg_state *known_reg)
+static void __collect_linked_regs(struct linked_regs *reg_set, struct bpf_reg_state *reg,
+ u32 id, u32 frameno, u32 spi_or_reg, bool is_reg)
+{
+ struct linked_reg *e;
+
+ if (reg->type != SCALAR_VALUE || (reg->id & ~BPF_ADD_CONST) != id)
+ return;
+
+ e = linked_regs_push(reg_set);
+ if (e) {
+ e->frameno = frameno;
+ e->is_reg = is_reg;
+ e->regno = spi_or_reg;
+ } else {
+ reg->id = 0;
+ }
+}
+
+/* For all R being scalar registers or spilled scalar registers
+ * in verifier state, save R in linked_regs if R->id == id.
+ * If there are too many Rs sharing same id, reset id for leftover Rs.
+ */
+static void collect_linked_regs(struct bpf_verifier_state *vstate, u32 id,
+ struct linked_regs *linked_regs)
+{
+ struct bpf_func_state *func;
+ struct bpf_reg_state *reg;
+ int i, j;
+
+ id = id & ~BPF_ADD_CONST;
+ for (i = vstate->curframe; i >= 0; i--) {
+ func = vstate->frame[i];
+ for (j = 0; j < BPF_REG_FP; j++) {
+ reg = &func->regs[j];
+ __collect_linked_regs(linked_regs, reg, id, i, j, true);
+ }
+ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
+ if (!is_spilled_reg(&func->stack[j]))
+ continue;
+ reg = &func->stack[j].spilled_ptr;
+ __collect_linked_regs(linked_regs, reg, id, i, j, false);
+ }
+ }
+}
+
+/* For all R in linked_regs, copy known_reg range into R
+ * if R->id == known_reg->id.
+ */
+static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_state *known_reg,
+ struct linked_regs *linked_regs)
{
struct bpf_reg_state fake_reg;
- struct bpf_func_state *state;
struct bpf_reg_state *reg;
+ struct linked_reg *e;
+ int i;
- bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+ for (i = 0; i < linked_regs->cnt; ++i) {
+ e = &linked_regs->entries[i];
+ reg = e->is_reg ? &vstate->frame[e->frameno]->regs[e->regno]
+ : &vstate->frame[e->frameno]->stack[e->spi].spilled_ptr;
if (reg->type != SCALAR_VALUE || reg == known_reg)
continue;
if ((reg->id & ~BPF_ADD_CONST) != (known_reg->id & ~BPF_ADD_CONST))
@@ -15114,7 +15337,7 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate,
copy_register_state(reg, known_reg);
/*
* Must preserve off, id and add_const flag,
- * otherwise another find_equal_scalars() will be incorrect.
+ * otherwise another sync_linked_regs() will be incorrect.
*/
reg->off = saved_off;
@@ -15122,7 +15345,7 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate,
scalar_min_max_add(reg, &fake_reg);
reg->var_off = tnum_add(reg->var_off, fake_reg.var_off);
}
- }));
+ }
}
static int check_cond_jmp_op(struct bpf_verifier_env *env,
@@ -15133,6 +15356,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
struct bpf_reg_state *eq_branch_regs;
+ struct linked_regs linked_regs = {};
u8 opcode = BPF_OP(insn->code);
bool is_jmp32;
int pred = -1;
@@ -15247,6 +15471,21 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return 0;
}
+ /* Push scalar registers sharing same ID to jump history,
+ * do this before creating 'other_branch', so that both
+ * 'this_branch' and 'other_branch' share this history
+ * if parent state is created.
+ */
+ if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id)
+ collect_linked_regs(this_branch, src_reg->id, &linked_regs);
+ if (dst_reg->type == SCALAR_VALUE && dst_reg->id)
+ collect_linked_regs(this_branch, dst_reg->id, &linked_regs);
+ if (linked_regs.cnt > 1) {
+ err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs));
+ if (err)
+ return err;
+ }
+
other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx,
false);
if (!other_branch)
@@ -15277,13 +15516,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (BPF_SRC(insn->code) == BPF_X &&
src_reg->type == SCALAR_VALUE && src_reg->id &&
!WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
- find_equal_scalars(this_branch, src_reg);
- find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]);
+ sync_linked_regs(this_branch, src_reg, &linked_regs);
+ sync_linked_regs(other_branch, &other_branch_regs[insn->src_reg], &linked_regs);
}
if (dst_reg->type == SCALAR_VALUE && dst_reg->id &&
!WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) {
- find_equal_scalars(this_branch, dst_reg);
- find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]);
+ sync_linked_regs(this_branch, dst_reg, &linked_regs);
+ sync_linked_regs(other_branch, &other_branch_regs[insn->dst_reg], &linked_regs);
}
/* if one pointer register is compared to another pointer
@@ -15571,6 +15810,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
int err;
struct bpf_func_state *frame = env->cur_state->frame[0];
const bool is_subprog = frame->subprogno;
+ bool return_32bit = false;
/* LSM and struct_ops func-ptr's return type could be "void" */
if (!is_subprog || frame->in_exception_callback_fn) {
@@ -15676,12 +15916,14 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
case BPF_PROG_TYPE_LSM:
if (env->prog->expected_attach_type != BPF_LSM_CGROUP) {
- /* Regular BPF_PROG_TYPE_LSM programs can return
- * any value.
- */
- return 0;
- }
- if (!env->prog->aux->attach_func_proto->type) {
+ /* no range found, any return value is allowed */
+ if (!get_func_retval_range(env->prog, &range))
+ return 0;
+ /* no restricted range, any return value is allowed */
+ if (range.minval == S32_MIN && range.maxval == S32_MAX)
+ return 0;
+ return_32bit = true;
+ } else if (!env->prog->aux->attach_func_proto->type) {
/* Make sure programs that attach to void
* hooks don't try to modify return value.
*/
@@ -15711,7 +15953,7 @@ enforce_retval:
if (err)
return err;
- if (!retval_range_within(range, reg)) {
+ if (!retval_range_within(range, reg, return_32bit)) {
verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name);
if (!is_subprog &&
prog->expected_attach_type == BPF_LSM_CGROUP &&
@@ -15877,6 +16119,274 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
return ret;
}
+/* Bitmask with 1s for all caller saved registers */
+#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1)
+
+/* Return a bitmask specifying which caller saved registers are
+ * clobbered by a call to a helper *as if* this helper follows
+ * bpf_fastcall contract:
+ * - includes R0 if function is non-void;
+ * - includes R1-R5 if corresponding parameter has is described
+ * in the function prototype.
+ */
+static u32 helper_fastcall_clobber_mask(const struct bpf_func_proto *fn)
+{
+ u32 mask;
+ int i;
+
+ mask = 0;
+ if (fn->ret_type != RET_VOID)
+ mask |= BIT(BPF_REG_0);
+ for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i)
+ if (fn->arg_type[i] != ARG_DONTCARE)
+ mask |= BIT(BPF_REG_1 + i);
+ return mask;
+}
+
+/* True if do_misc_fixups() replaces calls to helper number 'imm',
+ * replacement patch is presumed to follow bpf_fastcall contract
+ * (see mark_fastcall_pattern_for_call() below).
+ */
+static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm)
+{
+ switch (imm) {
+#ifdef CONFIG_X86_64
+ case BPF_FUNC_get_smp_processor_id:
+ return env->prog->jit_requested && bpf_jit_supports_percpu_insn();
+#endif
+ default:
+ return false;
+ }
+}
+
+/* Same as helper_fastcall_clobber_mask() but for kfuncs, see comment above */
+static u32 kfunc_fastcall_clobber_mask(struct bpf_kfunc_call_arg_meta *meta)
+{
+ u32 vlen, i, mask;
+
+ vlen = btf_type_vlen(meta->func_proto);
+ mask = 0;
+ if (!btf_type_is_void(btf_type_by_id(meta->btf, meta->func_proto->type)))
+ mask |= BIT(BPF_REG_0);
+ for (i = 0; i < vlen; ++i)
+ mask |= BIT(BPF_REG_1 + i);
+ return mask;
+}
+
+/* Same as verifier_inlines_helper_call() but for kfuncs, see comment above */
+static bool is_fastcall_kfunc_call(struct bpf_kfunc_call_arg_meta *meta)
+{
+ if (meta->btf == btf_vmlinux)
+ return meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
+ meta->func_id == special_kfunc_list[KF_bpf_rdonly_cast];
+ return false;
+}
+
+/* LLVM define a bpf_fastcall function attribute.
+ * This attribute means that function scratches only some of
+ * the caller saved registers defined by ABI.
+ * For BPF the set of such registers could be defined as follows:
+ * - R0 is scratched only if function is non-void;
+ * - R1-R5 are scratched only if corresponding parameter type is defined
+ * in the function prototype.
+ *
+ * The contract between kernel and clang allows to simultaneously use
+ * such functions and maintain backwards compatibility with old
+ * kernels that don't understand bpf_fastcall calls:
+ *
+ * - for bpf_fastcall calls clang allocates registers as-if relevant r0-r5
+ * registers are not scratched by the call;
+ *
+ * - as a post-processing step, clang visits each bpf_fastcall call and adds
+ * spill/fill for every live r0-r5;
+ *
+ * - stack offsets used for the spill/fill are allocated as lowest
+ * stack offsets in whole function and are not used for any other
+ * purposes;
+ *
+ * - when kernel loads a program, it looks for such patterns
+ * (bpf_fastcall function surrounded by spills/fills) and checks if
+ * spill/fill stack offsets are used exclusively in fastcall patterns;
+ *
+ * - if so, and if verifier or current JIT inlines the call to the
+ * bpf_fastcall function (e.g. a helper call), kernel removes unnecessary
+ * spill/fill pairs;
+ *
+ * - when old kernel loads a program, presence of spill/fill pairs
+ * keeps BPF program valid, albeit slightly less efficient.
+ *
+ * For example:
+ *
+ * r1 = 1;
+ * r2 = 2;
+ * *(u64 *)(r10 - 8) = r1; r1 = 1;
+ * *(u64 *)(r10 - 16) = r2; r2 = 2;
+ * call %[to_be_inlined] --> call %[to_be_inlined]
+ * r2 = *(u64 *)(r10 - 16); r0 = r1;
+ * r1 = *(u64 *)(r10 - 8); r0 += r2;
+ * r0 = r1; exit;
+ * r0 += r2;
+ * exit;
+ *
+ * The purpose of mark_fastcall_pattern_for_call is to:
+ * - look for such patterns;
+ * - mark spill and fill instructions in env->insn_aux_data[*].fastcall_pattern;
+ * - mark set env->insn_aux_data[*].fastcall_spills_num for call instruction;
+ * - update env->subprog_info[*]->fastcall_stack_off to find an offset
+ * at which bpf_fastcall spill/fill stack slots start;
+ * - update env->subprog_info[*]->keep_fastcall_stack.
+ *
+ * The .fastcall_pattern and .fastcall_stack_off are used by
+ * check_fastcall_stack_contract() to check if every stack access to
+ * fastcall spill/fill stack slot originates from spill/fill
+ * instructions, members of fastcall patterns.
+ *
+ * If such condition holds true for a subprogram, fastcall patterns could
+ * be rewritten by remove_fastcall_spills_fills().
+ * Otherwise bpf_fastcall patterns are not changed in the subprogram
+ * (code, presumably, generated by an older clang version).
+ *
+ * For example, it is *not* safe to remove spill/fill below:
+ *
+ * r1 = 1;
+ * *(u64 *)(r10 - 8) = r1; r1 = 1;
+ * call %[to_be_inlined] --> call %[to_be_inlined]
+ * r1 = *(u64 *)(r10 - 8); r0 = *(u64 *)(r10 - 8); <---- wrong !!!
+ * r0 = *(u64 *)(r10 - 8); r0 += r1;
+ * r0 += r1; exit;
+ * exit;
+ */
+static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env,
+ struct bpf_subprog_info *subprog,
+ int insn_idx, s16 lowest_off)
+{
+ struct bpf_insn *insns = env->prog->insnsi, *stx, *ldx;
+ struct bpf_insn *call = &env->prog->insnsi[insn_idx];
+ const struct bpf_func_proto *fn;
+ u32 clobbered_regs_mask = ALL_CALLER_SAVED_REGS;
+ u32 expected_regs_mask;
+ bool can_be_inlined = false;
+ s16 off;
+ int i;
+
+ if (bpf_helper_call(call)) {
+ if (get_helper_proto(env, call->imm, &fn) < 0)
+ /* error would be reported later */
+ return;
+ clobbered_regs_mask = helper_fastcall_clobber_mask(fn);
+ can_be_inlined = fn->allow_fastcall &&
+ (verifier_inlines_helper_call(env, call->imm) ||
+ bpf_jit_inlines_helper_call(call->imm));
+ }
+
+ if (bpf_pseudo_kfunc_call(call)) {
+ struct bpf_kfunc_call_arg_meta meta;
+ int err;
+
+ err = fetch_kfunc_meta(env, call, &meta, NULL);
+ if (err < 0)
+ /* error would be reported later */
+ return;
+
+ clobbered_regs_mask = kfunc_fastcall_clobber_mask(&meta);
+ can_be_inlined = is_fastcall_kfunc_call(&meta);
+ }
+
+ if (clobbered_regs_mask == ALL_CALLER_SAVED_REGS)
+ return;
+
+ /* e.g. if helper call clobbers r{0,1}, expect r{2,3,4,5} in the pattern */
+ expected_regs_mask = ~clobbered_regs_mask & ALL_CALLER_SAVED_REGS;
+
+ /* match pairs of form:
+ *
+ * *(u64 *)(r10 - Y) = rX (where Y % 8 == 0)
+ * ...
+ * call %[to_be_inlined]
+ * ...
+ * rX = *(u64 *)(r10 - Y)
+ */
+ for (i = 1, off = lowest_off; i <= ARRAY_SIZE(caller_saved); ++i, off += BPF_REG_SIZE) {
+ if (insn_idx - i < 0 || insn_idx + i >= env->prog->len)
+ break;
+ stx = &insns[insn_idx - i];
+ ldx = &insns[insn_idx + i];
+ /* must be a stack spill/fill pair */
+ if (stx->code != (BPF_STX | BPF_MEM | BPF_DW) ||
+ ldx->code != (BPF_LDX | BPF_MEM | BPF_DW) ||
+ stx->dst_reg != BPF_REG_10 ||
+ ldx->src_reg != BPF_REG_10)
+ break;
+ /* must be a spill/fill for the same reg */
+ if (stx->src_reg != ldx->dst_reg)
+ break;
+ /* must be one of the previously unseen registers */
+ if ((BIT(stx->src_reg) & expected_regs_mask) == 0)
+ break;
+ /* must be a spill/fill for the same expected offset,
+ * no need to check offset alignment, BPF_DW stack access
+ * is always 8-byte aligned.
+ */
+ if (stx->off != off || ldx->off != off)
+ break;
+ expected_regs_mask &= ~BIT(stx->src_reg);
+ env->insn_aux_data[insn_idx - i].fastcall_pattern = 1;
+ env->insn_aux_data[insn_idx + i].fastcall_pattern = 1;
+ }
+ if (i == 1)
+ return;
+
+ /* Conditionally set 'fastcall_spills_num' to allow forward
+ * compatibility when more helper functions are marked as
+ * bpf_fastcall at compile time than current kernel supports, e.g:
+ *
+ * 1: *(u64 *)(r10 - 8) = r1
+ * 2: call A ;; assume A is bpf_fastcall for current kernel
+ * 3: r1 = *(u64 *)(r10 - 8)
+ * 4: *(u64 *)(r10 - 8) = r1
+ * 5: call B ;; assume B is not bpf_fastcall for current kernel
+ * 6: r1 = *(u64 *)(r10 - 8)
+ *
+ * There is no need to block bpf_fastcall rewrite for such program.
+ * Set 'fastcall_pattern' for both calls to keep check_fastcall_stack_contract() happy,
+ * don't set 'fastcall_spills_num' for call B so that remove_fastcall_spills_fills()
+ * does not remove spill/fill pair {4,6}.
+ */
+ if (can_be_inlined)
+ env->insn_aux_data[insn_idx].fastcall_spills_num = i - 1;
+ else
+ subprog->keep_fastcall_stack = 1;
+ subprog->fastcall_stack_off = min(subprog->fastcall_stack_off, off);
+}
+
+static int mark_fastcall_patterns(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprog = env->subprog_info;
+ struct bpf_insn *insn;
+ s16 lowest_off;
+ int s, i;
+
+ for (s = 0; s < env->subprog_cnt; ++s, ++subprog) {
+ /* find lowest stack spill offset used in this subprog */
+ lowest_off = 0;
+ for (i = subprog->start; i < (subprog + 1)->start; ++i) {
+ insn = env->prog->insnsi + i;
+ if (insn->code != (BPF_STX | BPF_MEM | BPF_DW) ||
+ insn->dst_reg != BPF_REG_10)
+ continue;
+ lowest_off = min(lowest_off, insn->off);
+ }
+ /* use this offset to find fastcall patterns */
+ for (i = subprog->start; i < (subprog + 1)->start; ++i) {
+ insn = env->prog->insnsi + i;
+ if (insn->code != (BPF_JMP | BPF_CALL))
+ continue;
+ mark_fastcall_pattern_for_call(env, subprog, i, lowest_off);
+ }
+ }
+ return 0;
+}
+
/* Visits the instruction at index t and returns one of the following:
* < 0 - an error occurred
* DONE_EXPLORING - the instruction was fully explored
@@ -16772,7 +17282,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
*
* First verification path is [1-6]:
* - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7;
- * - at (5) r6 would be marked <= X, find_equal_scalars() would also mark
+ * - at (5) r6 would be marked <= X, sync_linked_regs() would also mark
* r7 <= X, because r6 and r7 share same id.
* Next verification path is [1-4, 6].
*
@@ -17566,7 +18076,7 @@ hit:
* the current state.
*/
if (is_jmp_point(env, env->insn_idx))
- err = err ? : push_jmp_history(env, cur, 0);
+ err = err ? : push_jmp_history(env, cur, 0, 0);
err = err ? : propagate_precision(env, &sl->state);
if (err)
return err;
@@ -17834,7 +18344,7 @@ static int do_check(struct bpf_verifier_env *env)
}
if (is_jmp_point(env, env->insn_idx)) {
- err = push_jmp_history(env, state, 0);
+ err = push_jmp_history(env, state, 0, 0);
if (err)
return err;
}
@@ -18410,6 +18920,53 @@ static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
}
+/* Add map behind fd to used maps list, if it's not already there, and return
+ * its index. Also set *reused to true if this map was already in the list of
+ * used maps.
+ * Returns <0 on error, or >= 0 index, on success.
+ */
+static int add_used_map_from_fd(struct bpf_verifier_env *env, int fd, bool *reused)
+{
+ CLASS(fd, f)(fd);
+ struct bpf_map *map;
+ int i;
+
+ map = __bpf_map_get(f);
+ if (IS_ERR(map)) {
+ verbose(env, "fd %d is not pointing to valid bpf_map\n", fd);
+ return PTR_ERR(map);
+ }
+
+ /* check whether we recorded this map already */
+ for (i = 0; i < env->used_map_cnt; i++) {
+ if (env->used_maps[i] == map) {
+ *reused = true;
+ return i;
+ }
+ }
+
+ if (env->used_map_cnt >= MAX_USED_MAPS) {
+ verbose(env, "The total number of maps per program has reached the limit of %u\n",
+ MAX_USED_MAPS);
+ return -E2BIG;
+ }
+
+ if (env->prog->sleepable)
+ atomic64_inc(&map->sleepable_refcnt);
+
+ /* hold the map. If the program is rejected by verifier,
+ * the map will be released by release_maps() or it
+ * will be used by the valid program until it's unloaded
+ * and all maps are released in bpf_free_used_maps()
+ */
+ bpf_map_inc(map);
+
+ *reused = false;
+ env->used_maps[env->used_map_cnt++] = map;
+
+ return env->used_map_cnt - 1;
+}
+
/* find and rewrite pseudo imm in ld_imm64 instructions:
*
* 1. if it accesses map FD, replace it with actual map pointer.
@@ -18421,7 +18978,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
{
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
- int i, j, err;
+ int i, err;
err = bpf_prog_calc_tag(env->prog);
if (err)
@@ -18438,9 +18995,10 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
struct bpf_insn_aux_data *aux;
struct bpf_map *map;
- struct fd f;
+ int map_idx;
u64 addr;
u32 fd;
+ bool reused;
if (i == insn_cnt - 1 || insn[1].code != 0 ||
insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
@@ -18501,20 +19059,18 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
break;
}
- f = fdget(fd);
- map = __bpf_map_get(f);
- if (IS_ERR(map)) {
- verbose(env, "fd %d is not pointing to valid bpf_map\n", fd);
- return PTR_ERR(map);
- }
+ map_idx = add_used_map_from_fd(env, fd, &reused);
+ if (map_idx < 0)
+ return map_idx;
+ map = env->used_maps[map_idx];
+
+ aux = &env->insn_aux_data[i];
+ aux->map_index = map_idx;
err = check_map_prog_compatibility(env, map, env->prog);
- if (err) {
- fdput(f);
+ if (err)
return err;
- }
- aux = &env->insn_aux_data[i];
if (insn[0].src_reg == BPF_PSEUDO_MAP_FD ||
insn[0].src_reg == BPF_PSEUDO_MAP_IDX) {
addr = (unsigned long)map;
@@ -18523,13 +19079,11 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
if (off >= BPF_MAX_VAR_OFF) {
verbose(env, "direct value offset of %u is not allowed\n", off);
- fdput(f);
return -EINVAL;
}
if (!map->ops->map_direct_value_addr) {
verbose(env, "no direct value access support for this map type\n");
- fdput(f);
return -EINVAL;
}
@@ -18537,7 +19091,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
if (err) {
verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n",
map->value_size, off);
- fdput(f);
return err;
}
@@ -18548,70 +19101,39 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
insn[0].imm = (u32)addr;
insn[1].imm = addr >> 32;
- /* check whether we recorded this map already */
- for (j = 0; j < env->used_map_cnt; j++) {
- if (env->used_maps[j] == map) {
- aux->map_index = j;
- fdput(f);
- goto next_insn;
- }
- }
-
- if (env->used_map_cnt >= MAX_USED_MAPS) {
- verbose(env, "The total number of maps per program has reached the limit of %u\n",
- MAX_USED_MAPS);
- fdput(f);
- return -E2BIG;
- }
-
- if (env->prog->sleepable)
- atomic64_inc(&map->sleepable_refcnt);
- /* hold the map. If the program is rejected by verifier,
- * the map will be released by release_maps() or it
- * will be used by the valid program until it's unloaded
- * and all maps are released in bpf_free_used_maps()
- */
- bpf_map_inc(map);
-
- aux->map_index = env->used_map_cnt;
- env->used_maps[env->used_map_cnt++] = map;
+ /* proceed with extra checks only if its newly added used map */
+ if (reused)
+ goto next_insn;
if (bpf_map_is_cgroup_storage(map) &&
bpf_cgroup_storage_assign(env->prog->aux, map)) {
verbose(env, "only one cgroup storage of each type is allowed\n");
- fdput(f);
return -EBUSY;
}
if (map->map_type == BPF_MAP_TYPE_ARENA) {
if (env->prog->aux->arena) {
verbose(env, "Only one arena per program\n");
- fdput(f);
return -EBUSY;
}
if (!env->allow_ptr_leaks || !env->bpf_capable) {
verbose(env, "CAP_BPF and CAP_PERFMON are required to use arena\n");
- fdput(f);
return -EPERM;
}
if (!env->prog->jit_requested) {
verbose(env, "JIT is required to use arena\n");
- fdput(f);
return -EOPNOTSUPP;
}
if (!bpf_jit_supports_arena()) {
verbose(env, "JIT doesn't support arena\n");
- fdput(f);
return -EOPNOTSUPP;
}
env->prog->aux->arena = (void *)map;
if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) {
verbose(env, "arena's user address must be set via map_extra or mmap()\n");
- fdput(f);
return -EINVAL;
}
}
- fdput(f);
next_insn:
insn++;
i++;
@@ -18767,6 +19289,9 @@ static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta)
for (i = 0; i < insn_cnt; i++, insn++) {
u8 code = insn->code;
+ if (tgt_idx <= i && i < tgt_idx + delta)
+ continue;
+
if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) ||
BPF_OP(code) == BPF_CALL || BPF_OP(code) == BPF_EXIT)
continue;
@@ -19026,9 +19551,11 @@ static int opt_remove_dead_code(struct bpf_verifier_env *env)
return 0;
}
+static const struct bpf_insn NOP = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
+
static int opt_remove_nops(struct bpf_verifier_env *env)
{
- const struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
+ const struct bpf_insn ja = NOP;
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
int i, err;
@@ -19153,14 +19680,39 @@ apply_patch_buffer:
*/
static int convert_ctx_accesses(struct bpf_verifier_env *env)
{
+ struct bpf_subprog_info *subprogs = env->subprog_info;
const struct bpf_verifier_ops *ops = env->ops;
- int i, cnt, size, ctx_field_size, delta = 0;
+ int i, cnt, size, ctx_field_size, delta = 0, epilogue_cnt = 0;
const int insn_cnt = env->prog->len;
- struct bpf_insn insn_buf[16], *insn;
+ struct bpf_insn *epilogue_buf = env->epilogue_buf;
+ struct bpf_insn *insn_buf = env->insn_buf;
+ struct bpf_insn *insn;
u32 target_size, size_default, off;
struct bpf_prog *new_prog;
enum bpf_access_type type;
bool is_narrower_load;
+ int epilogue_idx = 0;
+
+ if (ops->gen_epilogue) {
+ epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog,
+ -(subprogs[0].stack_depth + 8));
+ if (epilogue_cnt >= INSN_BUF_SIZE) {
+ verbose(env, "bpf verifier is misconfigured\n");
+ return -EINVAL;
+ } else if (epilogue_cnt) {
+ /* Save the ARG_PTR_TO_CTX for the epilogue to use */
+ cnt = 0;
+ subprogs[0].stack_depth += 8;
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1,
+ -subprogs[0].stack_depth);
+ insn_buf[cnt++] = env->prog->insnsi[0];
+ new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+ env->prog = new_prog;
+ delta += cnt - 1;
+ }
+ }
if (ops->gen_prologue || env->seen_direct_write) {
if (!ops->gen_prologue) {
@@ -19169,7 +19721,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
}
cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
env->prog);
- if (cnt >= ARRAY_SIZE(insn_buf)) {
+ if (cnt >= INSN_BUF_SIZE) {
verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
} else if (cnt) {
@@ -19182,6 +19734,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
}
}
+ if (delta)
+ WARN_ON(adjust_jmp_off(env->prog, 0, delta));
+
if (bpf_prog_is_offloaded(env->prog->aux))
return 0;
@@ -19214,6 +19769,25 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code);
env->prog->aux->num_exentries++;
continue;
+ } else if (insn->code == (BPF_JMP | BPF_EXIT) &&
+ epilogue_cnt &&
+ i + delta < subprogs[1].start) {
+ /* Generate epilogue for the main prog */
+ if (epilogue_idx) {
+ /* jump back to the earlier generated epilogue */
+ insn_buf[0] = BPF_JMP32_A(epilogue_idx - i - delta - 1);
+ cnt = 1;
+ } else {
+ memcpy(insn_buf, epilogue_buf,
+ epilogue_cnt * sizeof(*epilogue_buf));
+ cnt = epilogue_cnt;
+ /* epilogue_idx cannot be 0. It must have at
+ * least one ctx ptr saving insn before the
+ * epilogue.
+ */
+ epilogue_idx = i + delta;
+ }
+ goto patch_insn_buf;
} else {
continue;
}
@@ -19316,7 +19890,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
target_size = 0;
cnt = convert_ctx_access(type, insn, insn_buf, env->prog,
&target_size);
- if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
+ if (cnt == 0 || cnt >= INSN_BUF_SIZE ||
(ctx_field_size && !target_size)) {
verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
@@ -19325,7 +19899,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
if (is_narrower_load && size < target_size) {
u8 shift = bpf_ctx_narrow_access_offset(
off, size, size_default) * 8;
- if (shift && cnt + 1 >= ARRAY_SIZE(insn_buf)) {
+ if (shift && cnt + 1 >= INSN_BUF_SIZE) {
verbose(env, "bpf verifier narrow ctx load misconfigured\n");
return -EINVAL;
}
@@ -19350,6 +19924,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn->dst_reg, insn->dst_reg,
size * 8, 0);
+patch_insn_buf:
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
@@ -19870,7 +20445,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
const int insn_cnt = prog->len;
const struct bpf_map_ops *ops;
struct bpf_insn_aux_data *aux;
- struct bpf_insn insn_buf[16];
+ struct bpf_insn *insn_buf = env->insn_buf;
struct bpf_prog *new_prog;
struct bpf_map *map_ptr;
int i, ret, cnt, delta = 0, cur_subprog = 0;
@@ -19913,13 +20488,46 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
/* Convert BPF_CLASS(insn->code) == BPF_ALU64 to 32-bit ALU */
insn->code = BPF_ALU | BPF_OP(insn->code) | BPF_SRC(insn->code);
- /* Make divide-by-zero exceptions impossible. */
+ /* Make sdiv/smod divide-by-minus-one exceptions impossible. */
+ if ((insn->code == (BPF_ALU64 | BPF_MOD | BPF_K) ||
+ insn->code == (BPF_ALU64 | BPF_DIV | BPF_K) ||
+ insn->code == (BPF_ALU | BPF_MOD | BPF_K) ||
+ insn->code == (BPF_ALU | BPF_DIV | BPF_K)) &&
+ insn->off == 1 && insn->imm == -1) {
+ bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+ bool isdiv = BPF_OP(insn->code) == BPF_DIV;
+ struct bpf_insn *patchlet;
+ struct bpf_insn chk_and_sdiv[] = {
+ BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_NEG | BPF_K, insn->dst_reg,
+ 0, 0, 0),
+ };
+ struct bpf_insn chk_and_smod[] = {
+ BPF_MOV32_IMM(insn->dst_reg, 0),
+ };
+
+ patchlet = isdiv ? chk_and_sdiv : chk_and_smod;
+ cnt = isdiv ? ARRAY_SIZE(chk_and_sdiv) : ARRAY_SIZE(chk_and_smod);
+
+ new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Make divide-by-zero and divide-by-minus-one exceptions impossible. */
if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
bool isdiv = BPF_OP(insn->code) == BPF_DIV;
+ bool is_sdiv = isdiv && insn->off == 1;
+ bool is_smod = !isdiv && insn->off == 1;
struct bpf_insn *patchlet;
struct bpf_insn chk_and_div[] = {
/* [R,W]x div 0 -> 0 */
@@ -19939,10 +20547,62 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
BPF_JMP_IMM(BPF_JA, 0, 0, 1),
BPF_MOV32_REG(insn->dst_reg, insn->dst_reg),
};
+ struct bpf_insn chk_and_sdiv[] = {
+ /* [R,W]x sdiv 0 -> 0
+ * LLONG_MIN sdiv -1 -> LLONG_MIN
+ * INT_MIN sdiv -1 -> INT_MIN
+ */
+ BPF_MOV64_REG(BPF_REG_AX, insn->src_reg),
+ BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_ADD | BPF_K, BPF_REG_AX,
+ 0, 0, 1),
+ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JGT | BPF_K, BPF_REG_AX,
+ 0, 4, 1),
+ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JEQ | BPF_K, BPF_REG_AX,
+ 0, 1, 0),
+ BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_MOV | BPF_K, insn->dst_reg,
+ 0, 0, 0),
+ /* BPF_NEG(LLONG_MIN) == -LLONG_MIN == LLONG_MIN */
+ BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_NEG | BPF_K, insn->dst_reg,
+ 0, 0, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ *insn,
+ };
+ struct bpf_insn chk_and_smod[] = {
+ /* [R,W]x mod 0 -> [R,W]x */
+ /* [R,W]x mod -1 -> 0 */
+ BPF_MOV64_REG(BPF_REG_AX, insn->src_reg),
+ BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_ADD | BPF_K, BPF_REG_AX,
+ 0, 0, 1),
+ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JGT | BPF_K, BPF_REG_AX,
+ 0, 3, 1),
+ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JEQ | BPF_K, BPF_REG_AX,
+ 0, 3 + (is64 ? 0 : 1), 1),
+ BPF_MOV32_IMM(insn->dst_reg, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ *insn,
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_MOV32_REG(insn->dst_reg, insn->dst_reg),
+ };
- patchlet = isdiv ? chk_and_div : chk_and_mod;
- cnt = isdiv ? ARRAY_SIZE(chk_and_div) :
- ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0);
+ if (is_sdiv) {
+ patchlet = chk_and_sdiv;
+ cnt = ARRAY_SIZE(chk_and_sdiv);
+ } else if (is_smod) {
+ patchlet = chk_and_smod;
+ cnt = ARRAY_SIZE(chk_and_smod) - (is64 ? 2 : 0);
+ } else {
+ patchlet = isdiv ? chk_and_div : chk_and_mod;
+ cnt = isdiv ? ARRAY_SIZE(chk_and_div) :
+ ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0);
+ }
new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
if (!new_prog)
@@ -19989,7 +20649,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
(BPF_MODE(insn->code) == BPF_ABS ||
BPF_MODE(insn->code) == BPF_IND)) {
cnt = env->ops->gen_ld_abs(insn, insn_buf);
- if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ if (cnt == 0 || cnt >= INSN_BUF_SIZE) {
verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
@@ -20282,7 +20942,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
cnt = ops->map_gen_lookup(map_ptr, insn_buf);
if (cnt == -EOPNOTSUPP)
goto patch_map_ops_generic;
- if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ if (cnt <= 0 || cnt >= INSN_BUF_SIZE) {
verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
@@ -20384,7 +21044,7 @@ patch_map_ops_generic:
#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
/* Implement bpf_get_smp_processor_id() inline. */
if (insn->imm == BPF_FUNC_get_smp_processor_id &&
- prog->jit_requested && bpf_jit_supports_percpu_insn()) {
+ verifier_inlines_helper_call(env, insn->imm)) {
/* BPF_FUNC_get_smp_processor_id inlining is an
* optimization, so if pcpu_hot.cpu_number is ever
* changed in some incompatible and hard to support
@@ -20642,7 +21302,7 @@ static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
int position,
s32 stack_base,
u32 callback_subprogno,
- u32 *cnt)
+ u32 *total_cnt)
{
s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
@@ -20651,55 +21311,56 @@ static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
int reg_loop_cnt = BPF_REG_7;
int reg_loop_ctx = BPF_REG_8;
+ struct bpf_insn *insn_buf = env->insn_buf;
struct bpf_prog *new_prog;
u32 callback_start;
u32 call_insn_offset;
s32 callback_offset;
+ u32 cnt = 0;
/* This represents an inlined version of bpf_iter.c:bpf_loop,
* be careful to modify this code in sync.
*/
- struct bpf_insn insn_buf[] = {
- /* Return error and jump to the end of the patch if
- * expected number of iterations is too big.
- */
- BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2),
- BPF_MOV32_IMM(BPF_REG_0, -E2BIG),
- BPF_JMP_IMM(BPF_JA, 0, 0, 16),
- /* spill R6, R7, R8 to use these as loop vars */
- BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset),
- BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset),
- BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset),
- /* initialize loop vars */
- BPF_MOV64_REG(reg_loop_max, BPF_REG_1),
- BPF_MOV32_IMM(reg_loop_cnt, 0),
- BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3),
- /* loop header,
- * if reg_loop_cnt >= reg_loop_max skip the loop body
- */
- BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5),
- /* callback call,
- * correct callback offset would be set after patching
- */
- BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt),
- BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx),
- BPF_CALL_REL(0),
- /* increment loop counter */
- BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1),
- /* jump to loop header if callback returned 0 */
- BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6),
- /* return value of bpf_loop,
- * set R0 to the number of iterations
- */
- BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt),
- /* restore original values of R6, R7, R8 */
- BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset),
- BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset),
- BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset),
- };
- *cnt = ARRAY_SIZE(insn_buf);
- new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt);
+ /* Return error and jump to the end of the patch if
+ * expected number of iterations is too big.
+ */
+ insn_buf[cnt++] = BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2);
+ insn_buf[cnt++] = BPF_MOV32_IMM(BPF_REG_0, -E2BIG);
+ insn_buf[cnt++] = BPF_JMP_IMM(BPF_JA, 0, 0, 16);
+ /* spill R6, R7, R8 to use these as loop vars */
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset);
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset);
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset);
+ /* initialize loop vars */
+ insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_max, BPF_REG_1);
+ insn_buf[cnt++] = BPF_MOV32_IMM(reg_loop_cnt, 0);
+ insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3);
+ /* loop header,
+ * if reg_loop_cnt >= reg_loop_max skip the loop body
+ */
+ insn_buf[cnt++] = BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5);
+ /* callback call,
+ * correct callback offset would be set after patching
+ */
+ insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt);
+ insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx);
+ insn_buf[cnt++] = BPF_CALL_REL(0);
+ /* increment loop counter */
+ insn_buf[cnt++] = BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1);
+ /* jump to loop header if callback returned 0 */
+ insn_buf[cnt++] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6);
+ /* return value of bpf_loop,
+ * set R0 to the number of iterations
+ */
+ insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt);
+ /* restore original values of R6, R7, R8 */
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset);
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset);
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset);
+
+ *total_cnt = cnt;
+ new_prog = bpf_patch_insn_data(env, position, insn_buf, cnt);
if (!new_prog)
return new_prog;
@@ -20774,6 +21435,40 @@ static int optimize_bpf_loop(struct bpf_verifier_env *env)
return 0;
}
+/* Remove unnecessary spill/fill pairs, members of fastcall pattern,
+ * adjust subprograms stack depth when possible.
+ */
+static int remove_fastcall_spills_fills(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprog = env->subprog_info;
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ u32 spills_num;
+ bool modified = false;
+ int i, j;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (aux[i].fastcall_spills_num > 0) {
+ spills_num = aux[i].fastcall_spills_num;
+ /* NOPs would be removed by opt_remove_nops() */
+ for (j = 1; j <= spills_num; ++j) {
+ *(insn - j) = NOP;
+ *(insn + j) = NOP;
+ }
+ modified = true;
+ }
+ if ((subprog + 1)->start == i + 1) {
+ if (modified && !subprog->keep_fastcall_stack)
+ subprog->stack_depth = -subprog->fastcall_stack_off;
+ subprog++;
+ modified = false;
+ }
+ }
+
+ return 0;
+}
+
static void free_states(struct bpf_verifier_env *env)
{
struct bpf_verifier_state_list *sl, *sln;
@@ -21047,6 +21742,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
u32 btf_id, member_idx;
struct btf *btf;
const char *mname;
+ int err;
if (!prog->gpl_compatible) {
verbose(env, "struct ops programs must have a GPL compatible license\n");
@@ -21094,8 +21790,15 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
return -EINVAL;
}
+ err = bpf_struct_ops_supported(st_ops, __btf_member_bit_offset(t, member) / 8);
+ if (err) {
+ verbose(env, "attach to unsupported member %s of struct %s\n",
+ mname, st_ops->name);
+ return err;
+ }
+
if (st_ops->check_member) {
- int err = st_ops->check_member(t, member, prog);
+ err = st_ops->check_member(t, member, prog);
if (err) {
verbose(env, "attach to unsupported member %s of struct %s\n",
@@ -21706,6 +22409,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (ret < 0)
goto skip_full_check;
+ ret = mark_fastcall_patterns(env);
+ if (ret < 0)
+ goto skip_full_check;
+
ret = do_check_main(env);
ret = ret ?: do_check_subprogs(env);
@@ -21715,6 +22422,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
skip_full_check:
kvfree(env->explored_states);
+ /* might decrease stack depth, keep it before passes that
+ * allocate additional slots.
+ */
+ if (ret == 0)
+ ret = remove_fastcall_spills_fills(env);
+
if (ret == 0)
ret = check_max_stack_depth(env);
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 520b90dd97ec..c964dd7ff967 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -81,6 +81,8 @@ struct cgroup_file_ctx {
struct {
struct cgroup_pidlist *pidlist;
} procs1;
+
+ struct cgroup_of_peak peak;
};
/*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 90e50d6d3cf3..5886b95c6eae 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1972,6 +1972,13 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
return -EINVAL;
}
+struct cgroup_of_peak *of_peak(struct kernfs_open_file *of)
+{
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ return &ctx->peak;
+}
+
static void apply_cgroup_root_flags(unsigned int root_flags)
{
if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
@@ -4623,8 +4630,9 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
*
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
- * section. This function will return the correct next descendant as long
- * as both @pos and @root are accessible and @pos is a descendant of @root.
+ * section. Additionally, it isn't necessary to hold onto a reference to @pos.
+ * This function will return the correct next descendant as long as both @pos
+ * and @root are accessible and @pos is a descendant of @root.
*
* If a subsystem synchronizes ->css_online() and the start of iteration, a
* css which finished ->css_online() is guaranteed to be visible in the
@@ -4672,8 +4680,9 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre);
*
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
- * section. This function will return the correct rightmost descendant as
- * long as @pos is accessible.
+ * section. Additionally, it isn't necessary to hold onto a reference to @pos.
+ * This function will return the correct rightmost descendant as long as @pos
+ * is accessible.
*/
struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state *pos)
@@ -4717,9 +4726,9 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos)
*
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
- * section. This function will return the correct next descendant as long
- * as both @pos and @cgroup are accessible and @pos is a descendant of
- * @cgroup.
+ * section. Additionally, it isn't necessary to hold onto a reference to @pos.
+ * This function will return the correct next descendant as long as both @pos
+ * and @cgroup are accessible and @pos is a descendant of @cgroup.
*
* If a subsystem synchronizes ->css_online() and the start of iteration, a
* css which finished ->css_online() is guaranteed to be visible in the
@@ -6959,10 +6968,10 @@ struct cgroup *cgroup_v1v2_get_from_fd(int fd)
{
struct cgroup *cgrp;
struct fd f = fdget_raw(fd);
- if (!f.file)
+ if (!fd_file(f))
return ERR_PTR(-EBADF);
- cgrp = cgroup_v1v2_get_from_file(f.file);
+ cgrp = cgroup_v1v2_get_from_file(fd_file(f));
fdput(f);
return cgrp;
}
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index 00009f7d0835..b753695c5a8f 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -1,10 +1,4 @@
-# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-# CONFIG_KERNEL_GZIP is not set
-# CONFIG_KERNEL_BZIP2 is not set
-# CONFIG_KERNEL_LZMA is not set
CONFIG_KERNEL_XZ=y
-# CONFIG_KERNEL_LZO is not set
-# CONFIG_KERNEL_LZ4 is not set
CONFIG_SLUB=y
CONFIG_SLUB_TINY=y
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 63cf89393c6e..c1048893f4b6 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -505,7 +505,7 @@ int crash_check_hotplug_support(void)
crash_hotplug_lock();
/* Obtain lock while reading crash information */
if (!kexec_trylock()) {
- pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n");
+ pr_info("kexec_trylock() failed, kdump image may be inaccurate\n");
crash_hotplug_unlock();
return 0;
}
@@ -520,18 +520,25 @@ int crash_check_hotplug_support(void)
}
/*
- * To accurately reflect hot un/plug changes of cpu and memory resources
- * (including onling and offlining of those resources), the elfcorehdr
- * (which is passed to the crash kernel via the elfcorehdr= parameter)
- * must be updated with the new list of CPUs and memories.
+ * To accurately reflect hot un/plug changes of CPU and Memory resources
+ * (including onling and offlining of those resources), the relevant
+ * kexec segments must be updated with latest CPU and Memory resources.
*
- * In order to make changes to elfcorehdr, two conditions are needed:
- * First, the segment containing the elfcorehdr must be large enough
- * to permit a growing number of resources; the elfcorehdr memory size
- * is based on NR_CPUS_DEFAULT and CRASH_MAX_MEMORY_RANGES.
- * Second, purgatory must explicitly exclude the elfcorehdr from the
- * list of segments it checks (since the elfcorehdr changes and thus
- * would require an update to purgatory itself to update the digest).
+ * Architectures must ensure two things for all segments that need
+ * updating during hotplug events:
+ *
+ * 1. Segments must be large enough to accommodate a growing number of
+ * resources.
+ * 2. Exclude the segments from SHA verification.
+ *
+ * For example, on most architectures, the elfcorehdr (which is passed
+ * to the crash kernel via the elfcorehdr= parameter) must include the
+ * new list of CPUs and memory. To make changes to the elfcorehdr, it
+ * should be large enough to permit a growing number of CPU and Memory
+ * resources. One can estimate the elfcorehdr memory size based on
+ * NR_CPUS_DEFAULT and CRASH_MAX_MEMORY_RANGES. The elfcorehdr is
+ * excluded from SHA verification by default if the architecture
+ * supports crash hotplug.
*/
static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu, void *arg)
{
@@ -540,7 +547,7 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu,
crash_hotplug_lock();
/* Obtain lock while changing crash information */
if (!kexec_trylock()) {
- pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n");
+ pr_info("kexec_trylock() failed, kdump image may be inaccurate\n");
crash_hotplug_unlock();
return;
}
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
index 64d44a52c011..a620fb4b2116 100644
--- a/kernel/crash_reserve.c
+++ b/kernel/crash_reserve.c
@@ -335,6 +335,9 @@ int __init parse_crashkernel(char *cmdline,
if (!*crash_size)
ret = -EINVAL;
+ if (*crash_size >= system_ram)
+ ret = -EINVAL;
+
return ret;
}
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index c06e56be0ca1..4c0dcd909121 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -8,8 +8,7 @@ config HAS_DMA
depends on !NO_DMA
default y
-config DMA_OPS
- depends on HAS_DMA
+config DMA_OPS_HELPERS
bool
#
@@ -109,8 +108,8 @@ config DMA_BOUNCE_UNALIGNED_KMALLOC
config DMA_NEED_SYNC
def_bool ARCH_HAS_SYNC_DMA_FOR_DEVICE || ARCH_HAS_SYNC_DMA_FOR_CPU || \
- ARCH_HAS_SYNC_DMA_FOR_CPU_ALL || DMA_API_DEBUG || DMA_OPS || \
- SWIOTLB
+ ARCH_HAS_SYNC_DMA_FOR_CPU_ALL || DMA_API_DEBUG || \
+ ARCH_HAS_DMA_OPS || SWIOTLB
config DMA_RESTRICTED_POOL
bool "DMA Restricted Pool"
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index 21926e46ef4f..6977033444a3 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -1,8 +1,8 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_HAS_DMA) += mapping.o direct.o
-obj-$(CONFIG_DMA_OPS) += ops_helpers.o
-obj-$(CONFIG_DMA_OPS) += dummy.o
+obj-$(CONFIG_DMA_OPS_HELPERS) += ops_helpers.o
+obj-$(CONFIG_ARCH_HAS_DMA_OPS) += dummy.o
obj-$(CONFIG_DMA_CMA) += contiguous.o
obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o
obj-$(CONFIG_DMA_API_DEBUG) += debug.o
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 4480a3cd92e0..5b4e6d3bf7bc 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -20,7 +20,7 @@
* it for entirely different regions. In that case the arch code needs to
* override the variable below for dma-direct to work properly.
*/
-unsigned int zone_dma_bits __ro_after_init = 24;
+u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24);
static inline dma_addr_t phys_to_dma_direct(struct device *dev,
phys_addr_t phys)
@@ -59,7 +59,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit)
* zones.
*/
*phys_limit = dma_to_phys(dev, dma_limit);
- if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
+ if (*phys_limit <= zone_dma_limit)
return GFP_DMA;
if (*phys_limit <= DMA_BIT_MASK(32))
return GFP_DMA32;
@@ -140,7 +140,7 @@ again:
if (!page)
page = alloc_pages_node(node, gfp, get_order(size));
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
- dma_free_contiguous(dev, page, size);
+ __free_pages(page, get_order(size));
page = NULL;
if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
@@ -580,7 +580,7 @@ int dma_direct_supported(struct device *dev, u64 mask)
* part of the check.
*/
if (IS_ENABLED(CONFIG_ZONE_DMA))
- min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits));
+ min_mask = min_t(u64, min_mask, zone_dma_limit);
return mask >= phys_to_dma_unencrypted(dev, min_mask);
}
diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c
index b492d59ac77e..92de80e5b057 100644
--- a/kernel/dma/dummy.c
+++ b/kernel/dma/dummy.c
@@ -17,6 +17,15 @@ static dma_addr_t dma_dummy_map_page(struct device *dev, struct page *page,
{
return DMA_MAPPING_ERROR;
}
+static void dma_dummy_unmap_page(struct device *dev, dma_addr_t dma_handle,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ /*
+ * Dummy ops doesn't support map_page, so unmap_page should never be
+ * called.
+ */
+ WARN_ON_ONCE(true);
+}
static int dma_dummy_map_sg(struct device *dev, struct scatterlist *sgl,
int nelems, enum dma_data_direction dir,
@@ -25,6 +34,16 @@ static int dma_dummy_map_sg(struct device *dev, struct scatterlist *sgl,
return -EINVAL;
}
+static void dma_dummy_unmap_sg(struct device *dev, struct scatterlist *sgl,
+ int nelems, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ /*
+ * Dummy ops doesn't support map_sg, so unmap_sg should never be called.
+ */
+ WARN_ON_ONCE(true);
+}
+
static int dma_dummy_supported(struct device *hwdev, u64 mask)
{
return 0;
@@ -33,6 +52,8 @@ static int dma_dummy_supported(struct device *hwdev, u64 mask)
const struct dma_map_ops dma_dummy_ops = {
.mmap = dma_dummy_mmap,
.map_page = dma_dummy_map_page,
+ .unmap_page = dma_dummy_unmap_page,
.map_sg = dma_dummy_map_sg,
+ .unmap_sg = dma_dummy_unmap_sg,
.dma_supported = dma_dummy_supported,
};
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index b1c18058d55f..864a1121bf08 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -10,6 +10,7 @@
#include <linux/dma-map-ops.h>
#include <linux/export.h>
#include <linux/gfp.h>
+#include <linux/iommu-dma.h>
#include <linux/kmsan.h>
#include <linux/of_device.h>
#include <linux/slab.h>
@@ -17,6 +18,9 @@
#include "debug.h"
#include "direct.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/dma.h>
+
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
@@ -116,8 +120,12 @@ EXPORT_SYMBOL(dmam_alloc_attrs);
static bool dma_go_direct(struct device *dev, dma_addr_t mask,
const struct dma_map_ops *ops)
{
+ if (use_dma_iommu(dev))
+ return false;
+
if (likely(!ops))
return true;
+
#ifdef CONFIG_DMA_OPS_BYPASS
if (dev->dma_ops_bypass)
return min_not_zero(mask, dev->bus_dma_limit) >=
@@ -159,9 +167,13 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
if (dma_map_direct(dev, ops) ||
arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size))
addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
+ else if (use_dma_iommu(dev))
+ addr = iommu_dma_map_page(dev, page, offset, size, dir, attrs);
else
addr = ops->map_page(dev, page, offset, size, dir, attrs);
kmsan_handle_dma(page, offset, size, dir);
+ trace_dma_map_page(dev, page_to_phys(page) + offset, addr, size, dir,
+ attrs);
debug_dma_map_page(dev, page, offset, size, dir, addr, attrs);
return addr;
@@ -177,8 +189,11 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
if (dma_map_direct(dev, ops) ||
arch_dma_unmap_page_direct(dev, addr + size))
dma_direct_unmap_page(dev, addr, size, dir, attrs);
- else if (ops->unmap_page)
+ else if (use_dma_iommu(dev))
+ iommu_dma_unmap_page(dev, addr, size, dir, attrs);
+ else
ops->unmap_page(dev, addr, size, dir, attrs);
+ trace_dma_unmap_page(dev, addr, size, dir, attrs);
debug_dma_unmap_page(dev, addr, size, dir);
}
EXPORT_SYMBOL(dma_unmap_page_attrs);
@@ -197,11 +212,14 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
if (dma_map_direct(dev, ops) ||
arch_dma_map_sg_direct(dev, sg, nents))
ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
+ else if (use_dma_iommu(dev))
+ ents = iommu_dma_map_sg(dev, sg, nents, dir, attrs);
else
ents = ops->map_sg(dev, sg, nents, dir, attrs);
if (ents > 0) {
kmsan_handle_dma_sg(sg, nents, dir);
+ trace_dma_map_sg(dev, sg, nents, ents, dir, attrs);
debug_dma_map_sg(dev, sg, nents, ents, dir, attrs);
} else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
ents != -EIO && ents != -EREMOTEIO)) {
@@ -287,10 +305,13 @@ void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
const struct dma_map_ops *ops = get_dma_ops(dev);
BUG_ON(!valid_dma_direction(dir));
+ trace_dma_unmap_sg(dev, sg, nents, dir, attrs);
debug_dma_unmap_sg(dev, sg, nents, dir);
if (dma_map_direct(dev, ops) ||
arch_dma_unmap_sg_direct(dev, sg, nents))
dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
+ else if (use_dma_iommu(dev))
+ iommu_dma_unmap_sg(dev, sg, nents, dir, attrs);
else if (ops->unmap_sg)
ops->unmap_sg(dev, sg, nents, dir, attrs);
}
@@ -309,9 +330,12 @@ dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
if (dma_map_direct(dev, ops))
addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
+ else if (use_dma_iommu(dev))
+ addr = iommu_dma_map_resource(dev, phys_addr, size, dir, attrs);
else if (ops->map_resource)
addr = ops->map_resource(dev, phys_addr, size, dir, attrs);
+ trace_dma_map_resource(dev, phys_addr, addr, size, dir, attrs);
debug_dma_map_resource(dev, phys_addr, size, dir, addr, attrs);
return addr;
}
@@ -323,8 +347,13 @@ void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
const struct dma_map_ops *ops = get_dma_ops(dev);
BUG_ON(!valid_dma_direction(dir));
- if (!dma_map_direct(dev, ops) && ops->unmap_resource)
+ if (dma_map_direct(dev, ops))
+ ; /* nothing to do: uncached and no swiotlb */
+ else if (use_dma_iommu(dev))
+ iommu_dma_unmap_resource(dev, addr, size, dir, attrs);
+ else if (ops->unmap_resource)
ops->unmap_resource(dev, addr, size, dir, attrs);
+ trace_dma_unmap_resource(dev, addr, size, dir, attrs);
debug_dma_unmap_resource(dev, addr, size, dir);
}
EXPORT_SYMBOL(dma_unmap_resource);
@@ -338,8 +367,11 @@ void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
BUG_ON(!valid_dma_direction(dir));
if (dma_map_direct(dev, ops))
dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+ else if (use_dma_iommu(dev))
+ iommu_dma_sync_single_for_cpu(dev, addr, size, dir);
else if (ops->sync_single_for_cpu)
ops->sync_single_for_cpu(dev, addr, size, dir);
+ trace_dma_sync_single_for_cpu(dev, addr, size, dir);
debug_dma_sync_single_for_cpu(dev, addr, size, dir);
}
EXPORT_SYMBOL(__dma_sync_single_for_cpu);
@@ -352,8 +384,11 @@ void __dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
BUG_ON(!valid_dma_direction(dir));
if (dma_map_direct(dev, ops))
dma_direct_sync_single_for_device(dev, addr, size, dir);
+ else if (use_dma_iommu(dev))
+ iommu_dma_sync_single_for_device(dev, addr, size, dir);
else if (ops->sync_single_for_device)
ops->sync_single_for_device(dev, addr, size, dir);
+ trace_dma_sync_single_for_device(dev, addr, size, dir);
debug_dma_sync_single_for_device(dev, addr, size, dir);
}
EXPORT_SYMBOL(__dma_sync_single_for_device);
@@ -366,8 +401,11 @@ void __dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
BUG_ON(!valid_dma_direction(dir));
if (dma_map_direct(dev, ops))
dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir);
+ else if (use_dma_iommu(dev))
+ iommu_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
else if (ops->sync_sg_for_cpu)
ops->sync_sg_for_cpu(dev, sg, nelems, dir);
+ trace_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
}
EXPORT_SYMBOL(__dma_sync_sg_for_cpu);
@@ -380,8 +418,11 @@ void __dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
BUG_ON(!valid_dma_direction(dir));
if (dma_map_direct(dev, ops))
dma_direct_sync_sg_for_device(dev, sg, nelems, dir);
+ else if (use_dma_iommu(dev))
+ iommu_dma_sync_sg_for_device(dev, sg, nelems, dir);
else if (ops->sync_sg_for_device)
ops->sync_sg_for_device(dev, sg, nelems, dir);
+ trace_dma_sync_sg_for_device(dev, sg, nelems, dir);
debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
}
EXPORT_SYMBOL(__dma_sync_sg_for_device);
@@ -405,7 +446,7 @@ static void dma_setup_need_sync(struct device *dev)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (dma_map_direct(dev, ops) || (ops->flags & DMA_F_CAN_SKIP_SYNC))
+ if (dma_map_direct(dev, ops) || use_dma_iommu(dev))
/*
* dma_skip_sync will be reset to %false on first SWIOTLB buffer
* mapping, if any. During the device initialization, it's
@@ -446,6 +487,9 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
if (dma_alloc_direct(dev, ops))
return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr,
size, attrs);
+ if (use_dma_iommu(dev))
+ return iommu_dma_get_sgtable(dev, sgt, cpu_addr, dma_addr,
+ size, attrs);
if (!ops->get_sgtable)
return -ENXIO;
return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs);
@@ -482,6 +526,8 @@ bool dma_can_mmap(struct device *dev)
if (dma_alloc_direct(dev, ops))
return dma_direct_can_mmap(dev);
+ if (use_dma_iommu(dev))
+ return true;
return ops->mmap != NULL;
}
EXPORT_SYMBOL_GPL(dma_can_mmap);
@@ -508,6 +554,9 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
if (dma_alloc_direct(dev, ops))
return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size,
attrs);
+ if (use_dma_iommu(dev))
+ return iommu_dma_mmap(dev, vma, cpu_addr, dma_addr, size,
+ attrs);
if (!ops->mmap)
return -ENXIO;
return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
@@ -520,6 +569,10 @@ u64 dma_get_required_mask(struct device *dev)
if (dma_alloc_direct(dev, ops))
return dma_direct_get_required_mask(dev);
+
+ if (use_dma_iommu(dev))
+ return DMA_BIT_MASK(32);
+
if (ops->get_required_mask)
return ops->get_required_mask(dev);
@@ -559,11 +612,14 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
if (dma_alloc_direct(dev, ops))
cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs);
+ else if (use_dma_iommu(dev))
+ cpu_addr = iommu_dma_alloc(dev, size, dma_handle, flag, attrs);
else if (ops->alloc)
cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
else
return NULL;
+ trace_dma_alloc(dev, cpu_addr, *dma_handle, size, flag, attrs);
debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr, attrs);
return cpu_addr;
}
@@ -588,9 +644,12 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
if (!cpu_addr)
return;
+ trace_dma_free(dev, cpu_addr, dma_handle, size, attrs);
debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
if (dma_alloc_direct(dev, ops))
dma_direct_free(dev, size, cpu_addr, dma_handle, attrs);
+ else if (use_dma_iommu(dev))
+ iommu_dma_free(dev, size, cpu_addr, dma_handle, attrs);
else if (ops->free)
ops->free(dev, size, cpu_addr, dma_handle, attrs);
}
@@ -611,6 +670,8 @@ static struct page *__dma_alloc_pages(struct device *dev, size_t size,
size = PAGE_ALIGN(size);
if (dma_alloc_direct(dev, ops))
return dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp);
+ if (use_dma_iommu(dev))
+ return dma_common_alloc_pages(dev, size, dma_handle, dir, gfp);
if (!ops->alloc_pages_op)
return NULL;
return ops->alloc_pages_op(dev, size, dma_handle, dir, gfp);
@@ -621,8 +682,11 @@ struct page *dma_alloc_pages(struct device *dev, size_t size,
{
struct page *page = __dma_alloc_pages(dev, size, dma_handle, dir, gfp);
- if (page)
+ if (page) {
+ trace_dma_map_page(dev, page_to_phys(page), *dma_handle, size,
+ dir, 0);
debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0);
+ }
return page;
}
EXPORT_SYMBOL_GPL(dma_alloc_pages);
@@ -635,6 +699,8 @@ static void __dma_free_pages(struct device *dev, size_t size, struct page *page,
size = PAGE_ALIGN(size);
if (dma_alloc_direct(dev, ops))
dma_direct_free_pages(dev, size, page, dma_handle, dir);
+ else if (use_dma_iommu(dev))
+ dma_common_free_pages(dev, size, page, dma_handle, dir);
else if (ops->free_pages)
ops->free_pages(dev, size, page, dma_handle, dir);
}
@@ -642,6 +708,7 @@ static void __dma_free_pages(struct device *dev, size_t size, struct page *page,
void dma_free_pages(struct device *dev, size_t size, struct page *page,
dma_addr_t dma_handle, enum dma_data_direction dir)
{
+ trace_dma_unmap_page(dev, dma_handle, size, dir, 0);
debug_dma_unmap_page(dev, dma_handle, size, dir);
__dma_free_pages(dev, size, page, dma_handle, dir);
}
@@ -687,7 +754,6 @@ out_free_sgt:
struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
enum dma_data_direction dir, gfp_t gfp, unsigned long attrs)
{
- const struct dma_map_ops *ops = get_dma_ops(dev);
struct sg_table *sgt;
if (WARN_ON_ONCE(attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES))
@@ -695,13 +761,14 @@ struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
if (WARN_ON_ONCE(gfp & __GFP_COMP))
return NULL;
- if (ops && ops->alloc_noncontiguous)
- sgt = ops->alloc_noncontiguous(dev, size, dir, gfp, attrs);
+ if (use_dma_iommu(dev))
+ sgt = iommu_dma_alloc_noncontiguous(dev, size, dir, gfp, attrs);
else
sgt = alloc_single_sgt(dev, size, dir, gfp);
if (sgt) {
sgt->nents = 1;
+ trace_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs);
debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs);
}
return sgt;
@@ -720,11 +787,11 @@ static void free_single_sgt(struct device *dev, size_t size,
void dma_free_noncontiguous(struct device *dev, size_t size,
struct sg_table *sgt, enum dma_data_direction dir)
{
- const struct dma_map_ops *ops = get_dma_ops(dev);
-
+ trace_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir, 0);
debug_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir);
- if (ops && ops->free_noncontiguous)
- ops->free_noncontiguous(dev, size, sgt, dir);
+
+ if (use_dma_iommu(dev))
+ iommu_dma_free_noncontiguous(dev, size, sgt, dir);
else
free_single_sgt(dev, size, sgt, dir);
}
@@ -733,37 +800,26 @@ EXPORT_SYMBOL_GPL(dma_free_noncontiguous);
void *dma_vmap_noncontiguous(struct device *dev, size_t size,
struct sg_table *sgt)
{
- const struct dma_map_ops *ops = get_dma_ops(dev);
- unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
- if (ops && ops->alloc_noncontiguous)
- return vmap(sgt_handle(sgt)->pages, count, VM_MAP, PAGE_KERNEL);
+ if (use_dma_iommu(dev))
+ return iommu_dma_vmap_noncontiguous(dev, size, sgt);
+
return page_address(sg_page(sgt->sgl));
}
EXPORT_SYMBOL_GPL(dma_vmap_noncontiguous);
void dma_vunmap_noncontiguous(struct device *dev, void *vaddr)
{
- const struct dma_map_ops *ops = get_dma_ops(dev);
-
- if (ops && ops->alloc_noncontiguous)
- vunmap(vaddr);
+ if (use_dma_iommu(dev))
+ iommu_dma_vunmap_noncontiguous(dev, vaddr);
}
EXPORT_SYMBOL_GPL(dma_vunmap_noncontiguous);
int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma,
size_t size, struct sg_table *sgt)
{
- const struct dma_map_ops *ops = get_dma_ops(dev);
-
- if (ops && ops->alloc_noncontiguous) {
- unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
-
- if (vma->vm_pgoff >= count ||
- vma_pages(vma) > count - vma->vm_pgoff)
- return -ENXIO;
- return vm_map_pages(vma, sgt_handle(sgt)->pages, count);
- }
+ if (use_dma_iommu(dev))
+ return iommu_dma_mmap_noncontiguous(dev, vma, size, sgt);
return dma_mmap_pages(dev, vma, size, sg_page(sgt->sgl));
}
EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous);
@@ -772,32 +828,37 @@ static int dma_supported(struct device *dev, u64 mask)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
+ if (use_dma_iommu(dev)) {
+ if (WARN_ON(ops))
+ return false;
+ return true;
+ }
+
/*
- * ->dma_supported sets the bypass flag, so we must always call
- * into the method here unless the device is truly direct mapped.
+ * ->dma_supported sets and clears the bypass flag, so ignore it here
+ * and always call into the method if there is one.
*/
- if (!ops)
- return dma_direct_supported(dev, mask);
- if (!ops->dma_supported)
- return 1;
- return ops->dma_supported(dev, mask);
+ if (ops) {
+ if (!ops->dma_supported)
+ return true;
+ return ops->dma_supported(dev, mask);
+ }
+
+ return dma_direct_supported(dev, mask);
}
bool dma_pci_p2pdma_supported(struct device *dev)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- /* if ops is not set, dma direct will be used which supports P2PDMA */
- if (!ops)
- return true;
-
/*
* Note: dma_ops_bypass is not checked here because P2PDMA should
* not be used with dma mapping ops that do not have support even
* if the specific device is bypassing them.
*/
- return ops->flags & DMA_F_PCI_P2PDMA_SUPPORTED;
+ /* if ops is not set, dma direct and default IOMMU support P2PDMA */
+ return !ops;
}
EXPORT_SYMBOL_GPL(dma_pci_p2pdma_supported);
@@ -852,7 +913,7 @@ bool dma_addressing_limited(struct device *dev)
dma_get_required_mask(dev))
return true;
- if (unlikely(ops))
+ if (unlikely(ops) || use_dma_iommu(dev))
return false;
return !dma_direct_all_ram_mapped(dev);
}
@@ -865,6 +926,8 @@ size_t dma_max_mapping_size(struct device *dev)
if (dma_map_direct(dev, ops))
size = dma_direct_max_mapping_size(dev);
+ else if (use_dma_iommu(dev))
+ size = iommu_dma_max_mapping_size(dev);
else if (ops && ops->max_mapping_size)
size = ops->max_mapping_size(dev);
@@ -877,7 +940,9 @@ size_t dma_opt_mapping_size(struct device *dev)
const struct dma_map_ops *ops = get_dma_ops(dev);
size_t size = SIZE_MAX;
- if (ops && ops->opt_mapping_size)
+ if (use_dma_iommu(dev))
+ size = iommu_dma_opt_mapping_size();
+ else if (ops && ops->opt_mapping_size)
size = ops->opt_mapping_size();
return min(dma_max_mapping_size(dev), size);
@@ -888,6 +953,9 @@ unsigned long dma_get_merge_boundary(struct device *dev)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
+ if (use_dma_iommu(dev))
+ return iommu_dma_get_merge_boundary(dev);
+
if (!ops || !ops->get_merge_boundary)
return 0; /* can't merge */
diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c
index af4a6ef48ce0..9afd569eadb9 100644
--- a/kernel/dma/ops_helpers.c
+++ b/kernel/dma/ops_helpers.c
@@ -4,6 +4,7 @@
* the allocated memory contains normal pages in the direct kernel mapping.
*/
#include <linux/dma-map-ops.h>
+#include <linux/iommu-dma.h>
static struct page *dma_common_vaddr_to_page(void *cpu_addr)
{
@@ -70,8 +71,12 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size,
if (!page)
return NULL;
- *dma_handle = ops->map_page(dev, page, 0, size, dir,
- DMA_ATTR_SKIP_CPU_SYNC);
+ if (use_dma_iommu(dev))
+ *dma_handle = iommu_dma_map_page(dev, page, 0, size, dir,
+ DMA_ATTR_SKIP_CPU_SYNC);
+ else
+ *dma_handle = ops->map_page(dev, page, 0, size, dir,
+ DMA_ATTR_SKIP_CPU_SYNC);
if (*dma_handle == DMA_MAPPING_ERROR) {
dma_free_contiguous(dev, page, size);
return NULL;
@@ -86,7 +91,10 @@ void dma_common_free_pages(struct device *dev, size_t size, struct page *page,
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (ops->unmap_page)
+ if (use_dma_iommu(dev))
+ iommu_dma_unmap_page(dev, dma_handle, size, dir,
+ DMA_ATTR_SKIP_CPU_SYNC);
+ else if (ops->unmap_page)
ops->unmap_page(dev, dma_handle, size, dir,
DMA_ATTR_SKIP_CPU_SYNC);
dma_free_contiguous(dev, page, size);
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index d10613eb0f63..7b04f7575796 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -70,9 +70,9 @@ static bool cma_in_zone(gfp_t gfp)
/* CMA can't cross zone boundaries, see cma_activate_area() */
end = cma_get_base(cma) + size - 1;
if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
- return end <= DMA_BIT_MASK(zone_dma_bits);
+ return end <= zone_dma_limit;
if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
- return end <= DMA_BIT_MASK(32);
+ return end <= max(DMA_BIT_MASK(32), zone_dma_limit);
return true;
}
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index 27596f3b4aef..9e2afad1c615 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -10,8 +10,10 @@ struct page **dma_common_find_pages(void *cpu_addr)
{
struct vm_struct *area = find_vm_area(cpu_addr);
- if (!area || area->flags != VM_DMA_COHERENT)
+ if (!area || !(area->flags & VM_DMA_COHERENT))
return NULL;
+ WARN(area->flags != VM_DMA_COHERENT,
+ "unexpected flags in area: %p\n", cpu_addr);
return area->pages;
}
@@ -61,7 +63,7 @@ void dma_common_free_remap(void *cpu_addr, size_t size)
{
struct vm_struct *area = find_vm_area(cpu_addr);
- if (!area || area->flags != VM_DMA_COHERENT) {
+ if (!area || !(area->flags & VM_DMA_COHERENT)) {
WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr);
return;
}
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index df68d29740a0..abcf3fa63a56 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -450,9 +450,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
if (!remap)
io_tlb_default_mem.can_grow = true;
if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA))
- io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits);
+ io_tlb_default_mem.phys_limit = zone_dma_limit;
else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32))
- io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32);
+ io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit);
else
io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1);
#endif
@@ -629,7 +629,7 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
}
gfp &= ~GFP_ZONEMASK;
- if (phys_limit <= DMA_BIT_MASK(zone_dma_bits))
+ if (phys_limit <= zone_dma_limit)
gfp |= __GFP_DMA;
else if (phys_limit <= DMA_BIT_MASK(32))
gfp |= __GFP_DMA32;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4f03eb908e7f..5a8071c45c80 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -969,10 +969,10 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
struct fd f = fdget(fd);
int ret = 0;
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- css = css_tryget_online_from_dir(f.file->f_path.dentry,
+ css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry,
&perf_event_cgrp_subsys);
if (IS_ERR(css)) {
ret = PTR_ERR(css);
@@ -6001,10 +6001,10 @@ static const struct file_operations perf_fops;
static inline int perf_fget_light(int fd, struct fd *p)
{
struct fd f = fdget(fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- if (f.file->f_op != &perf_fops) {
+ if (fd_file(f)->f_op != &perf_fops) {
fdput(f);
return -EBADF;
}
@@ -6064,7 +6064,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
ret = perf_fget_light(arg, &output);
if (ret)
return ret;
- output_event = output.file->private_data;
+ output_event = fd_file(output)->private_data;
ret = perf_event_set_output(event, output_event);
fdput(output);
} else {
@@ -8964,7 +8964,7 @@ got_name:
mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
if (atomic_read(&nr_build_id_events))
- build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size);
+ build_id_parse_nofault(vma, mmap_event->build_id, &mmap_event->build_id_size);
perf_iterate_sb(perf_event_mmap_output,
mmap_event,
@@ -12665,7 +12665,7 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event_attr attr;
struct perf_event_context *ctx;
struct file *event_file = NULL;
- struct fd group = {NULL, 0};
+ struct fd group = EMPTY_FD;
struct task_struct *task = NULL;
struct pmu *pmu;
int event_fd;
@@ -12740,7 +12740,7 @@ SYSCALL_DEFINE5(perf_event_open,
err = perf_fget_light(group_fd, &group);
if (err)
goto err_fd;
- group_leader = group.file->private_data;
+ group_leader = fd_file(group)->private_data;
if (flags & PERF_FLAG_FD_OUTPUT)
output_event = group_leader;
if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -14002,21 +14002,19 @@ static void perf_event_setup_cpumask(unsigned int cpu)
struct cpumask *pmu_cpumask;
unsigned int scope;
- cpumask_set_cpu(cpu, perf_online_mask);
-
/*
* Early boot stage, the cpumask hasn't been set yet.
* The perf_online_<domain>_masks includes the first CPU of each domain.
- * Always uncondifionally set the boot CPU for the perf_online_<domain>_masks.
+ * Always unconditionally set the boot CPU for the perf_online_<domain>_masks.
*/
- if (!topology_sibling_cpumask(cpu)) {
+ if (cpumask_empty(perf_online_mask)) {
for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
pmu_cpumask = perf_scope_cpumask(scope);
if (WARN_ON_ONCE(!pmu_cpumask))
continue;
cpumask_set_cpu(cpu, pmu_cpumask);
}
- return;
+ goto end;
}
for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
@@ -14031,6 +14029,8 @@ static void perf_event_setup_cpumask(unsigned int cpu)
cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids)
cpumask_set_cpu(cpu, pmu_cpumask);
}
+end:
+ cpumask_set_cpu(cpu, perf_online_mask);
}
int perf_event_init_cpu(unsigned int cpu)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 4b7e590dc428..2ec796e2f055 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -103,8 +103,7 @@ struct xol_area {
atomic_t slot_count; /* number of in-use slots */
unsigned long *bitmap; /* 0 = free slot */
- struct vm_special_mapping xol_mapping;
- struct page *pages[2];
+ struct page *page;
/*
* We keep the vma's vm_start rather than a pointer to the vma
* itself. The probed process or a naughty kernel module could make
@@ -1466,6 +1465,21 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
}
+static vm_fault_t xol_fault(const struct vm_special_mapping *sm,
+ struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct xol_area *area = vma->vm_mm->uprobes_state.xol_area;
+
+ vmf->page = area->page;
+ get_page(vmf->page);
+ return 0;
+}
+
+static const struct vm_special_mapping xol_mapping = {
+ .name = "[uprobes]",
+ .fault = xol_fault,
+};
+
/* Slot allocation for XOL */
static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
{
@@ -1492,7 +1506,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
- &area->xol_mapping);
+ &xol_mapping);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto fail;
@@ -1531,12 +1545,9 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
if (!area->bitmap)
goto free_area;
- area->xol_mapping.name = "[uprobes]";
- area->xol_mapping.pages = area->pages;
- area->pages[0] = alloc_page(GFP_HIGHUSER);
- if (!area->pages[0])
+ area->page = alloc_page(GFP_HIGHUSER);
+ if (!area->page)
goto free_bitmap;
- area->pages[1] = NULL;
area->vaddr = vaddr;
init_waitqueue_head(&area->wq);
@@ -1544,12 +1555,12 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
set_bit(0, area->bitmap);
atomic_set(&area->slot_count, 1);
insns = arch_uprobe_trampoline(&insns_size);
- arch_uprobe_copy_ixol(area->pages[0], 0, insns, insns_size);
+ arch_uprobe_copy_ixol(area->page, 0, insns, insns_size);
if (!xol_add_vma(mm, area))
return area;
- __free_page(area->pages[0]);
+ __free_page(area->page);
free_bitmap:
kfree(area->bitmap);
free_area:
@@ -1591,7 +1602,7 @@ void uprobe_clear_state(struct mm_struct *mm)
if (!area)
return;
- put_page(area->pages[0]);
+ put_page(area->page);
kfree(area->bitmap);
kfree(area);
}
@@ -1658,7 +1669,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
if (unlikely(!xol_vaddr))
return 0;
- arch_uprobe_copy_ixol(area->pages[0], xol_vaddr,
+ arch_uprobe_copy_ixol(area->page, xol_vaddr,
&uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
return xol_vaddr;
diff --git a/kernel/exit.c b/kernel/exit.c
index 0d62a53605df..619f0014c33b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -778,6 +778,62 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
}
#ifdef CONFIG_DEBUG_STACK_USAGE
+unsigned long stack_not_used(struct task_struct *p)
+{
+ unsigned long *n = end_of_stack(p);
+
+ do { /* Skip over canary */
+# ifdef CONFIG_STACK_GROWSUP
+ n--;
+# else
+ n++;
+# endif
+ } while (!*n);
+
+# ifdef CONFIG_STACK_GROWSUP
+ return (unsigned long)end_of_stack(p) - (unsigned long)n;
+# else
+ return (unsigned long)n - (unsigned long)end_of_stack(p);
+# endif
+}
+
+/* Count the maximum pages reached in kernel stacks */
+static inline void kstack_histogram(unsigned long used_stack)
+{
+#ifdef CONFIG_VM_EVENT_COUNTERS
+ if (used_stack <= 1024)
+ count_vm_event(KSTACK_1K);
+#if THREAD_SIZE > 1024
+ else if (used_stack <= 2048)
+ count_vm_event(KSTACK_2K);
+#endif
+#if THREAD_SIZE > 2048
+ else if (used_stack <= 4096)
+ count_vm_event(KSTACK_4K);
+#endif
+#if THREAD_SIZE > 4096
+ else if (used_stack <= 8192)
+ count_vm_event(KSTACK_8K);
+#endif
+#if THREAD_SIZE > 8192
+ else if (used_stack <= 16384)
+ count_vm_event(KSTACK_16K);
+#endif
+#if THREAD_SIZE > 16384
+ else if (used_stack <= 32768)
+ count_vm_event(KSTACK_32K);
+#endif
+#if THREAD_SIZE > 32768
+ else if (used_stack <= 65536)
+ count_vm_event(KSTACK_64K);
+#endif
+#if THREAD_SIZE > 65536
+ else
+ count_vm_event(KSTACK_REST);
+#endif
+#endif /* CONFIG_VM_EVENT_COUNTERS */
+}
+
static void check_stack_usage(void)
{
static DEFINE_SPINLOCK(low_water_lock);
@@ -785,6 +841,7 @@ static void check_stack_usage(void)
unsigned long free;
free = stack_not_used(current);
+ kstack_histogram(THREAD_SIZE - free);
if (free >= lowest_to_date)
return;
diff --git a/kernel/fork.c b/kernel/fork.c
index d4b2d543f48c..cbdaca45d0c1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -23,6 +23,7 @@
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
+#include <linux/sched/ext.h>
#include <linux/seq_file.h>
#include <linux/rtmutex.h>
#include <linux/init.h>
@@ -832,7 +833,7 @@ static void check_mm(struct mm_struct *mm)
pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
mm_pgtables_bytes(mm));
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
#endif
}
@@ -969,6 +970,7 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(refcount_read(&tsk->usage));
WARN_ON(tsk == current);
+ sched_ext_free(tsk);
io_uring_free(tsk);
cgroup_free(tsk);
task_numa_free(tsk, true);
@@ -1276,7 +1278,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
mm->pmd_huge_pte = NULL;
#endif
mm_init_uprobes_state(mm);
@@ -2346,7 +2348,7 @@ __latent_entropy struct task_struct *copy_process(
retval = perf_event_init_task(p, clone_flags);
if (retval)
- goto bad_fork_cleanup_policy;
+ goto bad_fork_sched_cancel_fork;
retval = audit_alloc(p);
if (retval)
goto bad_fork_cleanup_perf;
@@ -2479,7 +2481,9 @@ __latent_entropy struct task_struct *copy_process(
* cgroup specific, it unconditionally needs to place the task on a
* runqueue.
*/
- sched_cgroup_fork(p, args);
+ retval = sched_cgroup_fork(p, args);
+ if (retval)
+ goto bad_fork_cancel_cgroup;
/*
* From this point on we must avoid any synchronous user-space
@@ -2525,13 +2529,13 @@ __latent_entropy struct task_struct *copy_process(
/* Don't start children in a dying pid namespace */
if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
retval = -ENOMEM;
- goto bad_fork_cancel_cgroup;
+ goto bad_fork_core_free;
}
/* Let kill terminate clone/fork in the middle */
if (fatal_signal_pending(current)) {
retval = -EINTR;
- goto bad_fork_cancel_cgroup;
+ goto bad_fork_core_free;
}
/* No more failure paths after this point. */
@@ -2605,10 +2609,11 @@ __latent_entropy struct task_struct *copy_process(
return p;
-bad_fork_cancel_cgroup:
+bad_fork_core_free:
sched_core_free(p);
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
+bad_fork_cancel_cgroup:
cgroup_cancel_fork(p, args);
bad_fork_put_pidfd:
if (clone_flags & CLONE_PIDFD) {
@@ -2647,6 +2652,8 @@ bad_fork_cleanup_audit:
audit_free(p);
bad_fork_cleanup_perf:
perf_event_free_task(p);
+bad_fork_sched_cancel_fork:
+ sched_cancel_fork(p);
bad_fork_cleanup_policy:
lockdep_free_task(p);
#ifdef CONFIG_NUMA
diff --git a/kernel/freezer.c b/kernel/freezer.c
index f57aaf96b829..44bbd7dbd2c8 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -72,7 +72,7 @@ bool __refrigerator(bool check_kthr_stop)
bool freeze;
raw_spin_lock_irq(&current->pi_lock);
- set_current_state(TASK_FROZEN);
+ WRITE_ONCE(current->__state, TASK_FROZEN);
/* unstale saved_state so that __thaw_task() will wake us up */
current->saved_state = TASK_RUNNING;
raw_spin_unlock_irq(&current->pi_lock);
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 06a1f091be81..136768ae2637 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -34,6 +34,7 @@
#include <linux/compat.h>
#include <linux/jhash.h>
#include <linux/pagemap.h>
+#include <linux/debugfs.h>
#include <linux/plist.h>
#include <linux/memblock.h>
#include <linux/fault-inject.h>
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 1c7e5159064c..3a24d6b5f559 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -832,7 +832,7 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info)
struct irq_chip *chip = info->chip;
BUG_ON(!chip || !chip->irq_mask || !chip->irq_unmask);
- if (!chip->irq_set_affinity)
+ if (!chip->irq_set_affinity && !(info->flags & MSI_FLAG_NO_AFFINITY))
chip->irq_set_affinity = msi_domain_set_affinity;
}
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 2595defe8c0d..d35d9792402d 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -23,7 +23,8 @@ int kimage_is_destination_range(struct kimage *image,
extern atomic_t __kexec_lock;
static inline bool kexec_trylock(void)
{
- return atomic_cmpxchg_acquire(&__kexec_lock, 0, 1) == 0;
+ int old = 0;
+ return atomic_try_cmpxchg_acquire(&__kexec_lock, &old, 1);
}
static inline void kexec_unlock(void)
{
diff --git a/kernel/kthread.c b/kernel/kthread.c
index f7be976ff88a..db4ceb0f503c 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -845,8 +845,16 @@ repeat:
* event only cares about the address.
*/
trace_sched_kthread_work_execute_end(work, func);
- } else if (!freezing(current))
+ } else if (!freezing(current)) {
schedule();
+ } else {
+ /*
+ * Handle the case where the current remains
+ * TASK_INTERRUPTIBLE. try_to_freeze() expects
+ * the current to be TASK_RUNNING.
+ */
+ __set_current_state(TASK_RUNNING);
+ }
try_to_freeze();
cond_resched();
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index fba1229f1de6..ebebd0eec7f6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -347,7 +347,7 @@ static __always_inline int __waiter_prio(struct task_struct *task)
{
int prio = task->prio;
- if (!rt_prio(prio))
+ if (!rt_or_dl_prio(prio))
return DEFAULT_PRIO;
return prio;
@@ -435,7 +435,7 @@ static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
* Note that RT tasks are excluded from same priority (lateral)
* steals to prevent the introduction of an unbounded latency.
*/
- if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio))
+ if (rt_or_dl_prio(waiter->tree.prio))
return false;
return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 33cac79e3994..5ded7dff46ef 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -631,7 +631,7 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
* if it is an RT task or wait in the wait queue
* for too long.
*/
- if (has_handoff || (!rt_task(waiter->task) &&
+ if (has_handoff || (!rt_or_dl_task(waiter->task) &&
!time_after(jiffies, waiter->timeout)))
return false;
@@ -914,7 +914,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
if (owner_state != OWNER_WRITER) {
if (need_resched())
break;
- if (rt_task(current) &&
+ if (rt_or_dl_task(current) &&
(prev_owner_state != OWNER_WRITER))
break;
}
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 78719e1ef1b1..10a5736a21c2 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -697,3 +697,4 @@ module_exit(test_ww_mutex_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("API test facility for ww_mutexes");
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
index 3ad2cc4823e5..76d204b7d29c 100644
--- a/kernel/locking/ww_mutex.h
+++ b/kernel/locking/ww_mutex.h
@@ -237,7 +237,7 @@ __ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
int a_prio = a->task->prio;
int b_prio = b->task->prio;
- if (rt_prio(a_prio) || rt_prio(b_prio)) {
+ if (rt_or_dl_prio(a_prio) || rt_or_dl_prio(b_prio)) {
if (a_prio > b_prio)
return true;
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 4047b6d48255..05a9a06a140c 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -160,6 +160,7 @@ config MODULE_UNLOAD_TAINT_TRACKING
config MODVERSIONS
bool "Module versioning support"
+ depends on !COMPILE_TEST
help
Usually, you have to use modules compiled with your kernel.
Saying Y here makes it sometimes possible to use modules
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 71396e297499..49b9bca9de12 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -3234,7 +3234,7 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
return -EINVAL;
f = fdget(fd);
- err = idempotent_init_module(f.file, uargs, flags);
+ err = idempotent_init_module(fd_file(f), uargs, flags);
fdput(f);
return err;
}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 6ec3deec68c2..dc952c3b05af 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -550,15 +550,15 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
struct nsset nsset = {};
int err = 0;
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- if (proc_ns_file(f.file)) {
- ns = get_proc_ns(file_inode(f.file));
+ if (proc_ns_file(fd_file(f))) {
+ ns = get_proc_ns(file_inode(fd_file(f)));
if (flags && (ns->ops->type != flags))
err = -EINVAL;
flags = ns->ops->type;
- } else if (!IS_ERR(pidfd_pid(f.file))) {
+ } else if (!IS_ERR(pidfd_pid(fd_file(f)))) {
err = check_setns_flags(flags);
} else {
err = -EINVAL;
@@ -570,10 +570,10 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
if (err)
goto out;
- if (proc_ns_file(f.file))
+ if (proc_ns_file(fd_file(f)))
err = validate_ns(&nsset, ns);
else
- err = validate_nsset(&nsset, pidfd_pid(f.file));
+ err = validate_nsset(&nsset, pidfd_pid(fd_file(f)));
if (!err) {
commit_nsset(&nsset);
perf_event_namespaces(current);
diff --git a/kernel/numa.c b/kernel/numa.c
deleted file mode 100644
index 67ca6b8585c0..000000000000
--- a/kernel/numa.c
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#include <linux/printk.h>
-#include <linux/numa.h>
-
-/* Stub functions: */
-
-#ifndef memory_add_physaddr_to_nid
-int memory_add_physaddr_to_nid(u64 start)
-{
- pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n",
- start);
- return 0;
-}
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-#endif
-
-#ifndef phys_to_target_node
-int phys_to_target_node(u64 start)
-{
- pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n",
- start);
- return 0;
-}
-EXPORT_SYMBOL_GPL(phys_to_target_node);
-#endif
diff --git a/kernel/panic.c b/kernel/panic.c
index 753d12f4dc8f..fbc59b3b64d0 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -384,7 +384,7 @@ void panic(const char *fmt, ...)
panic_print_sys_info(false);
- kmsg_dump(KMSG_DUMP_PANIC);
+ kmsg_dump_desc(KMSG_DUMP_PANIC, buf);
/*
* If you doubt kdump always works fine in any situation,
diff --git a/kernel/pid.c b/kernel/pid.c
index da76ed1873f7..2715afb77eab 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -540,13 +540,13 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
struct pid *pid;
f = fdget(fd);
- if (!f.file)
+ if (!fd_file(f))
return ERR_PTR(-EBADF);
- pid = pidfd_pid(f.file);
+ pid = pidfd_pid(fd_file(f));
if (!IS_ERR(pid)) {
get_pid(pid);
- *flags = f.file->f_flags;
+ *flags = fd_file(f)->f_flags;
}
fdput(f);
@@ -755,10 +755,10 @@ SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
return -EINVAL;
f = fdget(pidfd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- pid = pidfd_pid(f.file);
+ pid = pidfd_pid(fd_file(f));
if (IS_ERR(pid))
ret = PTR_ERR(pid);
else
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 71e4fe6f9b85..beb808f4c367 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -4668,16 +4668,21 @@ const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason)
EXPORT_SYMBOL_GPL(kmsg_dump_reason_str);
/**
- * kmsg_dump - dump kernel log to kernel message dumpers.
+ * kmsg_dump_desc - dump kernel log to kernel message dumpers.
* @reason: the reason (oops, panic etc) for dumping
+ * @desc: a short string to describe what caused the panic or oops. Can be NULL
+ * if no additional description is available.
*
* Call each of the registered dumper's dump() callback, which can
* retrieve the kmsg records with kmsg_dump_get_line() or
* kmsg_dump_get_buffer().
*/
-void kmsg_dump(enum kmsg_dump_reason reason)
+void kmsg_dump_desc(enum kmsg_dump_reason reason, const char *desc)
{
struct kmsg_dumper *dumper;
+ struct kmsg_dump_detail detail = {
+ .reason = reason,
+ .description = desc};
rcu_read_lock();
list_for_each_entry_rcu(dumper, &dump_list, list) {
@@ -4695,7 +4700,7 @@ void kmsg_dump(enum kmsg_dump_reason reason)
continue;
/* invoke dumper which will iterate over records */
- dumper->dump(dumper, reason);
+ dumper->dump(dumper, &detail);
}
rcu_read_unlock();
}
diff --git a/kernel/resource.c b/kernel/resource.c
index a83040fde236..b730bd28b422 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -450,8 +450,7 @@ int walk_system_ram_res_rev(u64 start, u64 end, void *arg,
/* re-alloc */
struct resource *rams_new;
- rams_new = kvrealloc(rams, rams_size * sizeof(struct resource),
- (rams_size + 16) * sizeof(struct resource),
+ rams_new = kvrealloc(rams, (rams_size + 16) * sizeof(struct resource),
GFP_KERNEL);
if (!rams_new)
goto out;
@@ -540,20 +539,62 @@ static int __region_intersects(struct resource *parent, resource_size_t start,
size_t size, unsigned long flags,
unsigned long desc)
{
- struct resource res;
+ resource_size_t ostart, oend;
int type = 0; int other = 0;
- struct resource *p;
+ struct resource *p, *dp;
+ bool is_type, covered;
+ struct resource res;
res.start = start;
res.end = start + size - 1;
for (p = parent->child; p ; p = p->sibling) {
- bool is_type = (((p->flags & flags) == flags) &&
- ((desc == IORES_DESC_NONE) ||
- (desc == p->desc)));
-
- if (resource_overlaps(p, &res))
- is_type ? type++ : other++;
+ if (!resource_overlaps(p, &res))
+ continue;
+ is_type = (p->flags & flags) == flags &&
+ (desc == IORES_DESC_NONE || desc == p->desc);
+ if (is_type) {
+ type++;
+ continue;
+ }
+ /*
+ * Continue to search in descendant resources as if the
+ * matched descendant resources cover some ranges of 'p'.
+ *
+ * |------------- "CXL Window 0" ------------|
+ * |-- "System RAM" --|
+ *
+ * will behave similar as the following fake resource
+ * tree when searching "System RAM".
+ *
+ * |-- "System RAM" --||-- "CXL Window 0a" --|
+ */
+ covered = false;
+ ostart = max(res.start, p->start);
+ oend = min(res.end, p->end);
+ for_each_resource(p, dp, false) {
+ if (!resource_overlaps(dp, &res))
+ continue;
+ is_type = (dp->flags & flags) == flags &&
+ (desc == IORES_DESC_NONE || desc == dp->desc);
+ if (is_type) {
+ type++;
+ /*
+ * Range from 'ostart' to 'dp->start'
+ * isn't covered by matched resource.
+ */
+ if (dp->start > ostart)
+ break;
+ if (dp->end >= oend) {
+ covered = true;
+ break;
+ }
+ /* Remove covered range */
+ ostart = max(ostart, dp->end + 1);
+ }
+ }
+ if (!covered)
+ other++;
}
if (type == 0)
@@ -1818,7 +1859,11 @@ EXPORT_SYMBOL(resource_list_free);
#ifdef CONFIG_GET_FREE_REGION
#define GFR_DESCENDING (1UL << 0)
#define GFR_REQUEST_REGION (1UL << 1)
-#define GFR_DEFAULT_ALIGN (1UL << PA_SECTION_SHIFT)
+#ifdef PA_SECTION_SHIFT
+#define GFR_DEFAULT_ALIGN (1UL << PA_SECTION_SHIFT)
+#else
+#define GFR_DEFAULT_ALIGN PAGE_SIZE
+#endif
static resource_size_t gfr_start(struct resource *base, resource_size_t size,
resource_size_t align, unsigned long flags)
@@ -1830,7 +1875,7 @@ static resource_size_t gfr_start(struct resource *base, resource_size_t size,
return end - size + 1;
}
- return ALIGN(base->start, align);
+ return ALIGN(max(base->start, align), align);
}
static bool gfr_continue(struct resource *base, resource_size_t addr,
@@ -2004,7 +2049,7 @@ struct resource *alloc_free_mem_region(struct resource *base,
return get_free_mem_region(NULL, base, size, align, name,
IORES_DESC_NONE, flags);
}
-EXPORT_SYMBOL_NS_GPL(alloc_free_mem_region, CXL);
+EXPORT_SYMBOL_GPL(alloc_free_mem_region);
#endif /* CONFIG_GET_FREE_REGION */
static int __init strict_iomem(char *str)
diff --git a/kernel/resource_kunit.c b/kernel/resource_kunit.c
index 0e509985a44a..42d2d8d20f5d 100644
--- a/kernel/resource_kunit.c
+++ b/kernel/resource_kunit.c
@@ -7,6 +7,8 @@
#include <linux/ioport.h>
#include <linux/kernel.h>
#include <linux/string.h>
+#include <linux/sizes.h>
+#include <linux/mm.h>
#define R0_START 0x0000
#define R0_END 0xffff
@@ -137,9 +139,150 @@ static void resource_test_intersection(struct kunit *test)
} while (++i < ARRAY_SIZE(results_for_intersection));
}
+/*
+ * The test resource tree for region_intersects() test:
+ *
+ * BASE-BASE+1M-1 : Test System RAM 0
+ * # hole 0 (BASE+1M-BASE+2M)
+ * BASE+2M-BASE+3M-1 : Test CXL Window 0
+ * BASE+3M-BASE+4M-1 : Test System RAM 1
+ * BASE+4M-BASE+7M-1 : Test CXL Window 1
+ * BASE+4M-BASE+5M-1 : Test System RAM 2
+ * BASE+4M+128K-BASE+4M+256K-1: Test Code
+ * BASE+5M-BASE+6M-1 : Test System RAM 3
+ */
+#define RES_TEST_RAM0_OFFSET 0
+#define RES_TEST_RAM0_SIZE SZ_1M
+#define RES_TEST_HOLE0_OFFSET (RES_TEST_RAM0_OFFSET + RES_TEST_RAM0_SIZE)
+#define RES_TEST_HOLE0_SIZE SZ_1M
+#define RES_TEST_WIN0_OFFSET (RES_TEST_HOLE0_OFFSET + RES_TEST_HOLE0_SIZE)
+#define RES_TEST_WIN0_SIZE SZ_1M
+#define RES_TEST_RAM1_OFFSET (RES_TEST_WIN0_OFFSET + RES_TEST_WIN0_SIZE)
+#define RES_TEST_RAM1_SIZE SZ_1M
+#define RES_TEST_WIN1_OFFSET (RES_TEST_RAM1_OFFSET + RES_TEST_RAM1_SIZE)
+#define RES_TEST_WIN1_SIZE (SZ_1M * 3)
+#define RES_TEST_RAM2_OFFSET RES_TEST_WIN1_OFFSET
+#define RES_TEST_RAM2_SIZE SZ_1M
+#define RES_TEST_CODE_OFFSET (RES_TEST_RAM2_OFFSET + SZ_128K)
+#define RES_TEST_CODE_SIZE SZ_128K
+#define RES_TEST_RAM3_OFFSET (RES_TEST_RAM2_OFFSET + RES_TEST_RAM2_SIZE)
+#define RES_TEST_RAM3_SIZE SZ_1M
+#define RES_TEST_TOTAL_SIZE ((RES_TEST_WIN1_OFFSET + RES_TEST_WIN1_SIZE))
+
+static void remove_free_resource(void *ctx)
+{
+ struct resource *res = (struct resource *)ctx;
+
+ remove_resource(res);
+ kfree(res);
+}
+
+static void resource_test_request_region(struct kunit *test, struct resource *parent,
+ resource_size_t start, resource_size_t size,
+ const char *name, unsigned long flags)
+{
+ struct resource *res;
+
+ res = __request_region(parent, start, size, name, flags);
+ KUNIT_ASSERT_NOT_NULL(test, res);
+ kunit_add_action_or_reset(test, remove_free_resource, res);
+}
+
+static void resource_test_insert_resource(struct kunit *test, struct resource *parent,
+ resource_size_t start, resource_size_t size,
+ const char *name, unsigned long flags)
+{
+ struct resource *res;
+
+ res = kzalloc(sizeof(*res), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, res);
+
+ res->name = name;
+ res->start = start;
+ res->end = start + size - 1;
+ res->flags = flags;
+ if (insert_resource(parent, res)) {
+ kfree(res);
+ KUNIT_FAIL_AND_ABORT(test, "Fail to insert resource %pR\n", res);
+ }
+
+ kunit_add_action_or_reset(test, remove_free_resource, res);
+}
+
+static void resource_test_region_intersects(struct kunit *test)
+{
+ unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+ struct resource *parent;
+ resource_size_t start;
+
+ /* Find an iomem_resource hole to hold test resources */
+ parent = alloc_free_mem_region(&iomem_resource, RES_TEST_TOTAL_SIZE, SZ_1M,
+ "test resources");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, parent);
+ start = parent->start;
+ kunit_add_action_or_reset(test, remove_free_resource, parent);
+
+ resource_test_request_region(test, parent, start + RES_TEST_RAM0_OFFSET,
+ RES_TEST_RAM0_SIZE, "Test System RAM 0", flags);
+ resource_test_insert_resource(test, parent, start + RES_TEST_WIN0_OFFSET,
+ RES_TEST_WIN0_SIZE, "Test CXL Window 0",
+ IORESOURCE_MEM);
+ resource_test_request_region(test, parent, start + RES_TEST_RAM1_OFFSET,
+ RES_TEST_RAM1_SIZE, "Test System RAM 1", flags);
+ resource_test_insert_resource(test, parent, start + RES_TEST_WIN1_OFFSET,
+ RES_TEST_WIN1_SIZE, "Test CXL Window 1",
+ IORESOURCE_MEM);
+ resource_test_request_region(test, parent, start + RES_TEST_RAM2_OFFSET,
+ RES_TEST_RAM2_SIZE, "Test System RAM 2", flags);
+ resource_test_insert_resource(test, parent, start + RES_TEST_CODE_OFFSET,
+ RES_TEST_CODE_SIZE, "Test Code", flags);
+ resource_test_request_region(test, parent, start + RES_TEST_RAM3_OFFSET,
+ RES_TEST_RAM3_SIZE, "Test System RAM 3", flags);
+ kunit_release_action(test, remove_free_resource, parent);
+
+ KUNIT_EXPECT_EQ(test, REGION_INTERSECTS,
+ region_intersects(start + RES_TEST_RAM0_OFFSET, PAGE_SIZE,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE));
+ KUNIT_EXPECT_EQ(test, REGION_INTERSECTS,
+ region_intersects(start + RES_TEST_RAM0_OFFSET +
+ RES_TEST_RAM0_SIZE - PAGE_SIZE, 2 * PAGE_SIZE,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE));
+ KUNIT_EXPECT_EQ(test, REGION_DISJOINT,
+ region_intersects(start + RES_TEST_HOLE0_OFFSET, PAGE_SIZE,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE));
+ KUNIT_EXPECT_EQ(test, REGION_DISJOINT,
+ region_intersects(start + RES_TEST_HOLE0_OFFSET +
+ RES_TEST_HOLE0_SIZE - PAGE_SIZE, 2 * PAGE_SIZE,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE));
+ KUNIT_EXPECT_EQ(test, REGION_MIXED,
+ region_intersects(start + RES_TEST_WIN0_OFFSET +
+ RES_TEST_WIN0_SIZE - PAGE_SIZE, 2 * PAGE_SIZE,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE));
+ KUNIT_EXPECT_EQ(test, REGION_INTERSECTS,
+ region_intersects(start + RES_TEST_RAM1_OFFSET +
+ RES_TEST_RAM1_SIZE - PAGE_SIZE, 2 * PAGE_SIZE,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE));
+ KUNIT_EXPECT_EQ(test, REGION_INTERSECTS,
+ region_intersects(start + RES_TEST_RAM2_OFFSET +
+ RES_TEST_RAM2_SIZE - PAGE_SIZE, 2 * PAGE_SIZE,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE));
+ KUNIT_EXPECT_EQ(test, REGION_INTERSECTS,
+ region_intersects(start + RES_TEST_CODE_OFFSET, PAGE_SIZE,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE));
+ KUNIT_EXPECT_EQ(test, REGION_INTERSECTS,
+ region_intersects(start + RES_TEST_RAM2_OFFSET,
+ RES_TEST_RAM2_SIZE + PAGE_SIZE,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE));
+ KUNIT_EXPECT_EQ(test, REGION_MIXED,
+ region_intersects(start + RES_TEST_RAM3_OFFSET,
+ RES_TEST_RAM3_SIZE + PAGE_SIZE,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE));
+}
+
static struct kunit_case resource_test_cases[] = {
KUNIT_CASE(resource_test_union),
KUNIT_CASE(resource_test_intersection),
+ KUNIT_CASE(resource_test_region_intersects),
{}
};
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index 39c315182b35..fae1f5c921eb 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -16,18 +16,25 @@
#include <linux/sched/clock.h>
#include <linux/sched/cputime.h>
#include <linux/sched/hotplug.h>
+#include <linux/sched/isolation.h>
#include <linux/sched/posix-timers.h>
#include <linux/sched/rt.h>
#include <linux/cpuidle.h>
#include <linux/jiffies.h>
+#include <linux/kobject.h>
#include <linux/livepatch.h>
+#include <linux/pm.h>
#include <linux/psi.h>
+#include <linux/rhashtable.h>
+#include <linux/seq_buf.h>
#include <linux/seqlock_api.h>
#include <linux/slab.h>
#include <linux/suspend.h>
#include <linux/tsacct_kern.h>
#include <linux/vtime.h>
+#include <linux/sysrq.h>
+#include <linux/percpu-rwsem.h>
#include <uapi/linux/sched/types.h>
@@ -52,4 +59,8 @@
#include "cputime.c"
#include "deadline.c"
+#ifdef CONFIG_SCHED_CLASS_EXT
+# include "ext.c"
+#endif
+
#include "syscalls.c"
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1d7f5941bcdc..43e453ab7e20 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -163,13 +163,19 @@ static inline int __task_prio(const struct task_struct *p)
if (p->sched_class == &stop_sched_class) /* trumps deadline */
return -2;
- if (rt_prio(p->prio)) /* includes deadline */
+ if (p->dl_server)
+ return -1; /* deadline */
+
+ if (rt_or_dl_prio(p->prio))
return p->prio; /* [-1, 99] */
if (p->sched_class == &idle_sched_class)
return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
- return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */
+ if (task_on_scx(p))
+ return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */
+
+ return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */
}
/*
@@ -192,12 +198,33 @@ static inline bool prio_less(const struct task_struct *a,
if (-pb < -pa)
return false;
- if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
- return !dl_time_before(a->dl.deadline, b->dl.deadline);
+ if (pa == -1) { /* dl_prio() doesn't work because of stop_class above */
+ const struct sched_dl_entity *a_dl, *b_dl;
+
+ a_dl = &a->dl;
+ /*
+ * Since,'a' and 'b' can be CFS tasks served by DL server,
+ * __task_prio() can return -1 (for DL) even for those. In that
+ * case, get to the dl_server's DL entity.
+ */
+ if (a->dl_server)
+ a_dl = a->dl_server;
+
+ b_dl = &b->dl;
+ if (b->dl_server)
+ b_dl = b->dl_server;
+
+ return !dl_time_before(a_dl->deadline, b_dl->deadline);
+ }
if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
return cfs_prio_less(a, b, in_fi);
+#ifdef CONFIG_SCHED_CLASS_EXT
+ if (pa == MAX_RT_PRIO + MAX_NICE + 1) /* ext */
+ return scx_prio_less(a, b, in_fi);
+#endif
+
return false;
}
@@ -240,6 +267,9 @@ static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
void sched_core_enqueue(struct rq *rq, struct task_struct *p)
{
+ if (p->se.sched_delayed)
+ return;
+
rq->core->core_task_seq++;
if (!p->core_cookie)
@@ -250,6 +280,9 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
{
+ if (p->se.sched_delayed)
+ return;
+
rq->core->core_task_seq++;
if (sched_core_enqueued(p)) {
@@ -1255,11 +1288,14 @@ bool sched_can_stop_tick(struct rq *rq)
return true;
/*
- * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
- * if there's more than one we need the tick for involuntary
- * preemption.
+ * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks
+ * left. For CFS, if there's more than one we need the tick for
+ * involuntary preemption. For SCX, ask.
*/
- if (rq->nr_running > 1)
+ if (scx_enabled() && !scx_can_stop_tick(rq))
+ return false;
+
+ if (rq->cfs.nr_running > 1)
return false;
/*
@@ -1269,7 +1305,7 @@ bool sched_can_stop_tick(struct rq *rq)
* dequeued by migrating while the constrained task continues to run.
* E.g. going from 2->1 without going through pick_next_task().
*/
- if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) {
+ if (__need_bw_check(rq, rq->curr)) {
if (cfs_task_bw_constrained(rq->curr))
return false;
}
@@ -1341,8 +1377,8 @@ void set_load_weight(struct task_struct *p, bool update_load)
* SCHED_OTHER tasks have to update their load when changing their
* weight
*/
- if (update_load && p->sched_class == &fair_sched_class)
- reweight_task(p, &lw);
+ if (update_load && p->sched_class->reweight_task)
+ p->sched_class->reweight_task(task_rq(p), p, &lw);
else
p->se.load = lw;
}
@@ -1672,6 +1708,9 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
if (unlikely(!p->sched_class->uclamp_enabled))
return;
+ if (p->se.sched_delayed)
+ return;
+
for_each_clamp_id(clamp_id)
uclamp_rq_inc_id(rq, p, clamp_id);
@@ -1696,6 +1735,9 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
if (unlikely(!p->sched_class->uclamp_enabled))
return;
+ if (p->se.sched_delayed)
+ return;
+
for_each_clamp_id(clamp_id)
uclamp_rq_dec_id(rq, p, clamp_id);
}
@@ -1975,14 +2017,21 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
}
- uclamp_rq_inc(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
+ /*
+ * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
+ * ->sched_delayed.
+ */
+ uclamp_rq_inc(rq, p);
if (sched_core_enabled(rq))
sched_core_enqueue(rq, p);
}
-void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+/*
+ * Must only return false when DEQUEUE_SLEEP.
+ */
+inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (sched_core_enabled(rq))
sched_core_dequeue(rq, p, flags);
@@ -1995,8 +2044,12 @@ void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
psi_dequeue(p, flags & DEQUEUE_SLEEP);
}
+ /*
+ * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
+ * and mark the task ->sched_delayed.
+ */
uclamp_rq_dec(rq, p);
- p->sched_class->dequeue_task(rq, p, flags);
+ return p->sched_class->dequeue_task(rq, p, flags);
}
void activate_task(struct rq *rq, struct task_struct *p, int flags)
@@ -2014,12 +2067,25 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
- WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING);
+ SCHED_WARN_ON(flags & DEQUEUE_SLEEP);
+
+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
ASSERT_EXCLUSIVE_WRITER(p->on_rq);
+ /*
+ * Code explicitly relies on TASK_ON_RQ_MIGRATING begin set *before*
+ * dequeue_task() and cleared *after* enqueue_task().
+ */
+
dequeue_task(rq, p, flags);
}
+static void block_task(struct rq *rq, struct task_struct *p, int flags)
+{
+ if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags))
+ __block_task(rq, p);
+}
+
/**
* task_curr - is this task currently executing on a CPU?
* @p: the task in question.
@@ -2032,6 +2098,17 @@ inline int task_curr(const struct task_struct *p)
}
/*
+ * ->switching_to() is called with the pi_lock and rq_lock held and must not
+ * mess with locking.
+ */
+void check_class_changing(struct rq *rq, struct task_struct *p,
+ const struct sched_class *prev_class)
+{
+ if (prev_class != p->sched_class && p->sched_class->switching_to)
+ p->sched_class->switching_to(rq, p);
+}
+
+/*
* switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
* use the balance_callback list if you want balancing.
*
@@ -2233,6 +2310,12 @@ void migrate_disable(void)
struct task_struct *p = current;
if (p->migration_disabled) {
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ *Warn about overflow half-way through the range.
+ */
+ WARN_ON_ONCE((s16)p->migration_disabled < 0);
+#endif
p->migration_disabled++;
return;
}
@@ -2251,14 +2334,20 @@ void migrate_enable(void)
.flags = SCA_MIGRATE_ENABLE,
};
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Check both overflow from migrate_disable() and superfluous
+ * migrate_enable().
+ */
+ if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
+ return;
+#endif
+
if (p->migration_disabled > 1) {
p->migration_disabled--;
return;
}
- if (WARN_ON_ONCE(!p->migration_disabled))
- return;
-
/*
* Ensure stop_task runs either before or after this, and that
* __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
@@ -2289,7 +2378,7 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)
static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
{
/* When not in the task's cpumask, no point in looking further. */
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+ if (!task_allowed_on_cpu(p, cpu))
return false;
/* migrate_disabled() must be allowed to finish. */
@@ -2298,7 +2387,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
/* Non kernel threads are not allowed during either online or offline. */
if (!(p->flags & PF_KTHREAD))
- return cpu_active(cpu) && task_cpu_possible(cpu, p);
+ return cpu_active(cpu);
/* KTHREAD_IS_PER_CPU is always allowed. */
if (kthread_is_per_cpu(p))
@@ -3607,8 +3696,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
rq->idle_stamp = 0;
}
#endif
-
- p->dl_server = NULL;
}
/*
@@ -3644,12 +3731,14 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
rq = __task_rq_lock(p, &rf);
if (task_on_rq_queued(p)) {
+ update_rq_clock(rq);
+ if (p->se.sched_delayed)
+ enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
if (!task_on_cpu(rq, p)) {
/*
* When on_rq && !on_cpu the task is preempted, see if
* it should preempt the task that is current now.
*/
- update_rq_clock(rq);
wakeup_preempt(rq, p, wake_flags);
}
ttwu_do_wakeup(p);
@@ -3776,6 +3865,15 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
{
/*
+ * The BPF scheduler may depend on select_task_rq() being invoked during
+ * wakeups. In addition, @p may end up executing on a different CPU
+ * regardless of what happens in the wakeup path making the ttwu_queue
+ * optimization less meaningful. Skip if on SCX.
+ */
+ if (task_on_scx(p))
+ return false;
+
+ /*
* Do not complicate things with the async wake_list while the CPU is
* in hotplug state.
*/
@@ -4029,11 +4127,16 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* case the whole 'p->on_rq && ttwu_runnable()' case below
* without taking any locks.
*
+ * Specifically, given current runs ttwu() we must be before
+ * schedule()'s block_task(), as such this must not observe
+ * sched_delayed.
+ *
* In particular:
* - we rely on Program-Order guarantees for all the ordering,
* - we're serialized against set_special_state() by virtue of
* it disabling IRQs (this allows not taking ->pi_lock).
*/
+ SCHED_WARN_ON(p->se.sched_delayed);
if (!ttwu_state_match(p, state, &success))
goto out;
@@ -4322,9 +4425,11 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.nr_migrations = 0;
p->se.vruntime = 0;
p->se.vlag = 0;
- p->se.slice = sysctl_sched_base_slice;
INIT_LIST_HEAD(&p->se.group_node);
+ /* A delayed task cannot be in clone(). */
+ SCHED_WARN_ON(p->se.sched_delayed);
+
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
#endif
@@ -4342,6 +4447,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->rt.on_rq = 0;
p->rt.on_list = 0;
+#ifdef CONFIG_SCHED_CLASS_EXT
+ init_scx_entity(&p->scx);
+#endif
+
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
@@ -4572,6 +4681,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->prio = p->normal_prio = p->static_prio;
set_load_weight(p, false);
+ p->se.custom_slice = 0;
+ p->se.slice = sysctl_sched_base_slice;
/*
* We don't need the reset flag anymore after the fork. It has
@@ -4582,10 +4693,18 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
if (dl_prio(p->prio))
return -EAGAIN;
- else if (rt_prio(p->prio))
+
+ scx_pre_fork(p);
+
+ if (rt_prio(p->prio)) {
p->sched_class = &rt_sched_class;
- else
+#ifdef CONFIG_SCHED_CLASS_EXT
+ } else if (task_should_scx(p)) {
+ p->sched_class = &ext_sched_class;
+#endif
+ } else {
p->sched_class = &fair_sched_class;
+ }
init_entity_runnable_average(&p->se);
@@ -4605,7 +4724,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
return 0;
}
-void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
+int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
{
unsigned long flags;
@@ -4632,11 +4751,19 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ return scx_fork(p);
+}
+
+void sched_cancel_fork(struct task_struct *p)
+{
+ scx_cancel_fork(p);
}
void sched_post_fork(struct task_struct *p)
{
uclamp_post_fork(p);
+ scx_post_fork(p);
}
unsigned long to_ratio(u64 period, u64 runtime)
@@ -4686,7 +4813,7 @@ void wake_up_new_task(struct task_struct *p)
update_rq_clock(rq);
post_init_entity_util_avg(p);
- activate_task(rq, p, ENQUEUE_NOCLOCK);
+ activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL);
trace_sched_wakeup_new(p);
wakeup_preempt(rq, p, WF_FORK);
#ifdef CONFIG_SMP
@@ -5469,6 +5596,7 @@ void sched_tick(void)
calc_global_load_tick(rq);
sched_core_tick(rq);
task_tick_mm_cid(rq, curr);
+ scx_tick(rq);
rq_unlock(rq, &rf);
@@ -5481,8 +5609,10 @@ void sched_tick(void)
wq_worker_tick(curr);
#ifdef CONFIG_SMP
- rq->idle_balance = idle_cpu(cpu);
- sched_balance_trigger(rq);
+ if (!scx_switched_all()) {
+ rq->idle_balance = idle_cpu(cpu);
+ sched_balance_trigger(rq);
+ }
#endif
}
@@ -5769,11 +5899,22 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
schedstat_inc(this_rq()->sched_count);
}
-static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
- struct rq_flags *rf)
+static void prev_balance(struct rq *rq, struct task_struct *prev,
+ struct rq_flags *rf)
{
-#ifdef CONFIG_SMP
+ const struct sched_class *start_class = prev->sched_class;
const struct sched_class *class;
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+ /*
+ * SCX requires a balance() call before every pick_next_task() including
+ * when waking up from SCHED_IDLE. If @start_class is below SCX, start
+ * from SCX instead.
+ */
+ if (scx_enabled() && sched_class_above(&ext_sched_class, start_class))
+ start_class = &ext_sched_class;
+#endif
+
/*
* We must do the balancing pass before put_prev_task(), such
* that when we release the rq->lock the task is in the same
@@ -5782,13 +5923,10 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
* We can terminate the balance pass as soon as we know there is
* a runnable task of @class priority or higher.
*/
- for_class_range(class, prev->sched_class, &idle_sched_class) {
- if (class->balance(rq, prev, rf))
+ for_active_class_range(class, start_class, &idle_sched_class) {
+ if (class->balance && class->balance(rq, prev, rf))
break;
}
-#endif
-
- put_prev_task(rq, prev);
}
/*
@@ -5800,6 +5938,11 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
const struct sched_class *class;
struct task_struct *p;
+ rq->dl_server = NULL;
+
+ if (scx_enabled())
+ goto restart;
+
/*
* Optimization: we know that if all tasks are in the fair class we can
* call that function directly, but only if the @prev task wasn't of a
@@ -5815,35 +5958,28 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/* Assume the next prioritized class is idle_sched_class */
if (!p) {
- put_prev_task(rq, prev);
- p = pick_next_task_idle(rq);
+ p = pick_task_idle(rq);
+ put_prev_set_next_task(rq, prev, p);
}
- /*
- * This is the fast path; it cannot be a DL server pick;
- * therefore even if @p == @prev, ->dl_server must be NULL.
- */
- if (p->dl_server)
- p->dl_server = NULL;
-
return p;
}
restart:
- put_prev_task_balance(rq, prev, rf);
-
- /*
- * We've updated @prev and no longer need the server link, clear it.
- * Must be done before ->pick_next_task() because that can (re)set
- * ->dl_server.
- */
- if (prev->dl_server)
- prev->dl_server = NULL;
+ prev_balance(rq, prev, rf);
- for_each_class(class) {
- p = class->pick_next_task(rq);
- if (p)
- return p;
+ for_each_active_class(class) {
+ if (class->pick_next_task) {
+ p = class->pick_next_task(rq, prev);
+ if (p)
+ return p;
+ } else {
+ p = class->pick_task(rq);
+ if (p) {
+ put_prev_set_next_task(rq, prev, p);
+ return p;
+ }
+ }
}
BUG(); /* The idle class should always have a runnable task. */
@@ -5873,7 +6009,9 @@ static inline struct task_struct *pick_task(struct rq *rq)
const struct sched_class *class;
struct task_struct *p;
- for_each_class(class) {
+ rq->dl_server = NULL;
+
+ for_each_active_class(class) {
p = class->pick_task(rq);
if (p)
return p;
@@ -5911,6 +6049,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* another cpu during offline.
*/
rq->core_pick = NULL;
+ rq->core_dl_server = NULL;
return __pick_next_task(rq, prev, rf);
}
@@ -5929,16 +6068,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
next = rq->core_pick;
- if (next != prev) {
- put_prev_task(rq, prev);
- set_next_task(rq, next);
- }
-
+ rq->dl_server = rq->core_dl_server;
rq->core_pick = NULL;
- goto out;
+ rq->core_dl_server = NULL;
+ goto out_set_next;
}
- put_prev_task_balance(rq, prev, rf);
+ prev_balance(rq, prev, rf);
smt_mask = cpu_smt_mask(cpu);
need_sync = !!rq->core->core_cookie;
@@ -5979,6 +6115,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
next = pick_task(rq);
if (!next->core_cookie) {
rq->core_pick = NULL;
+ rq->core_dl_server = NULL;
/*
* For robustness, update the min_vruntime_fi for
* unconstrained picks as well.
@@ -6006,7 +6143,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (i != cpu && (rq_i != rq->core || !core_clock_updated))
update_rq_clock(rq_i);
- p = rq_i->core_pick = pick_task(rq_i);
+ rq_i->core_pick = p = pick_task(rq_i);
+ rq_i->core_dl_server = rq_i->dl_server;
+
if (!max || prio_less(max, p, fi_before))
max = p;
}
@@ -6030,6 +6169,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
rq_i->core_pick = p;
+ rq_i->core_dl_server = NULL;
if (p == rq_i->idle) {
if (rq_i->nr_running) {
@@ -6090,6 +6230,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (i == cpu) {
rq_i->core_pick = NULL;
+ rq_i->core_dl_server = NULL;
continue;
}
@@ -6098,6 +6239,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (rq_i->curr == rq_i->core_pick) {
rq_i->core_pick = NULL;
+ rq_i->core_dl_server = NULL;
continue;
}
@@ -6105,8 +6247,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
out_set_next:
- set_next_task(rq, next);
-out:
+ put_prev_set_next_task(rq, prev, next);
if (rq->core->core_forceidle_count && next == rq->idle)
queue_core_balance(rq);
@@ -6342,19 +6483,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* Constants for the sched_mode argument of __schedule().
*
* The mode argument allows RT enabled kernels to differentiate a
- * preemption from blocking on an 'sleeping' spin/rwlock. Note that
- * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
- * optimize the AND operation out and just check for zero.
+ * preemption from blocking on an 'sleeping' spin/rwlock.
*/
-#define SM_NONE 0x0
-#define SM_PREEMPT 0x1
-#define SM_RTLOCK_WAIT 0x2
-
-#ifndef CONFIG_PREEMPT_RT
-# define SM_MASK_PREEMPT (~0U)
-#else
-# define SM_MASK_PREEMPT SM_PREEMPT
-#endif
+#define SM_IDLE (-1)
+#define SM_NONE 0
+#define SM_PREEMPT 1
+#define SM_RTLOCK_WAIT 2
/*
* __schedule() is the main scheduler function.
@@ -6395,9 +6529,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
*
* WARNING: must be called with preemption disabled!
*/
-static void __sched notrace __schedule(unsigned int sched_mode)
+static void __sched notrace __schedule(int sched_mode)
{
struct task_struct *prev, *next;
+ /*
+ * On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted
+ * as a preemption by schedule_debug() and RCU.
+ */
+ bool preempt = sched_mode > SM_NONE;
unsigned long *switch_count;
unsigned long prev_state;
struct rq_flags rf;
@@ -6408,13 +6547,13 @@ static void __sched notrace __schedule(unsigned int sched_mode)
rq = cpu_rq(cpu);
prev = rq->curr;
- schedule_debug(prev, !!sched_mode);
+ schedule_debug(prev, preempt);
if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
hrtick_clear(rq);
local_irq_disable();
- rcu_note_context_switch(!!sched_mode);
+ rcu_note_context_switch(preempt);
/*
* Make sure that signal_pending_state()->signal_pending() below
@@ -6443,22 +6582,33 @@ static void __sched notrace __schedule(unsigned int sched_mode)
switch_count = &prev->nivcsw;
+ /* Task state changes only considers SM_PREEMPT as preemption */
+ preempt = sched_mode == SM_PREEMPT;
+
/*
* We must load prev->state once (task_struct::state is volatile), such
* that we form a control dependency vs deactivate_task() below.
*/
prev_state = READ_ONCE(prev->__state);
- if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
+ if (sched_mode == SM_IDLE) {
+ /* SCX must consult the BPF scheduler to tell if rq is empty */
+ if (!rq->nr_running && !scx_enabled()) {
+ next = prev;
+ goto picked;
+ }
+ } else if (!preempt && prev_state) {
if (signal_pending_state(prev_state, prev)) {
WRITE_ONCE(prev->__state, TASK_RUNNING);
} else {
+ int flags = DEQUEUE_NOCLOCK;
+
prev->sched_contributes_to_load =
(prev_state & TASK_UNINTERRUPTIBLE) &&
!(prev_state & TASK_NOLOAD) &&
!(prev_state & TASK_FROZEN);
- if (prev->sched_contributes_to_load)
- rq->nr_uninterruptible++;
+ if (unlikely(is_special_task_state(prev_state)))
+ flags |= DEQUEUE_SPECIAL;
/*
* __schedule() ttwu()
@@ -6471,17 +6621,13 @@ static void __sched notrace __schedule(unsigned int sched_mode)
*
* After this, schedule() must not care about p->state any more.
*/
- deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
-
- if (prev->in_iowait) {
- atomic_inc(&rq->nr_iowait);
- delayacct_blkio_start();
- }
+ block_task(rq, prev, flags);
}
switch_count = &prev->nvcsw;
}
next = pick_next_task(rq, prev, &rf);
+picked:
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
#ifdef CONFIG_SCHED_DEBUG
@@ -6523,7 +6669,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
psi_account_irqtime(rq, prev, next);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
- trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
+ trace_sched_switch(preempt, prev, next, prev_state);
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
@@ -6599,7 +6745,7 @@ static void sched_update_worker(struct task_struct *tsk)
}
}
-static __always_inline void __schedule_loop(unsigned int sched_mode)
+static __always_inline void __schedule_loop(int sched_mode)
{
do {
preempt_disable();
@@ -6644,7 +6790,7 @@ void __sched schedule_idle(void)
*/
WARN_ON_ONCE(current->__state);
do {
- __schedule(SM_NONE);
+ __schedule(SM_IDLE);
} while (need_resched());
}
@@ -6870,6 +7016,10 @@ void __setscheduler_prio(struct task_struct *p, int prio)
p->sched_class = &dl_sched_class;
else if (rt_prio(prio))
p->sched_class = &rt_sched_class;
+#ifdef CONFIG_SCHED_CLASS_EXT
+ else if (task_should_scx(p))
+ p->sched_class = &ext_sched_class;
+#endif
else
p->sched_class = &fair_sched_class;
@@ -7015,6 +7165,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
}
__setscheduler_prio(p, prio);
+ check_class_changing(rq, p, prev_class);
if (queued)
enqueue_task(rq, p, queue_flag);
@@ -7405,7 +7556,7 @@ EXPORT_SYMBOL(io_schedule);
void sched_show_task(struct task_struct *p)
{
- unsigned long free = 0;
+ unsigned long free;
int ppid;
if (!try_get_task_stack(p))
@@ -7415,9 +7566,7 @@ void sched_show_task(struct task_struct *p)
if (task_is_running(p))
pr_cont(" running task ");
-#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
-#endif
ppid = 0;
rcu_read_lock();
if (pid_alive(p))
@@ -7429,6 +7578,7 @@ void sched_show_task(struct task_struct *p)
print_worker_info(KERN_INFO, p);
print_stop_info(KERN_INFO, p);
+ print_scx_info(KERN_INFO, p);
show_stack(p, NULL, KERN_INFO);
put_task_stack(p);
}
@@ -7957,6 +8107,8 @@ int sched_cpu_activate(unsigned int cpu)
cpuset_cpu_active();
}
+ scx_rq_activate(rq);
+
/*
* Put the rq online, if not already. This happens:
*
@@ -8006,6 +8158,8 @@ int sched_cpu_deactivate(unsigned int cpu)
sched_set_rq_offline(rq, cpu);
+ scx_rq_deactivate(rq);
+
/*
* When going down, decrement the number of cores with SMT present.
*/
@@ -8190,11 +8344,15 @@ void __init sched_init(void)
int i;
/* Make sure the linker didn't screw up */
- BUG_ON(&idle_sched_class != &fair_sched_class + 1 ||
- &fair_sched_class != &rt_sched_class + 1 ||
- &rt_sched_class != &dl_sched_class + 1);
#ifdef CONFIG_SMP
- BUG_ON(&dl_sched_class != &stop_sched_class + 1);
+ BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class));
+#endif
+ BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class));
+ BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class));
+ BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class));
+#ifdef CONFIG_SCHED_CLASS_EXT
+ BUG_ON(!sched_class_above(&fair_sched_class, &ext_sched_class));
+ BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class));
#endif
wait_bit_init();
@@ -8218,6 +8376,9 @@ void __init sched_init(void)
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_EXT_GROUP_SCHED
+ root_task_group.scx_weight = CGROUP_WEIGHT_DFL;
+#endif /* CONFIG_EXT_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
@@ -8228,8 +8389,6 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
}
- init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
-
#ifdef CONFIG_SMP
init_defrootdomain();
#endif
@@ -8284,8 +8443,13 @@ void __init sched_init(void)
init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
- rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * This is required for init cpu because rt.c:__enable_runtime()
+ * starts working after scheduler_running, which is not the case
+ * yet.
+ */
+ rq->rt.rt_runtime = global_rt_runtime();
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
#ifdef CONFIG_SMP
@@ -8317,10 +8481,12 @@ void __init sched_init(void)
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
+ fair_server_init(rq);
#ifdef CONFIG_SCHED_CORE
rq->core = rq;
rq->core_pick = NULL;
+ rq->core_dl_server = NULL;
rq->core_enabled = 0;
rq->core_tree = RB_ROOT;
rq->core_forceidle_count = 0;
@@ -8333,6 +8499,7 @@ void __init sched_init(void)
}
set_load_weight(&init_task, false);
+ init_task.se.slice = sysctl_sched_base_slice,
/*
* The boot idle thread does lazy MMU switching as well:
@@ -8363,6 +8530,7 @@ void __init sched_init(void)
balance_push_set(smp_processor_id(), false);
#endif
init_sched_fair_class();
+ init_sched_ext_class();
psi_init();
@@ -8548,7 +8716,7 @@ void normalize_rt_tasks(void)
schedstat_set(p->stats.sleep_start, 0);
schedstat_set(p->stats.block_start, 0);
- if (!dl_task(p) && !rt_task(p)) {
+ if (!rt_or_dl_task(p)) {
/*
* Renice negative nice level userspace
* tasks back to 0:
@@ -8648,6 +8816,7 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;
+ scx_group_set_weight(tg, CGROUP_WEIGHT_DFL);
alloc_uclamp_sched_group(tg, parent);
return tg;
@@ -8775,6 +8944,7 @@ void sched_move_task(struct task_struct *tsk)
put_prev_task(rq, tsk);
sched_change_group(tsk, group);
+ scx_move_task(tsk);
if (queued)
enqueue_task(rq, tsk, queue_flags);
@@ -8789,11 +8959,6 @@ void sched_move_task(struct task_struct *tsk)
}
}
-static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
-{
- return css ? container_of(css, struct task_group, css) : NULL;
-}
-
static struct cgroup_subsys_state *
cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
@@ -8817,6 +8982,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
struct task_group *parent = css_tg(css->parent);
+ int ret;
+
+ ret = scx_tg_online(tg);
+ if (ret)
+ return ret;
if (parent)
sched_online_group(tg, parent);
@@ -8831,6 +9001,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
return 0;
}
+static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+
+ scx_tg_offline(tg);
+}
+
static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
@@ -8848,9 +9025,9 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
sched_unregister_group(tg);
}
-#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
+#ifdef CONFIG_RT_GROUP_SCHED
struct task_struct *task;
struct cgroup_subsys_state *css;
@@ -8858,9 +9035,9 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
}
- return 0;
-}
#endif
+ return scx_cgroup_can_attach(tset);
+}
static void cpu_cgroup_attach(struct cgroup_taskset *tset)
{
@@ -8869,6 +9046,13 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
cgroup_taskset_for_each(task, css, tset)
sched_move_task(task);
+
+ scx_cgroup_finish_attach();
+}
+
+static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset)
+{
+ scx_cgroup_cancel_attach(tset);
}
#ifdef CONFIG_UCLAMP_TASK_GROUP
@@ -9045,22 +9229,36 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
}
#endif /* CONFIG_UCLAMP_TASK_GROUP */
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
+static unsigned long tg_weight(struct task_group *tg)
+{
#ifdef CONFIG_FAIR_GROUP_SCHED
+ return scale_load_down(tg->shares);
+#else
+ return sched_weight_from_cgroup(tg->scx_weight);
+#endif
+}
+
static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 shareval)
{
+ int ret;
+
if (shareval > scale_load_down(ULONG_MAX))
shareval = MAX_SHARES;
- return sched_group_set_shares(css_tg(css), scale_load(shareval));
+ ret = sched_group_set_shares(css_tg(css), scale_load(shareval));
+ if (!ret)
+ scx_group_set_weight(css_tg(css),
+ sched_weight_to_cgroup(shareval));
+ return ret;
}
static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- struct task_group *tg = css_tg(css);
-
- return (u64) scale_load_down(tg->shares);
+ return tg_weight(css_tg(css));
}
+#endif /* CONFIG_GROUP_SCHED_WEIGHT */
#ifdef CONFIG_CFS_BANDWIDTH
static DEFINE_MUTEX(cfs_constraints_mutex);
@@ -9406,7 +9604,6 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
return 0;
}
#endif /* CONFIG_CFS_BANDWIDTH */
-#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
@@ -9434,7 +9631,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
}
#endif /* CONFIG_RT_GROUP_SCHED */
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@@ -9444,12 +9641,17 @@ static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
struct cftype *cft, s64 idle)
{
- return sched_group_set_idle(css_tg(css), idle);
+ int ret;
+
+ ret = sched_group_set_idle(css_tg(css), idle);
+ if (!ret)
+ scx_group_set_idle(css_tg(css), idle);
+ return ret;
}
#endif
static struct cftype cpu_legacy_files[] = {
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
{
.name = "shares",
.read_u64 = cpu_shares_read_u64,
@@ -9559,38 +9761,35 @@ static int cpu_local_stat_show(struct seq_file *sf,
return 0;
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
+
static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- struct task_group *tg = css_tg(css);
- u64 weight = scale_load_down(tg->shares);
-
- return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
+ return sched_weight_to_cgroup(tg_weight(css_tg(css)));
}
static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
- struct cftype *cft, u64 weight)
+ struct cftype *cft, u64 cgrp_weight)
{
- /*
- * cgroup weight knobs should use the common MIN, DFL and MAX
- * values which are 1, 100 and 10000 respectively. While it loses
- * a bit of range on both ends, it maps pretty well onto the shares
- * value used by scheduler and the round-trip conversions preserve
- * the original value over the entire range.
- */
- if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
+ unsigned long weight;
+ int ret;
+
+ if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX)
return -ERANGE;
- weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
+ weight = sched_weight_from_cgroup(cgrp_weight);
- return sched_group_set_shares(css_tg(css), scale_load(weight));
+ ret = sched_group_set_shares(css_tg(css), scale_load(weight));
+ if (!ret)
+ scx_group_set_weight(css_tg(css), cgrp_weight);
+ return ret;
}
static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- unsigned long weight = scale_load_down(css_tg(css)->shares);
+ unsigned long weight = tg_weight(css_tg(css));
int last_delta = INT_MAX;
int prio, delta;
@@ -9609,7 +9808,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
struct cftype *cft, s64 nice)
{
unsigned long weight;
- int idx;
+ int idx, ret;
if (nice < MIN_NICE || nice > MAX_NICE)
return -ERANGE;
@@ -9618,9 +9817,13 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
idx = array_index_nospec(idx, 40);
weight = sched_prio_to_weight[idx];
- return sched_group_set_shares(css_tg(css), scale_load(weight));
+ ret = sched_group_set_shares(css_tg(css), scale_load(weight));
+ if (!ret)
+ scx_group_set_weight(css_tg(css),
+ sched_weight_to_cgroup(weight));
+ return ret;
}
-#endif
+#endif /* CONFIG_GROUP_SCHED_WEIGHT */
static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
long period, long quota)
@@ -9680,7 +9883,7 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
#endif
static struct cftype cpu_files[] = {
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
{
.name = "weight",
.flags = CFTYPE_NOT_ON_ROOT,
@@ -9734,14 +9937,14 @@ static struct cftype cpu_files[] = {
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
.css_online = cpu_cgroup_css_online,
+ .css_offline = cpu_cgroup_css_offline,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.css_extra_stat_show = cpu_extra_stat_show,
.css_local_stat_show = cpu_local_stat_show,
-#ifdef CONFIG_RT_GROUP_SCHED
.can_attach = cpu_cgroup_can_attach,
-#endif
.attach = cpu_cgroup_attach,
+ .cancel_attach = cpu_cgroup_cancel_attach,
.legacy_cftypes = cpu_legacy_files,
.dfl_cftypes = cpu_files,
.early_init = true,
@@ -10331,3 +10534,38 @@ void sched_mm_cid_fork(struct task_struct *t)
t->mm_cid_active = 1;
}
#endif
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
+ struct sched_enq_and_set_ctx *ctx)
+{
+ struct rq *rq = task_rq(p);
+
+ lockdep_assert_rq_held(rq);
+
+ *ctx = (struct sched_enq_and_set_ctx){
+ .p = p,
+ .queue_flags = queue_flags,
+ .queued = task_on_rq_queued(p),
+ .running = task_current(rq, p),
+ };
+
+ update_rq_clock(rq);
+ if (ctx->queued)
+ dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK);
+ if (ctx->running)
+ put_prev_task(rq, p);
+}
+
+void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
+{
+ struct rq *rq = task_rq(ctx->p);
+
+ lockdep_assert_rq_held(rq);
+
+ if (ctx->queued)
+ enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK);
+ if (ctx->running)
+ set_next_task(rq, ctx->p);
+}
+#endif /* CONFIG_SCHED_CLASS_EXT */
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index eece6244f9d2..c6ba15388ea7 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -197,8 +197,10 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost)
{
- unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu);
+ unsigned long min, max, util = scx_cpuperf_target(sg_cpu->cpu);
+ if (!scx_switched_all())
+ util += cpu_util_cfs_boost(sg_cpu->cpu);
util = effective_cpu_util(sg_cpu->cpu, util, &min, &max);
util = max(util, boost);
sg_cpu->bw_min = min;
@@ -325,16 +327,35 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
}
#ifdef CONFIG_NO_HZ_COMMON
-static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
+static bool sugov_hold_freq(struct sugov_cpu *sg_cpu)
{
- unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
- bool ret = idle_calls == sg_cpu->saved_idle_calls;
+ unsigned long idle_calls;
+ bool ret;
+
+ /*
+ * The heuristics in this function is for the fair class. For SCX, the
+ * performance target comes directly from the BPF scheduler. Let's just
+ * follow it.
+ */
+ if (scx_switched_all())
+ return false;
+
+ /* if capped by uclamp_max, always update to be in compliance */
+ if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)))
+ return false;
+
+ /*
+ * Maintain the frequency if the CPU has not been idle recently, as
+ * reduction is likely to be premature.
+ */
+ idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
+ ret = idle_calls == sg_cpu->saved_idle_calls;
sg_cpu->saved_idle_calls = idle_calls;
return ret;
}
#else
-static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
+static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; }
#endif /* CONFIG_NO_HZ_COMMON */
/*
@@ -382,14 +403,8 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
return;
next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap);
- /*
- * Do not reduce the frequency if the CPU has not been idle
- * recently, as the reduction is likely to be premature then.
- *
- * Except when the rq is capped by uclamp_max.
- */
- if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
- sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq &&
+
+ if (sugov_hold_freq(sg_cpu) && next_f < sg_policy->next_freq &&
!sg_policy->need_freq_update) {
next_f = sg_policy->next_freq;
@@ -436,14 +451,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
return;
- /*
- * Do not reduce the target performance level if the CPU has not been
- * idle recently, as the reduction is likely to be premature then.
- *
- * Except when the rq is capped by uclamp_max.
- */
- if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
- sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
+ if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util)
sg_cpu->util = prev_util;
cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min,
@@ -654,9 +662,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
* Fake (unused) bandwidth; workaround to "fix"
* priority inheritance.
*/
- .sched_runtime = 1000000,
- .sched_deadline = 10000000,
- .sched_period = 10000000,
+ .sched_runtime = NSEC_PER_MSEC,
+ .sched_deadline = 10 * NSEC_PER_MSEC,
+ .sched_period = 10 * NSEC_PER_MSEC,
};
struct cpufreq_policy *policy = sg_policy->policy;
int ret;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index f59e5c19d944..9ce93d0bf452 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -320,19 +320,12 @@ void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
__sub_running_bw(dl_se->dl_bw, dl_rq);
}
-static void dl_change_utilization(struct task_struct *p, u64 new_bw)
+static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_se, u64 new_bw)
{
- struct rq *rq;
-
- WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV);
-
- if (task_on_rq_queued(p))
- return;
+ if (dl_se->dl_non_contending) {
+ sub_running_bw(dl_se, &rq->dl);
+ dl_se->dl_non_contending = 0;
- rq = task_rq(p);
- if (p->dl.dl_non_contending) {
- sub_running_bw(&p->dl, &rq->dl);
- p->dl.dl_non_contending = 0;
/*
* If the timer handler is currently running and the
* timer cannot be canceled, inactive_task_timer()
@@ -340,13 +333,25 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
* will not touch the rq's active utilization,
* so we are still safe.
*/
- if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
- put_task_struct(p);
+ if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) {
+ if (!dl_server(dl_se))
+ put_task_struct(dl_task_of(dl_se));
+ }
}
- __sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ __sub_rq_bw(dl_se->dl_bw, &rq->dl);
__add_rq_bw(new_bw, &rq->dl);
}
+static void dl_change_utilization(struct task_struct *p, u64 new_bw)
+{
+ WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV);
+
+ if (task_on_rq_queued(p))
+ return;
+
+ dl_rq_change_utilization(task_rq(p), &p->dl, new_bw);
+}
+
static void __dl_clear_params(struct sched_dl_entity *dl_se);
/*
@@ -771,6 +776,15 @@ static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
/* for non-boosted task, pi_of(dl_se) == dl_se */
dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
dl_se->runtime = pi_of(dl_se)->dl_runtime;
+
+ /*
+ * If it is a deferred reservation, and the server
+ * is not handling an starvation case, defer it.
+ */
+ if (dl_se->dl_defer & !dl_se->dl_defer_running) {
+ dl_se->dl_throttled = 1;
+ dl_se->dl_defer_armed = 1;
+ }
}
/*
@@ -809,6 +823,9 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
replenish_dl_new_period(dl_se, rq);
}
+static int start_dl_timer(struct sched_dl_entity *dl_se);
+static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t);
+
/*
* Pure Earliest Deadline First (EDF) scheduling does not deal with the
* possibility of a entity lasting more than what it declared, and thus
@@ -837,9 +854,18 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
/*
* This could be the case for a !-dl task that is boosted.
* Just go with full inherited parameters.
+ *
+ * Or, it could be the case of a deferred reservation that
+ * was not able to consume its runtime in background and
+ * reached this point with current u > U.
+ *
+ * In both cases, set a new period.
*/
- if (dl_se->dl_deadline == 0)
- replenish_dl_new_period(dl_se, rq);
+ if (dl_se->dl_deadline == 0 ||
+ (dl_se->dl_defer_armed && dl_entity_overflow(dl_se, rq_clock(rq)))) {
+ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
+ dl_se->runtime = pi_of(dl_se)->dl_runtime;
+ }
if (dl_se->dl_yielded && dl_se->runtime > 0)
dl_se->runtime = 0;
@@ -873,6 +899,44 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
dl_se->dl_yielded = 0;
if (dl_se->dl_throttled)
dl_se->dl_throttled = 0;
+
+ /*
+ * If this is the replenishment of a deferred reservation,
+ * clear the flag and return.
+ */
+ if (dl_se->dl_defer_armed) {
+ dl_se->dl_defer_armed = 0;
+ return;
+ }
+
+ /*
+ * A this point, if the deferred server is not armed, and the deadline
+ * is in the future, if it is not running already, throttle the server
+ * and arm the defer timer.
+ */
+ if (dl_se->dl_defer && !dl_se->dl_defer_running &&
+ dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) {
+ if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) {
+
+ /*
+ * Set dl_se->dl_defer_armed and dl_throttled variables to
+ * inform the start_dl_timer() that this is a deferred
+ * activation.
+ */
+ dl_se->dl_defer_armed = 1;
+ dl_se->dl_throttled = 1;
+ if (!start_dl_timer(dl_se)) {
+ /*
+ * If for whatever reason (delays), a previous timer was
+ * queued but not serviced, cancel it and clean the
+ * deferrable server variables intended for start_dl_timer().
+ */
+ hrtimer_try_to_cancel(&dl_se->dl_timer);
+ dl_se->dl_defer_armed = 0;
+ dl_se->dl_throttled = 0;
+ }
+ }
+ }
}
/*
@@ -1023,6 +1087,15 @@ static void update_dl_entity(struct sched_dl_entity *dl_se)
}
replenish_dl_new_period(dl_se, rq);
+ } else if (dl_server(dl_se) && dl_se->dl_defer) {
+ /*
+ * The server can still use its previous deadline, so check if
+ * it left the dl_defer_running state.
+ */
+ if (!dl_se->dl_defer_running) {
+ dl_se->dl_defer_armed = 1;
+ dl_se->dl_throttled = 1;
+ }
}
}
@@ -1055,8 +1128,21 @@ static int start_dl_timer(struct sched_dl_entity *dl_se)
* We want the timer to fire at the deadline, but considering
* that it is actually coming from rq->clock and not from
* hrtimer's time base reading.
+ *
+ * The deferred reservation will have its timer set to
+ * (deadline - runtime). At that point, the CBS rule will decide
+ * if the current deadline can be used, or if a replenishment is
+ * required to avoid add too much pressure on the system
+ * (current u > U).
*/
- act = ns_to_ktime(dl_next_period(dl_se));
+ if (dl_se->dl_defer_armed) {
+ WARN_ON_ONCE(!dl_se->dl_throttled);
+ act = ns_to_ktime(dl_se->deadline - dl_se->runtime);
+ } else {
+ /* act = deadline - rel-deadline + period */
+ act = ns_to_ktime(dl_next_period(dl_se));
+ }
+
now = hrtimer_cb_get_time(timer);
delta = ktime_to_ns(now) - rq_clock(rq);
act = ktime_add_ns(act, delta);
@@ -1106,6 +1192,62 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
#endif
}
+/* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */
+static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC;
+
+static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se)
+{
+ struct rq *rq = rq_of_dl_se(dl_se);
+ u64 fw;
+
+ scoped_guard (rq_lock, rq) {
+ struct rq_flags *rf = &scope.rf;
+
+ if (!dl_se->dl_throttled || !dl_se->dl_runtime)
+ return HRTIMER_NORESTART;
+
+ sched_clock_tick();
+ update_rq_clock(rq);
+
+ if (!dl_se->dl_runtime)
+ return HRTIMER_NORESTART;
+
+ if (!dl_se->server_has_tasks(dl_se)) {
+ replenish_dl_entity(dl_se);
+ return HRTIMER_NORESTART;
+ }
+
+ if (dl_se->dl_defer_armed) {
+ /*
+ * First check if the server could consume runtime in background.
+ * If so, it is possible to push the defer timer for this amount
+ * of time. The dl_server_min_res serves as a limit to avoid
+ * forwarding the timer for a too small amount of time.
+ */
+ if (dl_time_before(rq_clock(dl_se->rq),
+ (dl_se->deadline - dl_se->runtime - dl_server_min_res))) {
+
+ /* reset the defer timer */
+ fw = dl_se->deadline - rq_clock(dl_se->rq) - dl_se->runtime;
+
+ hrtimer_forward_now(timer, ns_to_ktime(fw));
+ return HRTIMER_RESTART;
+ }
+
+ dl_se->dl_defer_running = 1;
+ }
+
+ enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
+
+ if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &dl_se->rq->curr->dl))
+ resched_curr(rq);
+
+ __push_dl_task(rq, rf);
+ }
+
+ return HRTIMER_NORESTART;
+}
+
/*
* This is the bandwidth enforcement timer callback. If here, we know
* a task is not on its dl_rq, since the fact that the timer was running
@@ -1128,28 +1270,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
struct rq_flags rf;
struct rq *rq;
- if (dl_server(dl_se)) {
- struct rq *rq = rq_of_dl_se(dl_se);
- struct rq_flags rf;
-
- rq_lock(rq, &rf);
- if (dl_se->dl_throttled) {
- sched_clock_tick();
- update_rq_clock(rq);
-
- if (dl_se->server_has_tasks(dl_se)) {
- enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
- resched_curr(rq);
- __push_dl_task(rq, &rf);
- } else {
- replenish_dl_entity(dl_se);
- }
-
- }
- rq_unlock(rq, &rf);
-
- return HRTIMER_NORESTART;
- }
+ if (dl_server(dl_se))
+ return dl_server_timer(timer, dl_se);
p = dl_task_of(dl_se);
rq = task_rq_lock(p, &rf);
@@ -1319,22 +1441,10 @@ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
return (delta * u_act) >> BW_SHIFT;
}
-static inline void
-update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
- int flags);
-static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
+s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
{
s64 scaled_delta_exec;
- if (unlikely(delta_exec <= 0)) {
- if (unlikely(dl_se->dl_yielded))
- goto throttle;
- return;
- }
-
- if (dl_entity_is_special(dl_se))
- return;
-
/*
* For tasks that participate in GRUB, we implement GRUB-PA: the
* spare reclaimed bandwidth is used to clock down frequency.
@@ -1353,8 +1463,64 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
}
+ return scaled_delta_exec;
+}
+
+static inline void
+update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
+ int flags);
+static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
+{
+ s64 scaled_delta_exec;
+
+ if (unlikely(delta_exec <= 0)) {
+ if (unlikely(dl_se->dl_yielded))
+ goto throttle;
+ return;
+ }
+
+ if (dl_server(dl_se) && dl_se->dl_throttled && !dl_se->dl_defer)
+ return;
+
+ if (dl_entity_is_special(dl_se))
+ return;
+
+ scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec);
+
dl_se->runtime -= scaled_delta_exec;
+ /*
+ * The fair server can consume its runtime while throttled (not queued/
+ * running as regular CFS).
+ *
+ * If the server consumes its entire runtime in this state. The server
+ * is not required for the current period. Thus, reset the server by
+ * starting a new period, pushing the activation.
+ */
+ if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) {
+ /*
+ * If the server was previously activated - the starving condition
+ * took place, it this point it went away because the fair scheduler
+ * was able to get runtime in background. So return to the initial
+ * state.
+ */
+ dl_se->dl_defer_running = 0;
+
+ hrtimer_try_to_cancel(&dl_se->dl_timer);
+
+ replenish_dl_new_period(dl_se, dl_se->rq);
+
+ /*
+ * Not being able to start the timer seems problematic. If it could not
+ * be started for whatever reason, we need to "unthrottle" the DL server
+ * and queue right away. Otherwise nothing might queue it. That's similar
+ * to what enqueue_dl_entity() does on start_dl_timer==0. For now, just warn.
+ */
+ WARN_ON_ONCE(!start_dl_timer(dl_se));
+
+ return;
+ }
+
throttle:
if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
dl_se->dl_throttled = 1;
@@ -1382,6 +1548,14 @@ throttle:
}
/*
+ * The fair server (sole dl_server) does not account for real-time
+ * workload because it is running fair work.
+ */
+ if (dl_se == &rq->fair_server)
+ return;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
* Because -- for now -- we share the rt bandwidth, we need to
* account our runtime there too, otherwise actual rt tasks
* would be able to exceed the shared quota.
@@ -1405,34 +1579,155 @@ throttle:
rt_rq->rt_time += delta_exec;
raw_spin_unlock(&rt_rq->rt_runtime_lock);
}
+#endif
+}
+
+/*
+ * In the non-defer mode, the idle time is not accounted, as the
+ * server provides a guarantee.
+ *
+ * If the dl_server is in defer mode, the idle time is also considered
+ * as time available for the fair server, avoiding a penalty for the
+ * rt scheduler that did not consumed that time.
+ */
+void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
+{
+ s64 delta_exec, scaled_delta_exec;
+
+ if (!rq->fair_server.dl_defer)
+ return;
+
+ /* no need to discount more */
+ if (rq->fair_server.runtime < 0)
+ return;
+
+ delta_exec = rq_clock_task(rq) - p->se.exec_start;
+ if (delta_exec < 0)
+ return;
+
+ scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec);
+
+ rq->fair_server.runtime -= scaled_delta_exec;
+
+ if (rq->fair_server.runtime < 0) {
+ rq->fair_server.dl_defer_running = 0;
+ rq->fair_server.runtime = 0;
+ }
+
+ p->se.exec_start = rq_clock_task(rq);
}
void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
{
- update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
+ /* 0 runtime = fair server disabled */
+ if (dl_se->dl_runtime)
+ update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
}
void dl_server_start(struct sched_dl_entity *dl_se)
{
+ struct rq *rq = dl_se->rq;
+
+ /*
+ * XXX: the apply do not work fine at the init phase for the
+ * fair server because things are not yet set. We need to improve
+ * this before getting generic.
+ */
if (!dl_server(dl_se)) {
+ u64 runtime = 50 * NSEC_PER_MSEC;
+ u64 period = 1000 * NSEC_PER_MSEC;
+
+ dl_server_apply_params(dl_se, runtime, period, 1);
+
dl_se->dl_server = 1;
+ dl_se->dl_defer = 1;
setup_new_dl_entity(dl_se);
}
+
+ if (!dl_se->dl_runtime)
+ return;
+
enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
+ if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl))
+ resched_curr(dl_se->rq);
}
void dl_server_stop(struct sched_dl_entity *dl_se)
{
+ if (!dl_se->dl_runtime)
+ return;
+
dequeue_dl_entity(dl_se, DEQUEUE_SLEEP);
+ hrtimer_try_to_cancel(&dl_se->dl_timer);
+ dl_se->dl_defer_armed = 0;
+ dl_se->dl_throttled = 0;
}
void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_server_has_tasks_f has_tasks,
- dl_server_pick_f pick)
+ dl_server_pick_f pick_task)
{
dl_se->rq = rq;
dl_se->server_has_tasks = has_tasks;
- dl_se->server_pick = pick;
+ dl_se->server_pick_task = pick_task;
+}
+
+void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq)
+{
+ u64 new_bw = dl_se->dl_bw;
+ int cpu = cpu_of(rq);
+ struct dl_bw *dl_b;
+
+ dl_b = dl_bw_of(cpu_of(rq));
+ guard(raw_spinlock)(&dl_b->lock);
+
+ if (!dl_bw_cpus(cpu))
+ return;
+
+ __dl_add(dl_b, new_bw, dl_bw_cpus(cpu));
+}
+
+int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init)
+{
+ u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+ u64 new_bw = to_ratio(period, runtime);
+ struct rq *rq = dl_se->rq;
+ int cpu = cpu_of(rq);
+ struct dl_bw *dl_b;
+ unsigned long cap;
+ int retval = 0;
+ int cpus;
+
+ dl_b = dl_bw_of(cpu);
+ guard(raw_spinlock)(&dl_b->lock);
+
+ cpus = dl_bw_cpus(cpu);
+ cap = dl_bw_capacity(cpu);
+
+ if (__dl_overflow(dl_b, cap, old_bw, new_bw))
+ return -EBUSY;
+
+ if (init) {
+ __add_rq_bw(new_bw, &rq->dl);
+ __dl_add(dl_b, new_bw, cpus);
+ } else {
+ __dl_sub(dl_b, dl_se->dl_bw, cpus);
+ __dl_add(dl_b, new_bw, cpus);
+
+ dl_rq_change_utilization(rq, dl_se, new_bw);
+ }
+
+ dl_se->dl_runtime = runtime;
+ dl_se->dl_deadline = period;
+ dl_se->dl_period = period;
+
+ dl_se->runtime = 0;
+ dl_se->deadline = 0;
+
+ dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+ dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
+
+ return retval;
}
/*
@@ -1599,46 +1894,40 @@ static inline bool __dl_less(struct rb_node *a, const struct rb_node *b)
return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline);
}
-static inline struct sched_statistics *
+static __always_inline struct sched_statistics *
__schedstats_from_dl_se(struct sched_dl_entity *dl_se)
{
+ if (!schedstat_enabled())
+ return NULL;
+
+ if (dl_server(dl_se))
+ return NULL;
+
return &dl_task_of(dl_se)->stats;
}
static inline void
update_stats_wait_start_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
{
- struct sched_statistics *stats;
-
- if (!schedstat_enabled())
- return;
-
- stats = __schedstats_from_dl_se(dl_se);
- __update_stats_wait_start(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
+ struct sched_statistics *stats = __schedstats_from_dl_se(dl_se);
+ if (stats)
+ __update_stats_wait_start(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
}
static inline void
update_stats_wait_end_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
{
- struct sched_statistics *stats;
-
- if (!schedstat_enabled())
- return;
-
- stats = __schedstats_from_dl_se(dl_se);
- __update_stats_wait_end(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
+ struct sched_statistics *stats = __schedstats_from_dl_se(dl_se);
+ if (stats)
+ __update_stats_wait_end(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
}
static inline void
update_stats_enqueue_sleeper_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
{
- struct sched_statistics *stats;
-
- if (!schedstat_enabled())
- return;
-
- stats = __schedstats_from_dl_se(dl_se);
- __update_stats_enqueue_sleeper(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
+ struct sched_statistics *stats = __schedstats_from_dl_se(dl_se);
+ if (stats)
+ __update_stats_enqueue_sleeper(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
}
static inline void
@@ -1735,7 +2024,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
* be counted in the active utilization; hence, we need to call
* add_running_bw().
*/
- if (dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
+ if (!dl_se->dl_defer && dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
if (flags & ENQUEUE_WAKEUP)
task_contending(dl_se, flags);
@@ -1757,6 +2046,25 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
setup_new_dl_entity(dl_se);
}
+ /*
+ * If the reservation is still throttled, e.g., it got replenished but is a
+ * deferred task and still got to wait, don't enqueue.
+ */
+ if (dl_se->dl_throttled && start_dl_timer(dl_se))
+ return;
+
+ /*
+ * We're about to enqueue, make sure we're not ->dl_throttled!
+ * In case the timer was not started, say because the defer time
+ * has passed, mark as not throttled and mark unarmed.
+ * Also cancel earlier timers, since letting those run is pointless.
+ */
+ if (dl_se->dl_throttled) {
+ hrtimer_try_to_cancel(&dl_se->dl_timer);
+ dl_se->dl_defer_armed = 0;
+ dl_se->dl_throttled = 0;
+ }
+
__enqueue_dl_entity(dl_se);
}
@@ -1846,7 +2154,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
enqueue_pushable_dl_task(rq, p);
}
-static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+static bool dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
update_curr_dl(rq);
@@ -1856,6 +2164,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
dequeue_dl_entity(&p->dl, flags);
if (!p->dl.dl_throttled && !dl_server(&p->dl))
dequeue_pushable_dl_task(rq, p);
+
+ return true;
}
/*
@@ -2074,6 +2384,9 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
deadline_queue_push_tasks(rq);
+
+ if (hrtick_enabled(rq))
+ start_hrtick_dl(rq, &p->dl);
}
static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq)
@@ -2086,7 +2399,11 @@ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq)
return __node_2_dle(left);
}
-static struct task_struct *pick_task_dl(struct rq *rq)
+/*
+ * __pick_next_task_dl - Helper to pick the next -deadline task to run.
+ * @rq: The runqueue to pick the next task from.
+ */
+static struct task_struct *__pick_task_dl(struct rq *rq)
{
struct sched_dl_entity *dl_se;
struct dl_rq *dl_rq = &rq->dl;
@@ -2100,14 +2417,13 @@ again:
WARN_ON_ONCE(!dl_se);
if (dl_server(dl_se)) {
- p = dl_se->server_pick(dl_se);
+ p = dl_se->server_pick_task(dl_se);
if (!p) {
- WARN_ON_ONCE(1);
dl_se->dl_yielded = 1;
update_curr_dl_se(rq, dl_se, 0);
goto again;
}
- p->dl_server = dl_se;
+ rq->dl_server = dl_se;
} else {
p = dl_task_of(dl_se);
}
@@ -2115,24 +2431,12 @@ again:
return p;
}
-static struct task_struct *pick_next_task_dl(struct rq *rq)
+static struct task_struct *pick_task_dl(struct rq *rq)
{
- struct task_struct *p;
-
- p = pick_task_dl(rq);
- if (!p)
- return p;
-
- if (!p->dl_server)
- set_next_task_dl(rq, p, true);
-
- if (hrtick_enabled(rq))
- start_hrtick_dl(rq, &p->dl);
-
- return p;
+ return __pick_task_dl(rq);
}
-static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
+static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next)
{
struct sched_dl_entity *dl_se = &p->dl;
struct dl_rq *dl_rq = &rq->dl;
@@ -2824,13 +3128,12 @@ DEFINE_SCHED_CLASS(dl) = {
.wakeup_preempt = wakeup_preempt_dl,
- .pick_next_task = pick_next_task_dl,
+ .pick_task = pick_task_dl,
.put_prev_task = put_prev_task_dl,
.set_next_task = set_next_task_dl,
#ifdef CONFIG_SMP
.balance = balance_dl,
- .pick_task = pick_task_dl,
.select_task_rq = select_task_rq_dl,
.migrate_task_rq = migrate_task_rq_dl,
.set_cpus_allowed = set_cpus_allowed_dl,
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index c1eb9a1afd13..f4035c7a0fa1 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -333,8 +333,165 @@ static const struct file_operations sched_debug_fops = {
.release = seq_release,
};
+enum dl_param {
+ DL_RUNTIME = 0,
+ DL_PERIOD,
+};
+
+static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */
+static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */
+
+static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos, enum dl_param param)
+{
+ long cpu = (long) ((struct seq_file *) filp->private_data)->private;
+ struct rq *rq = cpu_rq(cpu);
+ u64 runtime, period;
+ size_t err;
+ int retval;
+ u64 value;
+
+ err = kstrtoull_from_user(ubuf, cnt, 10, &value);
+ if (err)
+ return err;
+
+ scoped_guard (rq_lock_irqsave, rq) {
+ runtime = rq->fair_server.dl_runtime;
+ period = rq->fair_server.dl_period;
+
+ switch (param) {
+ case DL_RUNTIME:
+ if (runtime == value)
+ break;
+ runtime = value;
+ break;
+ case DL_PERIOD:
+ if (value == period)
+ break;
+ period = value;
+ break;
+ }
+
+ if (runtime > period ||
+ period > fair_server_period_max ||
+ period < fair_server_period_min) {
+ return -EINVAL;
+ }
+
+ if (rq->cfs.h_nr_running) {
+ update_rq_clock(rq);
+ dl_server_stop(&rq->fair_server);
+ }
+
+ retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0);
+ if (retval)
+ cnt = retval;
+
+ if (!runtime)
+ printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
+ cpu_of(rq));
+
+ if (rq->cfs.h_nr_running)
+ dl_server_start(&rq->fair_server);
+ }
+
+ *ppos += cnt;
+ return cnt;
+}
+
+static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param)
+{
+ unsigned long cpu = (unsigned long) m->private;
+ struct rq *rq = cpu_rq(cpu);
+ u64 value;
+
+ switch (param) {
+ case DL_RUNTIME:
+ value = rq->fair_server.dl_runtime;
+ break;
+ case DL_PERIOD:
+ value = rq->fair_server.dl_period;
+ break;
+ }
+
+ seq_printf(m, "%llu\n", value);
+ return 0;
+
+}
+
+static ssize_t
+sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME);
+}
+
+static int sched_fair_server_runtime_show(struct seq_file *m, void *v)
+{
+ return sched_fair_server_show(m, v, DL_RUNTIME);
+}
+
+static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_fair_server_runtime_show, inode->i_private);
+}
+
+static const struct file_operations fair_server_runtime_fops = {
+ .open = sched_fair_server_runtime_open,
+ .write = sched_fair_server_runtime_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static ssize_t
+sched_fair_server_period_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD);
+}
+
+static int sched_fair_server_period_show(struct seq_file *m, void *v)
+{
+ return sched_fair_server_show(m, v, DL_PERIOD);
+}
+
+static int sched_fair_server_period_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_fair_server_period_show, inode->i_private);
+}
+
+static const struct file_operations fair_server_period_fops = {
+ .open = sched_fair_server_period_open,
+ .write = sched_fair_server_period_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
static struct dentry *debugfs_sched;
+static void debugfs_fair_server_init(void)
+{
+ struct dentry *d_fair;
+ unsigned long cpu;
+
+ d_fair = debugfs_create_dir("fair_server", debugfs_sched);
+ if (!d_fair)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct dentry *d_cpu;
+ char buf[32];
+
+ snprintf(buf, sizeof(buf), "cpu%lu", cpu);
+ d_cpu = debugfs_create_dir(buf, d_fair);
+
+ debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &fair_server_runtime_fops);
+ debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &fair_server_period_fops);
+ }
+}
+
static __init int sched_init_debug(void)
{
struct dentry __maybe_unused *numa;
@@ -374,6 +531,8 @@ static __init int sched_init_debug(void)
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
+ debugfs_fair_server_init();
+
return 0;
}
late_initcall(sched_init_debug);
@@ -580,27 +739,27 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
else
SEQ_printf(m, " %c", task_state_to_char(p));
- SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
+ SEQ_printf(m, " %15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
SPLIT_NS(p->se.deadline),
+ p->se.custom_slice ? 'S' : ' ',
SPLIT_NS(p->se.slice),
SPLIT_NS(p->se.sum_exec_runtime),
(long long)(p->nvcsw + p->nivcsw),
p->prio);
- SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld %9lld.%06ld",
+ SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld",
SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)),
- SPLIT_NS(p->se.sum_exec_runtime),
SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
#ifdef CONFIG_NUMA_BALANCING
- SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
+ SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
#endif
#ifdef CONFIG_CGROUP_SCHED
- SEQ_printf_task_group_path(m, task_group(p), " %s")
+ SEQ_printf_task_group_path(m, task_group(p), " %s")
#endif
SEQ_printf(m, "\n");
@@ -612,10 +771,26 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
SEQ_printf(m, "\n");
SEQ_printf(m, "runnable tasks:\n");
- SEQ_printf(m, " S task PID tree-key switches prio"
- " wait-time sum-exec sum-sleep\n");
+ SEQ_printf(m, " S task PID vruntime eligible "
+ "deadline slice sum-exec switches "
+ "prio wait-time sum-sleep sum-block"
+#ifdef CONFIG_NUMA_BALANCING
+ " node group-id"
+#endif
+#ifdef CONFIG_CGROUP_SCHED
+ " group-path"
+#endif
+ "\n");
SEQ_printf(m, "-------------------------------------------------------"
- "------------------------------------------------------\n");
+ "------------------------------------------------------"
+ "------------------------------------------------------"
+#ifdef CONFIG_NUMA_BALANCING
+ "--------------"
+#endif
+#ifdef CONFIG_CGROUP_SCHED
+ "--------------"
+#endif
+ "\n");
rcu_read_lock();
for_each_process_thread(g, p) {
@@ -641,8 +816,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, "\n");
SEQ_printf(m, "cfs_rq[%d]:\n", cpu);
#endif
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
- SPLIT_NS(cfs_rq->exec_clock));
raw_spin_rq_lock_irqsave(rq, flags);
root = __pick_root_entity(cfs_rq);
@@ -669,8 +842,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(right_vruntime));
spread = right_vruntime - left_vruntime;
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
- SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
- cfs_rq->nr_spread_over);
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running",
@@ -730,9 +901,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
PU(rt_nr_running);
+
+#ifdef CONFIG_RT_GROUP_SCHED
P(rt_throttled);
PN(rt_time);
PN(rt_runtime);
+#endif
#undef PN
#undef PU
@@ -1090,6 +1264,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(dl.runtime);
P(dl.deadline);
}
+#ifdef CONFIG_SCHED_CLASS_EXT
+ __PS("ext.enabled", task_on_scx(p));
+#endif
#undef PN_SCHEDSTAT
#undef P_SCHEDSTAT
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
new file mode 100644
index 000000000000..c09e3dc38c34
--- /dev/null
+++ b/kernel/sched/ext.c
@@ -0,0 +1,7191 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <[email protected]>
+ * Copyright (c) 2022 David Vernet <[email protected]>
+ */
+#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
+
+enum scx_consts {
+ SCX_DSP_DFL_MAX_BATCH = 32,
+ SCX_DSP_MAX_LOOPS = 32,
+ SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
+
+ SCX_EXIT_BT_LEN = 64,
+ SCX_EXIT_MSG_LEN = 1024,
+ SCX_EXIT_DUMP_DFL_LEN = 32768,
+
+ SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE,
+};
+
+enum scx_exit_kind {
+ SCX_EXIT_NONE,
+ SCX_EXIT_DONE,
+
+ SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */
+ SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */
+ SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */
+ SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */
+
+ SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */
+ SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */
+ SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */
+};
+
+/*
+ * An exit code can be specified when exiting with scx_bpf_exit() or
+ * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN
+ * respectively. The codes are 64bit of the format:
+ *
+ * Bits: [63 .. 48 47 .. 32 31 .. 0]
+ * [ SYS ACT ] [ SYS RSN ] [ USR ]
+ *
+ * SYS ACT: System-defined exit actions
+ * SYS RSN: System-defined exit reasons
+ * USR : User-defined exit codes and reasons
+ *
+ * Using the above, users may communicate intention and context by ORing system
+ * actions and/or system reasons with a user-defined exit code.
+ */
+enum scx_exit_code {
+ /* Reasons */
+ SCX_ECODE_RSN_HOTPLUG = 1LLU << 32,
+
+ /* Actions */
+ SCX_ECODE_ACT_RESTART = 1LLU << 48,
+};
+
+/*
+ * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
+ * being disabled.
+ */
+struct scx_exit_info {
+ /* %SCX_EXIT_* - broad category of the exit reason */
+ enum scx_exit_kind kind;
+
+ /* exit code if gracefully exiting */
+ s64 exit_code;
+
+ /* textual representation of the above */
+ const char *reason;
+
+ /* backtrace if exiting due to an error */
+ unsigned long *bt;
+ u32 bt_len;
+
+ /* informational message */
+ char *msg;
+
+ /* debug dump */
+ char *dump;
+};
+
+/* sched_ext_ops.flags */
+enum scx_ops_flags {
+ /*
+ * Keep built-in idle tracking even if ops.update_idle() is implemented.
+ */
+ SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
+
+ /*
+ * By default, if there are no other task to run on the CPU, ext core
+ * keeps running the current task even after its slice expires. If this
+ * flag is specified, such tasks are passed to ops.enqueue() with
+ * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
+ */
+ SCX_OPS_ENQ_LAST = 1LLU << 1,
+
+ /*
+ * An exiting task may schedule after PF_EXITING is set. In such cases,
+ * bpf_task_from_pid() may not be able to find the task and if the BPF
+ * scheduler depends on pid lookup for dispatching, the task will be
+ * lost leading to various issues including RCU grace period stalls.
+ *
+ * To mask this problem, by default, unhashed tasks are automatically
+ * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
+ * depend on pid lookups and wants to handle these tasks directly, the
+ * following flag can be used.
+ */
+ SCX_OPS_ENQ_EXITING = 1LLU << 2,
+
+ /*
+ * If set, only tasks with policy set to SCHED_EXT are attached to
+ * sched_ext. If clear, SCHED_NORMAL tasks are also included.
+ */
+ SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
+
+ /*
+ * CPU cgroup support flags
+ */
+ SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* cpu.weight */
+
+ SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
+ SCX_OPS_ENQ_LAST |
+ SCX_OPS_ENQ_EXITING |
+ SCX_OPS_SWITCH_PARTIAL |
+ SCX_OPS_HAS_CGROUP_WEIGHT,
+};
+
+/* argument container for ops.init_task() */
+struct scx_init_task_args {
+ /*
+ * Set if ops.init_task() is being invoked on the fork path, as opposed
+ * to the scheduler transition path.
+ */
+ bool fork;
+#ifdef CONFIG_EXT_GROUP_SCHED
+ /* the cgroup the task is joining */
+ struct cgroup *cgroup;
+#endif
+};
+
+/* argument container for ops.exit_task() */
+struct scx_exit_task_args {
+ /* Whether the task exited before running on sched_ext. */
+ bool cancelled;
+};
+
+/* argument container for ops->cgroup_init() */
+struct scx_cgroup_init_args {
+ /* the weight of the cgroup [1..10000] */
+ u32 weight;
+};
+
+enum scx_cpu_preempt_reason {
+ /* next task is being scheduled by &sched_class_rt */
+ SCX_CPU_PREEMPT_RT,
+ /* next task is being scheduled by &sched_class_dl */
+ SCX_CPU_PREEMPT_DL,
+ /* next task is being scheduled by &sched_class_stop */
+ SCX_CPU_PREEMPT_STOP,
+ /* unknown reason for SCX being preempted */
+ SCX_CPU_PREEMPT_UNKNOWN,
+};
+
+/*
+ * Argument container for ops->cpu_acquire(). Currently empty, but may be
+ * expanded in the future.
+ */
+struct scx_cpu_acquire_args {};
+
+/* argument container for ops->cpu_release() */
+struct scx_cpu_release_args {
+ /* the reason the CPU was preempted */
+ enum scx_cpu_preempt_reason reason;
+
+ /* the task that's going to be scheduled on the CPU */
+ struct task_struct *task;
+};
+
+/*
+ * Informational context provided to dump operations.
+ */
+struct scx_dump_ctx {
+ enum scx_exit_kind kind;
+ s64 exit_code;
+ const char *reason;
+ u64 at_ns;
+ u64 at_jiffies;
+};
+
+/**
+ * struct sched_ext_ops - Operation table for BPF scheduler implementation
+ *
+ * Userland can implement an arbitrary scheduling policy by implementing and
+ * loading operations in this table.
+ */
+struct sched_ext_ops {
+ /**
+ * select_cpu - Pick the target CPU for a task which is being woken up
+ * @p: task being woken up
+ * @prev_cpu: the cpu @p was on before sleeping
+ * @wake_flags: SCX_WAKE_*
+ *
+ * Decision made here isn't final. @p may be moved to any CPU while it
+ * is getting dispatched for execution later. However, as @p is not on
+ * the rq at this point, getting the eventual execution CPU right here
+ * saves a small bit of overhead down the line.
+ *
+ * If an idle CPU is returned, the CPU is kicked and will try to
+ * dispatch. While an explicit custom mechanism can be added,
+ * select_cpu() serves as the default way to wake up idle CPUs.
+ *
+ * @p may be dispatched directly by calling scx_bpf_dispatch(). If @p
+ * is dispatched, the ops.enqueue() callback will be skipped. Finally,
+ * if @p is dispatched to SCX_DSQ_LOCAL, it will be dispatched to the
+ * local DSQ of whatever CPU is returned by this callback.
+ */
+ s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
+
+ /**
+ * enqueue - Enqueue a task on the BPF scheduler
+ * @p: task being enqueued
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * @p is ready to run. Dispatch directly by calling scx_bpf_dispatch()
+ * or enqueue on the BPF scheduler. If not directly dispatched, the bpf
+ * scheduler owns @p and if it fails to dispatch @p, the task will
+ * stall.
+ *
+ * If @p was dispatched from ops.select_cpu(), this callback is
+ * skipped.
+ */
+ void (*enqueue)(struct task_struct *p, u64 enq_flags);
+
+ /**
+ * dequeue - Remove a task from the BPF scheduler
+ * @p: task being dequeued
+ * @deq_flags: %SCX_DEQ_*
+ *
+ * Remove @p from the BPF scheduler. This is usually called to isolate
+ * the task while updating its scheduling properties (e.g. priority).
+ *
+ * The ext core keeps track of whether the BPF side owns a given task or
+ * not and can gracefully ignore spurious dispatches from BPF side,
+ * which makes it safe to not implement this method. However, depending
+ * on the scheduling logic, this can lead to confusing behaviors - e.g.
+ * scheduling position not being updated across a priority change.
+ */
+ void (*dequeue)(struct task_struct *p, u64 deq_flags);
+
+ /**
+ * dispatch - Dispatch tasks from the BPF scheduler and/or consume DSQs
+ * @cpu: CPU to dispatch tasks for
+ * @prev: previous task being switched out
+ *
+ * Called when a CPU's local dsq is empty. The operation should dispatch
+ * one or more tasks from the BPF scheduler into the DSQs using
+ * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using
+ * scx_bpf_consume().
+ *
+ * The maximum number of times scx_bpf_dispatch() can be called without
+ * an intervening scx_bpf_consume() is specified by
+ * ops.dispatch_max_batch. See the comments on top of the two functions
+ * for more details.
+ *
+ * When not %NULL, @prev is an SCX task with its slice depleted. If
+ * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
+ * @prev->scx.flags, it is not enqueued yet and will be enqueued after
+ * ops.dispatch() returns. To keep executing @prev, return without
+ * dispatching or consuming any tasks. Also see %SCX_OPS_ENQ_LAST.
+ */
+ void (*dispatch)(s32 cpu, struct task_struct *prev);
+
+ /**
+ * tick - Periodic tick
+ * @p: task running currently
+ *
+ * This operation is called every 1/HZ seconds on CPUs which are
+ * executing an SCX task. Setting @p->scx.slice to 0 will trigger an
+ * immediate dispatch cycle on the CPU.
+ */
+ void (*tick)(struct task_struct *p);
+
+ /**
+ * runnable - A task is becoming runnable on its associated CPU
+ * @p: task becoming runnable
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * This and the following three functions can be used to track a task's
+ * execution state transitions. A task becomes ->runnable() on a CPU,
+ * and then goes through one or more ->running() and ->stopping() pairs
+ * as it runs on the CPU, and eventually becomes ->quiescent() when it's
+ * done running on the CPU.
+ *
+ * @p is becoming runnable on the CPU because it's
+ *
+ * - waking up (%SCX_ENQ_WAKEUP)
+ * - being moved from another CPU
+ * - being restored after temporarily taken off the queue for an
+ * attribute change.
+ *
+ * This and ->enqueue() are related but not coupled. This operation
+ * notifies @p's state transition and may not be followed by ->enqueue()
+ * e.g. when @p is being dispatched to a remote CPU, or when @p is
+ * being enqueued on a CPU experiencing a hotplug event. Likewise, a
+ * task may be ->enqueue()'d without being preceded by this operation
+ * e.g. after exhausting its slice.
+ */
+ void (*runnable)(struct task_struct *p, u64 enq_flags);
+
+ /**
+ * running - A task is starting to run on its associated CPU
+ * @p: task starting to run
+ *
+ * See ->runnable() for explanation on the task state notifiers.
+ */
+ void (*running)(struct task_struct *p);
+
+ /**
+ * stopping - A task is stopping execution
+ * @p: task stopping to run
+ * @runnable: is task @p still runnable?
+ *
+ * See ->runnable() for explanation on the task state notifiers. If
+ * !@runnable, ->quiescent() will be invoked after this operation
+ * returns.
+ */
+ void (*stopping)(struct task_struct *p, bool runnable);
+
+ /**
+ * quiescent - A task is becoming not runnable on its associated CPU
+ * @p: task becoming not runnable
+ * @deq_flags: %SCX_DEQ_*
+ *
+ * See ->runnable() for explanation on the task state notifiers.
+ *
+ * @p is becoming quiescent on the CPU because it's
+ *
+ * - sleeping (%SCX_DEQ_SLEEP)
+ * - being moved to another CPU
+ * - being temporarily taken off the queue for an attribute change
+ * (%SCX_DEQ_SAVE)
+ *
+ * This and ->dequeue() are related but not coupled. This operation
+ * notifies @p's state transition and may not be preceded by ->dequeue()
+ * e.g. when @p is being dispatched to a remote CPU.
+ */
+ void (*quiescent)(struct task_struct *p, u64 deq_flags);
+
+ /**
+ * yield - Yield CPU
+ * @from: yielding task
+ * @to: optional yield target task
+ *
+ * If @to is NULL, @from is yielding the CPU to other runnable tasks.
+ * The BPF scheduler should ensure that other available tasks are
+ * dispatched before the yielding task. Return value is ignored in this
+ * case.
+ *
+ * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
+ * scheduler can implement the request, return %true; otherwise, %false.
+ */
+ bool (*yield)(struct task_struct *from, struct task_struct *to);
+
+ /**
+ * core_sched_before - Task ordering for core-sched
+ * @a: task A
+ * @b: task B
+ *
+ * Used by core-sched to determine the ordering between two tasks. See
+ * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
+ * core-sched.
+ *
+ * Both @a and @b are runnable and may or may not currently be queued on
+ * the BPF scheduler. Should return %true if @a should run before @b.
+ * %false if there's no required ordering or @b should run before @a.
+ *
+ * If not specified, the default is ordering them according to when they
+ * became runnable.
+ */
+ bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);
+
+ /**
+ * set_weight - Set task weight
+ * @p: task to set weight for
+ * @weight: new weight [1..10000]
+ *
+ * Update @p's weight to @weight.
+ */
+ void (*set_weight)(struct task_struct *p, u32 weight);
+
+ /**
+ * set_cpumask - Set CPU affinity
+ * @p: task to set CPU affinity for
+ * @cpumask: cpumask of cpus that @p can run on
+ *
+ * Update @p's CPU affinity to @cpumask.
+ */
+ void (*set_cpumask)(struct task_struct *p,
+ const struct cpumask *cpumask);
+
+ /**
+ * update_idle - Update the idle state of a CPU
+ * @cpu: CPU to udpate the idle state for
+ * @idle: whether entering or exiting the idle state
+ *
+ * This operation is called when @rq's CPU goes or leaves the idle
+ * state. By default, implementing this operation disables the built-in
+ * idle CPU tracking and the following helpers become unavailable:
+ *
+ * - scx_bpf_select_cpu_dfl()
+ * - scx_bpf_test_and_clear_cpu_idle()
+ * - scx_bpf_pick_idle_cpu()
+ *
+ * The user also must implement ops.select_cpu() as the default
+ * implementation relies on scx_bpf_select_cpu_dfl().
+ *
+ * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
+ * tracking.
+ */
+ void (*update_idle)(s32 cpu, bool idle);
+
+ /**
+ * cpu_acquire - A CPU is becoming available to the BPF scheduler
+ * @cpu: The CPU being acquired by the BPF scheduler.
+ * @args: Acquire arguments, see the struct definition.
+ *
+ * A CPU that was previously released from the BPF scheduler is now once
+ * again under its control.
+ */
+ void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
+
+ /**
+ * cpu_release - A CPU is taken away from the BPF scheduler
+ * @cpu: The CPU being released by the BPF scheduler.
+ * @args: Release arguments, see the struct definition.
+ *
+ * The specified CPU is no longer under the control of the BPF
+ * scheduler. This could be because it was preempted by a higher
+ * priority sched_class, though there may be other reasons as well. The
+ * caller should consult @args->reason to determine the cause.
+ */
+ void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
+
+ /**
+ * init_task - Initialize a task to run in a BPF scheduler
+ * @p: task to initialize for BPF scheduling
+ * @args: init arguments, see the struct definition
+ *
+ * Either we're loading a BPF scheduler or a new task is being forked.
+ * Initialize @p for BPF scheduling. This operation may block and can
+ * be used for allocations, and is called exactly once for a task.
+ *
+ * Return 0 for success, -errno for failure. An error return while
+ * loading will abort loading of the BPF scheduler. During a fork, it
+ * will abort that specific fork.
+ */
+ s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
+
+ /**
+ * exit_task - Exit a previously-running task from the system
+ * @p: task to exit
+ *
+ * @p is exiting or the BPF scheduler is being unloaded. Perform any
+ * necessary cleanup for @p.
+ */
+ void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);
+
+ /**
+ * enable - Enable BPF scheduling for a task
+ * @p: task to enable BPF scheduling for
+ *
+ * Enable @p for BPF scheduling. enable() is called on @p any time it
+ * enters SCX, and is always paired with a matching disable().
+ */
+ void (*enable)(struct task_struct *p);
+
+ /**
+ * disable - Disable BPF scheduling for a task
+ * @p: task to disable BPF scheduling for
+ *
+ * @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
+ * Disable BPF scheduling for @p. A disable() call is always matched
+ * with a prior enable() call.
+ */
+ void (*disable)(struct task_struct *p);
+
+ /**
+ * dump - Dump BPF scheduler state on error
+ * @ctx: debug dump context
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
+ */
+ void (*dump)(struct scx_dump_ctx *ctx);
+
+ /**
+ * dump_cpu - Dump BPF scheduler state for a CPU on error
+ * @ctx: debug dump context
+ * @cpu: CPU to generate debug dump for
+ * @idle: @cpu is currently idle without any runnable tasks
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
+ * @cpu. If @idle is %true and this operation doesn't produce any
+ * output, @cpu is skipped for dump.
+ */
+ void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
+
+ /**
+ * dump_task - Dump BPF scheduler state for a runnable task on error
+ * @ctx: debug dump context
+ * @p: runnable task to generate debug dump for
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
+ * @p.
+ */
+ void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
+
+#ifdef CONFIG_EXT_GROUP_SCHED
+ /**
+ * cgroup_init - Initialize a cgroup
+ * @cgrp: cgroup being initialized
+ * @args: init arguments, see the struct definition
+ *
+ * Either the BPF scheduler is being loaded or @cgrp created, initialize
+ * @cgrp for sched_ext. This operation may block.
+ *
+ * Return 0 for success, -errno for failure. An error return while
+ * loading will abort loading of the BPF scheduler. During cgroup
+ * creation, it will abort the specific cgroup creation.
+ */
+ s32 (*cgroup_init)(struct cgroup *cgrp,
+ struct scx_cgroup_init_args *args);
+
+ /**
+ * cgroup_exit - Exit a cgroup
+ * @cgrp: cgroup being exited
+ *
+ * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
+ * @cgrp for sched_ext. This operation my block.
+ */
+ void (*cgroup_exit)(struct cgroup *cgrp);
+
+ /**
+ * cgroup_prep_move - Prepare a task to be moved to a different cgroup
+ * @p: task being moved
+ * @from: cgroup @p is being moved from
+ * @to: cgroup @p is being moved to
+ *
+ * Prepare @p for move from cgroup @from to @to. This operation may
+ * block and can be used for allocations.
+ *
+ * Return 0 for success, -errno for failure. An error return aborts the
+ * migration.
+ */
+ s32 (*cgroup_prep_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+
+ /**
+ * cgroup_move - Commit cgroup move
+ * @p: task being moved
+ * @from: cgroup @p is being moved from
+ * @to: cgroup @p is being moved to
+ *
+ * Commit the move. @p is dequeued during this operation.
+ */
+ void (*cgroup_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+
+ /**
+ * cgroup_cancel_move - Cancel cgroup move
+ * @p: task whose cgroup move is being canceled
+ * @from: cgroup @p was being moved from
+ * @to: cgroup @p was being moved to
+ *
+ * @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
+ * Undo the preparation.
+ */
+ void (*cgroup_cancel_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+
+ /**
+ * cgroup_set_weight - A cgroup's weight is being changed
+ * @cgrp: cgroup whose weight is being updated
+ * @weight: new weight [1..10000]
+ *
+ * Update @tg's weight to @weight.
+ */
+ void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
+#endif /* CONFIG_CGROUPS */
+
+ /*
+ * All online ops must come before ops.cpu_online().
+ */
+
+ /**
+ * cpu_online - A CPU became online
+ * @cpu: CPU which just came up
+ *
+ * @cpu just came online. @cpu will not call ops.enqueue() or
+ * ops.dispatch(), nor run tasks associated with other CPUs beforehand.
+ */
+ void (*cpu_online)(s32 cpu);
+
+ /**
+ * cpu_offline - A CPU is going offline
+ * @cpu: CPU which is going offline
+ *
+ * @cpu is going offline. @cpu will not call ops.enqueue() or
+ * ops.dispatch(), nor run tasks associated with other CPUs afterwards.
+ */
+ void (*cpu_offline)(s32 cpu);
+
+ /*
+ * All CPU hotplug ops must come before ops.init().
+ */
+
+ /**
+ * init - Initialize the BPF scheduler
+ */
+ s32 (*init)(void);
+
+ /**
+ * exit - Clean up after the BPF scheduler
+ * @info: Exit info
+ */
+ void (*exit)(struct scx_exit_info *info);
+
+ /**
+ * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch
+ */
+ u32 dispatch_max_batch;
+
+ /**
+ * flags - %SCX_OPS_* flags
+ */
+ u64 flags;
+
+ /**
+ * timeout_ms - The maximum amount of time, in milliseconds, that a
+ * runnable task should be able to wait before being scheduled. The
+ * maximum timeout may not exceed the default timeout of 30 seconds.
+ *
+ * Defaults to the maximum allowed timeout value of 30 seconds.
+ */
+ u32 timeout_ms;
+
+ /**
+ * exit_dump_len - scx_exit_info.dump buffer length. If 0, the default
+ * value of 32768 is used.
+ */
+ u32 exit_dump_len;
+
+ /**
+ * hotplug_seq - A sequence number that may be set by the scheduler to
+ * detect when a hotplug event has occurred during the loading process.
+ * If 0, no detection occurs. Otherwise, the scheduler will fail to
+ * load if the sequence number does not match @scx_hotplug_seq on the
+ * enable path.
+ */
+ u64 hotplug_seq;
+
+ /**
+ * name - BPF scheduler's name
+ *
+ * Must be a non-zero valid BPF object name including only isalnum(),
+ * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
+ * BPF scheduler is enabled.
+ */
+ char name[SCX_OPS_NAME_LEN];
+};
+
+enum scx_opi {
+ SCX_OPI_BEGIN = 0,
+ SCX_OPI_NORMAL_BEGIN = 0,
+ SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online),
+ SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online),
+ SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init),
+ SCX_OPI_END = SCX_OP_IDX(init),
+};
+
+enum scx_wake_flags {
+ /* expose select WF_* flags as enums */
+ SCX_WAKE_FORK = WF_FORK,
+ SCX_WAKE_TTWU = WF_TTWU,
+ SCX_WAKE_SYNC = WF_SYNC,
+};
+
+enum scx_enq_flags {
+ /* expose select ENQUEUE_* flags as enums */
+ SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP,
+ SCX_ENQ_HEAD = ENQUEUE_HEAD,
+
+ /* high 32bits are SCX specific */
+
+ /*
+ * Set the following to trigger preemption when calling
+ * scx_bpf_dispatch() with a local dsq as the target. The slice of the
+ * current task is cleared to zero and the CPU is kicked into the
+ * scheduling path. Implies %SCX_ENQ_HEAD.
+ */
+ SCX_ENQ_PREEMPT = 1LLU << 32,
+
+ /*
+ * The task being enqueued was previously enqueued on the current CPU's
+ * %SCX_DSQ_LOCAL, but was removed from it in a call to the
+ * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was
+ * invoked in a ->cpu_release() callback, and the task is again
+ * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
+ * task will not be scheduled on the CPU until at least the next invocation
+ * of the ->cpu_acquire() callback.
+ */
+ SCX_ENQ_REENQ = 1LLU << 40,
+
+ /*
+ * The task being enqueued is the only task available for the cpu. By
+ * default, ext core keeps executing such tasks but when
+ * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
+ * %SCX_ENQ_LAST flag set.
+ *
+ * The BPF scheduler is responsible for triggering a follow-up
+ * scheduling event. Otherwise, Execution may stall.
+ */
+ SCX_ENQ_LAST = 1LLU << 41,
+
+ /* high 8 bits are internal */
+ __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56,
+
+ SCX_ENQ_CLEAR_OPSS = 1LLU << 56,
+ SCX_ENQ_DSQ_PRIQ = 1LLU << 57,
+};
+
+enum scx_deq_flags {
+ /* expose select DEQUEUE_* flags as enums */
+ SCX_DEQ_SLEEP = DEQUEUE_SLEEP,
+
+ /* high 32bits are SCX specific */
+
+ /*
+ * The generic core-sched layer decided to execute the task even though
+ * it hasn't been dispatched yet. Dequeue from the BPF side.
+ */
+ SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32,
+};
+
+enum scx_pick_idle_cpu_flags {
+ SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */
+};
+
+enum scx_kick_flags {
+ /*
+ * Kick the target CPU if idle. Guarantees that the target CPU goes
+ * through at least one full scheduling cycle before going idle. If the
+ * target CPU can be determined to be currently not idle and going to go
+ * through a scheduling cycle before going idle, noop.
+ */
+ SCX_KICK_IDLE = 1LLU << 0,
+
+ /*
+ * Preempt the current task and execute the dispatch path. If the
+ * current task of the target CPU is an SCX task, its ->scx.slice is
+ * cleared to zero before the scheduling path is invoked so that the
+ * task expires and the dispatch path is invoked.
+ */
+ SCX_KICK_PREEMPT = 1LLU << 1,
+
+ /*
+ * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will
+ * return after the target CPU finishes picking the next task.
+ */
+ SCX_KICK_WAIT = 1LLU << 2,
+};
+
+enum scx_tg_flags {
+ SCX_TG_ONLINE = 1U << 0,
+ SCX_TG_INITED = 1U << 1,
+};
+
+enum scx_ops_enable_state {
+ SCX_OPS_PREPPING,
+ SCX_OPS_ENABLING,
+ SCX_OPS_ENABLED,
+ SCX_OPS_DISABLING,
+ SCX_OPS_DISABLED,
+};
+
+static const char *scx_ops_enable_state_str[] = {
+ [SCX_OPS_PREPPING] = "prepping",
+ [SCX_OPS_ENABLING] = "enabling",
+ [SCX_OPS_ENABLED] = "enabled",
+ [SCX_OPS_DISABLING] = "disabling",
+ [SCX_OPS_DISABLED] = "disabled",
+};
+
+/*
+ * sched_ext_entity->ops_state
+ *
+ * Used to track the task ownership between the SCX core and the BPF scheduler.
+ * State transitions look as follows:
+ *
+ * NONE -> QUEUEING -> QUEUED -> DISPATCHING
+ * ^ | |
+ * | v v
+ * \-------------------------------/
+ *
+ * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
+ * sites for explanations on the conditions being waited upon and why they are
+ * safe. Transitions out of them into NONE or QUEUED must store_release and the
+ * waiters should load_acquire.
+ *
+ * Tracking scx_ops_state enables sched_ext core to reliably determine whether
+ * any given task can be dispatched by the BPF scheduler at all times and thus
+ * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
+ * to try to dispatch any task anytime regardless of its state as the SCX core
+ * can safely reject invalid dispatches.
+ */
+enum scx_ops_state {
+ SCX_OPSS_NONE, /* owned by the SCX core */
+ SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */
+ SCX_OPSS_QUEUED, /* owned by the BPF scheduler */
+ SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */
+
+ /*
+ * QSEQ brands each QUEUED instance so that, when dispatch races
+ * dequeue/requeue, the dispatcher can tell whether it still has a claim
+ * on the task being dispatched.
+ *
+ * As some 32bit archs can't do 64bit store_release/load_acquire,
+ * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
+ * 32bit machines. The dispatch race window QSEQ protects is very narrow
+ * and runs with IRQ disabled. 30 bits should be sufficient.
+ */
+ SCX_OPSS_QSEQ_SHIFT = 2,
+};
+
+/* Use macros to ensure that the type is unsigned long for the masks */
+#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
+#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)
+
+/*
+ * During exit, a task may schedule after losing its PIDs. When disabling the
+ * BPF scheduler, we need to be able to iterate tasks in every state to
+ * guarantee system safety. Maintain a dedicated task list which contains every
+ * task between its fork and eventual free.
+ */
+static DEFINE_SPINLOCK(scx_tasks_lock);
+static LIST_HEAD(scx_tasks);
+
+/* ops enable/disable */
+static struct kthread_worker *scx_ops_helper;
+static DEFINE_MUTEX(scx_ops_enable_mutex);
+DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
+DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
+static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
+static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0);
+static bool scx_switching_all;
+DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
+
+static struct sched_ext_ops scx_ops;
+static bool scx_warned_zero_slice;
+
+static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
+static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
+static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
+static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
+
+static struct static_key_false scx_has_op[SCX_OPI_END] =
+ { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
+
+static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
+static struct scx_exit_info *scx_exit_info;
+
+static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
+static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
+
+/*
+ * A monotically increasing sequence number that is incremented every time a
+ * scheduler is enabled. This can be used by to check if any custom sched_ext
+ * scheduler has ever been used in the system.
+ */
+static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
+
+/*
+ * The maximum amount of time in jiffies that a task may be runnable without
+ * being scheduled on a CPU. If this timeout is exceeded, it will trigger
+ * scx_ops_error().
+ */
+static unsigned long scx_watchdog_timeout;
+
+/*
+ * The last time the delayed work was run. This delayed work relies on
+ * ksoftirqd being able to run to service timer interrupts, so it's possible
+ * that this work itself could get wedged. To account for this, we check that
+ * it's not stalled in the timer tick, and trigger an error if it is.
+ */
+static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
+
+static struct delayed_work scx_watchdog_work;
+
+/* idle tracking */
+#ifdef CONFIG_SMP
+#ifdef CONFIG_CPUMASK_OFFSTACK
+#define CL_ALIGNED_IF_ONSTACK
+#else
+#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp
+#endif
+
+static struct {
+ cpumask_var_t cpu;
+ cpumask_var_t smt;
+} idle_masks CL_ALIGNED_IF_ONSTACK;
+
+#endif /* CONFIG_SMP */
+
+/* for %SCX_KICK_WAIT */
+static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
+
+/*
+ * Direct dispatch marker.
+ *
+ * Non-NULL values are used for direct dispatch from enqueue path. A valid
+ * pointer points to the task currently being enqueued. An ERR_PTR value is used
+ * to indicate that direct dispatch has already happened.
+ */
+static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
+
+/* dispatch queues */
+static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global;
+
+static const struct rhashtable_params dsq_hash_params = {
+ .key_len = 8,
+ .key_offset = offsetof(struct scx_dispatch_q, id),
+ .head_offset = offsetof(struct scx_dispatch_q, hash_node),
+};
+
+static struct rhashtable dsq_hash;
+static LLIST_HEAD(dsqs_to_free);
+
+/* dispatch buf */
+struct scx_dsp_buf_ent {
+ struct task_struct *task;
+ unsigned long qseq;
+ u64 dsq_id;
+ u64 enq_flags;
+};
+
+static u32 scx_dsp_max_batch;
+
+struct scx_dsp_ctx {
+ struct rq *rq;
+ u32 cursor;
+ u32 nr_tasks;
+ struct scx_dsp_buf_ent buf[];
+};
+
+static struct scx_dsp_ctx __percpu *scx_dsp_ctx;
+
+/* string formatting from BPF */
+struct scx_bstr_buf {
+ u64 data[MAX_BPRINTF_VARARGS];
+ char line[SCX_EXIT_MSG_LEN];
+};
+
+static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock);
+static struct scx_bstr_buf scx_exit_bstr_buf;
+
+/* ops debug dump */
+struct scx_dump_data {
+ s32 cpu;
+ bool first;
+ s32 cursor;
+ struct seq_buf *s;
+ const char *prefix;
+ struct scx_bstr_buf buf;
+};
+
+static struct scx_dump_data scx_dump_data = {
+ .cpu = -1,
+};
+
+/* /sys/kernel/sched_ext interface */
+static struct kset *scx_kset;
+static struct kobject *scx_root_kobj;
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched_ext.h>
+
+static void process_ddsp_deferred_locals(struct rq *rq);
+static void scx_bpf_kick_cpu(s32 cpu, u64 flags);
+static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
+ s64 exit_code,
+ const char *fmt, ...);
+
+#define scx_ops_error_kind(err, fmt, args...) \
+ scx_ops_exit_kind((err), 0, fmt, ##args)
+
+#define scx_ops_exit(code, fmt, args...) \
+ scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, (code), fmt, ##args)
+
+#define scx_ops_error(fmt, args...) \
+ scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args)
+
+#define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)])
+
+static long jiffies_delta_msecs(unsigned long at, unsigned long now)
+{
+ if (time_after(at, now))
+ return jiffies_to_msecs(at - now);
+ else
+ return -(long)jiffies_to_msecs(now - at);
+}
+
+/* if the highest set bit is N, return a mask with bits [N+1, 31] set */
+static u32 higher_bits(u32 flags)
+{
+ return ~((1 << fls(flags)) - 1);
+}
+
+/* return the mask with only the highest bit set */
+static u32 highest_bit(u32 flags)
+{
+ int bit = fls(flags);
+ return ((u64)1 << bit) >> 1;
+}
+
+static bool u32_before(u32 a, u32 b)
+{
+ return (s32)(a - b) < 0;
+}
+
+/*
+ * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
+ * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
+ * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check
+ * whether it's running from an allowed context.
+ *
+ * @mask is constant, always inline to cull the mask calculations.
+ */
+static __always_inline void scx_kf_allow(u32 mask)
+{
+ /* nesting is allowed only in increasing scx_kf_mask order */
+ WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask,
+ "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",
+ current->scx.kf_mask, mask);
+ current->scx.kf_mask |= mask;
+ barrier();
+}
+
+static void scx_kf_disallow(u32 mask)
+{
+ barrier();
+ current->scx.kf_mask &= ~mask;
+}
+
+#define SCX_CALL_OP(mask, op, args...) \
+do { \
+ if (mask) { \
+ scx_kf_allow(mask); \
+ scx_ops.op(args); \
+ scx_kf_disallow(mask); \
+ } else { \
+ scx_ops.op(args); \
+ } \
+} while (0)
+
+#define SCX_CALL_OP_RET(mask, op, args...) \
+({ \
+ __typeof__(scx_ops.op(args)) __ret; \
+ if (mask) { \
+ scx_kf_allow(mask); \
+ __ret = scx_ops.op(args); \
+ scx_kf_disallow(mask); \
+ } else { \
+ __ret = scx_ops.op(args); \
+ } \
+ __ret; \
+})
+
+/*
+ * Some kfuncs are allowed only on the tasks that are subjects of the
+ * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such
+ * restrictions, the following SCX_CALL_OP_*() variants should be used when
+ * invoking scx_ops operations that take task arguments. These can only be used
+ * for non-nesting operations due to the way the tasks are tracked.
+ *
+ * kfuncs which can only operate on such tasks can in turn use
+ * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on
+ * the specific task.
+ */
+#define SCX_CALL_OP_TASK(mask, op, task, args...) \
+do { \
+ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
+ current->scx.kf_tasks[0] = task; \
+ SCX_CALL_OP(mask, op, task, ##args); \
+ current->scx.kf_tasks[0] = NULL; \
+} while (0)
+
+#define SCX_CALL_OP_TASK_RET(mask, op, task, args...) \
+({ \
+ __typeof__(scx_ops.op(task, ##args)) __ret; \
+ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
+ current->scx.kf_tasks[0] = task; \
+ __ret = SCX_CALL_OP_RET(mask, op, task, ##args); \
+ current->scx.kf_tasks[0] = NULL; \
+ __ret; \
+})
+
+#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...) \
+({ \
+ __typeof__(scx_ops.op(task0, task1, ##args)) __ret; \
+ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
+ current->scx.kf_tasks[0] = task0; \
+ current->scx.kf_tasks[1] = task1; \
+ __ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args); \
+ current->scx.kf_tasks[0] = NULL; \
+ current->scx.kf_tasks[1] = NULL; \
+ __ret; \
+})
+
+/* @mask is constant, always inline to cull unnecessary branches */
+static __always_inline bool scx_kf_allowed(u32 mask)
+{
+ if (unlikely(!(current->scx.kf_mask & mask))) {
+ scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
+ mask, current->scx.kf_mask);
+ return false;
+ }
+
+ /*
+ * Enforce nesting boundaries. e.g. A kfunc which can be called from
+ * DISPATCH must not be called if we're running DEQUEUE which is nested
+ * inside ops.dispatch(). We don't need to check boundaries for any
+ * blocking kfuncs as the verifier ensures they're only called from
+ * sleepable progs.
+ */
+ if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
+ (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
+ scx_ops_error("cpu_release kfunc called from a nested operation");
+ return false;
+ }
+
+ if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
+ (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
+ scx_ops_error("dispatch kfunc called from a nested operation");
+ return false;
+ }
+
+ return true;
+}
+
+/* see SCX_CALL_OP_TASK() */
+static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
+ struct task_struct *p)
+{
+ if (!scx_kf_allowed(mask))
+ return false;
+
+ if (unlikely((p != current->scx.kf_tasks[0] &&
+ p != current->scx.kf_tasks[1]))) {
+ scx_ops_error("called on a task not being operated on");
+ return false;
+ }
+
+ return true;
+}
+
+static bool scx_kf_allowed_if_unlocked(void)
+{
+ return !current->scx.kf_mask;
+}
+
+/**
+ * nldsq_next_task - Iterate to the next task in a non-local DSQ
+ * @dsq: user dsq being interated
+ * @cur: current position, %NULL to start iteration
+ * @rev: walk backwards
+ *
+ * Returns %NULL when iteration is finished.
+ */
+static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq,
+ struct task_struct *cur, bool rev)
+{
+ struct list_head *list_node;
+ struct scx_dsq_list_node *dsq_lnode;
+
+ lockdep_assert_held(&dsq->lock);
+
+ if (cur)
+ list_node = &cur->scx.dsq_list.node;
+ else
+ list_node = &dsq->list;
+
+ /* find the next task, need to skip BPF iteration cursors */
+ do {
+ if (rev)
+ list_node = list_node->prev;
+ else
+ list_node = list_node->next;
+
+ if (list_node == &dsq->list)
+ return NULL;
+
+ dsq_lnode = container_of(list_node, struct scx_dsq_list_node,
+ node);
+ } while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR);
+
+ return container_of(dsq_lnode, struct task_struct, scx.dsq_list);
+}
+
+#define nldsq_for_each_task(p, dsq) \
+ for ((p) = nldsq_next_task((dsq), NULL, false); (p); \
+ (p) = nldsq_next_task((dsq), (p), false))
+
+
+/*
+ * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse]
+ * dispatch order. BPF-visible iterator is opaque and larger to allow future
+ * changes without breaking backward compatibility. Can be used with
+ * bpf_for_each(). See bpf_iter_scx_dsq_*().
+ */
+enum scx_dsq_iter_flags {
+ /* iterate in the reverse dispatch order */
+ SCX_DSQ_ITER_REV = 1U << 16,
+
+ __SCX_DSQ_ITER_HAS_SLICE = 1U << 30,
+ __SCX_DSQ_ITER_HAS_VTIME = 1U << 31,
+
+ __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV,
+ __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS |
+ __SCX_DSQ_ITER_HAS_SLICE |
+ __SCX_DSQ_ITER_HAS_VTIME,
+};
+
+struct bpf_iter_scx_dsq_kern {
+ struct scx_dsq_list_node cursor;
+ struct scx_dispatch_q *dsq;
+ u64 slice;
+ u64 vtime;
+} __attribute__((aligned(8)));
+
+struct bpf_iter_scx_dsq {
+ u64 __opaque[6];
+} __attribute__((aligned(8)));
+
+
+/*
+ * SCX task iterator.
+ */
+struct scx_task_iter {
+ struct sched_ext_entity cursor;
+ struct task_struct *locked;
+ struct rq *rq;
+ struct rq_flags rf;
+};
+
+/**
+ * scx_task_iter_init - Initialize a task iterator
+ * @iter: iterator to init
+ *
+ * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized,
+ * @iter must eventually be exited with scx_task_iter_exit().
+ *
+ * scx_tasks_lock may be released between this and the first next() call or
+ * between any two next() calls. If scx_tasks_lock is released between two
+ * next() calls, the caller is responsible for ensuring that the task being
+ * iterated remains accessible either through RCU read lock or obtaining a
+ * reference count.
+ *
+ * All tasks which existed when the iteration started are guaranteed to be
+ * visited as long as they still exist.
+ */
+static void scx_task_iter_init(struct scx_task_iter *iter)
+{
+ lockdep_assert_held(&scx_tasks_lock);
+
+ BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS &
+ ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));
+
+ iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
+ list_add(&iter->cursor.tasks_node, &scx_tasks);
+ iter->locked = NULL;
+}
+
+/**
+ * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator
+ * @iter: iterator to unlock rq for
+ *
+ * If @iter is in the middle of a locked iteration, it may be locking the rq of
+ * the task currently being visited. Unlock the rq if so. This function can be
+ * safely called anytime during an iteration.
+ *
+ * Returns %true if the rq @iter was locking is unlocked. %false if @iter was
+ * not locking an rq.
+ */
+static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter)
+{
+ if (iter->locked) {
+ task_rq_unlock(iter->rq, iter->locked, &iter->rf);
+ iter->locked = NULL;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+/**
+ * scx_task_iter_exit - Exit a task iterator
+ * @iter: iterator to exit
+ *
+ * Exit a previously initialized @iter. Must be called with scx_tasks_lock held.
+ * If the iterator holds a task's rq lock, that rq lock is released. See
+ * scx_task_iter_init() for details.
+ */
+static void scx_task_iter_exit(struct scx_task_iter *iter)
+{
+ lockdep_assert_held(&scx_tasks_lock);
+
+ scx_task_iter_rq_unlock(iter);
+ list_del_init(&iter->cursor.tasks_node);
+}
+
+/**
+ * scx_task_iter_next - Next task
+ * @iter: iterator to walk
+ *
+ * Visit the next task. See scx_task_iter_init() for details.
+ */
+static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
+{
+ struct list_head *cursor = &iter->cursor.tasks_node;
+ struct sched_ext_entity *pos;
+
+ lockdep_assert_held(&scx_tasks_lock);
+
+ list_for_each_entry(pos, cursor, tasks_node) {
+ if (&pos->tasks_node == &scx_tasks)
+ return NULL;
+ if (!(pos->flags & SCX_TASK_CURSOR)) {
+ list_move(cursor, &pos->tasks_node);
+ return container_of(pos, struct task_struct, scx);
+ }
+ }
+
+ /* can't happen, should always terminate at scx_tasks above */
+ BUG();
+}
+
+/**
+ * scx_task_iter_next_locked - Next non-idle task with its rq locked
+ * @iter: iterator to walk
+ * @include_dead: Whether we should include dead tasks in the iteration
+ *
+ * Visit the non-idle task with its rq lock held. Allows callers to specify
+ * whether they would like to filter out dead tasks. See scx_task_iter_init()
+ * for details.
+ */
+static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
+{
+ struct task_struct *p;
+
+ scx_task_iter_rq_unlock(iter);
+
+ while ((p = scx_task_iter_next(iter))) {
+ /*
+ * scx_task_iter is used to prepare and move tasks into SCX
+ * while loading the BPF scheduler and vice-versa while
+ * unloading. The init_tasks ("swappers") should be excluded
+ * from the iteration because:
+ *
+ * - It's unsafe to use __setschduler_prio() on an init_task to
+ * determine the sched_class to use as it won't preserve its
+ * idle_sched_class.
+ *
+ * - ops.init/exit_task() can easily be confused if called with
+ * init_tasks as they, e.g., share PID 0.
+ *
+ * As init_tasks are never scheduled through SCX, they can be
+ * skipped safely. Note that is_idle_task() which tests %PF_IDLE
+ * doesn't work here:
+ *
+ * - %PF_IDLE may not be set for an init_task whose CPU hasn't
+ * yet been onlined.
+ *
+ * - %PF_IDLE can be set on tasks that are not init_tasks. See
+ * play_idle_precise() used by CONFIG_IDLE_INJECT.
+ *
+ * Test for idle_sched_class as only init_tasks are on it.
+ */
+ if (p->sched_class != &idle_sched_class)
+ break;
+ }
+ if (!p)
+ return NULL;
+
+ iter->rq = task_rq_lock(p, &iter->rf);
+ iter->locked = p;
+
+ return p;
+}
+
+static enum scx_ops_enable_state scx_ops_enable_state(void)
+{
+ return atomic_read(&scx_ops_enable_state_var);
+}
+
+static enum scx_ops_enable_state
+scx_ops_set_enable_state(enum scx_ops_enable_state to)
+{
+ return atomic_xchg(&scx_ops_enable_state_var, to);
+}
+
+static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to,
+ enum scx_ops_enable_state from)
+{
+ int from_v = from;
+
+ return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to);
+}
+
+static bool scx_rq_bypassing(struct rq *rq)
+{
+ return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
+}
+
+/**
+ * wait_ops_state - Busy-wait the specified ops state to end
+ * @p: target task
+ * @opss: state to wait the end of
+ *
+ * Busy-wait for @p to transition out of @opss. This can only be used when the
+ * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also
+ * has load_acquire semantics to ensure that the caller can see the updates made
+ * in the enqueueing and dispatching paths.
+ */
+static void wait_ops_state(struct task_struct *p, unsigned long opss)
+{
+ do {
+ cpu_relax();
+ } while (atomic_long_read_acquire(&p->scx.ops_state) == opss);
+}
+
+/**
+ * ops_cpu_valid - Verify a cpu number
+ * @cpu: cpu number which came from a BPF ops
+ * @where: extra information reported on error
+ *
+ * @cpu is a cpu number which came from the BPF scheduler and can be any value.
+ * Verify that it is in range and one of the possible cpus. If invalid, trigger
+ * an ops error.
+ */
+static bool ops_cpu_valid(s32 cpu, const char *where)
+{
+ if (likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu))) {
+ return true;
+ } else {
+ scx_ops_error("invalid CPU %d%s%s", cpu,
+ where ? " " : "", where ?: "");
+ return false;
+ }
+}
+
+/**
+ * ops_sanitize_err - Sanitize a -errno value
+ * @ops_name: operation to blame on failure
+ * @err: -errno value to sanitize
+ *
+ * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return
+ * -%EPROTO. This is necessary because returning a rogue -errno up the chain can
+ * cause misbehaviors. For an example, a large negative return from
+ * ops.init_task() triggers an oops when passed up the call chain because the
+ * value fails IS_ERR() test after being encoded with ERR_PTR() and then is
+ * handled as a pointer.
+ */
+static int ops_sanitize_err(const char *ops_name, s32 err)
+{
+ if (err < 0 && err >= -MAX_ERRNO)
+ return err;
+
+ scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err);
+ return -EPROTO;
+}
+
+static void run_deferred(struct rq *rq)
+{
+ process_ddsp_deferred_locals(rq);
+}
+
+#ifdef CONFIG_SMP
+static void deferred_bal_cb_workfn(struct rq *rq)
+{
+ run_deferred(rq);
+}
+#endif
+
+static void deferred_irq_workfn(struct irq_work *irq_work)
+{
+ struct rq *rq = container_of(irq_work, struct rq, scx.deferred_irq_work);
+
+ raw_spin_rq_lock(rq);
+ run_deferred(rq);
+ raw_spin_rq_unlock(rq);
+}
+
+/**
+ * schedule_deferred - Schedule execution of deferred actions on an rq
+ * @rq: target rq
+ *
+ * Schedule execution of deferred actions on @rq. Must be called with @rq
+ * locked. Deferred actions are executed with @rq locked but unpinned, and thus
+ * can unlock @rq to e.g. migrate tasks to other rqs.
+ */
+static void schedule_deferred(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+
+#ifdef CONFIG_SMP
+ /*
+ * If in the middle of waking up a task, task_woken_scx() will be called
+ * afterwards which will then run the deferred actions, no need to
+ * schedule anything.
+ */
+ if (rq->scx.flags & SCX_RQ_IN_WAKEUP)
+ return;
+
+ /*
+ * If in balance, the balance callbacks will be called before rq lock is
+ * released. Schedule one.
+ */
+ if (rq->scx.flags & SCX_RQ_IN_BALANCE) {
+ queue_balance_callback(rq, &rq->scx.deferred_bal_cb,
+ deferred_bal_cb_workfn);
+ return;
+ }
+#endif
+ /*
+ * No scheduler hooks available. Queue an irq work. They are executed on
+ * IRQ re-enable which may take a bit longer than the scheduler hooks.
+ * The above WAKEUP and BALANCE paths should cover most of the cases and
+ * the time to IRQ re-enable shouldn't be long.
+ */
+ irq_work_queue(&rq->scx.deferred_irq_work);
+}
+
+/**
+ * touch_core_sched - Update timestamp used for core-sched task ordering
+ * @rq: rq to read clock from, must be locked
+ * @p: task to update the timestamp for
+ *
+ * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to
+ * implement global or local-DSQ FIFO ordering for core-sched. Should be called
+ * when a task becomes runnable and its turn on the CPU ends (e.g. slice
+ * exhaustion).
+ */
+static void touch_core_sched(struct rq *rq, struct task_struct *p)
+{
+ lockdep_assert_rq_held(rq);
+
+#ifdef CONFIG_SCHED_CORE
+ /*
+ * It's okay to update the timestamp spuriously. Use
+ * sched_core_disabled() which is cheaper than enabled().
+ *
+ * As this is used to determine ordering between tasks of sibling CPUs,
+ * it may be better to use per-core dispatch sequence instead.
+ */
+ if (!sched_core_disabled())
+ p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq));
+#endif
+}
+
+/**
+ * touch_core_sched_dispatch - Update core-sched timestamp on dispatch
+ * @rq: rq to read clock from, must be locked
+ * @p: task being dispatched
+ *
+ * If the BPF scheduler implements custom core-sched ordering via
+ * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO
+ * ordering within each local DSQ. This function is called from dispatch paths
+ * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect.
+ */
+static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p)
+{
+ lockdep_assert_rq_held(rq);
+
+#ifdef CONFIG_SCHED_CORE
+ if (SCX_HAS_OP(core_sched_before))
+ touch_core_sched(rq, p);
+#endif
+}
+
+static void update_curr_scx(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ s64 delta_exec;
+
+ delta_exec = update_curr_common(rq);
+ if (unlikely(delta_exec <= 0))
+ return;
+
+ if (curr->scx.slice != SCX_SLICE_INF) {
+ curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec);
+ if (!curr->scx.slice)
+ touch_core_sched(rq, curr);
+ }
+}
+
+static bool scx_dsq_priq_less(struct rb_node *node_a,
+ const struct rb_node *node_b)
+{
+ const struct task_struct *a =
+ container_of(node_a, struct task_struct, scx.dsq_priq);
+ const struct task_struct *b =
+ container_of(node_b, struct task_struct, scx.dsq_priq);
+
+ return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime);
+}
+
+static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
+{
+ /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */
+ WRITE_ONCE(dsq->nr, dsq->nr + delta);
+}
+
+static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
+ u64 enq_flags)
+{
+ bool is_local = dsq->id == SCX_DSQ_LOCAL;
+
+ WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));
+ WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) ||
+ !RB_EMPTY_NODE(&p->scx.dsq_priq));
+
+ if (!is_local) {
+ raw_spin_lock(&dsq->lock);
+ if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
+ scx_ops_error("attempting to dispatch to a destroyed dsq");
+ /* fall back to the global dsq */
+ raw_spin_unlock(&dsq->lock);
+ dsq = &scx_dsq_global;
+ raw_spin_lock(&dsq->lock);
+ }
+ }
+
+ if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) &&
+ (enq_flags & SCX_ENQ_DSQ_PRIQ))) {
+ /*
+ * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from
+ * their FIFO queues. To avoid confusion and accidentally
+ * starving vtime-dispatched tasks by FIFO-dispatched tasks, we
+ * disallow any internal DSQ from doing vtime ordering of
+ * tasks.
+ */
+ scx_ops_error("cannot use vtime ordering for built-in DSQs");
+ enq_flags &= ~SCX_ENQ_DSQ_PRIQ;
+ }
+
+ if (enq_flags & SCX_ENQ_DSQ_PRIQ) {
+ struct rb_node *rbp;
+
+ /*
+ * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are
+ * linked to both the rbtree and list on PRIQs, this can only be
+ * tested easily when adding the first task.
+ */
+ if (unlikely(RB_EMPTY_ROOT(&dsq->priq) &&
+ nldsq_next_task(dsq, NULL, false)))
+ scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks",
+ dsq->id);
+
+ p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ;
+ rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less);
+
+ /*
+ * Find the previous task and insert after it on the list so
+ * that @dsq->list is vtime ordered.
+ */
+ rbp = rb_prev(&p->scx.dsq_priq);
+ if (rbp) {
+ struct task_struct *prev =
+ container_of(rbp, struct task_struct,
+ scx.dsq_priq);
+ list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node);
+ } else {
+ list_add(&p->scx.dsq_list.node, &dsq->list);
+ }
+ } else {
+ /* a FIFO DSQ shouldn't be using PRIQ enqueuing */
+ if (unlikely(!RB_EMPTY_ROOT(&dsq->priq)))
+ scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
+ dsq->id);
+
+ if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
+ list_add(&p->scx.dsq_list.node, &dsq->list);
+ else
+ list_add_tail(&p->scx.dsq_list.node, &dsq->list);
+ }
+
+ /* seq records the order tasks are queued, used by BPF DSQ iterator */
+ dsq->seq++;
+ p->scx.dsq_seq = dsq->seq;
+
+ dsq_mod_nr(dsq, 1);
+ p->scx.dsq = dsq;
+
+ /*
+ * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the
+ * direct dispatch path, but we clear them here because the direct
+ * dispatch verdict may be overridden on the enqueue path during e.g.
+ * bypass.
+ */
+ p->scx.ddsp_dsq_id = SCX_DSQ_INVALID;
+ p->scx.ddsp_enq_flags = 0;
+
+ /*
+ * We're transitioning out of QUEUEING or DISPATCHING. store_release to
+ * match waiters' load_acquire.
+ */
+ if (enq_flags & SCX_ENQ_CLEAR_OPSS)
+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
+
+ if (is_local) {
+ struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
+ bool preempt = false;
+
+ if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
+ rq->curr->sched_class == &ext_sched_class) {
+ rq->curr->scx.slice = 0;
+ preempt = true;
+ }
+
+ if (preempt || sched_class_above(&ext_sched_class,
+ rq->curr->sched_class))
+ resched_curr(rq);
+ } else {
+ raw_spin_unlock(&dsq->lock);
+ }
+}
+
+static void task_unlink_from_dsq(struct task_struct *p,
+ struct scx_dispatch_q *dsq)
+{
+ WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node));
+
+ if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) {
+ rb_erase(&p->scx.dsq_priq, &dsq->priq);
+ RB_CLEAR_NODE(&p->scx.dsq_priq);
+ p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ;
+ }
+
+ list_del_init(&p->scx.dsq_list.node);
+ dsq_mod_nr(dsq, -1);
+}
+
+static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
+{
+ struct scx_dispatch_q *dsq = p->scx.dsq;
+ bool is_local = dsq == &rq->scx.local_dsq;
+
+ if (!dsq) {
+ /*
+ * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals.
+ * Unlinking is all that's needed to cancel.
+ */
+ if (unlikely(!list_empty(&p->scx.dsq_list.node)))
+ list_del_init(&p->scx.dsq_list.node);
+
+ /*
+ * When dispatching directly from the BPF scheduler to a local
+ * DSQ, the task isn't associated with any DSQ but
+ * @p->scx.holding_cpu may be set under the protection of
+ * %SCX_OPSS_DISPATCHING.
+ */
+ if (p->scx.holding_cpu >= 0)
+ p->scx.holding_cpu = -1;
+
+ return;
+ }
+
+ if (!is_local)
+ raw_spin_lock(&dsq->lock);
+
+ /*
+ * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_* can't
+ * change underneath us.
+ */
+ if (p->scx.holding_cpu < 0) {
+ /* @p must still be on @dsq, dequeue */
+ task_unlink_from_dsq(p, dsq);
+ } else {
+ /*
+ * We're racing against dispatch_to_local_dsq() which already
+ * removed @p from @dsq and set @p->scx.holding_cpu. Clear the
+ * holding_cpu which tells dispatch_to_local_dsq() that it lost
+ * the race.
+ */
+ WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node));
+ p->scx.holding_cpu = -1;
+ }
+ p->scx.dsq = NULL;
+
+ if (!is_local)
+ raw_spin_unlock(&dsq->lock);
+}
+
+static struct scx_dispatch_q *find_user_dsq(u64 dsq_id)
+{
+ return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
+}
+
+static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id)
+{
+ lockdep_assert(rcu_read_lock_any_held());
+
+ if (dsq_id == SCX_DSQ_GLOBAL)
+ return &scx_dsq_global;
+ else
+ return find_user_dsq(dsq_id);
+}
+
+static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
+ struct task_struct *p)
+{
+ struct scx_dispatch_q *dsq;
+
+ if (dsq_id == SCX_DSQ_LOCAL)
+ return &rq->scx.local_dsq;
+
+ if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
+ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
+
+ if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
+ return &scx_dsq_global;
+
+ return &cpu_rq(cpu)->scx.local_dsq;
+ }
+
+ dsq = find_non_local_dsq(dsq_id);
+ if (unlikely(!dsq)) {
+ scx_ops_error("non-existent DSQ 0x%llx for %s[%d]",
+ dsq_id, p->comm, p->pid);
+ return &scx_dsq_global;
+ }
+
+ return dsq;
+}
+
+static void mark_direct_dispatch(struct task_struct *ddsp_task,
+ struct task_struct *p, u64 dsq_id,
+ u64 enq_flags)
+{
+ /*
+ * Mark that dispatch already happened from ops.select_cpu() or
+ * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value
+ * which can never match a valid task pointer.
+ */
+ __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH));
+
+ /* @p must match the task on the enqueue path */
+ if (unlikely(p != ddsp_task)) {
+ if (IS_ERR(ddsp_task))
+ scx_ops_error("%s[%d] already direct-dispatched",
+ p->comm, p->pid);
+ else
+ scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
+ ddsp_task->comm, ddsp_task->pid,
+ p->comm, p->pid);
+ return;
+ }
+
+ WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID);
+ WARN_ON_ONCE(p->scx.ddsp_enq_flags);
+
+ p->scx.ddsp_dsq_id = dsq_id;
+ p->scx.ddsp_enq_flags = enq_flags;
+}
+
+static void direct_dispatch(struct task_struct *p, u64 enq_flags)
+{
+ struct rq *rq = task_rq(p);
+ struct scx_dispatch_q *dsq =
+ find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);
+
+ touch_core_sched_dispatch(rq, p);
+
+ p->scx.ddsp_enq_flags |= enq_flags;
+
+ /*
+ * We are in the enqueue path with @rq locked and pinned, and thus can't
+ * double lock a remote rq and enqueue to its local DSQ. For
+ * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer
+ * the enqueue so that it's executed when @rq can be unlocked.
+ */
+ if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) {
+ unsigned long opss;
+
+ opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK;
+
+ switch (opss & SCX_OPSS_STATE_MASK) {
+ case SCX_OPSS_NONE:
+ break;
+ case SCX_OPSS_QUEUEING:
+ /*
+ * As @p was never passed to the BPF side, _release is
+ * not strictly necessary. Still do it for consistency.
+ */
+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
+ break;
+ default:
+ WARN_ONCE(true, "sched_ext: %s[%d] has invalid ops state 0x%lx in direct_dispatch()",
+ p->comm, p->pid, opss);
+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
+ break;
+ }
+
+ WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));
+ list_add_tail(&p->scx.dsq_list.node,
+ &rq->scx.ddsp_deferred_locals);
+ schedule_deferred(rq);
+ return;
+ }
+
+ dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
+}
+
+static bool scx_rq_online(struct rq *rq)
+{
+ /*
+ * Test both cpu_active() and %SCX_RQ_ONLINE. %SCX_RQ_ONLINE indicates
+ * the online state as seen from the BPF scheduler. cpu_active() test
+ * guarantees that, if this function returns %true, %SCX_RQ_ONLINE will
+ * stay set until the current scheduling operation is complete even if
+ * we aren't locking @rq.
+ */
+ return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq)));
+}
+
+static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
+ int sticky_cpu)
+{
+ struct task_struct **ddsp_taskp;
+ unsigned long qseq;
+
+ WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
+
+ /* rq migration */
+ if (sticky_cpu == cpu_of(rq))
+ goto local_norefill;
+
+ /*
+ * If !scx_rq_online(), we already told the BPF scheduler that the CPU
+ * is offline and are just running the hotplug path. Don't bother the
+ * BPF scheduler.
+ */
+ if (!scx_rq_online(rq))
+ goto local;
+
+ if (scx_rq_bypassing(rq))
+ goto global;
+
+ if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
+ goto direct;
+
+ /* see %SCX_OPS_ENQ_EXITING */
+ if (!static_branch_unlikely(&scx_ops_enq_exiting) &&
+ unlikely(p->flags & PF_EXITING))
+ goto local;
+
+ if (!SCX_HAS_OP(enqueue))
+ goto global;
+
+ /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */
+ qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT;
+
+ WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
+ atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
+
+ ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
+ WARN_ON_ONCE(*ddsp_taskp);
+ *ddsp_taskp = p;
+
+ SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags);
+
+ *ddsp_taskp = NULL;
+ if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
+ goto direct;
+
+ /*
+ * If not directly dispatched, QUEUEING isn't clear yet and dispatch or
+ * dequeue may be waiting. The store_release matches their load_acquire.
+ */
+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq);
+ return;
+
+direct:
+ direct_dispatch(p, enq_flags);
+ return;
+
+local:
+ /*
+ * For task-ordering, slice refill must be treated as implying the end
+ * of the current slice. Otherwise, the longer @p stays on the CPU, the
+ * higher priority it becomes from scx_prio_less()'s POV.
+ */
+ touch_core_sched(rq, p);
+ p->scx.slice = SCX_SLICE_DFL;
+local_norefill:
+ dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
+ return;
+
+global:
+ touch_core_sched(rq, p); /* see the comment in local: */
+ p->scx.slice = SCX_SLICE_DFL;
+ dispatch_enqueue(&scx_dsq_global, p, enq_flags);
+}
+
+static bool task_runnable(const struct task_struct *p)
+{
+ return !list_empty(&p->scx.runnable_node);
+}
+
+static void set_task_runnable(struct rq *rq, struct task_struct *p)
+{
+ lockdep_assert_rq_held(rq);
+
+ if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) {
+ p->scx.runnable_at = jiffies;
+ p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT;
+ }
+
+ /*
+ * list_add_tail() must be used. scx_ops_bypass() depends on tasks being
+ * appened to the runnable_list.
+ */
+ list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
+}
+
+static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
+{
+ list_del_init(&p->scx.runnable_node);
+ if (reset_runnable_at)
+ p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
+}
+
+static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
+{
+ int sticky_cpu = p->scx.sticky_cpu;
+
+ if (enq_flags & ENQUEUE_WAKEUP)
+ rq->scx.flags |= SCX_RQ_IN_WAKEUP;
+
+ enq_flags |= rq->scx.extra_enq_flags;
+
+ if (sticky_cpu >= 0)
+ p->scx.sticky_cpu = -1;
+
+ /*
+ * Restoring a running task will be immediately followed by
+ * set_next_task_scx() which expects the task to not be on the BPF
+ * scheduler as tasks can only start running through local DSQs. Force
+ * direct-dispatch into the local DSQ by setting the sticky_cpu.
+ */
+ if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p))
+ sticky_cpu = cpu_of(rq);
+
+ if (p->scx.flags & SCX_TASK_QUEUED) {
+ WARN_ON_ONCE(!task_runnable(p));
+ goto out;
+ }
+
+ set_task_runnable(rq, p);
+ p->scx.flags |= SCX_TASK_QUEUED;
+ rq->scx.nr_running++;
+ add_nr_running(rq, 1);
+
+ if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p))
+ SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags);
+
+ if (enq_flags & SCX_ENQ_WAKEUP)
+ touch_core_sched(rq, p);
+
+ do_enqueue_task(rq, p, enq_flags, sticky_cpu);
+out:
+ rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
+}
+
+static void ops_dequeue(struct task_struct *p, u64 deq_flags)
+{
+ unsigned long opss;
+
+ /* dequeue is always temporary, don't reset runnable_at */
+ clr_task_runnable(p, false);
+
+ /* acquire ensures that we see the preceding updates on QUEUED */
+ opss = atomic_long_read_acquire(&p->scx.ops_state);
+
+ switch (opss & SCX_OPSS_STATE_MASK) {
+ case SCX_OPSS_NONE:
+ break;
+ case SCX_OPSS_QUEUEING:
+ /*
+ * QUEUEING is started and finished while holding @p's rq lock.
+ * As we're holding the rq lock now, we shouldn't see QUEUEING.
+ */
+ BUG();
+ case SCX_OPSS_QUEUED:
+ if (SCX_HAS_OP(dequeue))
+ SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags);
+
+ if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
+ SCX_OPSS_NONE))
+ break;
+ fallthrough;
+ case SCX_OPSS_DISPATCHING:
+ /*
+ * If @p is being dispatched from the BPF scheduler to a DSQ,
+ * wait for the transfer to complete so that @p doesn't get
+ * added to its DSQ after dequeueing is complete.
+ *
+ * As we're waiting on DISPATCHING with the rq locked, the
+ * dispatching side shouldn't try to lock the rq while
+ * DISPATCHING is set. See dispatch_to_local_dsq().
+ *
+ * DISPATCHING shouldn't have qseq set and control can reach
+ * here with NONE @opss from the above QUEUED case block.
+ * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss.
+ */
+ wait_ops_state(p, SCX_OPSS_DISPATCHING);
+ BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
+ break;
+ }
+}
+
+static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
+{
+ if (!(p->scx.flags & SCX_TASK_QUEUED)) {
+ WARN_ON_ONCE(task_runnable(p));
+ return true;
+ }
+
+ ops_dequeue(p, deq_flags);
+
+ /*
+ * A currently running task which is going off @rq first gets dequeued
+ * and then stops running. As we want running <-> stopping transitions
+ * to be contained within runnable <-> quiescent transitions, trigger
+ * ->stopping() early here instead of in put_prev_task_scx().
+ *
+ * @p may go through multiple stopping <-> running transitions between
+ * here and put_prev_task_scx() if task attribute changes occur while
+ * balance_scx() leaves @rq unlocked. However, they don't contain any
+ * information meaningful to the BPF scheduler and can be suppressed by
+ * skipping the callbacks if the task is !QUEUED.
+ */
+ if (SCX_HAS_OP(stopping) && task_current(rq, p)) {
+ update_curr_scx(rq);
+ SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false);
+ }
+
+ if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p))
+ SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags);
+
+ if (deq_flags & SCX_DEQ_SLEEP)
+ p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
+ else
+ p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP;
+
+ p->scx.flags &= ~SCX_TASK_QUEUED;
+ rq->scx.nr_running--;
+ sub_nr_running(rq, 1);
+
+ dispatch_dequeue(rq, p);
+ return true;
+}
+
+static void yield_task_scx(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ if (SCX_HAS_OP(yield))
+ SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL);
+ else
+ p->scx.slice = 0;
+}
+
+static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
+{
+ struct task_struct *from = rq->curr;
+
+ if (SCX_HAS_OP(yield))
+ return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to);
+ else
+ return false;
+}
+
+static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
+ struct scx_dispatch_q *src_dsq,
+ struct rq *dst_rq)
+{
+ struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq;
+
+ /* @dsq is locked and @p is on @dst_rq */
+ lockdep_assert_held(&src_dsq->lock);
+ lockdep_assert_rq_held(dst_rq);
+
+ WARN_ON_ONCE(p->scx.holding_cpu >= 0);
+
+ if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
+ list_add(&p->scx.dsq_list.node, &dst_dsq->list);
+ else
+ list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list);
+
+ dsq_mod_nr(dst_dsq, 1);
+ p->scx.dsq = dst_dsq;
+}
+
+#ifdef CONFIG_SMP
+/**
+ * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ
+ * @p: task to move
+ * @enq_flags: %SCX_ENQ_*
+ * @src_rq: rq to move the task from, locked on entry, released on return
+ * @dst_rq: rq to move the task into, locked on return
+ *
+ * Move @p which is currently on @src_rq to @dst_rq's local DSQ.
+ */
+static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
+ struct rq *src_rq, struct rq *dst_rq)
+{
+ lockdep_assert_rq_held(src_rq);
+
+ /* the following marks @p MIGRATING which excludes dequeue */
+ deactivate_task(src_rq, p, 0);
+ set_task_cpu(p, cpu_of(dst_rq));
+ p->scx.sticky_cpu = cpu_of(dst_rq);
+
+ raw_spin_rq_unlock(src_rq);
+ raw_spin_rq_lock(dst_rq);
+
+ /*
+ * We want to pass scx-specific enq_flags but activate_task() will
+ * truncate the upper 32 bit. As we own @rq, we can pass them through
+ * @rq->scx.extra_enq_flags instead.
+ */
+ WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr));
+ WARN_ON_ONCE(dst_rq->scx.extra_enq_flags);
+ dst_rq->scx.extra_enq_flags = enq_flags;
+ activate_task(dst_rq, p, 0);
+ dst_rq->scx.extra_enq_flags = 0;
+}
+
+/*
+ * Similar to kernel/sched/core.c::is_cpu_allowed(). However, there are two
+ * differences:
+ *
+ * - is_cpu_allowed() asks "Can this task run on this CPU?" while
+ * task_can_run_on_remote_rq() asks "Can the BPF scheduler migrate the task to
+ * this CPU?".
+ *
+ * While migration is disabled, is_cpu_allowed() has to say "yes" as the task
+ * must be allowed to finish on the CPU that it's currently on regardless of
+ * the CPU state. However, task_can_run_on_remote_rq() must say "no" as the
+ * BPF scheduler shouldn't attempt to migrate a task which has migration
+ * disabled.
+ *
+ * - The BPF scheduler is bypassed while the rq is offline and we can always say
+ * no to the BPF scheduler initiated migrations while offline.
+ */
+static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
+ bool trigger_error)
+{
+ int cpu = cpu_of(rq);
+
+ /*
+ * We don't require the BPF scheduler to avoid dispatching to offline
+ * CPUs mostly for convenience but also because CPUs can go offline
+ * between scx_bpf_dispatch() calls and here. Trigger error iff the
+ * picked CPU is outside the allowed mask.
+ */
+ if (!task_allowed_on_cpu(p, cpu)) {
+ if (trigger_error)
+ scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]",
+ cpu_of(rq), p->comm, p->pid);
+ return false;
+ }
+
+ if (unlikely(is_migration_disabled(p)))
+ return false;
+
+ if (!scx_rq_online(rq))
+ return false;
+
+ return true;
+}
+
+/**
+ * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq
+ * @p: target task
+ * @dsq: locked DSQ @p is currently on
+ * @src_rq: rq @p is currently on, stable with @dsq locked
+ *
+ * Called with @dsq locked but no rq's locked. We want to move @p to a different
+ * DSQ, including any local DSQ, but are not locking @src_rq. Locking @src_rq is
+ * required when transferring into a local DSQ. Even when transferring into a
+ * non-local DSQ, it's better to use the same mechanism to protect against
+ * dequeues and maintain the invariant that @p->scx.dsq can only change while
+ * @src_rq is locked, which e.g. scx_dump_task() depends on.
+ *
+ * We want to grab @src_rq but that can deadlock if we try while locking @dsq,
+ * so we want to unlink @p from @dsq, drop its lock and then lock @src_rq. As
+ * this may race with dequeue, which can't drop the rq lock or fail, do a little
+ * dancing from our side.
+ *
+ * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets
+ * dequeued after we unlock @dsq but before locking @src_rq, the holding_cpu
+ * would be cleared to -1. While other cpus may have updated it to different
+ * values afterwards, as this operation can't be preempted or recurse, the
+ * holding_cpu can never become this CPU again before we're done. Thus, we can
+ * tell whether we lost to dequeue by testing whether the holding_cpu still
+ * points to this CPU. See dispatch_dequeue() for the counterpart.
+ *
+ * On return, @dsq is unlocked and @src_rq is locked. Returns %true if @p is
+ * still valid. %false if lost to dequeue.
+ */
+static bool unlink_dsq_and_lock_src_rq(struct task_struct *p,
+ struct scx_dispatch_q *dsq,
+ struct rq *src_rq)
+{
+ s32 cpu = raw_smp_processor_id();
+
+ lockdep_assert_held(&dsq->lock);
+
+ WARN_ON_ONCE(p->scx.holding_cpu >= 0);
+ task_unlink_from_dsq(p, dsq);
+ p->scx.holding_cpu = cpu;
+
+ raw_spin_unlock(&dsq->lock);
+ raw_spin_rq_lock(src_rq);
+
+ /* task_rq couldn't have changed if we're still the holding cpu */
+ return likely(p->scx.holding_cpu == cpu) &&
+ !WARN_ON_ONCE(src_rq != task_rq(p));
+}
+
+static bool consume_remote_task(struct rq *this_rq, struct task_struct *p,
+ struct scx_dispatch_q *dsq, struct rq *src_rq)
+{
+ raw_spin_rq_unlock(this_rq);
+
+ if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) {
+ move_remote_task_to_local_dsq(p, 0, src_rq, this_rq);
+ return true;
+ } else {
+ raw_spin_rq_unlock(src_rq);
+ raw_spin_rq_lock(this_rq);
+ return false;
+ }
+}
+#else /* CONFIG_SMP */
+static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; }
+static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; }
+#endif /* CONFIG_SMP */
+
+static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq)
+{
+ struct task_struct *p;
+retry:
+ /*
+ * The caller can't expect to successfully consume a task if the task's
+ * addition to @dsq isn't guaranteed to be visible somehow. Test
+ * @dsq->list without locking and skip if it seems empty.
+ */
+ if (list_empty(&dsq->list))
+ return false;
+
+ raw_spin_lock(&dsq->lock);
+
+ nldsq_for_each_task(p, dsq) {
+ struct rq *task_rq = task_rq(p);
+
+ if (rq == task_rq) {
+ task_unlink_from_dsq(p, dsq);
+ move_local_task_to_local_dsq(p, 0, dsq, rq);
+ raw_spin_unlock(&dsq->lock);
+ return true;
+ }
+
+ if (task_can_run_on_remote_rq(p, rq, false)) {
+ if (likely(consume_remote_task(rq, p, dsq, task_rq)))
+ return true;
+ goto retry;
+ }
+ }
+
+ raw_spin_unlock(&dsq->lock);
+ return false;
+}
+
+/**
+ * dispatch_to_local_dsq - Dispatch a task to a local dsq
+ * @rq: current rq which is locked
+ * @dst_dsq: destination DSQ
+ * @p: task to dispatch
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local
+ * DSQ. This function performs all the synchronization dancing needed because
+ * local DSQs are protected with rq locks.
+ *
+ * The caller must have exclusive ownership of @p (e.g. through
+ * %SCX_OPSS_DISPATCHING).
+ */
+static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
+ struct task_struct *p, u64 enq_flags)
+{
+ struct rq *src_rq = task_rq(p);
+ struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
+
+ /*
+ * We're synchronized against dequeue through DISPATCHING. As @p can't
+ * be dequeued, its task_rq and cpus_allowed are stable too.
+ *
+ * If dispatching to @rq that @p is already on, no lock dancing needed.
+ */
+ if (rq == src_rq && rq == dst_rq) {
+ dispatch_enqueue(dst_dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+ return;
+ }
+
+#ifdef CONFIG_SMP
+ if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
+ dispatch_enqueue(&scx_dsq_global, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+ return;
+ }
+
+ /*
+ * @p is on a possibly remote @src_rq which we need to lock to move the
+ * task. If dequeue is in progress, it'd be locking @src_rq and waiting
+ * on DISPATCHING, so we can't grab @src_rq lock while holding
+ * DISPATCHING.
+ *
+ * As DISPATCHING guarantees that @p is wholly ours, we can pretend that
+ * we're moving from a DSQ and use the same mechanism - mark the task
+ * under transfer with holding_cpu, release DISPATCHING and then follow
+ * the same protocol. See unlink_dsq_and_lock_src_rq().
+ */
+ p->scx.holding_cpu = raw_smp_processor_id();
+
+ /* store_release ensures that dequeue sees the above */
+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
+
+ /* switch to @src_rq lock */
+ if (rq != src_rq) {
+ raw_spin_rq_unlock(rq);
+ raw_spin_rq_lock(src_rq);
+ }
+
+ /* task_rq couldn't have changed if we're still the holding cpu */
+ if (likely(p->scx.holding_cpu == raw_smp_processor_id()) &&
+ !WARN_ON_ONCE(src_rq != task_rq(p))) {
+ /*
+ * If @p is staying on the same rq, there's no need to go
+ * through the full deactivate/activate cycle. Optimize by
+ * abbreviating move_remote_task_to_local_dsq().
+ */
+ if (src_rq == dst_rq) {
+ p->scx.holding_cpu = -1;
+ dispatch_enqueue(&dst_rq->scx.local_dsq, p, enq_flags);
+ } else {
+ move_remote_task_to_local_dsq(p, enq_flags,
+ src_rq, dst_rq);
+ }
+
+ /* if the destination CPU is idle, wake it up */
+ if (sched_class_above(p->sched_class, dst_rq->curr->sched_class))
+ resched_curr(dst_rq);
+ }
+
+ /* switch back to @rq lock */
+ if (rq != dst_rq) {
+ raw_spin_rq_unlock(dst_rq);
+ raw_spin_rq_lock(rq);
+ }
+#else /* CONFIG_SMP */
+ BUG(); /* control can not reach here on UP */
+#endif /* CONFIG_SMP */
+}
+
+/**
+ * finish_dispatch - Asynchronously finish dispatching a task
+ * @rq: current rq which is locked
+ * @p: task to finish dispatching
+ * @qseq_at_dispatch: qseq when @p started getting dispatched
+ * @dsq_id: destination DSQ ID
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * Dispatching to local DSQs may need to wait for queueing to complete or
+ * require rq lock dancing. As we don't wanna do either while inside
+ * ops.dispatch() to avoid locking order inversion, we split dispatching into
+ * two parts. scx_bpf_dispatch() which is called by ops.dispatch() records the
+ * task and its qseq. Once ops.dispatch() returns, this function is called to
+ * finish up.
+ *
+ * There is no guarantee that @p is still valid for dispatching or even that it
+ * was valid in the first place. Make sure that the task is still owned by the
+ * BPF scheduler and claim the ownership before dispatching.
+ */
+static void finish_dispatch(struct rq *rq, struct task_struct *p,
+ unsigned long qseq_at_dispatch,
+ u64 dsq_id, u64 enq_flags)
+{
+ struct scx_dispatch_q *dsq;
+ unsigned long opss;
+
+ touch_core_sched_dispatch(rq, p);
+retry:
+ /*
+ * No need for _acquire here. @p is accessed only after a successful
+ * try_cmpxchg to DISPATCHING.
+ */
+ opss = atomic_long_read(&p->scx.ops_state);
+
+ switch (opss & SCX_OPSS_STATE_MASK) {
+ case SCX_OPSS_DISPATCHING:
+ case SCX_OPSS_NONE:
+ /* someone else already got to it */
+ return;
+ case SCX_OPSS_QUEUED:
+ /*
+ * If qseq doesn't match, @p has gone through at least one
+ * dispatch/dequeue and re-enqueue cycle between
+ * scx_bpf_dispatch() and here and we have no claim on it.
+ */
+ if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch)
+ return;
+
+ /*
+ * While we know @p is accessible, we don't yet have a claim on
+ * it - the BPF scheduler is allowed to dispatch tasks
+ * spuriously and there can be a racing dequeue attempt. Let's
+ * claim @p by atomically transitioning it from QUEUED to
+ * DISPATCHING.
+ */
+ if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
+ SCX_OPSS_DISPATCHING)))
+ break;
+ goto retry;
+ case SCX_OPSS_QUEUEING:
+ /*
+ * do_enqueue_task() is in the process of transferring the task
+ * to the BPF scheduler while holding @p's rq lock. As we aren't
+ * holding any kernel or BPF resource that the enqueue path may
+ * depend upon, it's safe to wait.
+ */
+ wait_ops_state(p, opss);
+ goto retry;
+ }
+
+ BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
+
+ dsq = find_dsq_for_dispatch(this_rq(), dsq_id, p);
+
+ if (dsq->id == SCX_DSQ_LOCAL)
+ dispatch_to_local_dsq(rq, dsq, p, enq_flags);
+ else
+ dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+}
+
+static void flush_dispatch_buf(struct rq *rq)
+{
+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+ u32 u;
+
+ for (u = 0; u < dspc->cursor; u++) {
+ struct scx_dsp_buf_ent *ent = &dspc->buf[u];
+
+ finish_dispatch(rq, ent->task, ent->qseq, ent->dsq_id,
+ ent->enq_flags);
+ }
+
+ dspc->nr_tasks += dspc->cursor;
+ dspc->cursor = 0;
+}
+
+static int balance_one(struct rq *rq, struct task_struct *prev)
+{
+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+ bool prev_on_scx = prev->sched_class == &ext_sched_class;
+ int nr_loops = SCX_DSP_MAX_LOOPS;
+
+ lockdep_assert_rq_held(rq);
+ rq->scx.flags |= SCX_RQ_IN_BALANCE;
+ rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
+
+ if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
+ unlikely(rq->scx.cpu_released)) {
+ /*
+ * If the previous sched_class for the current CPU was not SCX,
+ * notify the BPF scheduler that it again has control of the
+ * core. This callback complements ->cpu_release(), which is
+ * emitted in scx_next_task_picked().
+ */
+ if (SCX_HAS_OP(cpu_acquire))
+ SCX_CALL_OP(0, cpu_acquire, cpu_of(rq), NULL);
+ rq->scx.cpu_released = false;
+ }
+
+ if (prev_on_scx) {
+ update_curr_scx(rq);
+
+ /*
+ * If @prev is runnable & has slice left, it has priority and
+ * fetching more just increases latency for the fetched tasks.
+ * Tell pick_task_scx() to keep running @prev. If the BPF
+ * scheduler wants to handle this explicitly, it should
+ * implement ->cpu_release().
+ *
+ * See scx_ops_disable_workfn() for the explanation on the
+ * bypassing test.
+ */
+ if ((prev->scx.flags & SCX_TASK_QUEUED) &&
+ prev->scx.slice && !scx_rq_bypassing(rq)) {
+ rq->scx.flags |= SCX_RQ_BAL_KEEP;
+ goto has_tasks;
+ }
+ }
+
+ /* if there already are tasks to run, nothing to do */
+ if (rq->scx.local_dsq.nr)
+ goto has_tasks;
+
+ if (consume_dispatch_q(rq, &scx_dsq_global))
+ goto has_tasks;
+
+ if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq))
+ goto no_tasks;
+
+ dspc->rq = rq;
+
+ /*
+ * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock,
+ * the local DSQ might still end up empty after a successful
+ * ops.dispatch(). If the local DSQ is empty even after ops.dispatch()
+ * produced some tasks, retry. The BPF scheduler may depend on this
+ * looping behavior to simplify its implementation.
+ */
+ do {
+ dspc->nr_tasks = 0;
+
+ SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq),
+ prev_on_scx ? prev : NULL);
+
+ flush_dispatch_buf(rq);
+
+ if (rq->scx.local_dsq.nr)
+ goto has_tasks;
+ if (consume_dispatch_q(rq, &scx_dsq_global))
+ goto has_tasks;
+
+ /*
+ * ops.dispatch() can trap us in this loop by repeatedly
+ * dispatching ineligible tasks. Break out once in a while to
+ * allow the watchdog to run. As IRQ can't be enabled in
+ * balance(), we want to complete this scheduling cycle and then
+ * start a new one. IOW, we want to call resched_curr() on the
+ * next, most likely idle, task, not the current one. Use
+ * scx_bpf_kick_cpu() for deferred kicking.
+ */
+ if (unlikely(!--nr_loops)) {
+ scx_bpf_kick_cpu(cpu_of(rq), 0);
+ break;
+ }
+ } while (dspc->nr_tasks);
+
+no_tasks:
+ /*
+ * Didn't find another task to run. Keep running @prev unless
+ * %SCX_OPS_ENQ_LAST is in effect.
+ */
+ if ((prev->scx.flags & SCX_TASK_QUEUED) &&
+ (!static_branch_unlikely(&scx_ops_enq_last) ||
+ scx_rq_bypassing(rq))) {
+ rq->scx.flags |= SCX_RQ_BAL_KEEP;
+ goto has_tasks;
+ }
+ rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
+ return false;
+
+has_tasks:
+ rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
+ return true;
+}
+
+static int balance_scx(struct rq *rq, struct task_struct *prev,
+ struct rq_flags *rf)
+{
+ int ret;
+
+ rq_unpin_lock(rq, rf);
+
+ ret = balance_one(rq, prev);
+
+#ifdef CONFIG_SCHED_SMT
+ /*
+ * When core-sched is enabled, this ops.balance() call will be followed
+ * by pick_task_scx() on this CPU and the SMT siblings. Balance the
+ * siblings too.
+ */
+ if (sched_core_enabled(rq)) {
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
+ int scpu;
+
+ for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) {
+ struct rq *srq = cpu_rq(scpu);
+ struct task_struct *sprev = srq->curr;
+
+ WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq));
+ update_rq_clock(srq);
+ balance_one(srq, sprev);
+ }
+ }
+#endif
+ rq_repin_lock(rq, rf);
+
+ return ret;
+}
+
+static void process_ddsp_deferred_locals(struct rq *rq)
+{
+ struct task_struct *p;
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * Now that @rq can be unlocked, execute the deferred enqueueing of
+ * tasks directly dispatched to the local DSQs of other CPUs. See
+ * direct_dispatch(). Keep popping from the head instead of using
+ * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq
+ * temporarily.
+ */
+ while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
+ struct task_struct, scx.dsq_list.node))) {
+ struct scx_dispatch_q *dsq;
+
+ list_del_init(&p->scx.dsq_list.node);
+
+ dsq = find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);
+ if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
+ dispatch_to_local_dsq(rq, dsq, p, p->scx.ddsp_enq_flags);
+ }
+}
+
+static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
+{
+ if (p->scx.flags & SCX_TASK_QUEUED) {
+ /*
+ * Core-sched might decide to execute @p before it is
+ * dispatched. Call ops_dequeue() to notify the BPF scheduler.
+ */
+ ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC);
+ dispatch_dequeue(rq, p);
+ }
+
+ p->se.exec_start = rq_clock_task(rq);
+
+ /* see dequeue_task_scx() on why we skip when !QUEUED */
+ if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))
+ SCX_CALL_OP_TASK(SCX_KF_REST, running, p);
+
+ clr_task_runnable(p, true);
+
+ /*
+ * @p is getting newly scheduled or got kicked after someone updated its
+ * slice. Refresh whether tick can be stopped. See scx_can_stop_tick().
+ */
+ if ((p->scx.slice == SCX_SLICE_INF) !=
+ (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) {
+ if (p->scx.slice == SCX_SLICE_INF)
+ rq->scx.flags |= SCX_RQ_CAN_STOP_TICK;
+ else
+ rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK;
+
+ sched_update_tick_dependency(rq);
+
+ /*
+ * For now, let's refresh the load_avgs just when transitioning
+ * in and out of nohz. In the future, we might want to add a
+ * mechanism which calls the following periodically on
+ * tick-stopped CPUs.
+ */
+ update_other_load_avgs(rq);
+ }
+}
+
+static enum scx_cpu_preempt_reason
+preempt_reason_from_class(const struct sched_class *class)
+{
+#ifdef CONFIG_SMP
+ if (class == &stop_sched_class)
+ return SCX_CPU_PREEMPT_STOP;
+#endif
+ if (class == &dl_sched_class)
+ return SCX_CPU_PREEMPT_DL;
+ if (class == &rt_sched_class)
+ return SCX_CPU_PREEMPT_RT;
+ return SCX_CPU_PREEMPT_UNKNOWN;
+}
+
+static void switch_class(struct rq *rq, struct task_struct *next)
+{
+ const struct sched_class *next_class = next->sched_class;
+
+#ifdef CONFIG_SMP
+ /*
+ * Pairs with the smp_load_acquire() issued by a CPU in
+ * kick_cpus_irq_workfn() who is waiting for this CPU to perform a
+ * resched.
+ */
+ smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
+#endif
+ if (!static_branch_unlikely(&scx_ops_cpu_preempt))
+ return;
+
+ /*
+ * The callback is conceptually meant to convey that the CPU is no
+ * longer under the control of SCX. Therefore, don't invoke the callback
+ * if the next class is below SCX (in which case the BPF scheduler has
+ * actively decided not to schedule any tasks on the CPU).
+ */
+ if (sched_class_above(&ext_sched_class, next_class))
+ return;
+
+ /*
+ * At this point we know that SCX was preempted by a higher priority
+ * sched_class, so invoke the ->cpu_release() callback if we have not
+ * done so already. We only send the callback once between SCX being
+ * preempted, and it regaining control of the CPU.
+ *
+ * ->cpu_release() complements ->cpu_acquire(), which is emitted the
+ * next time that balance_scx() is invoked.
+ */
+ if (!rq->scx.cpu_released) {
+ if (SCX_HAS_OP(cpu_release)) {
+ struct scx_cpu_release_args args = {
+ .reason = preempt_reason_from_class(next_class),
+ .task = next,
+ };
+
+ SCX_CALL_OP(SCX_KF_CPU_RELEASE,
+ cpu_release, cpu_of(rq), &args);
+ }
+ rq->scx.cpu_released = true;
+ }
+}
+
+static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
+ struct task_struct *next)
+{
+ update_curr_scx(rq);
+
+ /* see dequeue_task_scx() on why we skip when !QUEUED */
+ if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
+ SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true);
+
+ if (p->scx.flags & SCX_TASK_QUEUED) {
+ set_task_runnable(rq, p);
+
+ /*
+ * If @p has slice left and is being put, @p is getting
+ * preempted by a higher priority scheduler class or core-sched
+ * forcing a different task. Leave it at the head of the local
+ * DSQ.
+ */
+ if (p->scx.slice && !scx_rq_bypassing(rq)) {
+ dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
+ return;
+ }
+
+ /*
+ * If @p is runnable but we're about to enter a lower
+ * sched_class, %SCX_OPS_ENQ_LAST must be set. Tell
+ * ops.enqueue() that @p is the only one available for this cpu,
+ * which should trigger an explicit follow-up scheduling event.
+ */
+ if (sched_class_above(&ext_sched_class, next->sched_class)) {
+ WARN_ON_ONCE(!static_branch_unlikely(&scx_ops_enq_last));
+ do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
+ } else {
+ do_enqueue_task(rq, p, 0, -1);
+ }
+ }
+
+ if (next && next->sched_class != &ext_sched_class)
+ switch_class(rq, next);
+}
+
+static struct task_struct *first_local_task(struct rq *rq)
+{
+ return list_first_entry_or_null(&rq->scx.local_dsq.list,
+ struct task_struct, scx.dsq_list.node);
+}
+
+static struct task_struct *pick_task_scx(struct rq *rq)
+{
+ struct task_struct *prev = rq->curr;
+ struct task_struct *p;
+
+ /*
+ * If balance_scx() is telling us to keep running @prev, replenish slice
+ * if necessary and keep running @prev. Otherwise, pop the first one
+ * from the local DSQ.
+ *
+ * WORKAROUND:
+ *
+ * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just
+ * have gone through balance_scx(). Unfortunately, there currently is a
+ * bug where fair could say yes on balance() but no on pick_task(),
+ * which then ends up calling pick_task_scx() without preceding
+ * balance_scx().
+ *
+ * For now, ignore cases where $prev is not on SCX. This isn't great and
+ * can theoretically lead to stalls. However, for switch_all cases, this
+ * happens only while a BPF scheduler is being loaded or unloaded, and,
+ * for partial cases, fair will likely keep triggering this CPU.
+ *
+ * Once fair is fixed, restore WARN_ON_ONCE().
+ */
+ if ((rq->scx.flags & SCX_RQ_BAL_KEEP) &&
+ prev->sched_class == &ext_sched_class) {
+ p = prev;
+ if (!p->scx.slice)
+ p->scx.slice = SCX_SLICE_DFL;
+ } else {
+ p = first_local_task(rq);
+ if (!p)
+ return NULL;
+
+ if (unlikely(!p->scx.slice)) {
+ if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
+ printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
+ p->comm, p->pid);
+ scx_warned_zero_slice = true;
+ }
+ p->scx.slice = SCX_SLICE_DFL;
+ }
+ }
+
+ return p;
+}
+
+#ifdef CONFIG_SCHED_CORE
+/**
+ * scx_prio_less - Task ordering for core-sched
+ * @a: task A
+ * @b: task B
+ *
+ * Core-sched is implemented as an additional scheduling layer on top of the
+ * usual sched_class'es and needs to find out the expected task ordering. For
+ * SCX, core-sched calls this function to interrogate the task ordering.
+ *
+ * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
+ * to implement the default task ordering. The older the timestamp, the higher
+ * prority the task - the global FIFO ordering matching the default scheduling
+ * behavior.
+ *
+ * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
+ * implement FIFO ordering within each local DSQ. See pick_task_scx().
+ */
+bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
+ bool in_fi)
+{
+ /*
+ * The const qualifiers are dropped from task_struct pointers when
+ * calling ops.core_sched_before(). Accesses are controlled by the
+ * verifier.
+ */
+ if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a)))
+ return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before,
+ (struct task_struct *)a,
+ (struct task_struct *)b);
+ else
+ return time_after64(a->scx.core_sched_at, b->scx.core_sched_at);
+}
+#endif /* CONFIG_SCHED_CORE */
+
+#ifdef CONFIG_SMP
+
+static bool test_and_clear_cpu_idle(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+ /*
+ * SMT mask should be cleared whether we can claim @cpu or not. The SMT
+ * cluster is not wholly idle either way. This also prevents
+ * scx_pick_idle_cpu() from getting caught in an infinite loop.
+ */
+ if (sched_smt_active()) {
+ const struct cpumask *smt = cpu_smt_mask(cpu);
+
+ /*
+ * If offline, @cpu is not its own sibling and
+ * scx_pick_idle_cpu() can get caught in an infinite loop as
+ * @cpu is never cleared from idle_masks.smt. Ensure that @cpu
+ * is eventually cleared.
+ */
+ if (cpumask_intersects(smt, idle_masks.smt))
+ cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
+ else if (cpumask_test_cpu(cpu, idle_masks.smt))
+ __cpumask_clear_cpu(cpu, idle_masks.smt);
+ }
+#endif
+ return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
+}
+
+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
+{
+ int cpu;
+
+retry:
+ if (sched_smt_active()) {
+ cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed);
+ if (cpu < nr_cpu_ids)
+ goto found;
+
+ if (flags & SCX_PICK_IDLE_CORE)
+ return -EBUSY;
+ }
+
+ cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
+ if (cpu >= nr_cpu_ids)
+ return -EBUSY;
+
+found:
+ if (test_and_clear_cpu_idle(cpu))
+ return cpu;
+ else
+ goto retry;
+}
+
+static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
+ u64 wake_flags, bool *found)
+{
+ s32 cpu;
+
+ *found = false;
+
+ if (!static_branch_likely(&scx_builtin_idle_enabled)) {
+ scx_ops_error("built-in idle tracking is disabled");
+ return prev_cpu;
+ }
+
+ /*
+ * If WAKE_SYNC, the waker's local DSQ is empty, and the system is
+ * under utilized, wake up @p to the local DSQ of the waker. Checking
+ * only for an empty local DSQ is insufficient as it could give the
+ * wakee an unfair advantage when the system is oversaturated.
+ * Checking only for the presence of idle CPUs is also insufficient as
+ * the local DSQ of the waker could have tasks piled up on it even if
+ * there is an idle core elsewhere on the system.
+ */
+ cpu = smp_processor_id();
+ if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 &&
+ !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) &&
+ cpu_rq(cpu)->scx.local_dsq.nr == 0) {
+ if (cpumask_test_cpu(cpu, p->cpus_ptr))
+ goto cpu_found;
+ }
+
+ if (p->nr_cpus_allowed == 1) {
+ if (test_and_clear_cpu_idle(prev_cpu)) {
+ cpu = prev_cpu;
+ goto cpu_found;
+ } else {
+ return prev_cpu;
+ }
+ }
+
+ /*
+ * If CPU has SMT, any wholly idle CPU is likely a better pick than
+ * partially idle @prev_cpu.
+ */
+ if (sched_smt_active()) {
+ if (cpumask_test_cpu(prev_cpu, idle_masks.smt) &&
+ test_and_clear_cpu_idle(prev_cpu)) {
+ cpu = prev_cpu;
+ goto cpu_found;
+ }
+
+ cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto cpu_found;
+ }
+
+ if (test_and_clear_cpu_idle(prev_cpu)) {
+ cpu = prev_cpu;
+ goto cpu_found;
+ }
+
+ cpu = scx_pick_idle_cpu(p->cpus_ptr, 0);
+ if (cpu >= 0)
+ goto cpu_found;
+
+ return prev_cpu;
+
+cpu_found:
+ *found = true;
+ return cpu;
+}
+
+static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
+{
+ /*
+ * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it
+ * can be a good migration opportunity with low cache and memory
+ * footprint. Returning a CPU different than @prev_cpu triggers
+ * immediate rq migration. However, for SCX, as the current rq
+ * association doesn't dictate where the task is going to run, this
+ * doesn't fit well. If necessary, we can later add a dedicated method
+ * which can decide to preempt self to force it through the regular
+ * scheduling path.
+ */
+ if (unlikely(wake_flags & WF_EXEC))
+ return prev_cpu;
+
+ if (SCX_HAS_OP(select_cpu)) {
+ s32 cpu;
+ struct task_struct **ddsp_taskp;
+
+ ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
+ WARN_ON_ONCE(*ddsp_taskp);
+ *ddsp_taskp = p;
+
+ cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
+ select_cpu, p, prev_cpu, wake_flags);
+ *ddsp_taskp = NULL;
+ if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
+ return cpu;
+ else
+ return prev_cpu;
+ } else {
+ bool found;
+ s32 cpu;
+
+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found);
+ if (found) {
+ p->scx.slice = SCX_SLICE_DFL;
+ p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
+ }
+ return cpu;
+ }
+}
+
+static void task_woken_scx(struct rq *rq, struct task_struct *p)
+{
+ run_deferred(rq);
+}
+
+static void set_cpus_allowed_scx(struct task_struct *p,
+ struct affinity_context *ac)
+{
+ set_cpus_allowed_common(p, ac);
+
+ /*
+ * The effective cpumask is stored in @p->cpus_ptr which may temporarily
+ * differ from the configured one in @p->cpus_mask. Always tell the bpf
+ * scheduler the effective one.
+ *
+ * Fine-grained memory write control is enforced by BPF making the const
+ * designation pointless. Cast it away when calling the operation.
+ */
+ if (SCX_HAS_OP(set_cpumask))
+ SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
+ (struct cpumask *)p->cpus_ptr);
+}
+
+static void reset_idle_masks(void)
+{
+ /*
+ * Consider all online cpus idle. Should converge to the actual state
+ * quickly.
+ */
+ cpumask_copy(idle_masks.cpu, cpu_online_mask);
+ cpumask_copy(idle_masks.smt, cpu_online_mask);
+}
+
+void __scx_update_idle(struct rq *rq, bool idle)
+{
+ int cpu = cpu_of(rq);
+
+ if (SCX_HAS_OP(update_idle)) {
+ SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
+ if (!static_branch_unlikely(&scx_builtin_idle_enabled))
+ return;
+ }
+
+ if (idle)
+ cpumask_set_cpu(cpu, idle_masks.cpu);
+ else
+ cpumask_clear_cpu(cpu, idle_masks.cpu);
+
+#ifdef CONFIG_SCHED_SMT
+ if (sched_smt_active()) {
+ const struct cpumask *smt = cpu_smt_mask(cpu);
+
+ if (idle) {
+ /*
+ * idle_masks.smt handling is racy but that's fine as
+ * it's only for optimization and self-correcting.
+ */
+ for_each_cpu(cpu, smt) {
+ if (!cpumask_test_cpu(cpu, idle_masks.cpu))
+ return;
+ }
+ cpumask_or(idle_masks.smt, idle_masks.smt, smt);
+ } else {
+ cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
+ }
+ }
+#endif
+}
+
+static void handle_hotplug(struct rq *rq, bool online)
+{
+ int cpu = cpu_of(rq);
+
+ atomic_long_inc(&scx_hotplug_seq);
+
+ if (online && SCX_HAS_OP(cpu_online))
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
+ else if (!online && SCX_HAS_OP(cpu_offline))
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, cpu);
+ else
+ scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
+ "cpu %d going %s, exiting scheduler", cpu,
+ online ? "online" : "offline");
+}
+
+void scx_rq_activate(struct rq *rq)
+{
+ handle_hotplug(rq, true);
+}
+
+void scx_rq_deactivate(struct rq *rq)
+{
+ handle_hotplug(rq, false);
+}
+
+static void rq_online_scx(struct rq *rq)
+{
+ rq->scx.flags |= SCX_RQ_ONLINE;
+}
+
+static void rq_offline_scx(struct rq *rq)
+{
+ rq->scx.flags &= ~SCX_RQ_ONLINE;
+}
+
+#else /* CONFIG_SMP */
+
+static bool test_and_clear_cpu_idle(int cpu) { return false; }
+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; }
+static void reset_idle_masks(void) {}
+
+#endif /* CONFIG_SMP */
+
+static bool check_rq_for_timeouts(struct rq *rq)
+{
+ struct task_struct *p;
+ struct rq_flags rf;
+ bool timed_out = false;
+
+ rq_lock_irqsave(rq, &rf);
+ list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
+ unsigned long last_runnable = p->scx.runnable_at;
+
+ if (unlikely(time_after(jiffies,
+ last_runnable + scx_watchdog_timeout))) {
+ u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
+
+ scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
+ "%s[%d] failed to run for %u.%03us",
+ p->comm, p->pid,
+ dur_ms / 1000, dur_ms % 1000);
+ timed_out = true;
+ break;
+ }
+ }
+ rq_unlock_irqrestore(rq, &rf);
+
+ return timed_out;
+}
+
+static void scx_watchdog_workfn(struct work_struct *work)
+{
+ int cpu;
+
+ WRITE_ONCE(scx_watchdog_timestamp, jiffies);
+
+ for_each_online_cpu(cpu) {
+ if (unlikely(check_rq_for_timeouts(cpu_rq(cpu))))
+ break;
+
+ cond_resched();
+ }
+ queue_delayed_work(system_unbound_wq, to_delayed_work(work),
+ scx_watchdog_timeout / 2);
+}
+
+void scx_tick(struct rq *rq)
+{
+ unsigned long last_check;
+
+ if (!scx_enabled())
+ return;
+
+ last_check = READ_ONCE(scx_watchdog_timestamp);
+ if (unlikely(time_after(jiffies,
+ last_check + READ_ONCE(scx_watchdog_timeout)))) {
+ u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
+
+ scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
+ "watchdog failed to check in for %u.%03us",
+ dur_ms / 1000, dur_ms % 1000);
+ }
+
+ update_other_load_avgs(rq);
+}
+
+static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
+{
+ update_curr_scx(rq);
+
+ /*
+ * While disabling, always resched and refresh core-sched timestamp as
+ * we can't trust the slice management or ops.core_sched_before().
+ */
+ if (scx_rq_bypassing(rq)) {
+ curr->scx.slice = 0;
+ touch_core_sched(rq, curr);
+ } else if (SCX_HAS_OP(tick)) {
+ SCX_CALL_OP(SCX_KF_REST, tick, curr);
+ }
+
+ if (!curr->scx.slice)
+ resched_curr(rq);
+}
+
+#ifdef CONFIG_EXT_GROUP_SCHED
+static struct cgroup *tg_cgrp(struct task_group *tg)
+{
+ /*
+ * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup,
+ * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the
+ * root cgroup.
+ */
+ if (tg && tg->css.cgroup)
+ return tg->css.cgroup;
+ else
+ return &cgrp_dfl_root.cgrp;
+}
+
+#define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg),
+
+#else /* CONFIG_EXT_GROUP_SCHED */
+
+#define SCX_INIT_TASK_ARGS_CGROUP(tg)
+
+#endif /* CONFIG_EXT_GROUP_SCHED */
+
+static enum scx_task_state scx_get_task_state(const struct task_struct *p)
+{
+ return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT;
+}
+
+static void scx_set_task_state(struct task_struct *p, enum scx_task_state state)
+{
+ enum scx_task_state prev_state = scx_get_task_state(p);
+ bool warn = false;
+
+ BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS));
+
+ switch (state) {
+ case SCX_TASK_NONE:
+ break;
+ case SCX_TASK_INIT:
+ warn = prev_state != SCX_TASK_NONE;
+ break;
+ case SCX_TASK_READY:
+ warn = prev_state == SCX_TASK_NONE;
+ break;
+ case SCX_TASK_ENABLED:
+ warn = prev_state != SCX_TASK_READY;
+ break;
+ default:
+ warn = true;
+ return;
+ }
+
+ WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]",
+ prev_state, state, p->comm, p->pid);
+
+ p->scx.flags &= ~SCX_TASK_STATE_MASK;
+ p->scx.flags |= state << SCX_TASK_STATE_SHIFT;
+}
+
+static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool fork)
+{
+ int ret;
+
+ p->scx.disallow = false;
+
+ if (SCX_HAS_OP(init_task)) {
+ struct scx_init_task_args args = {
+ SCX_INIT_TASK_ARGS_CGROUP(tg)
+ .fork = fork,
+ };
+
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, p, &args);
+ if (unlikely(ret)) {
+ ret = ops_sanitize_err("init_task", ret);
+ return ret;
+ }
+ }
+
+ scx_set_task_state(p, SCX_TASK_INIT);
+
+ if (p->scx.disallow) {
+ if (!fork) {
+ struct rq *rq;
+ struct rq_flags rf;
+
+ rq = task_rq_lock(p, &rf);
+
+ /*
+ * We're in the load path and @p->policy will be applied
+ * right after. Reverting @p->policy here and rejecting
+ * %SCHED_EXT transitions from scx_check_setscheduler()
+ * guarantees that if ops.init_task() sets @p->disallow,
+ * @p can never be in SCX.
+ */
+ if (p->policy == SCHED_EXT) {
+ p->policy = SCHED_NORMAL;
+ atomic_long_inc(&scx_nr_rejected);
+ }
+
+ task_rq_unlock(rq, p, &rf);
+ } else if (p->policy == SCHED_EXT) {
+ scx_ops_error("ops.init_task() set task->scx.disallow for %s[%d] during fork",
+ p->comm, p->pid);
+ }
+ }
+
+ p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
+ return 0;
+}
+
+static void scx_ops_enable_task(struct task_struct *p)
+{
+ u32 weight;
+
+ lockdep_assert_rq_held(task_rq(p));
+
+ /*
+ * Set the weight before calling ops.enable() so that the scheduler
+ * doesn't see a stale value if they inspect the task struct.
+ */
+ if (task_has_idle_policy(p))
+ weight = WEIGHT_IDLEPRIO;
+ else
+ weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO];
+
+ p->scx.weight = sched_weight_to_cgroup(weight);
+
+ if (SCX_HAS_OP(enable))
+ SCX_CALL_OP_TASK(SCX_KF_REST, enable, p);
+ scx_set_task_state(p, SCX_TASK_ENABLED);
+
+ if (SCX_HAS_OP(set_weight))
+ SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
+}
+
+static void scx_ops_disable_task(struct task_struct *p)
+{
+ lockdep_assert_rq_held(task_rq(p));
+ WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
+
+ if (SCX_HAS_OP(disable))
+ SCX_CALL_OP(SCX_KF_REST, disable, p);
+ scx_set_task_state(p, SCX_TASK_READY);
+}
+
+static void scx_ops_exit_task(struct task_struct *p)
+{
+ struct scx_exit_task_args args = {
+ .cancelled = false,
+ };
+
+ lockdep_assert_rq_held(task_rq(p));
+
+ switch (scx_get_task_state(p)) {
+ case SCX_TASK_NONE:
+ return;
+ case SCX_TASK_INIT:
+ args.cancelled = true;
+ break;
+ case SCX_TASK_READY:
+ break;
+ case SCX_TASK_ENABLED:
+ scx_ops_disable_task(p);
+ break;
+ default:
+ WARN_ON_ONCE(true);
+ return;
+ }
+
+ if (SCX_HAS_OP(exit_task))
+ SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args);
+ scx_set_task_state(p, SCX_TASK_NONE);
+}
+
+void init_scx_entity(struct sched_ext_entity *scx)
+{
+ /*
+ * init_idle() calls this function again after fork sequence is
+ * complete. Don't touch ->tasks_node as it's already linked.
+ */
+ memset(scx, 0, offsetof(struct sched_ext_entity, tasks_node));
+
+ INIT_LIST_HEAD(&scx->dsq_list.node);
+ RB_CLEAR_NODE(&scx->dsq_priq);
+ scx->sticky_cpu = -1;
+ scx->holding_cpu = -1;
+ INIT_LIST_HEAD(&scx->runnable_node);
+ scx->runnable_at = jiffies;
+ scx->ddsp_dsq_id = SCX_DSQ_INVALID;
+ scx->slice = SCX_SLICE_DFL;
+}
+
+void scx_pre_fork(struct task_struct *p)
+{
+ /*
+ * BPF scheduler enable/disable paths want to be able to iterate and
+ * update all tasks which can become complex when racing forks. As
+ * enable/disable are very cold paths, let's use a percpu_rwsem to
+ * exclude forks.
+ */
+ percpu_down_read(&scx_fork_rwsem);
+}
+
+int scx_fork(struct task_struct *p)
+{
+ percpu_rwsem_assert_held(&scx_fork_rwsem);
+
+ if (scx_enabled())
+ return scx_ops_init_task(p, task_group(p), true);
+ else
+ return 0;
+}
+
+void scx_post_fork(struct task_struct *p)
+{
+ if (scx_enabled()) {
+ scx_set_task_state(p, SCX_TASK_READY);
+
+ /*
+ * Enable the task immediately if it's running on sched_ext.
+ * Otherwise, it'll be enabled in switching_to_scx() if and
+ * when it's ever configured to run with a SCHED_EXT policy.
+ */
+ if (p->sched_class == &ext_sched_class) {
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+ scx_ops_enable_task(p);
+ task_rq_unlock(rq, p, &rf);
+ }
+ }
+
+ spin_lock_irq(&scx_tasks_lock);
+ list_add_tail(&p->scx.tasks_node, &scx_tasks);
+ spin_unlock_irq(&scx_tasks_lock);
+
+ percpu_up_read(&scx_fork_rwsem);
+}
+
+void scx_cancel_fork(struct task_struct *p)
+{
+ if (scx_enabled()) {
+ struct rq *rq;
+ struct rq_flags rf;
+
+ rq = task_rq_lock(p, &rf);
+ WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY);
+ scx_ops_exit_task(p);
+ task_rq_unlock(rq, p, &rf);
+ }
+
+ percpu_up_read(&scx_fork_rwsem);
+}
+
+void sched_ext_free(struct task_struct *p)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&scx_tasks_lock, flags);
+ list_del_init(&p->scx.tasks_node);
+ spin_unlock_irqrestore(&scx_tasks_lock, flags);
+
+ /*
+ * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY ->
+ * ENABLED transitions can't race us. Disable ops for @p.
+ */
+ if (scx_get_task_state(p) != SCX_TASK_NONE) {
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+ scx_ops_exit_task(p);
+ task_rq_unlock(rq, p, &rf);
+ }
+}
+
+static void reweight_task_scx(struct rq *rq, struct task_struct *p,
+ const struct load_weight *lw)
+{
+ lockdep_assert_rq_held(task_rq(p));
+
+ p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
+ if (SCX_HAS_OP(set_weight))
+ SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
+}
+
+static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
+{
+}
+
+static void switching_to_scx(struct rq *rq, struct task_struct *p)
+{
+ scx_ops_enable_task(p);
+
+ /*
+ * set_cpus_allowed_scx() is not called while @p is associated with a
+ * different scheduler class. Keep the BPF scheduler up-to-date.
+ */
+ if (SCX_HAS_OP(set_cpumask))
+ SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
+ (struct cpumask *)p->cpus_ptr);
+}
+
+static void switched_from_scx(struct rq *rq, struct task_struct *p)
+{
+ scx_ops_disable_task(p);
+}
+
+static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
+static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
+
+int scx_check_setscheduler(struct task_struct *p, int policy)
+{
+ lockdep_assert_rq_held(task_rq(p));
+
+ /* if disallow, reject transitioning into SCX */
+ if (scx_enabled() && READ_ONCE(p->scx.disallow) &&
+ p->policy != policy && policy == SCHED_EXT)
+ return -EACCES;
+
+ return 0;
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+bool scx_can_stop_tick(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ if (scx_rq_bypassing(rq))
+ return false;
+
+ if (p->sched_class != &ext_sched_class)
+ return true;
+
+ /*
+ * @rq can dispatch from different DSQs, so we can't tell whether it
+ * needs the tick or not by looking at nr_running. Allow stopping ticks
+ * iff the BPF scheduler indicated so. See set_next_task_scx().
+ */
+ return rq->scx.flags & SCX_RQ_CAN_STOP_TICK;
+}
+#endif
+
+#ifdef CONFIG_EXT_GROUP_SCHED
+
+DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem);
+static bool cgroup_warned_missing_weight;
+static bool cgroup_warned_missing_idle;
+
+static void scx_cgroup_warn_missing_weight(struct task_group *tg)
+{
+ if (scx_ops_enable_state() == SCX_OPS_DISABLED ||
+ cgroup_warned_missing_weight)
+ return;
+
+ if ((scx_ops.flags & SCX_OPS_HAS_CGROUP_WEIGHT) || !tg->css.parent)
+ return;
+
+ pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.weight\n",
+ scx_ops.name);
+ cgroup_warned_missing_weight = true;
+}
+
+static void scx_cgroup_warn_missing_idle(struct task_group *tg)
+{
+ if (scx_ops_enable_state() == SCX_OPS_DISABLED ||
+ cgroup_warned_missing_idle)
+ return;
+
+ if (!tg->idle)
+ return;
+
+ pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.idle\n",
+ scx_ops.name);
+ cgroup_warned_missing_idle = true;
+}
+
+int scx_tg_online(struct task_group *tg)
+{
+ int ret = 0;
+
+ WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED));
+
+ percpu_down_read(&scx_cgroup_rwsem);
+
+ scx_cgroup_warn_missing_weight(tg);
+
+ if (SCX_HAS_OP(cgroup_init)) {
+ struct scx_cgroup_init_args args = { .weight = tg->scx_weight };
+
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
+ tg->css.cgroup, &args);
+ if (!ret)
+ tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED;
+ else
+ ret = ops_sanitize_err("cgroup_init", ret);
+ } else {
+ tg->scx_flags |= SCX_TG_ONLINE;
+ }
+
+ percpu_up_read(&scx_cgroup_rwsem);
+ return ret;
+}
+
+void scx_tg_offline(struct task_group *tg)
+{
+ WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE));
+
+ percpu_down_read(&scx_cgroup_rwsem);
+
+ if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup);
+ tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
+
+ percpu_up_read(&scx_cgroup_rwsem);
+}
+
+int scx_cgroup_can_attach(struct cgroup_taskset *tset)
+{
+ struct cgroup_subsys_state *css;
+ struct task_struct *p;
+ int ret;
+
+ /* released in scx_finish/cancel_attach() */
+ percpu_down_read(&scx_cgroup_rwsem);
+
+ if (!scx_enabled())
+ return 0;
+
+ cgroup_taskset_for_each(p, css, tset) {
+ struct cgroup *from = tg_cgrp(task_group(p));
+ struct cgroup *to = tg_cgrp(css_tg(css));
+
+ WARN_ON_ONCE(p->scx.cgrp_moving_from);
+
+ /*
+ * sched_move_task() omits identity migrations. Let's match the
+ * behavior so that ops.cgroup_prep_move() and ops.cgroup_move()
+ * always match one-to-one.
+ */
+ if (from == to)
+ continue;
+
+ if (SCX_HAS_OP(cgroup_prep_move)) {
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move,
+ p, from, css->cgroup);
+ if (ret)
+ goto err;
+ }
+
+ p->scx.cgrp_moving_from = from;
+ }
+
+ return 0;
+
+err:
+ cgroup_taskset_for_each(p, css, tset) {
+ if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
+ p->scx.cgrp_moving_from, css->cgroup);
+ p->scx.cgrp_moving_from = NULL;
+ }
+
+ percpu_up_read(&scx_cgroup_rwsem);
+ return ops_sanitize_err("cgroup_prep_move", ret);
+}
+
+void scx_move_task(struct task_struct *p)
+{
+ if (!scx_enabled())
+ return;
+
+ /*
+ * We're called from sched_move_task() which handles both cgroup and
+ * autogroup moves. Ignore the latter.
+ *
+ * Also ignore exiting tasks, because in the exit path tasks transition
+ * from the autogroup to the root group, so task_group_is_autogroup()
+ * alone isn't able to catch exiting autogroup tasks. This is safe for
+ * cgroup_move(), because cgroup migrations never happen for PF_EXITING
+ * tasks.
+ */
+ if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING))
+ return;
+
+ /*
+ * @p must have ops.cgroup_prep_move() called on it and thus
+ * cgrp_moving_from set.
+ */
+ if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
+ SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p,
+ p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
+ p->scx.cgrp_moving_from = NULL;
+}
+
+void scx_cgroup_finish_attach(void)
+{
+ percpu_up_read(&scx_cgroup_rwsem);
+}
+
+void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
+{
+ struct cgroup_subsys_state *css;
+ struct task_struct *p;
+
+ if (!scx_enabled())
+ goto out_unlock;
+
+ cgroup_taskset_for_each(p, css, tset) {
+ if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
+ p->scx.cgrp_moving_from, css->cgroup);
+ p->scx.cgrp_moving_from = NULL;
+ }
+out_unlock:
+ percpu_up_read(&scx_cgroup_rwsem);
+}
+
+void scx_group_set_weight(struct task_group *tg, unsigned long weight)
+{
+ percpu_down_read(&scx_cgroup_rwsem);
+
+ if (tg->scx_weight != weight) {
+ if (SCX_HAS_OP(cgroup_set_weight))
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight,
+ tg_cgrp(tg), weight);
+ tg->scx_weight = weight;
+ }
+
+ percpu_up_read(&scx_cgroup_rwsem);
+}
+
+void scx_group_set_idle(struct task_group *tg, bool idle)
+{
+ percpu_down_read(&scx_cgroup_rwsem);
+ scx_cgroup_warn_missing_idle(tg);
+ percpu_up_read(&scx_cgroup_rwsem);
+}
+
+static void scx_cgroup_lock(void)
+{
+ percpu_down_write(&scx_cgroup_rwsem);
+}
+
+static void scx_cgroup_unlock(void)
+{
+ percpu_up_write(&scx_cgroup_rwsem);
+}
+
+#else /* CONFIG_EXT_GROUP_SCHED */
+
+static inline void scx_cgroup_lock(void) {}
+static inline void scx_cgroup_unlock(void) {}
+
+#endif /* CONFIG_EXT_GROUP_SCHED */
+
+/*
+ * Omitted operations:
+ *
+ * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task
+ * isn't tied to the CPU at that point. Preemption is implemented by resetting
+ * the victim task's slice to 0 and triggering reschedule on the target CPU.
+ *
+ * - migrate_task_rq: Unnecessary as task to cpu mapping is transient.
+ *
+ * - task_fork/dead: We need fork/dead notifications for all tasks regardless of
+ * their current sched_class. Call them directly from sched core instead.
+ */
+DEFINE_SCHED_CLASS(ext) = {
+ .enqueue_task = enqueue_task_scx,
+ .dequeue_task = dequeue_task_scx,
+ .yield_task = yield_task_scx,
+ .yield_to_task = yield_to_task_scx,
+
+ .wakeup_preempt = wakeup_preempt_scx,
+
+ .balance = balance_scx,
+ .pick_task = pick_task_scx,
+
+ .put_prev_task = put_prev_task_scx,
+ .set_next_task = set_next_task_scx,
+
+#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_scx,
+ .task_woken = task_woken_scx,
+ .set_cpus_allowed = set_cpus_allowed_scx,
+
+ .rq_online = rq_online_scx,
+ .rq_offline = rq_offline_scx,
+#endif
+
+ .task_tick = task_tick_scx,
+
+ .switching_to = switching_to_scx,
+ .switched_from = switched_from_scx,
+ .switched_to = switched_to_scx,
+ .reweight_task = reweight_task_scx,
+ .prio_changed = prio_changed_scx,
+
+ .update_curr = update_curr_scx,
+
+#ifdef CONFIG_UCLAMP_TASK
+ .uclamp_enabled = 1,
+#endif
+};
+
+static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
+{
+ memset(dsq, 0, sizeof(*dsq));
+
+ raw_spin_lock_init(&dsq->lock);
+ INIT_LIST_HEAD(&dsq->list);
+ dsq->id = dsq_id;
+}
+
+static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node)
+{
+ struct scx_dispatch_q *dsq;
+ int ret;
+
+ if (dsq_id & SCX_DSQ_FLAG_BUILTIN)
+ return ERR_PTR(-EINVAL);
+
+ dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node);
+ if (!dsq)
+ return ERR_PTR(-ENOMEM);
+
+ init_dsq(dsq, dsq_id);
+
+ ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node,
+ dsq_hash_params);
+ if (ret) {
+ kfree(dsq);
+ return ERR_PTR(ret);
+ }
+ return dsq;
+}
+
+static void free_dsq_irq_workfn(struct irq_work *irq_work)
+{
+ struct llist_node *to_free = llist_del_all(&dsqs_to_free);
+ struct scx_dispatch_q *dsq, *tmp_dsq;
+
+ llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node)
+ kfree_rcu(dsq, rcu);
+}
+
+static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn);
+
+static void destroy_dsq(u64 dsq_id)
+{
+ struct scx_dispatch_q *dsq;
+ unsigned long flags;
+
+ rcu_read_lock();
+
+ dsq = find_user_dsq(dsq_id);
+ if (!dsq)
+ goto out_unlock_rcu;
+
+ raw_spin_lock_irqsave(&dsq->lock, flags);
+
+ if (dsq->nr) {
+ scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)",
+ dsq->id, dsq->nr);
+ goto out_unlock_dsq;
+ }
+
+ if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params))
+ goto out_unlock_dsq;
+
+ /*
+ * Mark dead by invalidating ->id to prevent dispatch_enqueue() from
+ * queueing more tasks. As this function can be called from anywhere,
+ * freeing is bounced through an irq work to avoid nesting RCU
+ * operations inside scheduler locks.
+ */
+ dsq->id = SCX_DSQ_INVALID;
+ llist_add(&dsq->free_node, &dsqs_to_free);
+ irq_work_queue(&free_dsq_irq_work);
+
+out_unlock_dsq:
+ raw_spin_unlock_irqrestore(&dsq->lock, flags);
+out_unlock_rcu:
+ rcu_read_unlock();
+}
+
+#ifdef CONFIG_EXT_GROUP_SCHED
+static void scx_cgroup_exit(void)
+{
+ struct cgroup_subsys_state *css;
+
+ percpu_rwsem_assert_held(&scx_cgroup_rwsem);
+
+ /*
+ * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
+ * cgroups and exit all the inited ones, all online cgroups are exited.
+ */
+ rcu_read_lock();
+ css_for_each_descendant_post(css, &root_task_group.css) {
+ struct task_group *tg = css_tg(css);
+
+ if (!(tg->scx_flags & SCX_TG_INITED))
+ continue;
+ tg->scx_flags &= ~SCX_TG_INITED;
+
+ if (!scx_ops.cgroup_exit)
+ continue;
+
+ if (WARN_ON_ONCE(!css_tryget(css)))
+ continue;
+ rcu_read_unlock();
+
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup);
+
+ rcu_read_lock();
+ css_put(css);
+ }
+ rcu_read_unlock();
+}
+
+static int scx_cgroup_init(void)
+{
+ struct cgroup_subsys_state *css;
+ int ret;
+
+ percpu_rwsem_assert_held(&scx_cgroup_rwsem);
+
+ cgroup_warned_missing_weight = false;
+ cgroup_warned_missing_idle = false;
+
+ /*
+ * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk
+ * cgroups and init, all online cgroups are initialized.
+ */
+ rcu_read_lock();
+ css_for_each_descendant_pre(css, &root_task_group.css) {
+ struct task_group *tg = css_tg(css);
+ struct scx_cgroup_init_args args = { .weight = tg->scx_weight };
+
+ scx_cgroup_warn_missing_weight(tg);
+ scx_cgroup_warn_missing_idle(tg);
+
+ if ((tg->scx_flags &
+ (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
+ continue;
+
+ if (!scx_ops.cgroup_init) {
+ tg->scx_flags |= SCX_TG_INITED;
+ continue;
+ }
+
+ if (WARN_ON_ONCE(!css_tryget(css)))
+ continue;
+ rcu_read_unlock();
+
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
+ css->cgroup, &args);
+ if (ret) {
+ css_put(css);
+ return ret;
+ }
+ tg->scx_flags |= SCX_TG_INITED;
+
+ rcu_read_lock();
+ css_put(css);
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+#else
+static void scx_cgroup_exit(void) {}
+static int scx_cgroup_init(void) { return 0; }
+#endif
+
+
+/********************************************************************************
+ * Sysfs interface and ops enable/disable.
+ */
+
+#define SCX_ATTR(_name) \
+ static struct kobj_attribute scx_attr_##_name = { \
+ .attr = { .name = __stringify(_name), .mode = 0444 }, \
+ .show = scx_attr_##_name##_show, \
+ }
+
+static ssize_t scx_attr_state_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return sysfs_emit(buf, "%s\n",
+ scx_ops_enable_state_str[scx_ops_enable_state()]);
+}
+SCX_ATTR(state);
+
+static ssize_t scx_attr_switch_all_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all));
+}
+SCX_ATTR(switch_all);
+
+static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected));
+}
+SCX_ATTR(nr_rejected);
+
+static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq));
+}
+SCX_ATTR(hotplug_seq);
+
+static ssize_t scx_attr_enable_seq_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq));
+}
+SCX_ATTR(enable_seq);
+
+static struct attribute *scx_global_attrs[] = {
+ &scx_attr_state.attr,
+ &scx_attr_switch_all.attr,
+ &scx_attr_nr_rejected.attr,
+ &scx_attr_hotplug_seq.attr,
+ &scx_attr_enable_seq.attr,
+ NULL,
+};
+
+static const struct attribute_group scx_global_attr_group = {
+ .attrs = scx_global_attrs,
+};
+
+static void scx_kobj_release(struct kobject *kobj)
+{
+ kfree(kobj);
+}
+
+static ssize_t scx_attr_ops_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return sysfs_emit(buf, "%s\n", scx_ops.name);
+}
+SCX_ATTR(ops);
+
+static struct attribute *scx_sched_attrs[] = {
+ &scx_attr_ops.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(scx_sched);
+
+static const struct kobj_type scx_ktype = {
+ .release = scx_kobj_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = scx_sched_groups,
+};
+
+static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
+{
+ return add_uevent_var(env, "SCXOPS=%s", scx_ops.name);
+}
+
+static const struct kset_uevent_ops scx_uevent_ops = {
+ .uevent = scx_uevent,
+};
+
+/*
+ * Used by sched_fork() and __setscheduler_prio() to pick the matching
+ * sched_class. dl/rt are already handled.
+ */
+bool task_should_scx(struct task_struct *p)
+{
+ if (!scx_enabled() ||
+ unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING))
+ return false;
+ if (READ_ONCE(scx_switching_all))
+ return true;
+ return p->policy == SCHED_EXT;
+}
+
+/**
+ * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
+ *
+ * Bypassing guarantees that all runnable tasks make forward progress without
+ * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might
+ * be held by tasks that the BPF scheduler is forgetting to run, which
+ * unfortunately also excludes toggling the static branches.
+ *
+ * Let's work around by overriding a couple ops and modifying behaviors based on
+ * the DISABLING state and then cycling the queued tasks through dequeue/enqueue
+ * to force global FIFO scheduling.
+ *
+ * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
+ * %SCX_OPS_ENQ_LAST is also ignored.
+ *
+ * b. ops.dispatch() is ignored.
+ *
+ * c. balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice
+ * can't be trusted. Whenever a tick triggers, the running task is rotated to
+ * the tail of the queue with core_sched_at touched.
+ *
+ * d. pick_next_task() suppresses zero slice warning.
+ *
+ * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
+ * operations.
+ *
+ * f. scx_prio_less() reverts to the default core_sched_at order.
+ */
+static void scx_ops_bypass(bool bypass)
+{
+ int depth, cpu;
+
+ if (bypass) {
+ depth = atomic_inc_return(&scx_ops_bypass_depth);
+ WARN_ON_ONCE(depth <= 0);
+ if (depth != 1)
+ return;
+ } else {
+ depth = atomic_dec_return(&scx_ops_bypass_depth);
+ WARN_ON_ONCE(depth < 0);
+ if (depth != 0)
+ return;
+ }
+
+ /*
+ * No task property is changing. We just need to make sure all currently
+ * queued tasks are re-queued according to the new scx_rq_bypassing()
+ * state. As an optimization, walk each rq's runnable_list instead of
+ * the scx_tasks list.
+ *
+ * This function can't trust the scheduler and thus can't use
+ * cpus_read_lock(). Walk all possible CPUs instead of online.
+ */
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+ struct task_struct *p, *n;
+
+ rq_lock_irqsave(rq, &rf);
+
+ if (bypass) {
+ WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
+ rq->scx.flags |= SCX_RQ_BYPASSING;
+ } else {
+ WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING));
+ rq->scx.flags &= ~SCX_RQ_BYPASSING;
+ }
+
+ /*
+ * We need to guarantee that no tasks are on the BPF scheduler
+ * while bypassing. Either we see enabled or the enable path
+ * sees scx_rq_bypassing() before moving tasks to SCX.
+ */
+ if (!scx_enabled()) {
+ rq_unlock_irqrestore(rq, &rf);
+ continue;
+ }
+
+ /*
+ * The use of list_for_each_entry_safe_reverse() is required
+ * because each task is going to be removed from and added back
+ * to the runnable_list during iteration. Because they're added
+ * to the tail of the list, safe reverse iteration can still
+ * visit all nodes.
+ */
+ list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
+ scx.runnable_node) {
+ struct sched_enq_and_set_ctx ctx;
+
+ /* cycling deq/enq is enough, see the function comment */
+ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+ sched_enq_and_set_task(&ctx);
+ }
+
+ rq_unlock_irqrestore(rq, &rf);
+
+ /* kick to restore ticks */
+ resched_cpu(cpu);
+ }
+}
+
+static void free_exit_info(struct scx_exit_info *ei)
+{
+ kfree(ei->dump);
+ kfree(ei->msg);
+ kfree(ei->bt);
+ kfree(ei);
+}
+
+static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len)
+{
+ struct scx_exit_info *ei;
+
+ ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+ if (!ei)
+ return NULL;
+
+ ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL);
+ ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
+ ei->dump = kzalloc(exit_dump_len, GFP_KERNEL);
+
+ if (!ei->bt || !ei->msg || !ei->dump) {
+ free_exit_info(ei);
+ return NULL;
+ }
+
+ return ei;
+}
+
+static const char *scx_exit_reason(enum scx_exit_kind kind)
+{
+ switch (kind) {
+ case SCX_EXIT_UNREG:
+ return "unregistered from user space";
+ case SCX_EXIT_UNREG_BPF:
+ return "unregistered from BPF";
+ case SCX_EXIT_UNREG_KERN:
+ return "unregistered from the main kernel";
+ case SCX_EXIT_SYSRQ:
+ return "disabled by sysrq-S";
+ case SCX_EXIT_ERROR:
+ return "runtime error";
+ case SCX_EXIT_ERROR_BPF:
+ return "scx_bpf_error";
+ case SCX_EXIT_ERROR_STALL:
+ return "runnable task stall";
+ default:
+ return "<UNKNOWN>";
+ }
+}
+
+static void scx_ops_disable_workfn(struct kthread_work *work)
+{
+ struct scx_exit_info *ei = scx_exit_info;
+ struct scx_task_iter sti;
+ struct task_struct *p;
+ struct rhashtable_iter rht_iter;
+ struct scx_dispatch_q *dsq;
+ int i, kind;
+
+ kind = atomic_read(&scx_exit_kind);
+ while (true) {
+ /*
+ * NONE indicates that a new scx_ops has been registered since
+ * disable was scheduled - don't kill the new ops. DONE
+ * indicates that the ops has already been disabled.
+ */
+ if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
+ return;
+ if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE))
+ break;
+ }
+ ei->kind = kind;
+ ei->reason = scx_exit_reason(ei->kind);
+
+ /* guarantee forward progress by bypassing scx_ops */
+ scx_ops_bypass(true);
+
+ switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) {
+ case SCX_OPS_DISABLING:
+ WARN_ONCE(true, "sched_ext: duplicate disabling instance?");
+ break;
+ case SCX_OPS_DISABLED:
+ pr_warn("sched_ext: ops error detected without ops (%s)\n",
+ scx_exit_info->msg);
+ WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
+ SCX_OPS_DISABLING);
+ goto done;
+ default:
+ break;
+ }
+
+ /*
+ * Here, every runnable task is guaranteed to make forward progress and
+ * we can safely use blocking synchronization constructs. Actually
+ * disable ops.
+ */
+ mutex_lock(&scx_ops_enable_mutex);
+
+ static_branch_disable(&__scx_switched_all);
+ WRITE_ONCE(scx_switching_all, false);
+
+ /*
+ * Avoid racing against fork and cgroup changes. See scx_ops_enable()
+ * for explanation on the locking order.
+ */
+ percpu_down_write(&scx_fork_rwsem);
+ cpus_read_lock();
+ scx_cgroup_lock();
+
+ spin_lock_irq(&scx_tasks_lock);
+ scx_task_iter_init(&sti);
+ /*
+ * The BPF scheduler is going away. All tasks including %TASK_DEAD ones
+ * must be switched out and exited synchronously.
+ */
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ const struct sched_class *old_class = p->sched_class;
+ struct sched_enq_and_set_ctx ctx;
+
+ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+
+ p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL);
+ __setscheduler_prio(p, p->prio);
+ check_class_changing(task_rq(p), p, old_class);
+
+ sched_enq_and_set_task(&ctx);
+
+ check_class_changed(task_rq(p), p, old_class, p->prio);
+ scx_ops_exit_task(p);
+ }
+ scx_task_iter_exit(&sti);
+ spin_unlock_irq(&scx_tasks_lock);
+
+ /* no task is on scx, turn off all the switches and flush in-progress calls */
+ static_branch_disable_cpuslocked(&__scx_ops_enabled);
+ for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
+ static_branch_disable_cpuslocked(&scx_has_op[i]);
+ static_branch_disable_cpuslocked(&scx_ops_enq_last);
+ static_branch_disable_cpuslocked(&scx_ops_enq_exiting);
+ static_branch_disable_cpuslocked(&scx_ops_cpu_preempt);
+ static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
+ synchronize_rcu();
+
+ scx_cgroup_exit();
+
+ scx_cgroup_unlock();
+ cpus_read_unlock();
+ percpu_up_write(&scx_fork_rwsem);
+
+ if (ei->kind >= SCX_EXIT_ERROR) {
+ pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
+ scx_ops.name, ei->reason);
+
+ if (ei->msg[0] != '\0')
+ pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg);
+#ifdef CONFIG_STACKTRACE
+ stack_trace_print(ei->bt, ei->bt_len, 2);
+#endif
+ } else {
+ pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
+ scx_ops.name, ei->reason);
+ }
+
+ if (scx_ops.exit)
+ SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei);
+
+ cancel_delayed_work_sync(&scx_watchdog_work);
+
+ /*
+ * Delete the kobject from the hierarchy eagerly in addition to just
+ * dropping a reference. Otherwise, if the object is deleted
+ * asynchronously, sysfs could observe an object of the same name still
+ * in the hierarchy when another scheduler is loaded.
+ */
+ kobject_del(scx_root_kobj);
+ kobject_put(scx_root_kobj);
+ scx_root_kobj = NULL;
+
+ memset(&scx_ops, 0, sizeof(scx_ops));
+
+ rhashtable_walk_enter(&dsq_hash, &rht_iter);
+ do {
+ rhashtable_walk_start(&rht_iter);
+
+ while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
+ destroy_dsq(dsq->id);
+
+ rhashtable_walk_stop(&rht_iter);
+ } while (dsq == ERR_PTR(-EAGAIN));
+ rhashtable_walk_exit(&rht_iter);
+
+ free_percpu(scx_dsp_ctx);
+ scx_dsp_ctx = NULL;
+ scx_dsp_max_batch = 0;
+
+ free_exit_info(scx_exit_info);
+ scx_exit_info = NULL;
+
+ mutex_unlock(&scx_ops_enable_mutex);
+
+ WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
+ SCX_OPS_DISABLING);
+done:
+ scx_ops_bypass(false);
+}
+
+static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn);
+
+static void schedule_scx_ops_disable_work(void)
+{
+ struct kthread_worker *helper = READ_ONCE(scx_ops_helper);
+
+ /*
+ * We may be called spuriously before the first bpf_sched_ext_reg(). If
+ * scx_ops_helper isn't set up yet, there's nothing to do.
+ */
+ if (helper)
+ kthread_queue_work(helper, &scx_ops_disable_work);
+}
+
+static void scx_ops_disable(enum scx_exit_kind kind)
+{
+ int none = SCX_EXIT_NONE;
+
+ if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
+ kind = SCX_EXIT_ERROR;
+
+ atomic_try_cmpxchg(&scx_exit_kind, &none, kind);
+
+ schedule_scx_ops_disable_work();
+}
+
+static void dump_newline(struct seq_buf *s)
+{
+ trace_sched_ext_dump("");
+
+ /* @s may be zero sized and seq_buf triggers WARN if so */
+ if (s->size)
+ seq_buf_putc(s, '\n');
+}
+
+static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...)
+{
+ va_list args;
+
+#ifdef CONFIG_TRACEPOINTS
+ if (trace_sched_ext_dump_enabled()) {
+ /* protected by scx_dump_state()::dump_lock */
+ static char line_buf[SCX_EXIT_MSG_LEN];
+
+ va_start(args, fmt);
+ vscnprintf(line_buf, sizeof(line_buf), fmt, args);
+ va_end(args);
+
+ trace_sched_ext_dump(line_buf);
+ }
+#endif
+ /* @s may be zero sized and seq_buf triggers WARN if so */
+ if (s->size) {
+ va_start(args, fmt);
+ seq_buf_vprintf(s, fmt, args);
+ va_end(args);
+
+ seq_buf_putc(s, '\n');
+ }
+}
+
+static void dump_stack_trace(struct seq_buf *s, const char *prefix,
+ const unsigned long *bt, unsigned int len)
+{
+ unsigned int i;
+
+ for (i = 0; i < len; i++)
+ dump_line(s, "%s%pS", prefix, (void *)bt[i]);
+}
+
+static void ops_dump_init(struct seq_buf *s, const char *prefix)
+{
+ struct scx_dump_data *dd = &scx_dump_data;
+
+ lockdep_assert_irqs_disabled();
+
+ dd->cpu = smp_processor_id(); /* allow scx_bpf_dump() */
+ dd->first = true;
+ dd->cursor = 0;
+ dd->s = s;
+ dd->prefix = prefix;
+}
+
+static void ops_dump_flush(void)
+{
+ struct scx_dump_data *dd = &scx_dump_data;
+ char *line = dd->buf.line;
+
+ if (!dd->cursor)
+ return;
+
+ /*
+ * There's something to flush and this is the first line. Insert a blank
+ * line to distinguish ops dump.
+ */
+ if (dd->first) {
+ dump_newline(dd->s);
+ dd->first = false;
+ }
+
+ /*
+ * There may be multiple lines in $line. Scan and emit each line
+ * separately.
+ */
+ while (true) {
+ char *end = line;
+ char c;
+
+ while (*end != '\n' && *end != '\0')
+ end++;
+
+ /*
+ * If $line overflowed, it may not have newline at the end.
+ * Always emit with a newline.
+ */
+ c = *end;
+ *end = '\0';
+ dump_line(dd->s, "%s%s", dd->prefix, line);
+ if (c == '\0')
+ break;
+
+ /* move to the next line */
+ end++;
+ if (*end == '\0')
+ break;
+ line = end;
+ }
+
+ dd->cursor = 0;
+}
+
+static void ops_dump_exit(void)
+{
+ ops_dump_flush();
+ scx_dump_data.cpu = -1;
+}
+
+static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
+ struct task_struct *p, char marker)
+{
+ static unsigned long bt[SCX_EXIT_BT_LEN];
+ char dsq_id_buf[19] = "(n/a)";
+ unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
+ unsigned int bt_len = 0;
+
+ if (p->scx.dsq)
+ scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx",
+ (unsigned long long)p->scx.dsq->id);
+
+ dump_newline(s);
+ dump_line(s, " %c%c %s[%d] %+ldms",
+ marker, task_state_to_char(p), p->comm, p->pid,
+ jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies));
+ dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu",
+ scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,
+ p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK,
+ ops_state >> SCX_OPSS_QSEQ_SHIFT);
+ dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu",
+ p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf,
+ p->scx.dsq_vtime);
+ dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
+
+ if (SCX_HAS_OP(dump_task)) {
+ ops_dump_init(s, " ");
+ SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p);
+ ops_dump_exit();
+ }
+
+#ifdef CONFIG_STACKTRACE
+ bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1);
+#endif
+ if (bt_len) {
+ dump_newline(s);
+ dump_stack_trace(s, " ", bt, bt_len);
+ }
+}
+
+static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
+{
+ static DEFINE_SPINLOCK(dump_lock);
+ static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n";
+ struct scx_dump_ctx dctx = {
+ .kind = ei->kind,
+ .exit_code = ei->exit_code,
+ .reason = ei->reason,
+ .at_ns = ktime_get_ns(),
+ .at_jiffies = jiffies,
+ };
+ struct seq_buf s;
+ unsigned long flags;
+ char *buf;
+ int cpu;
+
+ spin_lock_irqsave(&dump_lock, flags);
+
+ seq_buf_init(&s, ei->dump, dump_len);
+
+ if (ei->kind == SCX_EXIT_NONE) {
+ dump_line(&s, "Debug dump triggered by %s", ei->reason);
+ } else {
+ dump_line(&s, "%s[%d] triggered exit kind %d:",
+ current->comm, current->pid, ei->kind);
+ dump_line(&s, " %s (%s)", ei->reason, ei->msg);
+ dump_newline(&s);
+ dump_line(&s, "Backtrace:");
+ dump_stack_trace(&s, " ", ei->bt, ei->bt_len);
+ }
+
+ if (SCX_HAS_OP(dump)) {
+ ops_dump_init(&s, "");
+ SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx);
+ ops_dump_exit();
+ }
+
+ dump_newline(&s);
+ dump_line(&s, "CPU states");
+ dump_line(&s, "----------");
+
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+ struct task_struct *p;
+ struct seq_buf ns;
+ size_t avail, used;
+ bool idle;
+
+ rq_lock(rq, &rf);
+
+ idle = list_empty(&rq->scx.runnable_list) &&
+ rq->curr->sched_class == &idle_sched_class;
+
+ if (idle && !SCX_HAS_OP(dump_cpu))
+ goto next;
+
+ /*
+ * We don't yet know whether ops.dump_cpu() will produce output
+ * and we may want to skip the default CPU dump if it doesn't.
+ * Use a nested seq_buf to generate the standard dump so that we
+ * can decide whether to commit later.
+ */
+ avail = seq_buf_get_buf(&s, &buf);
+ seq_buf_init(&ns, buf, avail);
+
+ dump_newline(&ns);
+ dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu",
+ cpu, rq->scx.nr_running, rq->scx.flags,
+ rq->scx.cpu_released, rq->scx.ops_qseq,
+ rq->scx.pnt_seq);
+ dump_line(&ns, " curr=%s[%d] class=%ps",
+ rq->curr->comm, rq->curr->pid,
+ rq->curr->sched_class);
+ if (!cpumask_empty(rq->scx.cpus_to_kick))
+ dump_line(&ns, " cpus_to_kick : %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_kick));
+ if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle))
+ dump_line(&ns, " idle_to_kick : %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_kick_if_idle));
+ if (!cpumask_empty(rq->scx.cpus_to_preempt))
+ dump_line(&ns, " cpus_to_preempt: %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_preempt));
+ if (!cpumask_empty(rq->scx.cpus_to_wait))
+ dump_line(&ns, " cpus_to_wait : %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_wait));
+
+ used = seq_buf_used(&ns);
+ if (SCX_HAS_OP(dump_cpu)) {
+ ops_dump_init(&ns, " ");
+ SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle);
+ ops_dump_exit();
+ }
+
+ /*
+ * If idle && nothing generated by ops.dump_cpu(), there's
+ * nothing interesting. Skip.
+ */
+ if (idle && used == seq_buf_used(&ns))
+ goto next;
+
+ /*
+ * $s may already have overflowed when $ns was created. If so,
+ * calling commit on it will trigger BUG.
+ */
+ if (avail) {
+ seq_buf_commit(&s, seq_buf_used(&ns));
+ if (seq_buf_has_overflowed(&ns))
+ seq_buf_set_overflow(&s);
+ }
+
+ if (rq->curr->sched_class == &ext_sched_class)
+ scx_dump_task(&s, &dctx, rq->curr, '*');
+
+ list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
+ scx_dump_task(&s, &dctx, p, ' ');
+ next:
+ rq_unlock(rq, &rf);
+ }
+
+ if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
+ memcpy(ei->dump + dump_len - sizeof(trunc_marker),
+ trunc_marker, sizeof(trunc_marker));
+
+ spin_unlock_irqrestore(&dump_lock, flags);
+}
+
+static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
+{
+ struct scx_exit_info *ei = scx_exit_info;
+
+ if (ei->kind >= SCX_EXIT_ERROR)
+ scx_dump_state(ei, scx_ops.exit_dump_len);
+
+ schedule_scx_ops_disable_work();
+}
+
+static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn);
+
+static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
+ s64 exit_code,
+ const char *fmt, ...)
+{
+ struct scx_exit_info *ei = scx_exit_info;
+ int none = SCX_EXIT_NONE;
+ va_list args;
+
+ if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind))
+ return;
+
+ ei->exit_code = exit_code;
+#ifdef CONFIG_STACKTRACE
+ if (kind >= SCX_EXIT_ERROR)
+ ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1);
+#endif
+ va_start(args, fmt);
+ vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
+ va_end(args);
+
+ /*
+ * Set ei->kind and ->reason for scx_dump_state(). They'll be set again
+ * in scx_ops_disable_workfn().
+ */
+ ei->kind = kind;
+ ei->reason = scx_exit_reason(ei->kind);
+
+ irq_work_queue(&scx_ops_error_irq_work);
+}
+
+static struct kthread_worker *scx_create_rt_helper(const char *name)
+{
+ struct kthread_worker *helper;
+
+ helper = kthread_create_worker(0, name);
+ if (helper)
+ sched_set_fifo(helper->task);
+ return helper;
+}
+
+static void check_hotplug_seq(const struct sched_ext_ops *ops)
+{
+ unsigned long long global_hotplug_seq;
+
+ /*
+ * If a hotplug event has occurred between when a scheduler was
+ * initialized, and when we were able to attach, exit and notify user
+ * space about it.
+ */
+ if (ops->hotplug_seq) {
+ global_hotplug_seq = atomic_long_read(&scx_hotplug_seq);
+ if (ops->hotplug_seq != global_hotplug_seq) {
+ scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
+ "expected hotplug seq %llu did not match actual %llu",
+ ops->hotplug_seq, global_hotplug_seq);
+ }
+ }
+}
+
+static int validate_ops(const struct sched_ext_ops *ops)
+{
+ /*
+ * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the
+ * ops.enqueue() callback isn't implemented.
+ */
+ if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) {
+ scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+{
+ struct scx_task_iter sti;
+ struct task_struct *p;
+ unsigned long timeout;
+ int i, cpu, ret;
+
+ if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
+ cpu_possible_mask)) {
+ pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation");
+ return -EINVAL;
+ }
+
+ mutex_lock(&scx_ops_enable_mutex);
+
+ if (!scx_ops_helper) {
+ WRITE_ONCE(scx_ops_helper,
+ scx_create_rt_helper("sched_ext_ops_helper"));
+ if (!scx_ops_helper) {
+ ret = -ENOMEM;
+ goto err_unlock;
+ }
+ }
+
+ if (scx_ops_enable_state() != SCX_OPS_DISABLED) {
+ ret = -EBUSY;
+ goto err_unlock;
+ }
+
+ scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL);
+ if (!scx_root_kobj) {
+ ret = -ENOMEM;
+ goto err_unlock;
+ }
+
+ scx_root_kobj->kset = scx_kset;
+ ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root");
+ if (ret < 0)
+ goto err;
+
+ scx_exit_info = alloc_exit_info(ops->exit_dump_len);
+ if (!scx_exit_info) {
+ ret = -ENOMEM;
+ goto err_del;
+ }
+
+ /*
+ * Set scx_ops, transition to PREPPING and clear exit info to arm the
+ * disable path. Failure triggers full disabling from here on.
+ */
+ scx_ops = *ops;
+
+ WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) !=
+ SCX_OPS_DISABLED);
+
+ atomic_set(&scx_exit_kind, SCX_EXIT_NONE);
+ scx_warned_zero_slice = false;
+
+ atomic_long_set(&scx_nr_rejected, 0);
+
+ for_each_possible_cpu(cpu)
+ cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE;
+
+ /*
+ * Keep CPUs stable during enable so that the BPF scheduler can track
+ * online CPUs by watching ->on/offline_cpu() after ->init().
+ */
+ cpus_read_lock();
+
+ if (scx_ops.init) {
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init);
+ if (ret) {
+ ret = ops_sanitize_err("init", ret);
+ goto err_disable_unlock_cpus;
+ }
+ }
+
+ for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
+ if (((void (**)(void))ops)[i])
+ static_branch_enable_cpuslocked(&scx_has_op[i]);
+
+ cpus_read_unlock();
+
+ ret = validate_ops(ops);
+ if (ret)
+ goto err_disable;
+
+ WARN_ON_ONCE(scx_dsp_ctx);
+ scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH;
+ scx_dsp_ctx = __alloc_percpu(struct_size_t(struct scx_dsp_ctx, buf,
+ scx_dsp_max_batch),
+ __alignof__(struct scx_dsp_ctx));
+ if (!scx_dsp_ctx) {
+ ret = -ENOMEM;
+ goto err_disable;
+ }
+
+ if (ops->timeout_ms)
+ timeout = msecs_to_jiffies(ops->timeout_ms);
+ else
+ timeout = SCX_WATCHDOG_MAX_TIMEOUT;
+
+ WRITE_ONCE(scx_watchdog_timeout, timeout);
+ WRITE_ONCE(scx_watchdog_timestamp, jiffies);
+ queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
+ scx_watchdog_timeout / 2);
+
+ /*
+ * Lock out forks, cgroup on/offlining and moves before opening the
+ * floodgate so that they don't wander into the operations prematurely.
+ *
+ * We don't need to keep the CPUs stable but static_branch_*() requires
+ * cpus_read_lock() and scx_cgroup_rwsem must nest inside
+ * cpu_hotplug_lock because of the following dependency chain:
+ *
+ * cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem
+ *
+ * So, we need to do cpus_read_lock() before scx_cgroup_lock() and use
+ * static_branch_*_cpuslocked().
+ *
+ * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the
+ * following dependency chain:
+ *
+ * scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock
+ */
+ percpu_down_write(&scx_fork_rwsem);
+ cpus_read_lock();
+ scx_cgroup_lock();
+
+ check_hotplug_seq(ops);
+
+ for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
+ if (((void (**)(void))ops)[i])
+ static_branch_enable_cpuslocked(&scx_has_op[i]);
+
+ if (ops->flags & SCX_OPS_ENQ_LAST)
+ static_branch_enable_cpuslocked(&scx_ops_enq_last);
+
+ if (ops->flags & SCX_OPS_ENQ_EXITING)
+ static_branch_enable_cpuslocked(&scx_ops_enq_exiting);
+ if (scx_ops.cpu_acquire || scx_ops.cpu_release)
+ static_branch_enable_cpuslocked(&scx_ops_cpu_preempt);
+
+ if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
+ reset_idle_masks();
+ static_branch_enable_cpuslocked(&scx_builtin_idle_enabled);
+ } else {
+ static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
+ }
+
+ /*
+ * All cgroups should be initialized before letting in tasks. cgroup
+ * on/offlining and task migrations are already locked out.
+ */
+ ret = scx_cgroup_init();
+ if (ret)
+ goto err_disable_unlock_all;
+
+ static_branch_enable_cpuslocked(&__scx_ops_enabled);
+
+ /*
+ * Enable ops for every task. Fork is excluded by scx_fork_rwsem
+ * preventing new tasks from being added. No need to exclude tasks
+ * leaving as sched_ext_free() can handle both prepped and enabled
+ * tasks. Prep all tasks first and then enable them with preemption
+ * disabled.
+ */
+ spin_lock_irq(&scx_tasks_lock);
+
+ scx_task_iter_init(&sti);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ /*
+ * @p may already be dead, have lost all its usages counts and
+ * be waiting for RCU grace period before being freed. @p can't
+ * be initialized for SCX in such cases and should be ignored.
+ */
+ if (!tryget_task_struct(p))
+ continue;
+
+ scx_task_iter_rq_unlock(&sti);
+ spin_unlock_irq(&scx_tasks_lock);
+
+ ret = scx_ops_init_task(p, task_group(p), false);
+ if (ret) {
+ put_task_struct(p);
+ spin_lock_irq(&scx_tasks_lock);
+ scx_task_iter_exit(&sti);
+ spin_unlock_irq(&scx_tasks_lock);
+ pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n",
+ ret, p->comm, p->pid);
+ goto err_disable_unlock_all;
+ }
+
+ put_task_struct(p);
+ spin_lock_irq(&scx_tasks_lock);
+ }
+ scx_task_iter_exit(&sti);
+
+ /*
+ * All tasks are prepped but are still ops-disabled. Ensure that
+ * %current can't be scheduled out and switch everyone.
+ * preempt_disable() is necessary because we can't guarantee that
+ * %current won't be starved if scheduled out while switching.
+ */
+ preempt_disable();
+
+ /*
+ * From here on, the disable path must assume that tasks have ops
+ * enabled and need to be recovered.
+ *
+ * Transition to ENABLING fails iff the BPF scheduler has already
+ * triggered scx_bpf_error(). Returning an error code here would lose
+ * the recorded error information. Exit indicating success so that the
+ * error is notified through ops.exit() with all the details.
+ */
+ if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) {
+ preempt_enable();
+ spin_unlock_irq(&scx_tasks_lock);
+ WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
+ ret = 0;
+ goto err_disable_unlock_all;
+ }
+
+ /*
+ * We're fully committed and can't fail. The PREPPED -> ENABLED
+ * transitions here are synchronized against sched_ext_free() through
+ * scx_tasks_lock.
+ */
+ WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
+
+ scx_task_iter_init(&sti);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ const struct sched_class *old_class = p->sched_class;
+ struct sched_enq_and_set_ctx ctx;
+
+ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+
+ scx_set_task_state(p, SCX_TASK_READY);
+ __setscheduler_prio(p, p->prio);
+ check_class_changing(task_rq(p), p, old_class);
+
+ sched_enq_and_set_task(&ctx);
+
+ check_class_changed(task_rq(p), p, old_class, p->prio);
+ }
+ scx_task_iter_exit(&sti);
+
+ spin_unlock_irq(&scx_tasks_lock);
+ preempt_enable();
+ scx_cgroup_unlock();
+ cpus_read_unlock();
+ percpu_up_write(&scx_fork_rwsem);
+
+ /* see above ENABLING transition for the explanation on exiting with 0 */
+ if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
+ WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
+ ret = 0;
+ goto err_disable;
+ }
+
+ if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL))
+ static_branch_enable(&__scx_switched_all);
+
+ pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
+ scx_ops.name, scx_switched_all() ? "" : " (partial)");
+ kobject_uevent(scx_root_kobj, KOBJ_ADD);
+ mutex_unlock(&scx_ops_enable_mutex);
+
+ atomic_long_inc(&scx_enable_seq);
+
+ return 0;
+
+err_del:
+ kobject_del(scx_root_kobj);
+err:
+ kobject_put(scx_root_kobj);
+ scx_root_kobj = NULL;
+ if (scx_exit_info) {
+ free_exit_info(scx_exit_info);
+ scx_exit_info = NULL;
+ }
+err_unlock:
+ mutex_unlock(&scx_ops_enable_mutex);
+ return ret;
+
+err_disable_unlock_all:
+ scx_cgroup_unlock();
+ percpu_up_write(&scx_fork_rwsem);
+err_disable_unlock_cpus:
+ cpus_read_unlock();
+err_disable:
+ mutex_unlock(&scx_ops_enable_mutex);
+ /* must be fully disabled before returning */
+ scx_ops_disable(SCX_EXIT_ERROR);
+ kthread_flush_work(&scx_ops_disable_work);
+ return ret;
+}
+
+
+/********************************************************************************
+ * bpf_struct_ops plumbing.
+ */
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+
+extern struct btf *btf_vmlinux;
+static const struct btf_type *task_struct_type;
+static u32 task_struct_type_id;
+
+static bool set_arg_maybe_null(const char *op, int arg_n, int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ struct btf *btf = bpf_get_btf_vmlinux();
+ const struct bpf_struct_ops_desc *st_ops_desc;
+ const struct btf_member *member;
+ const struct btf_type *t;
+ u32 btf_id, member_idx;
+ const char *mname;
+
+ /* struct_ops op args are all sequential, 64-bit numbers */
+ if (off != arg_n * sizeof(__u64))
+ return false;
+
+ /* btf_id should be the type id of struct sched_ext_ops */
+ btf_id = prog->aux->attach_btf_id;
+ st_ops_desc = bpf_struct_ops_find(btf, btf_id);
+ if (!st_ops_desc)
+ return false;
+
+ /* BTF type of struct sched_ext_ops */
+ t = st_ops_desc->type;
+
+ member_idx = prog->expected_attach_type;
+ if (member_idx >= btf_type_vlen(t))
+ return false;
+
+ /*
+ * Get the member name of this struct_ops program, which corresponds to
+ * a field in struct sched_ext_ops. For example, the member name of the
+ * dispatch struct_ops program (callback) is "dispatch".
+ */
+ member = &btf_type_member(t)[member_idx];
+ mname = btf_name_by_offset(btf_vmlinux, member->name_off);
+
+ if (!strcmp(mname, op)) {
+ /*
+ * The value is a pointer to a type (struct task_struct) given
+ * by a BTF ID (PTR_TO_BTF_ID). It is trusted (PTR_TRUSTED),
+ * however, can be a NULL (PTR_MAYBE_NULL). The BPF program
+ * should check the pointer to make sure it is not NULL before
+ * using it, or the verifier will reject the program.
+ *
+ * Longer term, this is something that should be addressed by
+ * BTF, and be fully contained within the verifier.
+ */
+ info->reg_type = PTR_MAYBE_NULL | PTR_TO_BTF_ID | PTR_TRUSTED;
+ info->btf = btf_vmlinux;
+ info->btf_id = task_struct_type_id;
+
+ return true;
+ }
+
+ return false;
+}
+
+static bool bpf_scx_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (type != BPF_READ)
+ return false;
+ if (set_arg_maybe_null("dispatch", 1, off, size, type, prog, info) ||
+ set_arg_maybe_null("yield", 1, off, size, type, prog, info))
+ return true;
+ if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
+ return false;
+ if (off % size != 0)
+ return false;
+
+ return btf_ctx_access(off, size, type, prog, info);
+}
+
+static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg, int off,
+ int size)
+{
+ const struct btf_type *t;
+
+ t = btf_type_by_id(reg->btf, reg->btf_id);
+ if (t == task_struct_type) {
+ if (off >= offsetof(struct task_struct, scx.slice) &&
+ off + size <= offsetofend(struct task_struct, scx.slice))
+ return SCALAR_VALUE;
+ if (off >= offsetof(struct task_struct, scx.dsq_vtime) &&
+ off + size <= offsetofend(struct task_struct, scx.dsq_vtime))
+ return SCALAR_VALUE;
+ if (off >= offsetof(struct task_struct, scx.disallow) &&
+ off + size <= offsetofend(struct task_struct, scx.disallow))
+ return SCALAR_VALUE;
+ }
+
+ return -EACCES;
+}
+
+static const struct bpf_func_proto *
+bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_task_storage_get:
+ return &bpf_task_storage_get_proto;
+ case BPF_FUNC_task_storage_delete:
+ return &bpf_task_storage_delete_proto;
+ default:
+ return bpf_base_func_proto(func_id, prog);
+ }
+}
+
+static const struct bpf_verifier_ops bpf_scx_verifier_ops = {
+ .get_func_proto = bpf_scx_get_func_proto,
+ .is_valid_access = bpf_scx_is_valid_access,
+ .btf_struct_access = bpf_scx_btf_struct_access,
+};
+
+static int bpf_scx_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ const struct sched_ext_ops *uops = udata;
+ struct sched_ext_ops *ops = kdata;
+ u32 moff = __btf_member_bit_offset(t, member) / 8;
+ int ret;
+
+ switch (moff) {
+ case offsetof(struct sched_ext_ops, dispatch_max_batch):
+ if (*(u32 *)(udata + moff) > INT_MAX)
+ return -E2BIG;
+ ops->dispatch_max_batch = *(u32 *)(udata + moff);
+ return 1;
+ case offsetof(struct sched_ext_ops, flags):
+ if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS)
+ return -EINVAL;
+ ops->flags = *(u64 *)(udata + moff);
+ return 1;
+ case offsetof(struct sched_ext_ops, name):
+ ret = bpf_obj_name_cpy(ops->name, uops->name,
+ sizeof(ops->name));
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ return -EINVAL;
+ return 1;
+ case offsetof(struct sched_ext_ops, timeout_ms):
+ if (msecs_to_jiffies(*(u32 *)(udata + moff)) >
+ SCX_WATCHDOG_MAX_TIMEOUT)
+ return -E2BIG;
+ ops->timeout_ms = *(u32 *)(udata + moff);
+ return 1;
+ case offsetof(struct sched_ext_ops, exit_dump_len):
+ ops->exit_dump_len =
+ *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN;
+ return 1;
+ case offsetof(struct sched_ext_ops, hotplug_seq):
+ ops->hotplug_seq = *(u64 *)(udata + moff);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int bpf_scx_check_member(const struct btf_type *t,
+ const struct btf_member *member,
+ const struct bpf_prog *prog)
+{
+ u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+ switch (moff) {
+ case offsetof(struct sched_ext_ops, init_task):
+#ifdef CONFIG_EXT_GROUP_SCHED
+ case offsetof(struct sched_ext_ops, cgroup_init):
+ case offsetof(struct sched_ext_ops, cgroup_exit):
+ case offsetof(struct sched_ext_ops, cgroup_prep_move):
+#endif
+ case offsetof(struct sched_ext_ops, cpu_online):
+ case offsetof(struct sched_ext_ops, cpu_offline):
+ case offsetof(struct sched_ext_ops, init):
+ case offsetof(struct sched_ext_ops, exit):
+ break;
+ default:
+ if (prog->sleepable)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int bpf_scx_reg(void *kdata, struct bpf_link *link)
+{
+ return scx_ops_enable(kdata, link);
+}
+
+static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
+{
+ scx_ops_disable(SCX_EXIT_UNREG);
+ kthread_flush_work(&scx_ops_disable_work);
+}
+
+static int bpf_scx_init(struct btf *btf)
+{
+ s32 type_id;
+
+ type_id = btf_find_by_name_kind(btf, "task_struct", BTF_KIND_STRUCT);
+ if (type_id < 0)
+ return -EINVAL;
+ task_struct_type = btf_type_by_id(btf, type_id);
+ task_struct_type_id = type_id;
+
+ return 0;
+}
+
+static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link)
+{
+ /*
+ * sched_ext does not support updating the actively-loaded BPF
+ * scheduler, as registering a BPF scheduler can always fail if the
+ * scheduler returns an error code for e.g. ops.init(), ops.init_task(),
+ * etc. Similarly, we can always race with unregistration happening
+ * elsewhere, such as with sysrq.
+ */
+ return -EOPNOTSUPP;
+}
+
+static int bpf_scx_validate(void *kdata)
+{
+ return 0;
+}
+
+static s32 select_cpu_stub(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; }
+static void enqueue_stub(struct task_struct *p, u64 enq_flags) {}
+static void dequeue_stub(struct task_struct *p, u64 enq_flags) {}
+static void dispatch_stub(s32 prev_cpu, struct task_struct *p) {}
+static void tick_stub(struct task_struct *p) {}
+static void runnable_stub(struct task_struct *p, u64 enq_flags) {}
+static void running_stub(struct task_struct *p) {}
+static void stopping_stub(struct task_struct *p, bool runnable) {}
+static void quiescent_stub(struct task_struct *p, u64 deq_flags) {}
+static bool yield_stub(struct task_struct *from, struct task_struct *to) { return false; }
+static bool core_sched_before_stub(struct task_struct *a, struct task_struct *b) { return false; }
+static void set_weight_stub(struct task_struct *p, u32 weight) {}
+static void set_cpumask_stub(struct task_struct *p, const struct cpumask *mask) {}
+static void update_idle_stub(s32 cpu, bool idle) {}
+static void cpu_acquire_stub(s32 cpu, struct scx_cpu_acquire_args *args) {}
+static void cpu_release_stub(s32 cpu, struct scx_cpu_release_args *args) {}
+static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; }
+static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {}
+static void enable_stub(struct task_struct *p) {}
+static void disable_stub(struct task_struct *p) {}
+#ifdef CONFIG_EXT_GROUP_SCHED
+static s32 cgroup_init_stub(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; }
+static void cgroup_exit_stub(struct cgroup *cgrp) {}
+static s32 cgroup_prep_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; }
+static void cgroup_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
+static void cgroup_cancel_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
+static void cgroup_set_weight_stub(struct cgroup *cgrp, u32 weight) {}
+#endif
+static void cpu_online_stub(s32 cpu) {}
+static void cpu_offline_stub(s32 cpu) {}
+static s32 init_stub(void) { return -EINVAL; }
+static void exit_stub(struct scx_exit_info *info) {}
+static void dump_stub(struct scx_dump_ctx *ctx) {}
+static void dump_cpu_stub(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {}
+static void dump_task_stub(struct scx_dump_ctx *ctx, struct task_struct *p) {}
+
+static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
+ .select_cpu = select_cpu_stub,
+ .enqueue = enqueue_stub,
+ .dequeue = dequeue_stub,
+ .dispatch = dispatch_stub,
+ .tick = tick_stub,
+ .runnable = runnable_stub,
+ .running = running_stub,
+ .stopping = stopping_stub,
+ .quiescent = quiescent_stub,
+ .yield = yield_stub,
+ .core_sched_before = core_sched_before_stub,
+ .set_weight = set_weight_stub,
+ .set_cpumask = set_cpumask_stub,
+ .update_idle = update_idle_stub,
+ .cpu_acquire = cpu_acquire_stub,
+ .cpu_release = cpu_release_stub,
+ .init_task = init_task_stub,
+ .exit_task = exit_task_stub,
+ .enable = enable_stub,
+ .disable = disable_stub,
+#ifdef CONFIG_EXT_GROUP_SCHED
+ .cgroup_init = cgroup_init_stub,
+ .cgroup_exit = cgroup_exit_stub,
+ .cgroup_prep_move = cgroup_prep_move_stub,
+ .cgroup_move = cgroup_move_stub,
+ .cgroup_cancel_move = cgroup_cancel_move_stub,
+ .cgroup_set_weight = cgroup_set_weight_stub,
+#endif
+ .cpu_online = cpu_online_stub,
+ .cpu_offline = cpu_offline_stub,
+ .init = init_stub,
+ .exit = exit_stub,
+ .dump = dump_stub,
+ .dump_cpu = dump_cpu_stub,
+ .dump_task = dump_task_stub,
+};
+
+static struct bpf_struct_ops bpf_sched_ext_ops = {
+ .verifier_ops = &bpf_scx_verifier_ops,
+ .reg = bpf_scx_reg,
+ .unreg = bpf_scx_unreg,
+ .check_member = bpf_scx_check_member,
+ .init_member = bpf_scx_init_member,
+ .init = bpf_scx_init,
+ .update = bpf_scx_update,
+ .validate = bpf_scx_validate,
+ .name = "sched_ext_ops",
+ .owner = THIS_MODULE,
+ .cfi_stubs = &__bpf_ops_sched_ext_ops
+};
+
+
+/********************************************************************************
+ * System integration and init.
+ */
+
+static void sysrq_handle_sched_ext_reset(u8 key)
+{
+ if (scx_ops_helper)
+ scx_ops_disable(SCX_EXIT_SYSRQ);
+ else
+ pr_info("sched_ext: BPF scheduler not yet used\n");
+}
+
+static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
+ .handler = sysrq_handle_sched_ext_reset,
+ .help_msg = "reset-sched-ext(S)",
+ .action_msg = "Disable sched_ext and revert all tasks to CFS",
+ .enable_mask = SYSRQ_ENABLE_RTNICE,
+};
+
+static void sysrq_handle_sched_ext_dump(u8 key)
+{
+ struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" };
+
+ if (scx_enabled())
+ scx_dump_state(&ei, 0);
+}
+
+static const struct sysrq_key_op sysrq_sched_ext_dump_op = {
+ .handler = sysrq_handle_sched_ext_dump,
+ .help_msg = "dump-sched-ext(D)",
+ .action_msg = "Trigger sched_ext debug dump",
+ .enable_mask = SYSRQ_ENABLE_RTNICE,
+};
+
+static bool can_skip_idle_kick(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * We can skip idle kicking if @rq is going to go through at least one
+ * full SCX scheduling cycle before going idle. Just checking whether
+ * curr is not idle is insufficient because we could be racing
+ * balance_one() trying to pull the next task from a remote rq, which
+ * may fail, and @rq may become idle afterwards.
+ *
+ * The race window is small and we don't and can't guarantee that @rq is
+ * only kicked while idle anyway. Skip only when sure.
+ */
+ return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE);
+}
+
+static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct scx_rq *this_scx = &this_rq->scx;
+ bool should_wait = false;
+ unsigned long flags;
+
+ raw_spin_rq_lock_irqsave(rq, flags);
+
+ /*
+ * During CPU hotplug, a CPU may depend on kicking itself to make
+ * forward progress. Allow kicking self regardless of online state.
+ */
+ if (cpu_online(cpu) || cpu == cpu_of(this_rq)) {
+ if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) {
+ if (rq->curr->sched_class == &ext_sched_class)
+ rq->curr->scx.slice = 0;
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
+ }
+
+ if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
+ pseqs[cpu] = rq->scx.pnt_seq;
+ should_wait = true;
+ }
+
+ resched_curr(rq);
+ } else {
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
+ }
+
+ raw_spin_rq_unlock_irqrestore(rq, flags);
+
+ return should_wait;
+}
+
+static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ raw_spin_rq_lock_irqsave(rq, flags);
+
+ if (!can_skip_idle_kick(rq) &&
+ (cpu_online(cpu) || cpu == cpu_of(this_rq)))
+ resched_curr(rq);
+
+ raw_spin_rq_unlock_irqrestore(rq, flags);
+}
+
+static void kick_cpus_irq_workfn(struct irq_work *irq_work)
+{
+ struct rq *this_rq = this_rq();
+ struct scx_rq *this_scx = &this_rq->scx;
+ unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
+ bool should_wait = false;
+ s32 cpu;
+
+ for_each_cpu(cpu, this_scx->cpus_to_kick) {
+ should_wait |= kick_one_cpu(cpu, this_rq, pseqs);
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
+ }
+
+ for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) {
+ kick_one_cpu_if_idle(cpu, this_rq);
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
+ }
+
+ if (!should_wait)
+ return;
+
+ for_each_cpu(cpu, this_scx->cpus_to_wait) {
+ unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq;
+
+ if (cpu != cpu_of(this_rq)) {
+ /*
+ * Pairs with smp_store_release() issued by this CPU in
+ * scx_next_task_picked() on the resched path.
+ *
+ * We busy-wait here to guarantee that no other task can
+ * be scheduled on our core before the target CPU has
+ * entered the resched path.
+ */
+ while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu])
+ cpu_relax();
+ }
+
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
+ }
+}
+
+/**
+ * print_scx_info - print out sched_ext scheduler state
+ * @log_lvl: the log level to use when printing
+ * @p: target task
+ *
+ * If a sched_ext scheduler is enabled, print the name and state of the
+ * scheduler. If @p is on sched_ext, print further information about the task.
+ *
+ * This function can be safely called on any task as long as the task_struct
+ * itself is accessible. While safe, this function isn't synchronized and may
+ * print out mixups or garbages of limited length.
+ */
+void print_scx_info(const char *log_lvl, struct task_struct *p)
+{
+ enum scx_ops_enable_state state = scx_ops_enable_state();
+ const char *all = READ_ONCE(scx_switching_all) ? "+all" : "";
+ char runnable_at_buf[22] = "?";
+ struct sched_class *class;
+ unsigned long runnable_at;
+
+ if (state == SCX_OPS_DISABLED)
+ return;
+
+ /*
+ * Carefully check if the task was running on sched_ext, and then
+ * carefully copy the time it's been runnable, and its state.
+ */
+ if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) ||
+ class != &ext_sched_class) {
+ printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name,
+ scx_ops_enable_state_str[state], all);
+ return;
+ }
+
+ if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at,
+ sizeof(runnable_at)))
+ scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms",
+ jiffies_delta_msecs(runnable_at, jiffies));
+
+ /* print everything onto one line to conserve console space */
+ printk("%sSched_ext: %s (%s%s), task: runnable_at=%s",
+ log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all,
+ runnable_at_buf);
+}
+
+static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr)
+{
+ /*
+ * SCX schedulers often have userspace components which are sometimes
+ * involved in critial scheduling paths. PM operations involve freezing
+ * userspace which can lead to scheduling misbehaviors including stalls.
+ * Let's bypass while PM operations are in progress.
+ */
+ switch (event) {
+ case PM_HIBERNATION_PREPARE:
+ case PM_SUSPEND_PREPARE:
+ case PM_RESTORE_PREPARE:
+ scx_ops_bypass(true);
+ break;
+ case PM_POST_HIBERNATION:
+ case PM_POST_SUSPEND:
+ case PM_POST_RESTORE:
+ scx_ops_bypass(false);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block scx_pm_notifier = {
+ .notifier_call = scx_pm_handler,
+};
+
+void __init init_sched_ext_class(void)
+{
+ s32 cpu, v;
+
+ /*
+ * The following is to prevent the compiler from optimizing out the enum
+ * definitions so that BPF scheduler implementations can use them
+ * through the generated vmlinux.h.
+ */
+ WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT |
+ SCX_TG_ONLINE);
+
+ BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
+ init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL);
+#ifdef CONFIG_SMP
+ BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
+#endif
+ scx_kick_cpus_pnt_seqs =
+ __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
+ __alignof__(scx_kick_cpus_pnt_seqs[0]));
+ BUG_ON(!scx_kick_cpus_pnt_seqs);
+
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+
+ init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
+ INIT_LIST_HEAD(&rq->scx.runnable_list);
+ INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
+
+ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL));
+ init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn);
+ init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
+
+ if (cpu_online(cpu))
+ cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE;
+ }
+
+ register_sysrq_key('S', &sysrq_sched_ext_reset_op);
+ register_sysrq_key('D', &sysrq_sched_ext_dump_op);
+ INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn);
+}
+
+
+/********************************************************************************
+ * Helpers that can be called from the BPF scheduler.
+ */
+#include <linux/btf_ids.h>
+
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
+ * @p: task_struct to select a CPU for
+ * @prev_cpu: CPU @p was on previously
+ * @wake_flags: %SCX_WAKE_* flags
+ * @is_idle: out parameter indicating whether the returned CPU is idle
+ *
+ * Can only be called from ops.select_cpu() if the built-in CPU selection is
+ * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
+ * @p, @prev_cpu and @wake_flags match ops.select_cpu().
+ *
+ * Returns the picked CPU with *@is_idle indicating whether the picked CPU is
+ * currently idle and thus a good candidate for direct dispatching.
+ */
+__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
+ u64 wake_flags, bool *is_idle)
+{
+ if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) {
+ *is_idle = false;
+ return prev_cpu;
+ }
+#ifdef CONFIG_SMP
+ return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
+#else
+ *is_idle = false;
+ return prev_cpu;
+#endif
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_select_cpu,
+};
+
+static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags)
+{
+ if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
+ return false;
+
+ lockdep_assert_irqs_disabled();
+
+ if (unlikely(!p)) {
+ scx_ops_error("called with NULL task");
+ return false;
+ }
+
+ if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
+ scx_ops_error("invalid enq_flags 0x%llx", enq_flags);
+ return false;
+ }
+
+ return true;
+}
+
+static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags)
+{
+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+ struct task_struct *ddsp_task;
+
+ ddsp_task = __this_cpu_read(direct_dispatch_task);
+ if (ddsp_task) {
+ mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags);
+ return;
+ }
+
+ if (unlikely(dspc->cursor >= scx_dsp_max_batch)) {
+ scx_ops_error("dispatch buffer overflow");
+ return;
+ }
+
+ dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){
+ .task = p,
+ .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK,
+ .dsq_id = dsq_id,
+ .enq_flags = enq_flags,
+ };
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ
+ * @p: task_struct to dispatch
+ * @dsq_id: DSQ to dispatch to
+ * @slice: duration @p can run for in nsecs, 0 to keep the current value
+ * @enq_flags: SCX_ENQ_*
+ *
+ * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe
+ * to call this function spuriously. Can be called from ops.enqueue(),
+ * ops.select_cpu(), and ops.dispatch().
+ *
+ * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch
+ * and @p must match the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be
+ * used to target the local DSQ of a CPU other than the enqueueing one. Use
+ * ops.select_cpu() to be on the target CPU in the first place.
+ *
+ * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p
+ * will be directly dispatched to the corresponding dispatch queue after
+ * ops.select_cpu() returns. If @p is dispatched to SCX_DSQ_LOCAL, it will be
+ * dispatched to the local DSQ of the CPU returned by ops.select_cpu().
+ * @enq_flags are OR'd with the enqueue flags on the enqueue path before the
+ * task is dispatched.
+ *
+ * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id
+ * and this function can be called upto ops.dispatch_max_batch times to dispatch
+ * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the
+ * remaining slots. scx_bpf_consume() flushes the batch and resets the counter.
+ *
+ * This function doesn't have any locking restrictions and may be called under
+ * BPF locks (in the future when BPF introduces more flexible locking).
+ *
+ * @p is allowed to run for @slice. The scheduling path is triggered on slice
+ * exhaustion. If zero, the current residual slice is maintained. If
+ * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with
+ * scx_bpf_kick_cpu() to trigger scheduling.
+ */
+__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
+ u64 enq_flags)
+{
+ if (!scx_dispatch_preamble(p, enq_flags))
+ return;
+
+ if (slice)
+ p->scx.slice = slice;
+ else
+ p->scx.slice = p->scx.slice ?: 1;
+
+ scx_dispatch_commit(p, dsq_id, enq_flags);
+}
+
+/**
+ * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ
+ * @p: task_struct to dispatch
+ * @dsq_id: DSQ to dispatch to
+ * @slice: duration @p can run for in nsecs, 0 to keep the current value
+ * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
+ * @enq_flags: SCX_ENQ_*
+ *
+ * Dispatch @p into the vtime priority queue of the DSQ identified by @dsq_id.
+ * Tasks queued into the priority queue are ordered by @vtime and always
+ * consumed after the tasks in the FIFO queue. All other aspects are identical
+ * to scx_bpf_dispatch().
+ *
+ * @vtime ordering is according to time_before64() which considers wrapping. A
+ * numerically larger vtime may indicate an earlier position in the ordering and
+ * vice-versa.
+ */
+__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id,
+ u64 slice, u64 vtime, u64 enq_flags)
+{
+ if (!scx_dispatch_preamble(p, enq_flags))
+ return;
+
+ if (slice)
+ p->scx.slice = slice;
+ else
+ p->scx.slice = p->scx.slice ?: 1;
+
+ p->scx.dsq_vtime = vtime;
+
+ scx_dispatch_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch)
+BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_enqueue_dispatch,
+};
+
+static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit,
+ struct task_struct *p, u64 dsq_id,
+ u64 enq_flags)
+{
+ struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
+ struct rq *this_rq, *src_rq, *dst_rq, *locked_rq;
+ bool dispatched = false;
+ bool in_balance;
+ unsigned long flags;
+
+ if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH))
+ return false;
+
+ /*
+ * Can be called from either ops.dispatch() locking this_rq() or any
+ * context where no rq lock is held. If latter, lock @p's task_rq which
+ * we'll likely need anyway.
+ */
+ src_rq = task_rq(p);
+
+ local_irq_save(flags);
+ this_rq = this_rq();
+ in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE;
+
+ if (in_balance) {
+ if (this_rq != src_rq) {
+ raw_spin_rq_unlock(this_rq);
+ raw_spin_rq_lock(src_rq);
+ }
+ } else {
+ raw_spin_rq_lock(src_rq);
+ }
+
+ locked_rq = src_rq;
+ raw_spin_lock(&src_dsq->lock);
+
+ /*
+ * Did someone else get to it? @p could have already left $src_dsq, got
+ * re-enqueud, or be in the process of being consumed by someone else.
+ */
+ if (unlikely(p->scx.dsq != src_dsq ||
+ u32_before(kit->cursor.priv, p->scx.dsq_seq) ||
+ p->scx.holding_cpu >= 0) ||
+ WARN_ON_ONCE(src_rq != task_rq(p))) {
+ raw_spin_unlock(&src_dsq->lock);
+ goto out;
+ }
+
+ /* @p is still on $src_dsq and stable, determine the destination */
+ dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p);
+
+ if (dst_dsq->id == SCX_DSQ_LOCAL) {
+ dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
+ if (!task_can_run_on_remote_rq(p, dst_rq, true)) {
+ dst_dsq = &scx_dsq_global;
+ dst_rq = src_rq;
+ }
+ } else {
+ /* no need to migrate if destination is a non-local DSQ */
+ dst_rq = src_rq;
+ }
+
+ /*
+ * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different
+ * CPU, @p will be migrated.
+ */
+ if (dst_dsq->id == SCX_DSQ_LOCAL) {
+ /* @p is going from a non-local DSQ to a local DSQ */
+ if (src_rq == dst_rq) {
+ task_unlink_from_dsq(p, src_dsq);
+ move_local_task_to_local_dsq(p, enq_flags,
+ src_dsq, dst_rq);
+ raw_spin_unlock(&src_dsq->lock);
+ } else {
+ raw_spin_unlock(&src_dsq->lock);
+ move_remote_task_to_local_dsq(p, enq_flags,
+ src_rq, dst_rq);
+ locked_rq = dst_rq;
+ }
+ } else {
+ /*
+ * @p is going from a non-local DSQ to a non-local DSQ. As
+ * $src_dsq is already locked, do an abbreviated dequeue.
+ */
+ task_unlink_from_dsq(p, src_dsq);
+ p->scx.dsq = NULL;
+ raw_spin_unlock(&src_dsq->lock);
+
+ if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME)
+ p->scx.dsq_vtime = kit->vtime;
+ dispatch_enqueue(dst_dsq, p, enq_flags);
+ }
+
+ if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE)
+ p->scx.slice = kit->slice;
+
+ dispatched = true;
+out:
+ if (in_balance) {
+ if (this_rq != locked_rq) {
+ raw_spin_rq_unlock(locked_rq);
+ raw_spin_rq_lock(this_rq);
+ }
+ } else {
+ raw_spin_rq_unlock_irqrestore(locked_rq, flags);
+ }
+
+ kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE |
+ __SCX_DSQ_ITER_HAS_VTIME);
+ return dispatched;
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots
+ *
+ * Can only be called from ops.dispatch().
+ */
+__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
+{
+ if (!scx_kf_allowed(SCX_KF_DISPATCH))
+ return 0;
+
+ return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor);
+}
+
+/**
+ * scx_bpf_dispatch_cancel - Cancel the latest dispatch
+ *
+ * Cancel the latest dispatch. Can be called multiple times to cancel further
+ * dispatches. Can only be called from ops.dispatch().
+ */
+__bpf_kfunc void scx_bpf_dispatch_cancel(void)
+{
+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+
+ if (!scx_kf_allowed(SCX_KF_DISPATCH))
+ return;
+
+ if (dspc->cursor > 0)
+ dspc->cursor--;
+ else
+ scx_ops_error("dispatch buffer underflow");
+}
+
+/**
+ * scx_bpf_consume - Transfer a task from a DSQ to the current CPU's local DSQ
+ * @dsq_id: DSQ to consume
+ *
+ * Consume a task from the non-local DSQ identified by @dsq_id and transfer it
+ * to the current CPU's local DSQ for execution. Can only be called from
+ * ops.dispatch().
+ *
+ * This function flushes the in-flight dispatches from scx_bpf_dispatch() before
+ * trying to consume the specified DSQ. It may also grab rq locks and thus can't
+ * be called under any BPF locks.
+ *
+ * Returns %true if a task has been consumed, %false if there isn't any task to
+ * consume.
+ */
+__bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
+{
+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+ struct scx_dispatch_q *dsq;
+
+ if (!scx_kf_allowed(SCX_KF_DISPATCH))
+ return false;
+
+ flush_dispatch_buf(dspc->rq);
+
+ dsq = find_non_local_dsq(dsq_id);
+ if (unlikely(!dsq)) {
+ scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id);
+ return false;
+ }
+
+ if (consume_dispatch_q(dspc->rq, dsq)) {
+ /*
+ * A successfully consumed task can be dequeued before it starts
+ * running while the CPU is trying to migrate other dispatched
+ * tasks. Bump nr_tasks to tell balance_scx() to retry on empty
+ * local DSQ.
+ */
+ dspc->nr_tasks++;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+/**
+ * scx_bpf_dispatch_from_dsq_set_slice - Override slice when dispatching from DSQ
+ * @it__iter: DSQ iterator in progress
+ * @slice: duration the dispatched task can run for in nsecs
+ *
+ * Override the slice of the next task that will be dispatched from @it__iter
+ * using scx_bpf_dispatch_from_dsq[_vtime](). If this function is not called,
+ * the previous slice duration is kept.
+ */
+__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice(
+ struct bpf_iter_scx_dsq *it__iter, u64 slice)
+{
+ struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
+
+ kit->slice = slice;
+ kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE;
+}
+
+/**
+ * scx_bpf_dispatch_from_dsq_set_vtime - Override vtime when dispatching from DSQ
+ * @it__iter: DSQ iterator in progress
+ * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ
+ *
+ * Override the vtime of the next task that will be dispatched from @it__iter
+ * using scx_bpf_dispatch_from_dsq_vtime(). If this function is not called, the
+ * previous slice vtime is kept. If scx_bpf_dispatch_from_dsq() is used to
+ * dispatch the next task, the override is ignored and cleared.
+ */
+__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
+ struct bpf_iter_scx_dsq *it__iter, u64 vtime)
+{
+ struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
+
+ kit->vtime = vtime;
+ kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME;
+}
+
+/**
+ * scx_bpf_dispatch_from_dsq - Move a task from DSQ iteration to a DSQ
+ * @it__iter: DSQ iterator in progress
+ * @p: task to transfer
+ * @dsq_id: DSQ to move @p to
+ * @enq_flags: SCX_ENQ_*
+ *
+ * Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ
+ * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can
+ * be the destination.
+ *
+ * For the transfer to be successful, @p must still be on the DSQ and have been
+ * queued before the DSQ iteration started. This function doesn't care whether
+ * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have
+ * been queued before the iteration started.
+ *
+ * @p's slice is kept by default. Use scx_bpf_dispatch_from_dsq_set_slice() to
+ * update.
+ *
+ * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq
+ * lock (e.g. BPF timers or SYSCALL programs).
+ *
+ * Returns %true if @p has been consumed, %false if @p had already been consumed
+ * or dequeued.
+ */
+__bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter,
+ struct task_struct *p, u64 dsq_id,
+ u64 enq_flags)
+{
+ return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter,
+ p, dsq_id, enq_flags);
+}
+
+/**
+ * scx_bpf_dispatch_vtime_from_dsq - Move a task from DSQ iteration to a PRIQ DSQ
+ * @it__iter: DSQ iterator in progress
+ * @p: task to transfer
+ * @dsq_id: DSQ to move @p to
+ * @enq_flags: SCX_ENQ_*
+ *
+ * Transfer @p which is on the DSQ currently iterated by @it__iter to the
+ * priority queue of the DSQ specified by @dsq_id. The destination must be a
+ * user DSQ as only user DSQs support priority queue.
+ *
+ * @p's slice and vtime are kept by default. Use
+ * scx_bpf_dispatch_from_dsq_set_slice() and
+ * scx_bpf_dispatch_from_dsq_set_vtime() to update.
+ *
+ * All other aspects are identical to scx_bpf_dispatch_from_dsq(). See
+ * scx_bpf_dispatch_vtime() for more information on @vtime.
+ */
+__bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter,
+ struct task_struct *p, u64 dsq_id,
+ u64 enq_flags)
+{
+ return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter,
+ p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
+BTF_ID_FLAGS(func, scx_bpf_consume)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_dispatch,
+};
+
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
+ *
+ * Iterate over all of the tasks currently enqueued on the local DSQ of the
+ * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
+ * processed tasks. Can only be called from ops.cpu_release().
+ */
+__bpf_kfunc u32 scx_bpf_reenqueue_local(void)
+{
+ LIST_HEAD(tasks);
+ u32 nr_enqueued = 0;
+ struct rq *rq;
+ struct task_struct *p, *n;
+
+ if (!scx_kf_allowed(SCX_KF_CPU_RELEASE))
+ return 0;
+
+ rq = cpu_rq(smp_processor_id());
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * The BPF scheduler may choose to dispatch tasks back to
+ * @rq->scx.local_dsq. Move all candidate tasks off to a private list
+ * first to avoid processing the same tasks repeatedly.
+ */
+ list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list,
+ scx.dsq_list.node) {
+ /*
+ * If @p is being migrated, @p's current CPU may not agree with
+ * its allowed CPUs and the migration_cpu_stop is about to
+ * deactivate and re-activate @p anyway. Skip re-enqueueing.
+ *
+ * While racing sched property changes may also dequeue and
+ * re-enqueue a migrating task while its current CPU and allowed
+ * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to
+ * the current local DSQ for running tasks and thus are not
+ * visible to the BPF scheduler.
+ */
+ if (p->migration_pending)
+ continue;
+
+ dispatch_dequeue(rq, p);
+ list_add_tail(&p->scx.dsq_list.node, &tasks);
+ }
+
+ list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) {
+ list_del_init(&p->scx.dsq_list.node);
+ do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
+ nr_enqueued++;
+ }
+
+ return nr_enqueued;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_cpu_release)
+BTF_ID_FLAGS(func, scx_bpf_reenqueue_local)
+BTF_KFUNCS_END(scx_kfunc_ids_cpu_release)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_cpu_release,
+};
+
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_create_dsq - Create a custom DSQ
+ * @dsq_id: DSQ to create
+ * @node: NUMA node to allocate from
+ *
+ * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable
+ * scx callback, and any BPF_PROG_TYPE_SYSCALL prog.
+ */
+__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
+{
+ if (unlikely(node >= (int)nr_node_ids ||
+ (node < 0 && node != NUMA_NO_NODE)))
+ return -EINVAL;
+ return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node));
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_unlocked)
+BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_unlocked)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_unlocked,
+};
+
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_kick_cpu - Trigger reschedule on a CPU
+ * @cpu: cpu to kick
+ * @flags: %SCX_KICK_* flags
+ *
+ * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
+ * trigger rescheduling on a busy CPU. This can be called from any online
+ * scx_ops operation and the actual kicking is performed asynchronously through
+ * an irq work.
+ */
+__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
+{
+ struct rq *this_rq;
+ unsigned long irq_flags;
+
+ if (!ops_cpu_valid(cpu, NULL))
+ return;
+
+ local_irq_save(irq_flags);
+
+ this_rq = this_rq();
+
+ /*
+ * While bypassing for PM ops, IRQ handling may not be online which can
+ * lead to irq_work_queue() malfunction such as infinite busy wait for
+ * IRQ status update. Suppress kicking.
+ */
+ if (scx_rq_bypassing(this_rq))
+ goto out;
+
+ /*
+ * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting
+ * rq locks. We can probably be smarter and avoid bouncing if called
+ * from ops which don't hold a rq lock.
+ */
+ if (flags & SCX_KICK_IDLE) {
+ struct rq *target_rq = cpu_rq(cpu);
+
+ if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT)))
+ scx_ops_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
+
+ if (raw_spin_rq_trylock(target_rq)) {
+ if (can_skip_idle_kick(target_rq)) {
+ raw_spin_rq_unlock(target_rq);
+ goto out;
+ }
+ raw_spin_rq_unlock(target_rq);
+ }
+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle);
+ } else {
+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick);
+
+ if (flags & SCX_KICK_PREEMPT)
+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt);
+ if (flags & SCX_KICK_WAIT)
+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait);
+ }
+
+ irq_work_queue(&this_rq->scx.kick_cpus_irq_work);
+out:
+ local_irq_restore(irq_flags);
+}
+
+/**
+ * scx_bpf_dsq_nr_queued - Return the number of queued tasks
+ * @dsq_id: id of the DSQ
+ *
+ * Return the number of tasks in the DSQ matching @dsq_id. If not found,
+ * -%ENOENT is returned.
+ */
+__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
+{
+ struct scx_dispatch_q *dsq;
+ s32 ret;
+
+ preempt_disable();
+
+ if (dsq_id == SCX_DSQ_LOCAL) {
+ ret = READ_ONCE(this_rq()->scx.local_dsq.nr);
+ goto out;
+ } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
+ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
+
+ if (ops_cpu_valid(cpu, NULL)) {
+ ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr);
+ goto out;
+ }
+ } else {
+ dsq = find_non_local_dsq(dsq_id);
+ if (dsq) {
+ ret = READ_ONCE(dsq->nr);
+ goto out;
+ }
+ }
+ ret = -ENOENT;
+out:
+ preempt_enable();
+ return ret;
+}
+
+/**
+ * scx_bpf_destroy_dsq - Destroy a custom DSQ
+ * @dsq_id: DSQ to destroy
+ *
+ * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with
+ * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is
+ * empty and no further tasks are dispatched to it. Ignored if called on a DSQ
+ * which doesn't exist. Can be called from any online scx_ops operations.
+ */
+__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
+{
+ destroy_dsq(dsq_id);
+}
+
+/**
+ * bpf_iter_scx_dsq_new - Create a DSQ iterator
+ * @it: iterator to initialize
+ * @dsq_id: DSQ to iterate
+ * @flags: %SCX_DSQ_ITER_*
+ *
+ * Initialize BPF iterator @it which can be used with bpf_for_each() to walk
+ * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes
+ * tasks which are already queued when this function is invoked.
+ */
+__bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
+ u64 flags)
+{
+ struct bpf_iter_scx_dsq_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) >
+ sizeof(struct bpf_iter_scx_dsq));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) !=
+ __alignof__(struct bpf_iter_scx_dsq));
+
+ if (flags & ~__SCX_DSQ_ITER_USER_FLAGS)
+ return -EINVAL;
+
+ kit->dsq = find_non_local_dsq(dsq_id);
+ if (!kit->dsq)
+ return -ENOENT;
+
+ INIT_LIST_HEAD(&kit->cursor.node);
+ kit->cursor.flags |= SCX_DSQ_LNODE_ITER_CURSOR | flags;
+ kit->cursor.priv = READ_ONCE(kit->dsq->seq);
+
+ return 0;
+}
+
+/**
+ * bpf_iter_scx_dsq_next - Progress a DSQ iterator
+ * @it: iterator to progress
+ *
+ * Return the next task. See bpf_iter_scx_dsq_new().
+ */
+__bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it)
+{
+ struct bpf_iter_scx_dsq_kern *kit = (void *)it;
+ bool rev = kit->cursor.flags & SCX_DSQ_ITER_REV;
+ struct task_struct *p;
+ unsigned long flags;
+
+ if (!kit->dsq)
+ return NULL;
+
+ raw_spin_lock_irqsave(&kit->dsq->lock, flags);
+
+ if (list_empty(&kit->cursor.node))
+ p = NULL;
+ else
+ p = container_of(&kit->cursor, struct task_struct, scx.dsq_list);
+
+ /*
+ * Only tasks which were queued before the iteration started are
+ * visible. This bounds BPF iterations and guarantees that vtime never
+ * jumps in the other direction while iterating.
+ */
+ do {
+ p = nldsq_next_task(kit->dsq, p, rev);
+ } while (p && unlikely(u32_before(kit->cursor.priv, p->scx.dsq_seq)));
+
+ if (p) {
+ if (rev)
+ list_move_tail(&kit->cursor.node, &p->scx.dsq_list.node);
+ else
+ list_move(&kit->cursor.node, &p->scx.dsq_list.node);
+ } else {
+ list_del_init(&kit->cursor.node);
+ }
+
+ raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);
+
+ return p;
+}
+
+/**
+ * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator
+ * @it: iterator to destroy
+ *
+ * Undo scx_iter_scx_dsq_new().
+ */
+__bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
+{
+ struct bpf_iter_scx_dsq_kern *kit = (void *)it;
+
+ if (!kit->dsq)
+ return;
+
+ if (!list_empty(&kit->cursor.node)) {
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&kit->dsq->lock, flags);
+ list_del_init(&kit->cursor.node);
+ raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);
+ }
+ kit->dsq = NULL;
+}
+
+__bpf_kfunc_end_defs();
+
+static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
+ char *fmt, unsigned long long *data, u32 data__sz)
+{
+ struct bpf_bprintf_data bprintf_data = { .get_bin_args = true };
+ s32 ret;
+
+ if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 ||
+ (data__sz && !data)) {
+ scx_ops_error("invalid data=%p and data__sz=%u",
+ (void *)data, data__sz);
+ return -EINVAL;
+ }
+
+ ret = copy_from_kernel_nofault(data_buf, data, data__sz);
+ if (ret < 0) {
+ scx_ops_error("failed to read data fields (%d)", ret);
+ return ret;
+ }
+
+ ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8,
+ &bprintf_data);
+ if (ret < 0) {
+ scx_ops_error("format preparation failed (%d)", ret);
+ return ret;
+ }
+
+ ret = bstr_printf(line_buf, line_size, fmt,
+ bprintf_data.bin_args);
+ bpf_bprintf_cleanup(&bprintf_data);
+ if (ret < 0) {
+ scx_ops_error("(\"%s\", %p, %u) failed to format",
+ fmt, data, data__sz);
+ return ret;
+ }
+
+ return ret;
+}
+
+static s32 bstr_format(struct scx_bstr_buf *buf,
+ char *fmt, unsigned long long *data, u32 data__sz)
+{
+ return __bstr_format(buf->data, buf->line, sizeof(buf->line),
+ fmt, data, data__sz);
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler.
+ * @exit_code: Exit value to pass to user space via struct scx_exit_info.
+ * @fmt: error message format string
+ * @data: format string parameters packaged using ___bpf_fill() macro
+ * @data__sz: @data len, must end in '__sz' for the verifier
+ *
+ * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops
+ * disabling.
+ */
+__bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
+ unsigned long long *data, u32 data__sz)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
+ if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
+ scx_ops_exit_kind(SCX_EXIT_UNREG_BPF, exit_code, "%s",
+ scx_exit_bstr_buf.line);
+ raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
+}
+
+/**
+ * scx_bpf_error_bstr - Indicate fatal error
+ * @fmt: error message format string
+ * @data: format string parameters packaged using ___bpf_fill() macro
+ * @data__sz: @data len, must end in '__sz' for the verifier
+ *
+ * Indicate that the BPF scheduler encountered a fatal error and initiate ops
+ * disabling.
+ */
+__bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
+ u32 data__sz)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
+ if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
+ scx_ops_exit_kind(SCX_EXIT_ERROR_BPF, 0, "%s",
+ scx_exit_bstr_buf.line);
+ raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
+}
+
+/**
+ * scx_bpf_dump - Generate extra debug dump specific to the BPF scheduler
+ * @fmt: format string
+ * @data: format string parameters packaged using ___bpf_fill() macro
+ * @data__sz: @data len, must end in '__sz' for the verifier
+ *
+ * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and
+ * dump_task() to generate extra debug dump specific to the BPF scheduler.
+ *
+ * The extra dump may be multiple lines. A single line may be split over
+ * multiple calls. The last line is automatically terminated.
+ */
+__bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
+ u32 data__sz)
+{
+ struct scx_dump_data *dd = &scx_dump_data;
+ struct scx_bstr_buf *buf = &dd->buf;
+ s32 ret;
+
+ if (raw_smp_processor_id() != dd->cpu) {
+ scx_ops_error("scx_bpf_dump() must only be called from ops.dump() and friends");
+ return;
+ }
+
+ /* append the formatted string to the line buf */
+ ret = __bstr_format(buf->data, buf->line + dd->cursor,
+ sizeof(buf->line) - dd->cursor, fmt, data, data__sz);
+ if (ret < 0) {
+ dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)",
+ dd->prefix, fmt, data, data__sz, ret);
+ return;
+ }
+
+ dd->cursor += ret;
+ dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line));
+
+ if (!dd->cursor)
+ return;
+
+ /*
+ * If the line buf overflowed or ends in a newline, flush it into the
+ * dump. This is to allow the caller to generate a single line over
+ * multiple calls. As ops_dump_flush() can also handle multiple lines in
+ * the line buf, the only case which can lead to an unexpected
+ * truncation is when the caller keeps generating newlines in the middle
+ * instead of the end consecutively. Don't do that.
+ */
+ if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n')
+ ops_dump_flush();
+}
+
+/**
+ * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU
+ * @cpu: CPU of interest
+ *
+ * Return the maximum relative capacity of @cpu in relation to the most
+ * performant CPU in the system. The return value is in the range [1,
+ * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur().
+ */
+__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
+{
+ if (ops_cpu_valid(cpu, NULL))
+ return arch_scale_cpu_capacity(cpu);
+ else
+ return SCX_CPUPERF_ONE;
+}
+
+/**
+ * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU
+ * @cpu: CPU of interest
+ *
+ * Return the current relative performance of @cpu in relation to its maximum.
+ * The return value is in the range [1, %SCX_CPUPERF_ONE].
+ *
+ * The current performance level of a CPU in relation to the maximum performance
+ * available in the system can be calculated as follows:
+ *
+ * scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE
+ *
+ * The result is in the range [1, %SCX_CPUPERF_ONE].
+ */
+__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
+{
+ if (ops_cpu_valid(cpu, NULL))
+ return arch_scale_freq_capacity(cpu);
+ else
+ return SCX_CPUPERF_ONE;
+}
+
+/**
+ * scx_bpf_cpuperf_set - Set the relative performance target of a CPU
+ * @cpu: CPU of interest
+ * @perf: target performance level [0, %SCX_CPUPERF_ONE]
+ * @flags: %SCX_CPUPERF_* flags
+ *
+ * Set the target performance level of @cpu to @perf. @perf is in linear
+ * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the
+ * schedutil cpufreq governor chooses the target frequency.
+ *
+ * The actual performance level chosen, CPU grouping, and the overhead and
+ * latency of the operations are dependent on the hardware and cpufreq driver in
+ * use. Consult hardware and cpufreq documentation for more information. The
+ * current performance level can be monitored using scx_bpf_cpuperf_cur().
+ */
+__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
+{
+ if (unlikely(perf > SCX_CPUPERF_ONE)) {
+ scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu);
+ return;
+ }
+
+ if (ops_cpu_valid(cpu, NULL)) {
+ struct rq *rq = cpu_rq(cpu);
+
+ rq->scx.cpuperf_target = perf;
+
+ rcu_read_lock_sched_notrace();
+ cpufreq_update_util(cpu_rq(cpu), 0);
+ rcu_read_unlock_sched_notrace();
+ }
+}
+
+/**
+ * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs
+ *
+ * All valid CPU IDs in the system are smaller than the returned value.
+ */
+__bpf_kfunc u32 scx_bpf_nr_cpu_ids(void)
+{
+ return nr_cpu_ids;
+}
+
+/**
+ * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void)
+{
+ return cpu_possible_mask;
+}
+
+/**
+ * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void)
+{
+ return cpu_online_mask;
+}
+
+/**
+ * scx_bpf_put_cpumask - Release a possible/online cpumask
+ * @cpumask: cpumask to release
+ */
+__bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask)
+{
+ /*
+ * Empty function body because we aren't actually acquiring or releasing
+ * a reference to a global cpumask, which is read-only in the caller and
+ * is never released. The acquire / release semantics here are just used
+ * to make the cpumask is a trusted pointer in the caller.
+ */
+}
+
+/**
+ * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
+ * per-CPU cpumask.
+ *
+ * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
+{
+ if (!static_branch_likely(&scx_builtin_idle_enabled)) {
+ scx_ops_error("built-in idle tracking is disabled");
+ return cpu_none_mask;
+ }
+
+#ifdef CONFIG_SMP
+ return idle_masks.cpu;
+#else
+ return cpu_none_mask;
+#endif
+}
+
+/**
+ * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
+ * per-physical-core cpumask. Can be used to determine if an entire physical
+ * core is free.
+ *
+ * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
+{
+ if (!static_branch_likely(&scx_builtin_idle_enabled)) {
+ scx_ops_error("built-in idle tracking is disabled");
+ return cpu_none_mask;
+ }
+
+#ifdef CONFIG_SMP
+ if (sched_smt_active())
+ return idle_masks.smt;
+ else
+ return idle_masks.cpu;
+#else
+ return cpu_none_mask;
+#endif
+}
+
+/**
+ * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
+ * either the percpu, or SMT idle-tracking cpumask.
+ */
+__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
+{
+ /*
+ * Empty function body because we aren't actually acquiring or releasing
+ * a reference to a global idle cpumask, which is read-only in the
+ * caller and is never released. The acquire / release semantics here
+ * are just used to make the cpumask a trusted pointer in the caller.
+ */
+}
+
+/**
+ * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
+ * @cpu: cpu to test and clear idle for
+ *
+ * Returns %true if @cpu was idle and its idle state was successfully cleared.
+ * %false otherwise.
+ *
+ * Unavailable if ops.update_idle() is implemented and
+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
+ */
+__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
+{
+ if (!static_branch_likely(&scx_builtin_idle_enabled)) {
+ scx_ops_error("built-in idle tracking is disabled");
+ return false;
+ }
+
+ if (ops_cpu_valid(cpu, NULL))
+ return test_and_clear_cpu_idle(cpu);
+ else
+ return false;
+}
+
+/**
+ * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
+ * @cpus_allowed: Allowed cpumask
+ * @flags: %SCX_PICK_IDLE_CPU_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
+ * number on success. -%EBUSY if no matching cpu was found.
+ *
+ * Idle CPU tracking may race against CPU scheduling state transitions. For
+ * example, this function may return -%EBUSY as CPUs are transitioning into the
+ * idle state. If the caller then assumes that there will be dispatch events on
+ * the CPUs as they were all busy, the scheduler may end up stalling with CPUs
+ * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and
+ * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch
+ * event in the near future.
+ *
+ * Unavailable if ops.update_idle() is implemented and
+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
+ */
+__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
+ u64 flags)
+{
+ if (!static_branch_likely(&scx_builtin_idle_enabled)) {
+ scx_ops_error("built-in idle tracking is disabled");
+ return -EBUSY;
+ }
+
+ return scx_pick_idle_cpu(cpus_allowed, flags);
+}
+
+/**
+ * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
+ * @cpus_allowed: Allowed cpumask
+ * @flags: %SCX_PICK_IDLE_CPU_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
+ * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
+ * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
+ * empty.
+ *
+ * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
+ * set, this function can't tell which CPUs are idle and will always pick any
+ * CPU.
+ */
+__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
+ u64 flags)
+{
+ s32 cpu;
+
+ if (static_branch_likely(&scx_builtin_idle_enabled)) {
+ cpu = scx_pick_idle_cpu(cpus_allowed, flags);
+ if (cpu >= 0)
+ return cpu;
+ }
+
+ cpu = cpumask_any_distribute(cpus_allowed);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+ else
+ return -EBUSY;
+}
+
+/**
+ * scx_bpf_task_running - Is task currently running?
+ * @p: task of interest
+ */
+__bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p)
+{
+ return task_rq(p)->curr == p;
+}
+
+/**
+ * scx_bpf_task_cpu - CPU a task is currently associated with
+ * @p: task of interest
+ */
+__bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p)
+{
+ return task_cpu(p);
+}
+
+/**
+ * scx_bpf_cpu_rq - Fetch the rq of a CPU
+ * @cpu: CPU of the rq
+ */
+__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu)
+{
+ if (!ops_cpu_valid(cpu, NULL))
+ return NULL;
+
+ return cpu_rq(cpu);
+}
+
+/**
+ * scx_bpf_task_cgroup - Return the sched cgroup of a task
+ * @p: task of interest
+ *
+ * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with
+ * from the scheduler's POV. SCX operations should use this function to
+ * determine @p's current cgroup as, unlike following @p->cgroups,
+ * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all
+ * rq-locked operations. Can be called on the parameter tasks of rq-locked
+ * operations. The restriction guarantees that @p's rq is locked by the caller.
+ */
+#ifdef CONFIG_CGROUP_SCHED
+__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
+{
+ struct task_group *tg = p->sched_task_group;
+ struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
+
+ if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p))
+ goto out;
+
+ /*
+ * A task_group may either be a cgroup or an autogroup. In the latter
+ * case, @tg->css.cgroup is %NULL. A task_group can't become the other
+ * kind once created.
+ */
+ if (tg && tg->css.cgroup)
+ cgrp = tg->css.cgroup;
+ else
+ cgrp = &cgrp_dfl_root.cgrp;
+out:
+ cgroup_get(cgrp);
+ return cgrp;
+}
+#endif
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_any)
+BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
+BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
+BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_set)
+BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
+BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
+BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
+#ifdef CONFIG_CGROUP_SCHED
+BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
+#endif
+BTF_KFUNCS_END(scx_kfunc_ids_any)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_any = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_any,
+};
+
+static int __init scx_init(void)
+{
+ int ret;
+
+ /*
+ * kfunc registration can't be done from init_sched_ext_class() as
+ * register_btf_kfunc_id_set() needs most of the system to be up.
+ *
+ * Some kfuncs are context-sensitive and can only be called from
+ * specific SCX ops. They are grouped into BTF sets accordingly.
+ * Unfortunately, BPF currently doesn't have a way of enforcing such
+ * restrictions. Eventually, the verifier should be able to enforce
+ * them. For now, register them the same and make each kfunc explicitly
+ * check using scx_kf_allowed().
+ */
+ if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &scx_kfunc_set_select_cpu)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &scx_kfunc_set_enqueue_dispatch)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &scx_kfunc_set_dispatch)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &scx_kfunc_set_cpu_release)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &scx_kfunc_set_unlocked)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
+ &scx_kfunc_set_unlocked)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &scx_kfunc_set_any)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+ &scx_kfunc_set_any)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
+ &scx_kfunc_set_any))) {
+ pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret);
+ return ret;
+ }
+
+ ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops);
+ if (ret) {
+ pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret);
+ return ret;
+ }
+
+ ret = register_pm_notifier(&scx_pm_notifier);
+ if (ret) {
+ pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret);
+ return ret;
+ }
+
+ scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj);
+ if (!scx_kset) {
+ pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n");
+ return -ENOMEM;
+ }
+
+ ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group);
+ if (ret < 0) {
+ pr_err("sched_ext: Failed to add global attributes\n");
+ return ret;
+ }
+
+ return 0;
+}
+__initcall(scx_init);
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
new file mode 100644
index 000000000000..246019519231
--- /dev/null
+++ b/kernel/sched/ext.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <[email protected]>
+ * Copyright (c) 2022 David Vernet <[email protected]>
+ */
+#ifdef CONFIG_SCHED_CLASS_EXT
+
+void scx_tick(struct rq *rq);
+void init_scx_entity(struct sched_ext_entity *scx);
+void scx_pre_fork(struct task_struct *p);
+int scx_fork(struct task_struct *p);
+void scx_post_fork(struct task_struct *p);
+void scx_cancel_fork(struct task_struct *p);
+bool scx_can_stop_tick(struct rq *rq);
+void scx_rq_activate(struct rq *rq);
+void scx_rq_deactivate(struct rq *rq);
+int scx_check_setscheduler(struct task_struct *p, int policy);
+bool task_should_scx(struct task_struct *p);
+void init_sched_ext_class(void);
+
+static inline u32 scx_cpuperf_target(s32 cpu)
+{
+ if (scx_enabled())
+ return cpu_rq(cpu)->scx.cpuperf_target;
+ else
+ return 0;
+}
+
+static inline bool task_on_scx(const struct task_struct *p)
+{
+ return scx_enabled() && p->sched_class == &ext_sched_class;
+}
+
+#ifdef CONFIG_SCHED_CORE
+bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
+ bool in_fi);
+#endif
+
+#else /* CONFIG_SCHED_CLASS_EXT */
+
+static inline void scx_tick(struct rq *rq) {}
+static inline void scx_pre_fork(struct task_struct *p) {}
+static inline int scx_fork(struct task_struct *p) { return 0; }
+static inline void scx_post_fork(struct task_struct *p) {}
+static inline void scx_cancel_fork(struct task_struct *p) {}
+static inline u32 scx_cpuperf_target(s32 cpu) { return 0; }
+static inline bool scx_can_stop_tick(struct rq *rq) { return true; }
+static inline void scx_rq_activate(struct rq *rq) {}
+static inline void scx_rq_deactivate(struct rq *rq) {}
+static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; }
+static inline bool task_on_scx(const struct task_struct *p) { return false; }
+static inline void init_sched_ext_class(void) {}
+
+#endif /* CONFIG_SCHED_CLASS_EXT */
+
+#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP)
+void __scx_update_idle(struct rq *rq, bool idle);
+
+static inline void scx_update_idle(struct rq *rq, bool idle)
+{
+ if (scx_enabled())
+ __scx_update_idle(rq, idle);
+}
+#else
+static inline void scx_update_idle(struct rq *rq, bool idle) {}
+#endif
+
+#ifdef CONFIG_CGROUP_SCHED
+#ifdef CONFIG_EXT_GROUP_SCHED
+int scx_tg_online(struct task_group *tg);
+void scx_tg_offline(struct task_group *tg);
+int scx_cgroup_can_attach(struct cgroup_taskset *tset);
+void scx_move_task(struct task_struct *p);
+void scx_cgroup_finish_attach(void);
+void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
+void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
+void scx_group_set_idle(struct task_group *tg, bool idle);
+#else /* CONFIG_EXT_GROUP_SCHED */
+static inline int scx_tg_online(struct task_group *tg) { return 0; }
+static inline void scx_tg_offline(struct task_group *tg) {}
+static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
+static inline void scx_move_task(struct task_struct *p) {}
+static inline void scx_cgroup_finish_attach(void) {}
+static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
+static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
+static inline void scx_group_set_idle(struct task_group *tg, bool idle) {}
+#endif /* CONFIG_EXT_GROUP_SCHED */
+#endif /* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8dc9385f6da4..225b31aaee55 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -511,7 +511,7 @@ static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
static int se_is_idle(struct sched_entity *se)
{
- return 0;
+ return task_has_idle_policy(task_of(se));
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -779,8 +779,22 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
}
/* ensure we never gain time by being placed backwards. */
- u64_u32_store(cfs_rq->min_vruntime,
- __update_min_vruntime(cfs_rq, vruntime));
+ cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime);
+}
+
+static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *root = __pick_root_entity(cfs_rq);
+ struct sched_entity *curr = cfs_rq->curr;
+ u64 min_slice = ~0ULL;
+
+ if (curr && curr->on_rq)
+ min_slice = curr->slice;
+
+ if (root)
+ min_slice = min(min_slice, root->min_slice);
+
+ return min_slice;
}
static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
@@ -799,19 +813,34 @@ static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node
}
}
+static inline void __min_slice_update(struct sched_entity *se, struct rb_node *node)
+{
+ if (node) {
+ struct sched_entity *rse = __node_2_se(node);
+ if (rse->min_slice < se->min_slice)
+ se->min_slice = rse->min_slice;
+ }
+}
+
/*
* se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
*/
static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
{
u64 old_min_vruntime = se->min_vruntime;
+ u64 old_min_slice = se->min_slice;
struct rb_node *node = &se->run_node;
se->min_vruntime = se->vruntime;
__min_vruntime_update(se, node->rb_right);
__min_vruntime_update(se, node->rb_left);
- return se->min_vruntime == old_min_vruntime;
+ se->min_slice = se->slice;
+ __min_slice_update(se, node->rb_right);
+ __min_slice_update(se, node->rb_left);
+
+ return se->min_vruntime == old_min_vruntime &&
+ se->min_slice == old_min_slice;
}
RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
@@ -824,6 +853,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
avg_vruntime_add(cfs_rq, se);
se->min_vruntime = se->vruntime;
+ se->min_slice = se->slice;
rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
__entity_less, &min_vruntime_cb);
}
@@ -974,17 +1004,18 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
* XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
* this is probably good enough.
*/
-static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if ((s64)(se->vruntime - se->deadline) < 0)
- return;
+ return false;
/*
* For EEVDF the virtual time slope is determined by w_i (iow.
* nice) while the request time r_i is determined by
* sysctl_sched_base_slice.
*/
- se->slice = sysctl_sched_base_slice;
+ if (!se->custom_slice)
+ se->slice = sysctl_sched_base_slice;
/*
* EEVDF: vd_i = ve_i + r_i / w_i
@@ -994,10 +1025,7 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
/*
* The task has consumed its request, reschedule.
*/
- if (cfs_rq->nr_running > 1) {
- resched_curr(rq_of(cfs_rq));
- clear_buddies(cfs_rq, se);
- }
+ return true;
}
#include "pelt.h"
@@ -1135,6 +1163,38 @@ static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
dl_server_update(p->dl_server, delta_exec);
}
+static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+{
+ if (!sched_feat(PREEMPT_SHORT))
+ return false;
+
+ if (curr->vlag == curr->deadline)
+ return false;
+
+ return !entity_eligible(cfs_rq, curr);
+}
+
+static inline bool do_preempt_short(struct cfs_rq *cfs_rq,
+ struct sched_entity *pse, struct sched_entity *se)
+{
+ if (!sched_feat(PREEMPT_SHORT))
+ return false;
+
+ if (pse->slice >= se->slice)
+ return false;
+
+ if (!entity_eligible(cfs_rq, pse))
+ return false;
+
+ if (entity_before(pse, se))
+ return true;
+
+ if (!entity_eligible(cfs_rq, se))
+ return true;
+
+ return false;
+}
+
/*
* Used by other classes to account runtime.
*/
@@ -1156,23 +1216,44 @@ s64 update_curr_common(struct rq *rq)
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
+ struct rq *rq = rq_of(cfs_rq);
s64 delta_exec;
+ bool resched;
if (unlikely(!curr))
return;
- delta_exec = update_curr_se(rq_of(cfs_rq), curr);
+ delta_exec = update_curr_se(rq, curr);
if (unlikely(delta_exec <= 0))
return;
curr->vruntime += calc_delta_fair(delta_exec, curr);
- update_deadline(cfs_rq, curr);
+ resched = update_deadline(cfs_rq, curr);
update_min_vruntime(cfs_rq);
- if (entity_is_task(curr))
- update_curr_task(task_of(curr), delta_exec);
+ if (entity_is_task(curr)) {
+ struct task_struct *p = task_of(curr);
+
+ update_curr_task(p, delta_exec);
+
+ /*
+ * Any fair task that runs outside of fair_server should
+ * account against fair_server such that it can account for
+ * this time and possibly avoid running this period.
+ */
+ if (p->dl_server != &rq->fair_server)
+ dl_server_update(&rq->fair_server, delta_exec);
+ }
account_cfs_rq_runtime(cfs_rq, delta_exec);
+
+ if (rq->nr_running == 1)
+ return;
+
+ if (resched || did_preempt_short(cfs_rq, curr)) {
+ resched_curr(rq);
+ clear_buddies(cfs_rq, curr);
+ }
}
static void update_curr_fair(struct rq *rq)
@@ -1742,7 +1823,7 @@ static bool pgdat_free_space_enough(struct pglist_data *pgdat)
continue;
if (zone_watermark_ok(zone, 0,
- wmark_pages(zone, WMARK_PROMO) + enough_wmark,
+ promo_wmark_pages(zone) + enough_wmark,
ZONE_MOVABLE, 0))
return true;
}
@@ -1840,8 +1921,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
* The pages in slow memory node should be migrated according
* to hot/cold instead of private/shared.
*/
- if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
- !node_is_toptier(src_nid)) {
+ if (folio_use_access_time(folio)) {
struct pglist_data *pgdat;
unsigned long rate_limit;
unsigned int latency, th, def_th;
@@ -3188,6 +3268,15 @@ static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
return true;
}
+ /*
+ * This vma has not been accessed for a while, and if the number
+ * the threads in the same process is low, which means no other
+ * threads can help scan this vma, force a vma scan.
+ */
+ if (READ_ONCE(mm->numa_scan_seq) >
+ (vma->numab_state->prev_scan_seq + get_nr_threads(current)))
+ return true;
+
return false;
}
@@ -3835,7 +3924,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
}
}
-void reweight_task(struct task_struct *p, const struct load_weight *lw)
+static void reweight_task_fair(struct rq *rq, struct task_struct *p,
+ const struct load_weight *lw)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -5178,7 +5268,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
u64 vslice, vruntime = avg_vruntime(cfs_rq);
s64 lag = 0;
- se->slice = sysctl_sched_base_slice;
+ if (!se->custom_slice)
+ se->slice = sysctl_sched_base_slice;
vslice = calc_delta_fair(se->slice, se);
/*
@@ -5259,6 +5350,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
se->vruntime = vruntime - lag;
+ if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
+ se->deadline += se->vruntime;
+ se->rel_deadline = 0;
+ return;
+ }
+
/*
* When joining the competition; the existing tasks will be,
* on average, halfway through their slice, as such start tasks
@@ -5279,6 +5376,9 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
static inline bool cfs_bandwidth_used(void);
static void
+requeue_delayed_entity(struct sched_entity *se);
+
+static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
bool curr = cfs_rq->curr == se;
@@ -5365,20 +5465,48 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
-static void
+static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
+{
+ se->sched_delayed = 0;
+ if (sched_feat(DELAY_ZERO) && se->vlag > 0)
+ se->vlag = 0;
+}
+
+static bool
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- int action = UPDATE_TG;
+ bool sleep = flags & DEQUEUE_SLEEP;
+ update_curr(cfs_rq);
+
+ if (flags & DEQUEUE_DELAYED) {
+ SCHED_WARN_ON(!se->sched_delayed);
+ } else {
+ bool delay = sleep;
+ /*
+ * DELAY_DEQUEUE relies on spurious wakeups, special task
+ * states must not suffer spurious wakeups, excempt them.
+ */
+ if (flags & DEQUEUE_SPECIAL)
+ delay = false;
+
+ SCHED_WARN_ON(delay && se->sched_delayed);
+
+ if (sched_feat(DELAY_DEQUEUE) && delay &&
+ !entity_eligible(cfs_rq, se)) {
+ if (cfs_rq->next == se)
+ cfs_rq->next = NULL;
+ update_load_avg(cfs_rq, se, 0);
+ se->sched_delayed = 1;
+ return false;
+ }
+ }
+
+ int action = UPDATE_TG;
if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
action |= DO_DETACH;
/*
- * Update run-time statistics of the 'current'.
- */
- update_curr(cfs_rq);
-
- /*
* When dequeuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - For group_entity, update its runnable_weight to reflect the new
@@ -5395,6 +5523,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
clear_buddies(cfs_rq, se);
update_entity_lag(cfs_rq, se);
+ if (sched_feat(PLACE_REL_DEADLINE) && !sleep) {
+ se->deadline -= se->vruntime;
+ se->rel_deadline = 1;
+ }
+
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
@@ -5414,8 +5547,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
update_min_vruntime(cfs_rq);
+ if (flags & DEQUEUE_DELAYED)
+ finish_delayed_dequeue_entity(se);
+
if (cfs_rq->nr_running == 0)
update_idle_cfs_rq_clock_pelt(cfs_rq);
+
+ return true;
}
static void
@@ -5441,6 +5579,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
update_stats_curr_start(cfs_rq, se);
+ SCHED_WARN_ON(cfs_rq->curr);
cfs_rq->curr = se;
/*
@@ -5461,6 +5600,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
+static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
+
/*
* Pick the next process, keeping these things in mind, in this order:
* 1) keep things fair between processes/task groups
@@ -5469,16 +5610,26 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* 4) do not run the "skip" process, if something else is available
*/
static struct sched_entity *
-pick_next_entity(struct cfs_rq *cfs_rq)
+pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
{
/*
* Enabling NEXT_BUDDY will affect latency but not fairness.
*/
if (sched_feat(NEXT_BUDDY) &&
- cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
+ /* ->next will never be delayed */
+ SCHED_WARN_ON(cfs_rq->next->sched_delayed);
return cfs_rq->next;
+ }
- return pick_eevdf(cfs_rq);
+ struct sched_entity *se = pick_eevdf(cfs_rq);
+ if (se->sched_delayed) {
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ SCHED_WARN_ON(se->sched_delayed);
+ SCHED_WARN_ON(se->on_rq);
+ return NULL;
+ }
+ return se;
}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
@@ -5502,6 +5653,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* in !on_rq case, update occurred at dequeue */
update_load_avg(cfs_rq, prev, 0);
}
+ SCHED_WARN_ON(cfs_rq->curr != prev);
cfs_rq->curr = NULL;
}
@@ -5765,6 +5917,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, idle_task_delta, dequeue = 1;
+ long rq_h_nr_running = rq->cfs.h_nr_running;
raw_spin_lock(&cfs_b->lock);
/* This will start the period timer if necessary */
@@ -5798,11 +5951,21 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+ int flags;
+
/* throttled entity or throttle-on-deactivate */
if (!se->on_rq)
goto done;
- dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+ /*
+ * Abuse SPECIAL to avoid delayed dequeue in this instance.
+ * This avoids teaching dequeue_entities() about throttled
+ * entities and keeps things relatively simple.
+ */
+ flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
+ if (se->sched_delayed)
+ flags |= DEQUEUE_DELAYED;
+ dequeue_entity(qcfs_rq, se, flags);
if (cfs_rq_is_idle(group_cfs_rq(se)))
idle_task_delta = cfs_rq->h_nr_running;
@@ -5836,6 +5999,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
/* At this point se is NULL and we are at root level*/
sub_nr_running(rq, task_delta);
+ /* Stop the fair server if throttling resulted in no runnable tasks */
+ if (rq_h_nr_running && !rq->cfs.h_nr_running)
+ dl_server_stop(&rq->fair_server);
done:
/*
* Note: distribution will already see us throttled via the
@@ -5854,6 +6020,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, idle_task_delta;
+ long rq_h_nr_running = rq->cfs.h_nr_running;
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -5891,8 +6058,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- if (se->on_rq)
+ if (se->on_rq) {
+ SCHED_WARN_ON(se->sched_delayed);
break;
+ }
enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
if (cfs_rq_is_idle(group_cfs_rq(se)))
@@ -5923,6 +6092,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
goto unthrottle_throttle;
}
+ /* Start the fair server if un-throttling resulted in new runnable tasks */
+ if (!rq_h_nr_running && rq->cfs.h_nr_running)
+ dl_server_start(&rq->fair_server);
+
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, task_delta);
@@ -6555,7 +6728,7 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
{
int cpu = cpu_of(rq);
- if (!sched_feat(HZ_BW) || !cfs_bandwidth_used())
+ if (!cfs_bandwidth_used())
return;
if (!tick_nohz_full_cpu(cpu))
@@ -6738,6 +6911,37 @@ static int sched_idle_cpu(int cpu)
}
#endif
+static void
+requeue_delayed_entity(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /*
+ * se->sched_delayed should imply: se->on_rq == 1.
+ * Because a delayed entity is one that is still on
+ * the runqueue competing until elegibility.
+ */
+ SCHED_WARN_ON(!se->sched_delayed);
+ SCHED_WARN_ON(!se->on_rq);
+
+ if (sched_feat(DELAY_ZERO)) {
+ update_entity_lag(cfs_rq, se);
+ if (se->vlag > 0) {
+ cfs_rq->nr_running--;
+ if (se != cfs_rq->curr)
+ __dequeue_entity(cfs_rq, se);
+ se->vlag = 0;
+ place_entity(cfs_rq, se, 0);
+ if (se != cfs_rq->curr)
+ __enqueue_entity(cfs_rq, se);
+ cfs_rq->nr_running++;
+ }
+ }
+
+ update_load_avg(cfs_rq, se, 0);
+ se->sched_delayed = 0;
+}
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -6750,6 +6954,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct sched_entity *se = &p->se;
int idle_h_nr_running = task_has_idle_policy(p);
int task_new = !(flags & ENQUEUE_WAKEUP);
+ int rq_h_nr_running = rq->cfs.h_nr_running;
+ u64 slice = 0;
/*
* The code below (indirectly) updates schedutil which looks at
@@ -6757,7 +6963,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
* Let's add the task's estimated utilization to the cfs_rq's
* estimated utilization, before we update schedutil.
*/
- util_est_enqueue(&rq->cfs, p);
+ if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
+ util_est_enqueue(&rq->cfs, p);
+
+ if (flags & ENQUEUE_DELAYED) {
+ requeue_delayed_entity(se);
+ return;
+ }
/*
* If in_iowait is set, the code below may not trigger any cpufreq
@@ -6768,10 +6980,24 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
for_each_sched_entity(se) {
- if (se->on_rq)
+ if (se->on_rq) {
+ if (se->sched_delayed)
+ requeue_delayed_entity(se);
break;
+ }
cfs_rq = cfs_rq_of(se);
+
+ /*
+ * Basically set the slice of group entries to the min_slice of
+ * their respective cfs_rq. This ensures the group can service
+ * its entities in the desired time-frame.
+ */
+ if (slice) {
+ se->slice = slice;
+ se->custom_slice = 1;
+ }
enqueue_entity(cfs_rq, se, flags);
+ slice = cfs_rq_min_slice(cfs_rq);
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
@@ -6793,6 +7019,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
se_update_runnable(se);
update_cfs_group(se);
+ se->slice = slice;
+ slice = cfs_rq_min_slice(cfs_rq);
+
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
@@ -6804,6 +7033,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
goto enqueue_throttle;
}
+ if (!rq_h_nr_running && rq->cfs.h_nr_running) {
+ /* Account for idle runtime */
+ if (!rq->nr_running)
+ dl_server_update_idle_time(rq, rq->curr);
+ dl_server_start(&rq->fair_server);
+ }
+
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, 1);
@@ -6833,36 +7069,59 @@ enqueue_throttle:
static void set_next_buddy(struct sched_entity *se);
/*
- * The dequeue_task method is called before nr_running is
- * decreased. We remove the task from the rbtree and
- * update the fair scheduling stats:
+ * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
+ * failing half-way through and resume the dequeue later.
+ *
+ * Returns:
+ * -1 - dequeue delayed
+ * 0 - dequeue throttled
+ * 1 - dequeue complete
*/
-static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
{
- struct cfs_rq *cfs_rq;
- struct sched_entity *se = &p->se;
- int task_sleep = flags & DEQUEUE_SLEEP;
- int idle_h_nr_running = task_has_idle_policy(p);
bool was_sched_idle = sched_idle_rq(rq);
+ int rq_h_nr_running = rq->cfs.h_nr_running;
+ bool task_sleep = flags & DEQUEUE_SLEEP;
+ bool task_delayed = flags & DEQUEUE_DELAYED;
+ struct task_struct *p = NULL;
+ int idle_h_nr_running = 0;
+ int h_nr_running = 0;
+ struct cfs_rq *cfs_rq;
+ u64 slice = 0;
- util_est_dequeue(&rq->cfs, p);
+ if (entity_is_task(se)) {
+ p = task_of(se);
+ h_nr_running = 1;
+ idle_h_nr_running = task_has_idle_policy(p);
+ } else {
+ cfs_rq = group_cfs_rq(se);
+ slice = cfs_rq_min_slice(cfs_rq);
+ }
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- dequeue_entity(cfs_rq, se, flags);
- cfs_rq->h_nr_running--;
+ if (!dequeue_entity(cfs_rq, se, flags)) {
+ if (p && &p->se == se)
+ return -1;
+
+ break;
+ }
+
+ cfs_rq->h_nr_running -= h_nr_running;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = 1;
+ idle_h_nr_running = h_nr_running;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
- goto dequeue_throttle;
+ return 0;
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
+ slice = cfs_rq_min_slice(cfs_rq);
+
/* Avoid re-evaluating load for this entity: */
se = parent_entity(se);
/*
@@ -6874,6 +7133,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
break;
}
flags |= DEQUEUE_SLEEP;
+ flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL);
}
for_each_sched_entity(se) {
@@ -6883,28 +7143,61 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
se_update_runnable(se);
update_cfs_group(se);
- cfs_rq->h_nr_running--;
+ se->slice = slice;
+ slice = cfs_rq_min_slice(cfs_rq);
+
+ cfs_rq->h_nr_running -= h_nr_running;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = 1;
+ idle_h_nr_running = h_nr_running;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
- goto dequeue_throttle;
-
+ return 0;
}
- /* At this point se is NULL and we are at root level*/
- sub_nr_running(rq, 1);
+ sub_nr_running(rq, h_nr_running);
+
+ if (rq_h_nr_running && !rq->cfs.h_nr_running)
+ dl_server_stop(&rq->fair_server);
/* balance early to pull high priority tasks */
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
rq->next_balance = jiffies;
-dequeue_throttle:
- util_est_update(&rq->cfs, p, task_sleep);
+ if (p && task_delayed) {
+ SCHED_WARN_ON(!task_sleep);
+ SCHED_WARN_ON(p->on_rq != 1);
+
+ /* Fix-up what dequeue_task_fair() skipped */
+ hrtick_update(rq);
+
+ /* Fix-up what block_task() skipped. */
+ __block_task(rq, p);
+ }
+
+ return 1;
+}
+
+/*
+ * The dequeue_task method is called before nr_running is
+ * decreased. We remove the task from the rbtree and
+ * update the fair scheduling stats:
+ */
+static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+{
+ if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
+ util_est_dequeue(&rq->cfs, p);
+
+ if (dequeue_entities(rq, &p->se, flags) < 0) {
+ util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
+ return false;
+ }
+
+ util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
hrtick_update(rq);
+ return true;
}
#ifdef CONFIG_SMP
@@ -7803,6 +8096,105 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
}
/*
+ * This function computes an effective utilization for the given CPU, to be
+ * used for frequency selection given the linear relation: f = u * f_max.
+ *
+ * The scheduler tracks the following metrics:
+ *
+ * cpu_util_{cfs,rt,dl,irq}()
+ * cpu_bw_dl()
+ *
+ * Where the cfs,rt and dl util numbers are tracked with the same metric and
+ * synchronized windows and are thus directly comparable.
+ *
+ * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+ * which excludes things like IRQ and steal-time. These latter are then accrued
+ * in the IRQ utilization.
+ *
+ * The DL bandwidth number OTOH is not a measured metric but a value computed
+ * based on the task model parameters and gives the minimal utilization
+ * required to meet deadlines.
+ */
+unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long *min,
+ unsigned long *max)
+{
+ unsigned long util, irq, scale;
+ struct rq *rq = cpu_rq(cpu);
+
+ scale = arch_scale_cpu_capacity(cpu);
+
+ /*
+ * Early check to see if IRQ/steal time saturates the CPU, can be
+ * because of inaccuracies in how we track these -- see
+ * update_irq_load_avg().
+ */
+ irq = cpu_util_irq(rq);
+ if (unlikely(irq >= scale)) {
+ if (min)
+ *min = scale;
+ if (max)
+ *max = scale;
+ return scale;
+ }
+
+ if (min) {
+ /*
+ * The minimum utilization returns the highest level between:
+ * - the computed DL bandwidth needed with the IRQ pressure which
+ * steals time to the deadline task.
+ * - The minimum performance requirement for CFS and/or RT.
+ */
+ *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
+
+ /*
+ * When an RT task is runnable and uclamp is not used, we must
+ * ensure that the task will run at maximum compute capacity.
+ */
+ if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
+ *min = max(*min, scale);
+ }
+
+ /*
+ * Because the time spend on RT/DL tasks is visible as 'lost' time to
+ * CFS tasks and we use the same metric to track the effective
+ * utilization (PELT windows are synchronized) we can directly add them
+ * to obtain the CPU's actual utilization.
+ */
+ util = util_cfs + cpu_util_rt(rq);
+ util += cpu_util_dl(rq);
+
+ /*
+ * The maximum hint is a soft bandwidth requirement, which can be lower
+ * than the actual utilization because of uclamp_max requirements.
+ */
+ if (max)
+ *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
+
+ if (util >= scale)
+ return scale;
+
+ /*
+ * There is still idle time; further improve the number by using the
+ * IRQ metric. Because IRQ/steal time is hidden from the task clock we
+ * need to scale the task numbers:
+ *
+ * max - irq
+ * U' = irq + --------- * U
+ * max
+ */
+ util = scale_irq_capacity(util, irq, scale);
+ util += irq;
+
+ return min(scale, util);
+}
+
+unsigned long sched_cpu_util(int cpu)
+{
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
+}
+
+/*
* energy_env - Utilization landscape for energy estimation.
* @task_busy_time: Utilization contribution by the task for which we test the
* placement. Given by eenv_task_busy_time().
@@ -8286,7 +8678,21 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
static void task_dead_fair(struct task_struct *p)
{
- remove_entity_load_avg(&p->se);
+ struct sched_entity *se = &p->se;
+
+ if (se->sched_delayed) {
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+ if (se->sched_delayed) {
+ update_rq_clock(rq);
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ }
+ task_rq_unlock(rq, p, &rf);
+ }
+
+ remove_entity_load_avg(se);
}
/*
@@ -8322,7 +8728,7 @@ static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context
static int
balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
- if (rq->nr_running)
+ if (sched_fair_runnable(rq))
return 1;
return sched_balance_newidle(rq, rf) != 0;
@@ -8381,16 +8787,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
if (test_tsk_need_resched(curr))
return;
- /* Idle tasks are by definition preempted by non-idle tasks. */
- if (unlikely(task_has_idle_policy(curr)) &&
- likely(!task_has_idle_policy(p)))
- goto preempt;
-
- /*
- * Batch and idle tasks do not preempt non-idle tasks (their preemption
- * is driven by the tick):
- */
- if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
+ if (!sched_feat(WAKEUP_PREEMPTION))
return;
find_matching_se(&se, &pse);
@@ -8400,7 +8797,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
pse_is_idle = se_is_idle(pse);
/*
- * Preempt an idle group in favor of a non-idle group (and don't preempt
+ * Preempt an idle entity in favor of a non-idle entity (and don't preempt
* in the inverse case).
*/
if (cse_is_idle && !pse_is_idle)
@@ -8408,11 +8805,26 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
if (cse_is_idle != pse_is_idle)
return;
+ /*
+ * BATCH and IDLE tasks do not preempt others.
+ */
+ if (unlikely(!normal_policy(p->policy)))
+ return;
+
cfs_rq = cfs_rq_of(se);
update_curr(cfs_rq);
+ /*
+ * If @p has a shorter slice than current and @p is eligible, override
+ * current's slice protection in order to allow preemption.
+ *
+ * Note that even if @p does not turn out to be the most eligible
+ * task at this moment, current's slice protection will be lost.
+ */
+ if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline)
+ se->vlag = se->deadline + 1;
/*
- * XXX pick_eevdf(cfs_rq) != se ?
+ * If @p has become the most eligible task, force preemption.
*/
if (pick_eevdf(cfs_rq) == pse)
goto preempt;
@@ -8423,7 +8835,6 @@ preempt:
resched_curr(rq);
}
-#ifdef CONFIG_SMP
static struct task_struct *pick_task_fair(struct rq *rq)
{
struct sched_entity *se;
@@ -8435,95 +8846,58 @@ again:
return NULL;
do {
- struct sched_entity *curr = cfs_rq->curr;
-
- /* When we pick for a remote RQ, we'll not have done put_prev_entity() */
- if (curr) {
- if (curr->on_rq)
- update_curr(cfs_rq);
- else
- curr = NULL;
+ /* Might not have done put_prev_entity() */
+ if (cfs_rq->curr && cfs_rq->curr->on_rq)
+ update_curr(cfs_rq);
- if (unlikely(check_cfs_rq_runtime(cfs_rq)))
- goto again;
- }
+ if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ goto again;
- se = pick_next_entity(cfs_rq);
+ se = pick_next_entity(rq, cfs_rq);
+ if (!se)
+ goto again;
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
return task_of(se);
}
-#endif
+
+static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
+static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
- struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
struct task_struct *p;
int new_tasks;
again:
- if (!sched_fair_runnable(rq))
+ p = pick_task_fair(rq);
+ if (!p)
goto idle;
+ se = &p->se;
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (!prev || prev->sched_class != &fair_sched_class)
+ if (prev->sched_class != &fair_sched_class)
goto simple;
+ __put_prev_set_next_dl_server(rq, prev, p);
+
/*
* Because of the set_next_buddy() in dequeue_task_fair() it is rather
* likely that a next task is from the same cgroup as the current.
*
* Therefore attempt to avoid putting and setting the entire cgroup
* hierarchy, only change the part that actually changes.
- */
-
- do {
- struct sched_entity *curr = cfs_rq->curr;
-
- /*
- * Since we got here without doing put_prev_entity() we also
- * have to consider cfs_rq->curr. If it is still a runnable
- * entity, update_curr() will update its vruntime, otherwise
- * forget we've ever seen it.
- */
- if (curr) {
- if (curr->on_rq)
- update_curr(cfs_rq);
- else
- curr = NULL;
-
- /*
- * This call to check_cfs_rq_runtime() will do the
- * throttle and dequeue its entity in the parent(s).
- * Therefore the nr_running test will indeed
- * be correct.
- */
- if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
- cfs_rq = &rq->cfs;
-
- if (!cfs_rq->nr_running)
- goto idle;
-
- goto simple;
- }
- }
-
- se = pick_next_entity(cfs_rq);
- cfs_rq = group_cfs_rq(se);
- } while (cfs_rq);
-
- p = task_of(se);
-
- /*
+ *
* Since we haven't yet done put_prev_entity and if the selected task
* is a different task than we started out with, try and touch the
* least amount of cfs_rqs.
*/
if (prev != p) {
struct sched_entity *pse = &prev->se;
+ struct cfs_rq *cfs_rq;
while (!(cfs_rq = is_same_group(se, pse))) {
int se_depth = se->depth;
@@ -8541,38 +8915,15 @@ again:
put_prev_entity(cfs_rq, pse);
set_next_entity(cfs_rq, se);
- }
-
- goto done;
-simple:
-#endif
- if (prev)
- put_prev_task(rq, prev);
- do {
- se = pick_next_entity(cfs_rq);
- set_next_entity(cfs_rq, se);
- cfs_rq = group_cfs_rq(se);
- } while (cfs_rq);
+ __set_next_task_fair(rq, p, true);
+ }
- p = task_of(se);
+ return p;
-done: __maybe_unused;
-#ifdef CONFIG_SMP
- /*
- * Move the next running task to the front of
- * the list, so our cfs_tasks list becomes MRU
- * one.
- */
- list_move(&p->se.group_node, &rq->cfs_tasks);
+simple:
#endif
-
- if (hrtick_enabled_fair(rq))
- hrtick_start_fair(rq, p);
-
- update_misfit_status(p, rq);
- sched_fair_update_stop_tick(rq, p);
-
+ put_prev_set_next_task(rq, prev, p);
return p;
idle:
@@ -8601,15 +8952,34 @@ idle:
return NULL;
}
-static struct task_struct *__pick_next_task_fair(struct rq *rq)
+static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev)
+{
+ return pick_next_task_fair(rq, prev, NULL);
+}
+
+static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
{
- return pick_next_task_fair(rq, NULL, NULL);
+ return !!dl_se->rq->cfs.nr_running;
+}
+
+static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
+{
+ return pick_task_fair(dl_se->rq);
+}
+
+void fair_server_init(struct rq *rq)
+{
+ struct sched_dl_entity *dl_se = &rq->fair_server;
+
+ init_dl_entity(dl_se);
+
+ dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task);
}
/*
* Account for a descheduled task:
*/
-static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
struct sched_entity *se = &prev->se;
struct cfs_rq *cfs_rq;
@@ -9347,28 +9717,18 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {
static bool __update_blocked_others(struct rq *rq, bool *done)
{
- const struct sched_class *curr_class;
- u64 now = rq_clock_pelt(rq);
- unsigned long hw_pressure;
- bool decayed;
+ bool updated;
/*
* update_load_avg() can call cpufreq_update_util(). Make sure that RT,
* DL and IRQ signals have been updated before updating CFS.
*/
- curr_class = rq->curr->sched_class;
-
- hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
-
- decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
- update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
- update_hw_load_avg(now, rq, hw_pressure) |
- update_irq_load_avg(rq, 0);
+ updated = update_other_load_avgs(rq);
if (others_have_blocked(rq))
*done = false;
- return decayed;
+ return updated;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -12702,22 +13062,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
*/
static void task_fork_fair(struct task_struct *p)
{
- struct sched_entity *se = &p->se, *curr;
- struct cfs_rq *cfs_rq;
- struct rq *rq = this_rq();
- struct rq_flags rf;
-
- rq_lock(rq, &rf);
- update_rq_clock(rq);
-
set_task_max_allowed_capacity(p);
-
- cfs_rq = task_cfs_rq(current);
- curr = cfs_rq->curr;
- if (curr)
- update_curr(cfs_rq);
- place_entity(cfs_rq, se, ENQUEUE_INITIAL);
- rq_unlock(rq, &rf);
}
/*
@@ -12829,10 +13174,28 @@ static void attach_task_cfs_rq(struct task_struct *p)
static void switched_from_fair(struct rq *rq, struct task_struct *p)
{
detach_task_cfs_rq(p);
+ /*
+ * Since this is called after changing class, this is a little weird
+ * and we cannot use DEQUEUE_DELAYED.
+ */
+ if (p->se.sched_delayed) {
+ /* First, dequeue it from its new class' structures */
+ dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP);
+ /*
+ * Now, clean up the fair_sched_class side of things
+ * related to sched_delayed being true and that wasn't done
+ * due to the generic dequeue not using DEQUEUE_DELAYED.
+ */
+ finish_delayed_dequeue_entity(&p->se);
+ p->se.rel_deadline = 0;
+ __block_task(rq, p);
+ }
}
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
+ SCHED_WARN_ON(p->se.sched_delayed);
+
attach_task_cfs_rq(p);
set_task_max_allowed_capacity(p);
@@ -12850,12 +13213,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
}
}
-/* Account for a task changing its policy or group.
- *
- * This routine is mostly called to set cfs_rq->curr field when a task
- * migrates between groups/classes.
- */
-static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
+static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
{
struct sched_entity *se = &p->se;
@@ -12868,6 +13226,27 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
list_move(&se->group_node, &rq->cfs_tasks);
}
#endif
+ if (!first)
+ return;
+
+ SCHED_WARN_ON(se->sched_delayed);
+
+ if (hrtick_enabled_fair(rq))
+ hrtick_start_fair(rq, p);
+
+ update_misfit_status(p, rq);
+ sched_fair_update_stop_tick(rq, p);
+}
+
+/*
+ * Account for a task changing its policy or group.
+ *
+ * This routine is mostly called to set cfs_rq->curr field when a task
+ * migrates between groups/classes.
+ */
+static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
+{
+ struct sched_entity *se = &p->se;
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -12876,12 +13255,14 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
/* ensure bandwidth has been allocated on our new cfs_rq */
account_cfs_rq_runtime(cfs_rq, 0);
}
+
+ __set_next_task_fair(rq, p, first);
}
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
- u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20)));
+ cfs_rq->min_vruntime = (u64)(-(1LL << 20));
#ifdef CONFIG_SMP
raw_spin_lock_init(&cfs_rq->removed.lock);
#endif
@@ -12983,28 +13364,35 @@ void online_fair_sched_group(struct task_group *tg)
void unregister_fair_sched_group(struct task_group *tg)
{
- unsigned long flags;
- struct rq *rq;
int cpu;
destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(cpu) {
- if (tg->se[cpu])
- remove_entity_load_avg(tg->se[cpu]);
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+ struct sched_entity *se = tg->se[cpu];
+ struct rq *rq = cpu_rq(cpu);
+
+ if (se) {
+ if (se->sched_delayed) {
+ guard(rq_lock_irqsave)(rq);
+ if (se->sched_delayed) {
+ update_rq_clock(rq);
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ }
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
+ remove_entity_load_avg(se);
+ }
/*
* Only empty task groups can be destroyed; so we can speculatively
* check on_list without danger of it being re-added.
*/
- if (!tg->cfs_rq[cpu]->on_list)
- continue;
-
- rq = cpu_rq(cpu);
-
- raw_spin_rq_lock_irqsave(rq, flags);
- list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
- raw_spin_rq_unlock_irqrestore(rq, flags);
+ if (cfs_rq->on_list) {
+ guard(rq_lock_irqsave)(rq);
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
}
}
@@ -13194,13 +13582,13 @@ DEFINE_SCHED_CLASS(fair) = {
.wakeup_preempt = check_preempt_wakeup_fair,
+ .pick_task = pick_task_fair,
.pick_next_task = __pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
.set_next_task = set_next_task_fair,
#ifdef CONFIG_SMP
.balance = balance_fair,
- .pick_task = pick_task_fair,
.select_task_rq = select_task_rq_fair,
.migrate_task_rq = migrate_task_rq_fair,
@@ -13214,6 +13602,7 @@ DEFINE_SCHED_CLASS(fair) = {
.task_tick = task_tick_fair,
.task_fork = task_fork_fair,
+ .reweight_task = reweight_task_fair,
.prio_changed = prio_changed_fair,
.switched_from = switched_from_fair,
.switched_to = switched_to_fair,
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 143f55df890b..290874079f60 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -5,8 +5,24 @@
* sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
*/
SCHED_FEAT(PLACE_LAG, true)
+/*
+ * Give new tasks half a slice to ease into the competition.
+ */
SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
+/*
+ * Preserve relative virtual deadline on 'migration'.
+ */
+SCHED_FEAT(PLACE_REL_DEADLINE, true)
+/*
+ * Inhibit (wakeup) preemption until the current task has either matched the
+ * 0-lag point or until is has exhausted it's slice.
+ */
SCHED_FEAT(RUN_TO_PARITY, true)
+/*
+ * Allow wakeup of tasks with a shorter slice to cancel RESPECT_SLICE for
+ * current.
+ */
+SCHED_FEAT(PREEMPT_SHORT, true)
/*
* Prefer to schedule the task we woke last (assuming it failed
@@ -22,6 +38,18 @@ SCHED_FEAT(NEXT_BUDDY, false)
SCHED_FEAT(CACHE_HOT_BUDDY, true)
/*
+ * Delay dequeueing tasks until they get selected or woken.
+ *
+ * By delaying the dequeue for non-eligible tasks, they remain in the
+ * competition and can burn off their negative lag. When they get selected
+ * they'll have positive lag by definition.
+ *
+ * DELAY_ZERO clips the lag on dequeue (or wakeup) to 0.
+ */
+SCHED_FEAT(DELAY_DEQUEUE, true)
+SCHED_FEAT(DELAY_ZERO, true)
+
+/*
* Allow wakeup-time preemption of the current task:
*/
SCHED_FEAT(WAKEUP_PREEMPTION, true)
@@ -85,5 +113,3 @@ SCHED_FEAT(WA_BIAS, true)
SCHED_FEAT(UTIL_EST, true)
SCHED_FEAT(LATENCY_WARN, false)
-
-SCHED_FEAT(HZ_BW, true)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 6e78d071beb5..d2f096bb274c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -450,43 +450,37 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
resched_curr(rq);
}
-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
+ dl_server_update_idle_time(rq, prev);
+ scx_update_idle(rq, false);
}
static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
{
update_idle_core(rq);
+ scx_update_idle(rq, true);
schedstat_inc(rq->sched_goidle);
+ next->se.exec_start = rq_clock_task(rq);
}
-#ifdef CONFIG_SMP
-static struct task_struct *pick_task_idle(struct rq *rq)
+struct task_struct *pick_task_idle(struct rq *rq)
{
return rq->idle;
}
-#endif
-
-struct task_struct *pick_next_task_idle(struct rq *rq)
-{
- struct task_struct *next = rq->idle;
-
- set_next_task_idle(rq, next, true);
-
- return next;
-}
/*
* It is not legal to sleep in the idle task - print a warning
* message if some code attempts to do it:
*/
-static void
+static bool
dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
{
raw_spin_rq_unlock_irq(rq);
printk(KERN_ERR "bad: scheduling from the idle thread!\n");
dump_stack();
raw_spin_rq_lock_irq(rq);
+ return true;
}
/*
@@ -528,13 +522,12 @@ DEFINE_SCHED_CLASS(idle) = {
.wakeup_preempt = wakeup_preempt_idle,
- .pick_next_task = pick_next_task_idle,
+ .pick_task = pick_task_idle,
.put_prev_task = put_prev_task_idle,
.set_next_task = set_next_task_idle,
#ifdef CONFIG_SMP
.balance = balance_idle,
- .pick_task = pick_task_idle,
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index fa52906a4478..a9c65d97b3ca 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -467,3 +467,23 @@ int update_irq_load_avg(struct rq *rq, u64 running)
return ret;
}
#endif
+
+/*
+ * Load avg and utiliztion metrics need to be updated periodically and before
+ * consumption. This function updates the metrics for all subsystems except for
+ * the fair class. @rq must be locked and have its clock updated.
+ */
+bool update_other_load_avgs(struct rq *rq)
+{
+ u64 now = rq_clock_pelt(rq);
+ const struct sched_class *curr_class = rq->curr->sched_class;
+ unsigned long hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
+
+ lockdep_assert_rq_held(rq);
+
+ /* hw_pressure doesn't care about invariance */
+ return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
+ update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
+ update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure) |
+ update_irq_load_avg(rq, 0);
+}
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 2150062949d4..f4f6a0875c66 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -6,6 +6,7 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
+bool update_other_load_avgs(struct rq *rq);
#ifdef CONFIG_SCHED_HW_PRESSURE
int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 310523c1b9e3..172c588de542 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -8,10 +8,6 @@ int sched_rr_timeslice = RR_TIMESLICE;
/* More than 4 hours if BW_SHIFT equals 20. */
static const u64 max_rt_runtime = MAX_BW;
-static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
-
-struct rt_bandwidth def_rt_bandwidth;
-
/*
* period over which we measure -rt task CPU usage in us.
* default: 1s
@@ -66,6 +62,40 @@ static int __init sched_rt_sysctl_init(void)
late_initcall(sched_rt_sysctl_init);
#endif
+void init_rt_rq(struct rt_rq *rt_rq)
+{
+ struct rt_prio_array *array;
+ int i;
+
+ array = &rt_rq->active;
+ for (i = 0; i < MAX_RT_PRIO; i++) {
+ INIT_LIST_HEAD(array->queue + i);
+ __clear_bit(i, array->bitmap);
+ }
+ /* delimiter for bitsearch: */
+ __set_bit(MAX_RT_PRIO, array->bitmap);
+
+#if defined CONFIG_SMP
+ rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
+ rt_rq->highest_prio.next = MAX_RT_PRIO-1;
+ rt_rq->overloaded = 0;
+ plist_head_init(&rt_rq->pushable_tasks);
+#endif /* CONFIG_SMP */
+ /* We start is dequeued state, because no RT tasks are queued */
+ rt_rq->rt_queued = 0;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ rt_rq->rt_time = 0;
+ rt_rq->rt_throttled = 0;
+ rt_rq->rt_runtime = 0;
+ raw_spin_lock_init(&rt_rq->rt_runtime_lock);
+#endif
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+
+static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
+
static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
{
struct rt_bandwidth *rt_b =
@@ -130,35 +160,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
do_start_rt_bandwidth(rt_b);
}
-void init_rt_rq(struct rt_rq *rt_rq)
-{
- struct rt_prio_array *array;
- int i;
-
- array = &rt_rq->active;
- for (i = 0; i < MAX_RT_PRIO; i++) {
- INIT_LIST_HEAD(array->queue + i);
- __clear_bit(i, array->bitmap);
- }
- /* delimiter for bit-search: */
- __set_bit(MAX_RT_PRIO, array->bitmap);
-
-#if defined CONFIG_SMP
- rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
- rt_rq->highest_prio.next = MAX_RT_PRIO-1;
- rt_rq->overloaded = 0;
- plist_head_init(&rt_rq->pushable_tasks);
-#endif /* CONFIG_SMP */
- /* We start is dequeued state, because no RT tasks are queued */
- rt_rq->rt_queued = 0;
-
- rt_rq->rt_time = 0;
- rt_rq->rt_throttled = 0;
- rt_rq->rt_runtime = 0;
- raw_spin_lock_init(&rt_rq->rt_runtime_lock);
-}
-
-#ifdef CONFIG_RT_GROUP_SCHED
static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
{
hrtimer_cancel(&rt_b->rt_period_timer);
@@ -195,7 +196,6 @@ void unregister_rt_sched_group(struct task_group *tg)
{
if (tg->rt_se)
destroy_rt_bandwidth(&tg->rt_bandwidth);
-
}
void free_rt_sched_group(struct task_group *tg)
@@ -253,8 +253,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
if (!tg->rt_se)
goto err;
- init_rt_bandwidth(&tg->rt_bandwidth,
- ktime_to_ns(def_rt_bandwidth.rt_period), 0);
+ init_rt_bandwidth(&tg->rt_bandwidth, ktime_to_ns(global_rt_period()), 0);
for_each_possible_cpu(i) {
rt_rq = kzalloc_node(sizeof(struct rt_rq),
@@ -604,70 +603,6 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
return &rt_rq->tg->rt_bandwidth;
}
-#else /* !CONFIG_RT_GROUP_SCHED */
-
-static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
-{
- return rt_rq->rt_runtime;
-}
-
-static inline u64 sched_rt_period(struct rt_rq *rt_rq)
-{
- return ktime_to_ns(def_rt_bandwidth.rt_period);
-}
-
-typedef struct rt_rq *rt_rq_iter_t;
-
-#define for_each_rt_rq(rt_rq, iter, rq) \
- for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
-
-#define for_each_sched_rt_entity(rt_se) \
- for (; rt_se; rt_se = NULL)
-
-static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
-{
- return NULL;
-}
-
-static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
-{
- struct rq *rq = rq_of_rt_rq(rt_rq);
-
- if (!rt_rq->rt_nr_running)
- return;
-
- enqueue_top_rt_rq(rt_rq);
- resched_curr(rq);
-}
-
-static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
-{
- dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
-}
-
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
- return rt_rq->rt_throttled;
-}
-
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
- return cpu_online_mask;
-}
-
-static inline
-struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
-{
- return &cpu_rq(cpu)->rt;
-}
-
-static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
-{
- return &def_rt_bandwidth;
-}
-
-#endif /* CONFIG_RT_GROUP_SCHED */
-
bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
{
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
@@ -859,7 +794,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
const struct cpumask *span;
span = sched_rt_period_mask();
-#ifdef CONFIG_RT_GROUP_SCHED
+
/*
* FIXME: isolated CPUs should really leave the root task group,
* whether they are isolcpus or were isolated via cpusets, lest
@@ -871,7 +806,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
*/
if (rt_b == &root_task_group.rt_bandwidth)
span = cpu_online_mask;
-#endif
+
for_each_cpu(i, span) {
int enqueue = 0;
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
@@ -938,18 +873,6 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
return idle;
}
-static inline int rt_se_prio(struct sched_rt_entity *rt_se)
-{
-#ifdef CONFIG_RT_GROUP_SCHED
- struct rt_rq *rt_rq = group_rt_rq(rt_se);
-
- if (rt_rq)
- return rt_rq->highest_prio.curr;
-#endif
-
- return rt_task_of(rt_se)->prio;
-}
-
static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
{
u64 runtime = sched_rt_runtime(rt_rq);
@@ -993,6 +916,72 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
return 0;
}
+#else /* !CONFIG_RT_GROUP_SCHED */
+
+typedef struct rt_rq *rt_rq_iter_t;
+
+#define for_each_rt_rq(rt_rq, iter, rq) \
+ for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
+
+#define for_each_sched_rt_entity(rt_se) \
+ for (; rt_se; rt_se = NULL)
+
+static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
+{
+ return NULL;
+}
+
+static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ if (!rt_rq->rt_nr_running)
+ return;
+
+ enqueue_top_rt_rq(rt_rq);
+ resched_curr(rq);
+}
+
+static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
+{
+ dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
+}
+
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+ return false;
+}
+
+static inline const struct cpumask *sched_rt_period_mask(void)
+{
+ return cpu_online_mask;
+}
+
+static inline
+struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
+{
+ return &cpu_rq(cpu)->rt;
+}
+
+#ifdef CONFIG_SMP
+static void __enable_runtime(struct rq *rq) { }
+static void __disable_runtime(struct rq *rq) { }
+#endif
+
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+static inline int rt_se_prio(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_RT_GROUP_SCHED
+ struct rt_rq *rt_rq = group_rt_rq(rt_se);
+
+ if (rt_rq)
+ return rt_rq->highest_prio.curr;
+#endif
+
+ return rt_task_of(rt_se)->prio;
+}
+
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
@@ -1000,7 +989,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
- struct sched_rt_entity *rt_se = &curr->rt;
s64 delta_exec;
if (curr->sched_class != &rt_sched_class)
@@ -1010,6 +998,9 @@ static void update_curr_rt(struct rq *rq)
if (unlikely(delta_exec <= 0))
return;
+#ifdef CONFIG_RT_GROUP_SCHED
+ struct sched_rt_entity *rt_se = &curr->rt;
+
if (!rt_bandwidth_enabled())
return;
@@ -1028,6 +1019,7 @@ static void update_curr_rt(struct rq *rq)
do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
}
}
+#endif
}
static void
@@ -1184,7 +1176,6 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
static void
inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
- start_rt_bandwidth(&def_rt_bandwidth);
}
static inline
@@ -1492,7 +1483,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
enqueue_pushable_task(rq, p);
}
-static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
+static bool dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
{
struct sched_rt_entity *rt_se = &p->rt;
@@ -1500,6 +1491,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
dequeue_rt_entity(rt_se, flags);
dequeue_pushable_task(rq, p);
+
+ return true;
}
/*
@@ -1755,17 +1748,7 @@ static struct task_struct *pick_task_rt(struct rq *rq)
return p;
}
-static struct task_struct *pick_next_task_rt(struct rq *rq)
-{
- struct task_struct *p = pick_task_rt(rq);
-
- if (p)
- set_next_task_rt(rq, p, true);
-
- return p;
-}
-
-static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
+static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_struct *next)
{
struct sched_rt_entity *rt_se = &p->rt;
struct rt_rq *rt_rq = &rq->rt;
@@ -2652,13 +2635,12 @@ DEFINE_SCHED_CLASS(rt) = {
.wakeup_preempt = wakeup_preempt_rt,
- .pick_next_task = pick_next_task_rt,
+ .pick_task = pick_task_rt,
.put_prev_task = put_prev_task_rt,
.set_next_task = set_next_task_rt,
#ifdef CONFIG_SMP
.balance = balance_rt,
- .pick_task = pick_task_rt,
.select_task_rq = select_task_rq_rt,
.set_cpus_allowed = set_cpus_allowed_common,
.rq_online = rq_online_rt,
@@ -2912,19 +2894,6 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
#ifdef CONFIG_SYSCTL
static int sched_rt_global_constraints(void)
{
- unsigned long flags;
- int i;
-
- raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
- for_each_possible_cpu(i) {
- struct rt_rq *rt_rq = &cpu_rq(i)->rt;
-
- raw_spin_lock(&rt_rq->rt_runtime_lock);
- rt_rq->rt_runtime = global_rt_runtime();
- raw_spin_unlock(&rt_rq->rt_runtime_lock);
- }
- raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
-
return 0;
}
#endif /* CONFIG_SYSCTL */
@@ -2944,12 +2913,6 @@ static int sched_rt_global_validate(void)
static void sched_rt_do_global(void)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
- def_rt_bandwidth.rt_runtime = global_rt_runtime();
- def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
- raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
}
static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4c36cc680361..b1c3588a8f00 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -68,6 +68,7 @@
#include <linux/wait_api.h>
#include <linux/wait_bit.h>
#include <linux/workqueue_api.h>
+#include <linux/delayacct.h>
#include <trace/events/power.h>
#include <trace/events/sched.h>
@@ -192,9 +193,18 @@ static inline int idle_policy(int policy)
return policy == SCHED_IDLE;
}
+static inline int normal_policy(int policy)
+{
+#ifdef CONFIG_SCHED_CLASS_EXT
+ if (policy == SCHED_EXT)
+ return true;
+#endif
+ return policy == SCHED_NORMAL;
+}
+
static inline int fair_policy(int policy)
{
- return policy == SCHED_NORMAL || policy == SCHED_BATCH;
+ return normal_policy(policy) || policy == SCHED_BATCH;
}
static inline int rt_policy(int policy)
@@ -245,6 +255,24 @@ static inline void update_avg(u64 *avg, u64 sample)
(val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1))
/*
+ * cgroup weight knobs should use the common MIN, DFL and MAX values which are
+ * 1, 100 and 10000 respectively. While it loses a bit of range on both ends, it
+ * maps pretty well onto the shares value used by scheduler and the round-trip
+ * conversions preserve the original value over the entire range.
+ */
+static inline unsigned long sched_weight_from_cgroup(unsigned long cgrp_weight)
+{
+ return DIV_ROUND_CLOSEST_ULL(cgrp_weight * 1024, CGROUP_WEIGHT_DFL);
+}
+
+static inline unsigned long sched_weight_to_cgroup(unsigned long weight)
+{
+ return clamp_t(unsigned long,
+ DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024),
+ CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
+}
+
+/*
* !! For sched_setattr_nocheck() (kernel) only !!
*
* This is actually gross. :(
@@ -335,7 +363,7 @@ extern bool __checkparam_dl(const struct sched_attr *attr);
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
extern int dl_bw_check_overflow(int cpu);
-
+extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec);
/*
* SCHED_DEADLINE supports servers (nested scheduling) with the following
* interface:
@@ -361,7 +389,14 @@ extern void dl_server_start(struct sched_dl_entity *dl_se);
extern void dl_server_stop(struct sched_dl_entity *dl_se);
extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_server_has_tasks_f has_tasks,
- dl_server_pick_f pick);
+ dl_server_pick_f pick_task);
+
+extern void dl_server_update_idle_time(struct rq *rq,
+ struct task_struct *p);
+extern void fair_server_init(struct rq *rq);
+extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
+extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
+ u64 runtime, u64 period, bool init);
#ifdef CONFIG_CGROUP_SCHED
@@ -397,16 +432,17 @@ struct cfs_bandwidth {
struct task_group {
struct cgroup_subsys_state css;
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
+ /* A positive value indicates that this is a SCHED_IDLE group. */
+ int idle;
+#endif
+
#ifdef CONFIG_FAIR_GROUP_SCHED
/* schedulable entities of this group on each CPU */
struct sched_entity **se;
/* runqueue "owned" by this group on each CPU */
struct cfs_rq **cfs_rq;
unsigned long shares;
-
- /* A positive value indicates that this is a SCHED_IDLE group. */
- int idle;
-
#ifdef CONFIG_SMP
/*
* load_avg can be heavily contended at clock tick time, so put
@@ -424,6 +460,11 @@ struct task_group {
struct rt_bandwidth rt_bandwidth;
#endif
+#ifdef CONFIG_EXT_GROUP_SCHED
+ u32 scx_flags; /* SCX_TG_* */
+ u32 scx_weight;
+#endif
+
struct rcu_head rcu;
struct list_head list;
@@ -448,7 +489,7 @@ struct task_group {
};
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
/*
@@ -479,6 +520,11 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
return walk_tg_tree_from(&root_task_group, down, up, data);
}
+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct task_group, css) : NULL;
+}
+
extern int tg_nop(struct task_group *tg, void *data);
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -535,6 +581,9 @@ extern void set_task_rq_fair(struct sched_entity *se,
static inline void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next) { }
#endif /* CONFIG_SMP */
+#else /* !CONFIG_FAIR_GROUP_SCHED */
+static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; }
+static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; }
#endif /* CONFIG_FAIR_GROUP_SCHED */
#else /* CONFIG_CGROUP_SCHED */
@@ -588,6 +637,11 @@ do { \
# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy)
# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val)
+struct balance_callback {
+ struct balance_callback *next;
+ void (*func)(struct rq *rq);
+};
+
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
@@ -599,17 +653,12 @@ struct cfs_rq {
s64 avg_vruntime;
u64 avg_load;
- u64 exec_clock;
u64 min_vruntime;
#ifdef CONFIG_SCHED_CORE
unsigned int forceidle_seq;
u64 min_vruntime_fi;
#endif
-#ifndef CONFIG_64BIT
- u64 min_vruntime_copy;
-#endif
-
struct rb_root_cached tasks_timeline;
/*
@@ -619,10 +668,6 @@ struct cfs_rq {
struct sched_entity *curr;
struct sched_entity *next;
-#ifdef CONFIG_SCHED_DEBUG
- unsigned int nr_spread_over;
-#endif
-
#ifdef CONFIG_SMP
/*
* CFS load tracking
@@ -696,6 +741,44 @@ struct cfs_rq {
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
+#ifdef CONFIG_SCHED_CLASS_EXT
+/* scx_rq->flags, protected by the rq lock */
+enum scx_rq_flags {
+ /*
+ * A hotplugged CPU starts scheduling before rq_online_scx(). Track
+ * ops.cpu_on/offline() state so that ops.enqueue/dispatch() are called
+ * only while the BPF scheduler considers the CPU to be online.
+ */
+ SCX_RQ_ONLINE = 1 << 0,
+ SCX_RQ_CAN_STOP_TICK = 1 << 1,
+ SCX_RQ_BAL_KEEP = 1 << 2, /* balance decided to keep current */
+ SCX_RQ_BYPASSING = 1 << 3,
+
+ SCX_RQ_IN_WAKEUP = 1 << 16,
+ SCX_RQ_IN_BALANCE = 1 << 17,
+};
+
+struct scx_rq {
+ struct scx_dispatch_q local_dsq;
+ struct list_head runnable_list; /* runnable tasks on this rq */
+ struct list_head ddsp_deferred_locals; /* deferred ddsps from enq */
+ unsigned long ops_qseq;
+ u64 extra_enq_flags; /* see move_task_to_local_dsq() */
+ u32 nr_running;
+ u32 flags;
+ u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */
+ bool cpu_released;
+ cpumask_var_t cpus_to_kick;
+ cpumask_var_t cpus_to_kick_if_idle;
+ cpumask_var_t cpus_to_preempt;
+ cpumask_var_t cpus_to_wait;
+ unsigned long pnt_seq;
+ struct balance_callback deferred_bal_cb;
+ struct irq_work deferred_irq_work;
+ struct irq_work kick_cpus_irq_work;
+};
+#endif /* CONFIG_SCHED_CLASS_EXT */
+
static inline int rt_bandwidth_enabled(void)
{
return sysctl_sched_rt_runtime >= 0;
@@ -726,13 +809,13 @@ struct rt_rq {
#endif /* CONFIG_SMP */
int rt_queued;
+#ifdef CONFIG_RT_GROUP_SCHED
int rt_throttled;
u64 rt_time;
u64 rt_runtime;
/* Nests inside the rq lock: */
raw_spinlock_t rt_runtime_lock;
-#ifdef CONFIG_RT_GROUP_SCHED
unsigned int rt_nr_boosted;
struct rq *rq;
@@ -820,6 +903,9 @@ static inline void se_update_runnable(struct sched_entity *se)
static inline long se_runnable(struct sched_entity *se)
{
+ if (se->sched_delayed)
+ return false;
+
if (entity_is_task(se))
return !!se->on_rq;
else
@@ -834,6 +920,9 @@ static inline void se_update_runnable(struct sched_entity *se) { }
static inline long se_runnable(struct sched_entity *se)
{
+ if (se->sched_delayed)
+ return false;
+
return !!se->on_rq;
}
@@ -996,11 +1085,6 @@ struct uclamp_rq {
DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
#endif /* CONFIG_UCLAMP_TASK */
-struct balance_callback {
- struct balance_callback *next;
- void (*func)(struct rq *rq);
-};
-
/*
* This is the main, per-CPU runqueue data structure.
*
@@ -1043,6 +1127,11 @@ struct rq {
struct cfs_rq cfs;
struct rt_rq rt;
struct dl_rq dl;
+#ifdef CONFIG_SCHED_CLASS_EXT
+ struct scx_rq scx;
+#endif
+
+ struct sched_dl_entity fair_server;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this CPU: */
@@ -1059,6 +1148,7 @@ struct rq {
unsigned int nr_uninterruptible;
struct task_struct __rcu *curr;
+ struct sched_dl_entity *dl_server;
struct task_struct *idle;
struct task_struct *stop;
unsigned long next_balance;
@@ -1158,7 +1248,6 @@ struct rq {
/* latency stats */
struct sched_info rq_sched_info;
unsigned long long rq_cpu_time;
- /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
/* sys_sched_yield() stats */
unsigned int yld_count;
@@ -1187,6 +1276,7 @@ struct rq {
/* per rq */
struct rq *core;
struct task_struct *core_pick;
+ struct sched_dl_entity *core_dl_server;
unsigned int core_enabled;
unsigned int core_sched_seq;
struct rb_root core_tree;
@@ -2247,11 +2337,13 @@ extern const u32 sched_prio_to_wmult[40];
*
*/
-#define DEQUEUE_SLEEP 0x01
+#define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */
#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */
#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
+#define DEQUEUE_SPECIAL 0x10
#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */
+#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */
#define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_RESTORE 0x02
@@ -2267,6 +2359,7 @@ extern const u32 sched_prio_to_wmult[40];
#endif
#define ENQUEUE_INITIAL 0x80
#define ENQUEUE_MIGRATING 0x100
+#define ENQUEUE_DELAYED 0x200
#define RETRY_TASK ((void *)-1UL)
@@ -2285,23 +2378,31 @@ struct sched_class {
#endif
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
- void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
+ bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*yield_task) (struct rq *rq);
bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
- struct task_struct *(*pick_next_task)(struct rq *rq);
+ int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+ struct task_struct *(*pick_task)(struct rq *rq);
+ /*
+ * Optional! When implemented pick_next_task() should be equivalent to:
+ *
+ * next = pick_task();
+ * if (next) {
+ * put_prev_task(prev);
+ * set_next_task_first(next);
+ * }
+ */
+ struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev);
- void (*put_prev_task)(struct rq *rq, struct task_struct *p);
+ void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next);
void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
#ifdef CONFIG_SMP
- int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
- struct task_struct * (*pick_task)(struct rq *rq);
-
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
@@ -2323,8 +2424,11 @@ struct sched_class {
* cannot assume the switched_from/switched_to pair is serialized by
* rq->lock. They are however serialized by p->pi_lock.
*/
+ void (*switching_to) (struct rq *this_rq, struct task_struct *task);
void (*switched_from)(struct rq *this_rq, struct task_struct *task);
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
+ void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
+ const struct load_weight *lw);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
int oldprio);
@@ -2345,7 +2449,7 @@ struct sched_class {
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
WARN_ON_ONCE(rq->curr != prev);
- prev->sched_class->put_prev_task(rq, prev);
+ prev->sched_class->put_prev_task(rq, prev, NULL);
}
static inline void set_next_task(struct rq *rq, struct task_struct *next)
@@ -2353,6 +2457,30 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next)
next->sched_class->set_next_task(rq, next, false);
}
+static inline void
+__put_prev_set_next_dl_server(struct rq *rq,
+ struct task_struct *prev,
+ struct task_struct *next)
+{
+ prev->dl_server = NULL;
+ next->dl_server = rq->dl_server;
+ rq->dl_server = NULL;
+}
+
+static inline void put_prev_set_next_task(struct rq *rq,
+ struct task_struct *prev,
+ struct task_struct *next)
+{
+ WARN_ON_ONCE(rq->curr != prev);
+
+ __put_prev_set_next_dl_server(rq, prev, next);
+
+ if (next == prev)
+ return;
+
+ prev->sched_class->put_prev_task(rq, prev, next);
+ next->sched_class->set_next_task(rq, next, true);
+}
/*
* Helper to define a sched_class instance; each one is placed in a separate
@@ -2373,19 +2501,54 @@ const struct sched_class name##_sched_class \
extern struct sched_class __sched_class_highest[];
extern struct sched_class __sched_class_lowest[];
+extern const struct sched_class stop_sched_class;
+extern const struct sched_class dl_sched_class;
+extern const struct sched_class rt_sched_class;
+extern const struct sched_class fair_sched_class;
+extern const struct sched_class idle_sched_class;
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+extern const struct sched_class ext_sched_class;
+
+DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); /* SCX BPF scheduler loaded */
+DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */
+
+#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled)
+#define scx_switched_all() static_branch_unlikely(&__scx_switched_all)
+#else /* !CONFIG_SCHED_CLASS_EXT */
+#define scx_enabled() false
+#define scx_switched_all() false
+#endif /* !CONFIG_SCHED_CLASS_EXT */
+
+/*
+ * Iterate only active classes. SCX can take over all fair tasks or be
+ * completely disabled. If the former, skip fair. If the latter, skip SCX.
+ */
+static inline const struct sched_class *next_active_class(const struct sched_class *class)
+{
+ class++;
+#ifdef CONFIG_SCHED_CLASS_EXT
+ if (scx_switched_all() && class == &fair_sched_class)
+ class++;
+ if (!scx_enabled() && class == &ext_sched_class)
+ class++;
+#endif
+ return class;
+}
+
#define for_class_range(class, _from, _to) \
for (class = (_from); class < (_to); class++)
#define for_each_class(class) \
for_class_range(class, __sched_class_highest, __sched_class_lowest)
-#define sched_class_above(_a, _b) ((_a) < (_b))
+#define for_active_class_range(class, _from, _to) \
+ for (class = (_from); class != (_to); class = next_active_class(class))
-extern const struct sched_class stop_sched_class;
-extern const struct sched_class dl_sched_class;
-extern const struct sched_class rt_sched_class;
-extern const struct sched_class fair_sched_class;
-extern const struct sched_class idle_sched_class;
+#define for_each_active_class(class) \
+ for_active_class_range(class, __sched_class_highest, __sched_class_lowest)
+
+#define sched_class_above(_a, _b) ((_a) < (_b))
static inline bool sched_stop_runnable(struct rq *rq)
{
@@ -2408,7 +2571,7 @@ static inline bool sched_fair_runnable(struct rq *rq)
}
extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
-extern struct task_struct *pick_next_task_idle(struct rq *rq);
+extern struct task_struct *pick_task_idle(struct rq *rq);
#define SCA_CHECK 0x01
#define SCA_MIGRATE_DISABLE 0x02
@@ -2424,6 +2587,19 @@ extern void sched_balance_trigger(struct rq *rq);
extern int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx);
extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx);
+static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu)
+{
+ /* When not in the task's cpumask, no point in looking further. */
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+ return false;
+
+ /* Can @cpu run a user thread? */
+ if (!(p->flags & PF_KTHREAD) && !task_cpu_possible(cpu, p))
+ return false;
+
+ return true;
+}
+
static inline cpumask_t *alloc_user_cpus_ptr(int node)
{
/*
@@ -2457,6 +2633,11 @@ extern int push_cpu_stop(void *arg);
#else /* !CONFIG_SMP: */
+static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu)
+{
+ return true;
+}
+
static inline int __set_cpus_allowed_ptr(struct task_struct *p,
struct affinity_context *ctx)
{
@@ -2510,12 +2691,9 @@ extern void init_sched_dl_class(void);
extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
-extern void reweight_task(struct task_struct *p, const struct load_weight *lw);
-
extern void resched_curr(struct rq *rq);
extern void resched_cpu(int cpu);
-extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
@@ -2586,6 +2764,19 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
sched_update_tick_dependency(rq);
}
+static inline void __block_task(struct rq *rq, struct task_struct *p)
+{
+ WRITE_ONCE(p->on_rq, 0);
+ ASSERT_EXCLUSIVE_WRITER(p->on_rq);
+ if (p->sched_contributes_to_load)
+ rq->nr_uninterruptible++;
+
+ if (p->in_iowait) {
+ atomic_inc(&rq->nr_iowait);
+ delayacct_blkio_start();
+ }
+}
+
extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
@@ -3099,6 +3290,8 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
return READ_ONCE(rq->avg_rt.util_avg);
}
+#else /* !CONFIG_SMP */
+static inline bool update_other_load_avgs(struct rq *rq) { return false; }
#endif /* CONFIG_SMP */
#ifdef CONFIG_UCLAMP_TASK
@@ -3607,8 +3800,10 @@ extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *c
extern void __setscheduler_prio(struct task_struct *p, int prio);
extern void set_load_weight(struct task_struct *p, bool update_load);
extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
-extern void dequeue_task(struct rq *rq, struct task_struct *p, int flags);
+extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
+extern void check_class_changing(struct rq *rq, struct task_struct *p,
+ const struct sched_class *prev_class);
extern void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
int oldprio);
@@ -3629,4 +3824,24 @@ static inline void balance_callbacks(struct rq *rq, struct balance_callback *hea
#endif
+#ifdef CONFIG_SCHED_CLASS_EXT
+/*
+ * Used by SCX in the enable/disable paths to move tasks between sched_classes
+ * and establish invariants.
+ */
+struct sched_enq_and_set_ctx {
+ struct task_struct *p;
+ int queue_flags;
+ bool queued;
+ bool running;
+};
+
+void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
+ struct sched_enq_and_set_ctx *ctx);
+void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
+
+#endif /* CONFIG_SCHED_CLASS_EXT */
+
+#include "ext.h"
+
#endif /* _KERNEL_SCHED_SCHED_H */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index b1b8fe61c532..058dd42e3d9b 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -41,26 +41,17 @@ static struct task_struct *pick_task_stop(struct rq *rq)
return rq->stop;
}
-static struct task_struct *pick_next_task_stop(struct rq *rq)
-{
- struct task_struct *p = pick_task_stop(rq);
-
- if (p)
- set_next_task_stop(rq, p, true);
-
- return p;
-}
-
static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
add_nr_running(rq, 1);
}
-static void
+static bool
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
sub_nr_running(rq, 1);
+ return true;
}
static void yield_task_stop(struct rq *rq)
@@ -68,7 +59,7 @@ static void yield_task_stop(struct rq *rq)
BUG(); /* the stop task should never yield, its pointless. */
}
-static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
+static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
update_curr_common(rq);
}
@@ -111,13 +102,12 @@ DEFINE_SCHED_CLASS(stop) = {
.wakeup_preempt = wakeup_preempt_stop,
- .pick_next_task = pick_next_task_stop,
+ .pick_task = pick_task_stop,
.put_prev_task = put_prev_task_stop,
.set_next_task = set_next_task_stop,
#ifdef CONFIG_SMP
.balance = balance_stop,
- .pick_task = pick_task_stop,
.select_task_rq = select_task_rq_stop,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 195d2f2834a9..aa70beee9895 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -57,7 +57,7 @@ static int effective_prio(struct task_struct *p)
* keep the priority unchanged. Otherwise, update priority
* to the normal priority:
*/
- if (!rt_prio(p->prio))
+ if (!rt_or_dl_prio(p->prio))
return p->normal_prio;
return p->prio;
}
@@ -258,107 +258,6 @@ int sched_core_idle_cpu(int cpu)
#endif
-#ifdef CONFIG_SMP
-/*
- * This function computes an effective utilization for the given CPU, to be
- * used for frequency selection given the linear relation: f = u * f_max.
- *
- * The scheduler tracks the following metrics:
- *
- * cpu_util_{cfs,rt,dl,irq}()
- * cpu_bw_dl()
- *
- * Where the cfs,rt and dl util numbers are tracked with the same metric and
- * synchronized windows and are thus directly comparable.
- *
- * The cfs,rt,dl utilization are the running times measured with rq->clock_task
- * which excludes things like IRQ and steal-time. These latter are then accrued
- * in the IRQ utilization.
- *
- * The DL bandwidth number OTOH is not a measured metric but a value computed
- * based on the task model parameters and gives the minimal utilization
- * required to meet deadlines.
- */
-unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long *min,
- unsigned long *max)
-{
- unsigned long util, irq, scale;
- struct rq *rq = cpu_rq(cpu);
-
- scale = arch_scale_cpu_capacity(cpu);
-
- /*
- * Early check to see if IRQ/steal time saturates the CPU, can be
- * because of inaccuracies in how we track these -- see
- * update_irq_load_avg().
- */
- irq = cpu_util_irq(rq);
- if (unlikely(irq >= scale)) {
- if (min)
- *min = scale;
- if (max)
- *max = scale;
- return scale;
- }
-
- if (min) {
- /*
- * The minimum utilization returns the highest level between:
- * - the computed DL bandwidth needed with the IRQ pressure which
- * steals time to the deadline task.
- * - The minimum performance requirement for CFS and/or RT.
- */
- *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
-
- /*
- * When an RT task is runnable and uclamp is not used, we must
- * ensure that the task will run at maximum compute capacity.
- */
- if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
- *min = max(*min, scale);
- }
-
- /*
- * Because the time spend on RT/DL tasks is visible as 'lost' time to
- * CFS tasks and we use the same metric to track the effective
- * utilization (PELT windows are synchronized) we can directly add them
- * to obtain the CPU's actual utilization.
- */
- util = util_cfs + cpu_util_rt(rq);
- util += cpu_util_dl(rq);
-
- /*
- * The maximum hint is a soft bandwidth requirement, which can be lower
- * than the actual utilization because of uclamp_max requirements.
- */
- if (max)
- *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
-
- if (util >= scale)
- return scale;
-
- /*
- * There is still idle time; further improve the number by using the
- * IRQ metric. Because IRQ/steal time is hidden from the task clock we
- * need to scale the task numbers:
- *
- * max - irq
- * U' = irq + --------- * U
- * max
- */
- util = scale_irq_capacity(util, irq, scale);
- util += irq;
-
- return min(scale, util);
-}
-
-unsigned long sched_cpu_util(int cpu)
-{
- return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
-}
-#endif /* CONFIG_SMP */
-
/**
* find_process_by_pid - find a process with a matching PID value.
* @pid: the pid in question.
@@ -401,13 +300,23 @@ static void __setscheduler_params(struct task_struct *p,
p->policy = policy;
- if (dl_policy(policy))
+ if (dl_policy(policy)) {
__setparam_dl(p, attr);
- else if (fair_policy(policy))
+ } else if (fair_policy(policy)) {
p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+ if (attr->sched_runtime) {
+ p->se.custom_slice = 1;
+ p->se.slice = clamp_t(u64, attr->sched_runtime,
+ NSEC_PER_MSEC/10, /* HZ=1000 * 10 */
+ NSEC_PER_MSEC*100); /* HZ=100 / 10 */
+ } else {
+ p->se.custom_slice = 0;
+ p->se.slice = sysctl_sched_base_slice;
+ }
+ }
/* rt-policy tasks do not have a timerslack */
- if (task_is_realtime(p)) {
+ if (rt_or_dl_task_policy(p)) {
p->timer_slack_ns = 0;
} else if (p->timer_slack_ns == 0) {
/* when switching back to non-rt policy, restore timerslack */
@@ -703,12 +612,18 @@ recheck:
goto unlock;
}
+ retval = scx_check_setscheduler(p, policy);
+ if (retval)
+ goto unlock;
+
/*
* If not changing anything there's no need to proceed further,
* but store a possible modification of reset_on_fork.
*/
if (unlikely(policy == p->policy)) {
- if (fair_policy(policy) && attr->sched_nice != task_nice(p))
+ if (fair_policy(policy) &&
+ (attr->sched_nice != task_nice(p) ||
+ (attr->sched_runtime != p->se.slice)))
goto change;
if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
goto change;
@@ -805,6 +720,7 @@ change:
__setscheduler_prio(p, newprio);
}
__setscheduler_uclamp(p, attr);
+ check_class_changing(rq, p, prev_class);
if (queued) {
/*
@@ -854,6 +770,9 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
.sched_nice = PRIO_TO_NICE(p->static_prio),
};
+ if (p->se.custom_slice)
+ attr.sched_runtime = p->se.slice;
+
/* Fixup the legacy SCHED_RESET_ON_FORK hack. */
if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
@@ -1020,12 +939,14 @@ err_size:
static void get_params(struct task_struct *p, struct sched_attr *attr)
{
- if (task_has_dl_policy(p))
+ if (task_has_dl_policy(p)) {
__getparam_dl(p, attr);
- else if (task_has_rt_policy(p))
+ } else if (task_has_rt_policy(p)) {
attr->sched_priority = p->rt_priority;
- else
+ } else {
attr->sched_nice = task_nice(p);
+ attr->sched_runtime = p->se.slice;
+ }
}
/**
@@ -1610,6 +1531,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
case SCHED_NORMAL:
case SCHED_BATCH:
case SCHED_IDLE:
+ case SCHED_EXT:
ret = 0;
break;
}
@@ -1637,6 +1559,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
case SCHED_NORMAL:
case SCHED_BATCH:
case SCHED_IDLE:
+ case SCHED_EXT:
ret = 0;
}
return ret;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 76504b776d03..9748a4c8d668 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -516,6 +516,14 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
set_rq_online(rq);
+ /*
+ * Because the rq is not a task, dl_add_task_root_domain() did not
+ * move the fair server bw to the rd if it already started.
+ * Add it now.
+ */
+ if (rq->fair_server.dl_server)
+ __dl_server_attach_root(&rq->fair_server, rq);
+
rq_unlock_irqrestore(rq, &rf);
if (old_rd)
diff --git a/kernel/signal.c b/kernel/signal.c
index 6fe29715105b..6e57036f947f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3941,11 +3941,11 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
return -EINVAL;
f = fdget(pidfd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
/* Is this a pidfd? */
- pid = pidfd_to_pid(f.file);
+ pid = pidfd_to_pid(fd_file(f));
if (IS_ERR(pid)) {
ret = PTR_ERR(pid);
goto err;
@@ -3958,7 +3958,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
switch (flags) {
case 0:
/* Infer scope from the type of pidfd. */
- if (f.file->f_flags & PIDFD_THREAD)
+ if (fd_file(f)->f_flags & PIDFD_THREAD)
type = PIDTYPE_PID;
else
type = PIDTYPE_TGID;
diff --git a/kernel/sys.c b/kernel/sys.c
index e3c4cffb520c..4da31f28fda8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1916,10 +1916,10 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
int err;
exe = fdget(fd);
- if (!exe.file)
+ if (!fd_file(exe))
return -EBADF;
- inode = file_inode(exe.file);
+ inode = file_inode(fd_file(exe));
/*
* Because the original mm->exe_file points to executable file, make
@@ -1927,14 +1927,14 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
* overall picture.
*/
err = -EACCES;
- if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
+ if (!S_ISREG(inode->i_mode) || path_noexec(&fd_file(exe)->f_path))
goto exit;
- err = file_permission(exe.file, MAY_EXEC);
+ err = file_permission(fd_file(exe), MAY_EXEC);
if (err)
goto exit;
- err = replace_mm_exe_file(mm, exe.file);
+ err = replace_mm_exe_file(mm, fd_file(exe));
exit:
fdput(exe);
return err;
@@ -2557,7 +2557,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = current->timer_slack_ns;
break;
case PR_SET_TIMERSLACK:
- if (task_is_realtime(current))
+ if (rt_or_dl_task_policy(current))
break;
if (arg2 <= 0)
current->timer_slack_ns =
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4354ea231fab..0700f40c53ac 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -419,7 +419,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
f = fdget(fd);
- if (!f.file)
+ if (!fd_file(f))
return 0;
size = nla_total_size(sizeof(struct cgroupstats));
@@ -440,7 +440,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
stats = nla_data(na);
memset(stats, 0, sizeof(*stats));
- rc = cgroupstats_build(stats, f.file->f_path.dentry);
+ rc = cgroupstats_build(stats, fd_file(f)->f_path.dentry);
if (rc < 0) {
nlmsg_free(rep_skb);
goto err;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 12eb40d6290e..cddcd08ea827 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1977,7 +1977,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
* expiry.
*/
if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
- if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT))
+ if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT))
mode |= HRTIMER_MODE_HARD;
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ac0a01cc8634..a582cd25ca87 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -24,7 +24,6 @@
#include <linux/key.h>
#include <linux/verification.h>
#include <linux/namei.h>
-#include <linux/fileattr.h>
#include <net/bpf_sk_storage.h>
@@ -798,29 +797,6 @@ const struct bpf_func_proto bpf_task_pt_regs_proto = {
.ret_btf_id = &bpf_task_pt_regs_ids[0],
};
-BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
-{
- struct bpf_array *array = container_of(map, struct bpf_array, map);
- struct cgroup *cgrp;
-
- if (unlikely(idx >= array->map.max_entries))
- return -E2BIG;
-
- cgrp = READ_ONCE(array->ptrs[idx]);
- if (unlikely(!cgrp))
- return -EAGAIN;
-
- return task_under_cgroup_hierarchy(current, cgrp);
-}
-
-static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
- .func = bpf_current_task_under_cgroup,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_ANYTHING,
-};
-
struct send_signal_irq_work {
struct irq_work irq_work;
struct task_struct *task;
@@ -1226,7 +1202,8 @@ static const struct bpf_func_proto bpf_get_func_arg_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_LONG,
+ .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED,
+ .arg3_size = sizeof(u64),
};
BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value)
@@ -1242,7 +1219,8 @@ static const struct bpf_func_proto bpf_get_func_ret_proto = {
.func = get_func_ret,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_LONG,
+ .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED,
+ .arg2_size = sizeof(u64),
};
BPF_CALL_1(get_func_arg_cnt, void *, ctx)
@@ -1439,73 +1417,6 @@ static int __init bpf_key_sig_kfuncs_init(void)
late_initcall(bpf_key_sig_kfuncs_init);
#endif /* CONFIG_KEYS */
-/* filesystem kfuncs */
-__bpf_kfunc_start_defs();
-
-/**
- * bpf_get_file_xattr - get xattr of a file
- * @file: file to get xattr from
- * @name__str: name of the xattr
- * @value_p: output buffer of the xattr value
- *
- * Get xattr *name__str* of *file* and store the output in *value_ptr*.
- *
- * For security reasons, only *name__str* with prefix "user." is allowed.
- *
- * Return: 0 on success, a negative value on error.
- */
-__bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str,
- struct bpf_dynptr *value_p)
-{
- struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p;
- struct dentry *dentry;
- u32 value_len;
- void *value;
- int ret;
-
- if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
- return -EPERM;
-
- value_len = __bpf_dynptr_size(value_ptr);
- value = __bpf_dynptr_data_rw(value_ptr, value_len);
- if (!value)
- return -EINVAL;
-
- dentry = file_dentry(file);
- ret = inode_permission(&nop_mnt_idmap, dentry->d_inode, MAY_READ);
- if (ret)
- return ret;
- return __vfs_getxattr(dentry, dentry->d_inode, name__str, value, value_len);
-}
-
-__bpf_kfunc_end_defs();
-
-BTF_KFUNCS_START(fs_kfunc_set_ids)
-BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
-BTF_KFUNCS_END(fs_kfunc_set_ids)
-
-static int bpf_get_file_xattr_filter(const struct bpf_prog *prog, u32 kfunc_id)
-{
- if (!btf_id_set8_contains(&fs_kfunc_set_ids, kfunc_id))
- return 0;
-
- /* Only allow to attach from LSM hooks, to avoid recursion */
- return prog->type != BPF_PROG_TYPE_LSM ? -EACCES : 0;
-}
-
-static const struct btf_kfunc_id_set bpf_fs_kfunc_set = {
- .owner = THIS_MODULE,
- .set = &fs_kfunc_set_ids,
- .filter = bpf_get_file_xattr_filter,
-};
-
-static int __init bpf_fs_kfuncs_init(void)
-{
- return register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fs_kfunc_set);
-}
-
-late_initcall(bpf_fs_kfuncs_init);
-
static const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -1548,8 +1459,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_numa_node_id_proto;
case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto;
- case BPF_FUNC_current_task_under_cgroup:
- return &bpf_current_task_under_cgroup_proto;
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
case BPF_FUNC_probe_write_user:
@@ -1578,6 +1487,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_cgrp_storage_get_proto;
case BPF_FUNC_cgrp_storage_delete:
return &bpf_cgrp_storage_delete_proto;
+ case BPF_FUNC_current_task_under_cgroup:
+ return &bpf_current_task_under_cgroup_proto;
#endif
case BPF_FUNC_send_signal:
return &bpf_send_signal_proto;
@@ -1598,7 +1509,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_jiffies64:
return &bpf_jiffies64_proto;
case BPF_FUNC_get_task_stack:
- return &bpf_get_task_stack_proto;
+ return prog->sleepable ? &bpf_get_task_stack_sleepable_proto
+ : &bpf_get_task_stack_proto;
case BPF_FUNC_copy_from_user:
return &bpf_copy_from_user_proto;
case BPF_FUNC_copy_from_user_task:
@@ -1654,7 +1566,7 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto;
case BPF_FUNC_get_stack:
- return &bpf_get_stack_proto;
+ return prog->sleepable ? &bpf_get_stack_sleepable_proto : &bpf_get_stack_proto;
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
case BPF_FUNC_override_return:
return &bpf_override_return_proto;
@@ -3299,7 +3211,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe,
struct bpf_run_ctx *old_run_ctx;
int err = 0;
- if (link->task && current->mm != link->task->mm)
+ if (link->task && !same_thread_group(current, link->task))
return 0;
if (sleepable)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index cebd879a30cb..77dc0b25140e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -32,6 +32,8 @@
#include <asm/local64.h>
#include <asm/local.h>
+#include "trace.h"
+
/*
* The "absolute" timestamp in the buffer is only 59 bits.
* If a clock has the 5 MSBs set, it needs to be saved and
@@ -42,6 +44,21 @@
static void update_pages_handler(struct work_struct *work);
+#define RING_BUFFER_META_MAGIC 0xBADFEED
+
+struct ring_buffer_meta {
+ int magic;
+ int struct_size;
+ unsigned long text_addr;
+ unsigned long data_addr;
+ unsigned long first_buffer;
+ unsigned long head_buffer;
+ unsigned long commit_buffer;
+ __u32 subbuf_size;
+ __u32 nr_subbufs;
+ int buffers[];
+};
+
/*
* The ring buffer header is special. We must manually up keep it.
*/
@@ -342,7 +359,8 @@ struct buffer_page {
local_t entries; /* entries on this page */
unsigned long real_end; /* real end of data */
unsigned order; /* order of the page */
- u32 id; /* ID for external mapping */
+ u32 id:30; /* ID for external mapping */
+ u32 range:1; /* Mapped via a range */
struct buffer_data_page *page; /* Actual data page */
};
@@ -373,7 +391,9 @@ static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
static void free_buffer_page(struct buffer_page *bpage)
{
- free_pages((unsigned long)bpage->page, bpage->order);
+ /* Range pages are not to be freed */
+ if (!bpage->range)
+ free_pages((unsigned long)bpage->page, bpage->order);
kfree(bpage);
}
@@ -491,9 +511,11 @@ struct ring_buffer_per_cpu {
unsigned long pages_removed;
unsigned int mapped;
+ unsigned int user_mapped; /* user space mapping */
struct mutex mapping_lock;
unsigned long *subbuf_ids; /* ID to subbuf VA */
struct trace_buffer_meta *meta_page;
+ struct ring_buffer_meta *ring_meta;
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
@@ -523,6 +545,12 @@ struct trace_buffer {
struct rb_irq_work irq_work;
bool time_stamp_abs;
+ unsigned long range_addr_start;
+ unsigned long range_addr_end;
+
+ long last_text_delta;
+ long last_data_delta;
+
unsigned int subbuf_size;
unsigned int subbuf_order;
unsigned int max_data_size;
@@ -1239,6 +1267,11 @@ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
* Set the previous list pointer to have the HEAD flag.
*/
rb_set_list_to_head(head->list.prev);
+
+ if (cpu_buffer->ring_meta) {
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ meta->head_buffer = (unsigned long)head->page;
+ }
}
static void rb_list_head_clear(struct list_head *list)
@@ -1478,9 +1511,484 @@ static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
}
}
+/*
+ * Take an address, add the meta data size as well as the array of
+ * array subbuffer indexes, then align it to a subbuffer size.
+ *
+ * This is used to help find the next per cpu subbuffer within a mapped range.
+ */
+static unsigned long
+rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs)
+{
+ addr += sizeof(struct ring_buffer_meta) +
+ sizeof(int) * nr_subbufs;
+ return ALIGN(addr, subbuf_size);
+}
+
+/*
+ * Return the ring_buffer_meta for a given @cpu.
+ */
+static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu)
+{
+ int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+ unsigned long ptr = buffer->range_addr_start;
+ struct ring_buffer_meta *meta;
+ int nr_subbufs;
+
+ if (!ptr)
+ return NULL;
+
+ /* When nr_pages passed in is zero, the first meta has already been initialized */
+ if (!nr_pages) {
+ meta = (struct ring_buffer_meta *)ptr;
+ nr_subbufs = meta->nr_subbufs;
+ } else {
+ meta = NULL;
+ /* Include the reader page */
+ nr_subbufs = nr_pages + 1;
+ }
+
+ /*
+ * The first chunk may not be subbuffer aligned, where as
+ * the rest of the chunks are.
+ */
+ if (cpu) {
+ ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs);
+ ptr += subbuf_size * nr_subbufs;
+
+ /* We can use multiplication to find chunks greater than 1 */
+ if (cpu > 1) {
+ unsigned long size;
+ unsigned long p;
+
+ /* Save the beginning of this CPU chunk */
+ p = ptr;
+ ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs);
+ ptr += subbuf_size * nr_subbufs;
+
+ /* Now all chunks after this are the same size */
+ size = ptr - p;
+ ptr += size * (cpu - 2);
+ }
+ }
+ return (void *)ptr;
+}
+
+/* Return the start of subbufs given the meta pointer */
+static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta)
+{
+ int subbuf_size = meta->subbuf_size;
+ unsigned long ptr;
+
+ ptr = (unsigned long)meta;
+ ptr = rb_range_align_subbuf(ptr, subbuf_size, meta->nr_subbufs);
+
+ return (void *)ptr;
+}
+
+/*
+ * Return a specific sub-buffer for a given @cpu defined by @idx.
+ */
+static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx)
+{
+ struct ring_buffer_meta *meta;
+ unsigned long ptr;
+ int subbuf_size;
+
+ meta = rb_range_meta(cpu_buffer->buffer, 0, cpu_buffer->cpu);
+ if (!meta)
+ return NULL;
+
+ if (WARN_ON_ONCE(idx >= meta->nr_subbufs))
+ return NULL;
+
+ subbuf_size = meta->subbuf_size;
+
+ /* Map this buffer to the order that's in meta->buffers[] */
+ idx = meta->buffers[idx];
+
+ ptr = (unsigned long)rb_subbufs_from_meta(meta);
+
+ ptr += subbuf_size * idx;
+ if (ptr + subbuf_size > cpu_buffer->buffer->range_addr_end)
+ return NULL;
+
+ return (void *)ptr;
+}
+
+/*
+ * See if the existing memory contains valid ring buffer data.
+ * As the previous kernel must be the same as this kernel, all
+ * the calculations (size of buffers and number of buffers)
+ * must be the same.
+ */
+static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
+ struct trace_buffer *buffer, int nr_pages)
+{
+ int subbuf_size = PAGE_SIZE;
+ struct buffer_data_page *subbuf;
+ unsigned long buffers_start;
+ unsigned long buffers_end;
+ int i;
+
+ /* Check the meta magic and meta struct size */
+ if (meta->magic != RING_BUFFER_META_MAGIC ||
+ meta->struct_size != sizeof(*meta)) {
+ pr_info("Ring buffer boot meta[%d] mismatch of magic or struct size\n", cpu);
+ return false;
+ }
+
+ /* The subbuffer's size and number of subbuffers must match */
+ if (meta->subbuf_size != subbuf_size ||
+ meta->nr_subbufs != nr_pages + 1) {
+ pr_info("Ring buffer boot meta [%d] mismatch of subbuf_size/nr_pages\n", cpu);
+ return false;
+ }
+
+ buffers_start = meta->first_buffer;
+ buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs);
+
+ /* Is the head and commit buffers within the range of buffers? */
+ if (meta->head_buffer < buffers_start ||
+ meta->head_buffer >= buffers_end) {
+ pr_info("Ring buffer boot meta [%d] head buffer out of range\n", cpu);
+ return false;
+ }
+
+ if (meta->commit_buffer < buffers_start ||
+ meta->commit_buffer >= buffers_end) {
+ pr_info("Ring buffer boot meta [%d] commit buffer out of range\n", cpu);
+ return false;
+ }
+
+ subbuf = rb_subbufs_from_meta(meta);
+
+ /* Is the meta buffers and the subbufs themselves have correct data? */
+ for (i = 0; i < meta->nr_subbufs; i++) {
+ if (meta->buffers[i] < 0 ||
+ meta->buffers[i] >= meta->nr_subbufs) {
+ pr_info("Ring buffer boot meta [%d] array out of range\n", cpu);
+ return false;
+ }
+
+ if ((unsigned)local_read(&subbuf->commit) > subbuf_size) {
+ pr_info("Ring buffer boot meta [%d] buffer invalid commit\n", cpu);
+ return false;
+ }
+
+ subbuf = (void *)subbuf + subbuf_size;
+ }
+
+ return true;
+}
+
+static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf);
+
+static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu,
+ unsigned long long *timestamp, u64 *delta_ptr)
+{
+ struct ring_buffer_event *event;
+ u64 ts, delta;
+ int events = 0;
+ int e;
+
+ *delta_ptr = 0;
+ *timestamp = 0;
+
+ ts = dpage->time_stamp;
+
+ for (e = 0; e < tail; e += rb_event_length(event)) {
+
+ event = (struct ring_buffer_event *)(dpage->data + e);
+
+ switch (event->type_len) {
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ delta = rb_event_time_stamp(event);
+ ts += delta;
+ break;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ delta = rb_event_time_stamp(event);
+ delta = rb_fix_abs_ts(delta, ts);
+ if (delta < ts) {
+ *delta_ptr = delta;
+ *timestamp = ts;
+ return -1;
+ }
+ ts = delta;
+ break;
+
+ case RINGBUF_TYPE_PADDING:
+ if (event->time_delta == 1)
+ break;
+ fallthrough;
+ case RINGBUF_TYPE_DATA:
+ events++;
+ ts += event->time_delta;
+ break;
+
+ default:
+ return -1;
+ }
+ }
+ *timestamp = ts;
+ return events;
+}
+
+static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
+{
+ unsigned long long ts;
+ u64 delta;
+ int tail;
+
+ tail = local_read(&dpage->commit);
+ return rb_read_data_buffer(dpage, tail, cpu, &ts, &delta);
+}
+
+/* If the meta data has been validated, now validate the events */
+static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct buffer_page *head_page;
+ unsigned long entry_bytes = 0;
+ unsigned long entries = 0;
+ int ret;
+ int i;
+
+ if (!meta || !meta->head_buffer)
+ return;
+
+ /* Do the reader page first */
+ ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu);
+ if (ret < 0) {
+ pr_info("Ring buffer reader page is invalid\n");
+ goto invalid;
+ }
+ entries += ret;
+ entry_bytes += local_read(&cpu_buffer->reader_page->page->commit);
+ local_set(&cpu_buffer->reader_page->entries, ret);
+
+ head_page = cpu_buffer->head_page;
+
+ /* If both the head and commit are on the reader_page then we are done. */
+ if (head_page == cpu_buffer->reader_page &&
+ head_page == cpu_buffer->commit_page)
+ goto done;
+
+ /* Iterate until finding the commit page */
+ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) {
+
+ /* Reader page has already been done */
+ if (head_page == cpu_buffer->reader_page)
+ continue;
+
+ ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
+ if (ret < 0) {
+ pr_info("Ring buffer meta [%d] invalid buffer page\n",
+ cpu_buffer->cpu);
+ goto invalid;
+ }
+ entries += ret;
+ entry_bytes += local_read(&head_page->page->commit);
+ local_set(&cpu_buffer->head_page->entries, ret);
+
+ if (head_page == cpu_buffer->commit_page)
+ break;
+ }
+
+ if (head_page != cpu_buffer->commit_page) {
+ pr_info("Ring buffer meta [%d] commit page not found\n",
+ cpu_buffer->cpu);
+ goto invalid;
+ }
+ done:
+ local_set(&cpu_buffer->entries, entries);
+ local_set(&cpu_buffer->entries_bytes, entry_bytes);
+
+ pr_info("Ring buffer meta [%d] is from previous boot!\n", cpu_buffer->cpu);
+ return;
+
+ invalid:
+ /* The content of the buffers are invalid, reset the meta data */
+ meta->head_buffer = 0;
+ meta->commit_buffer = 0;
+
+ /* Reset the reader page */
+ local_set(&cpu_buffer->reader_page->entries, 0);
+ local_set(&cpu_buffer->reader_page->page->commit, 0);
+
+ /* Reset all the subbuffers */
+ for (i = 0; i < meta->nr_subbufs - 1; i++, rb_inc_page(&head_page)) {
+ local_set(&head_page->entries, 0);
+ local_set(&head_page->page->commit, 0);
+ }
+}
+
+/* Used to calculate data delta */
+static char rb_data_ptr[] = "";
+
+#define THIS_TEXT_PTR ((unsigned long)rb_meta_init_text_addr)
+#define THIS_DATA_PTR ((unsigned long)rb_data_ptr)
+
+static void rb_meta_init_text_addr(struct ring_buffer_meta *meta)
+{
+ meta->text_addr = THIS_TEXT_PTR;
+ meta->data_addr = THIS_DATA_PTR;
+}
+
+static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
+{
+ struct ring_buffer_meta *meta;
+ unsigned long delta;
+ void *subbuf;
+ int cpu;
+ int i;
+
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+ void *next_meta;
+
+ meta = rb_range_meta(buffer, nr_pages, cpu);
+
+ if (rb_meta_valid(meta, cpu, buffer, nr_pages)) {
+ /* Make the mappings match the current address */
+ subbuf = rb_subbufs_from_meta(meta);
+ delta = (unsigned long)subbuf - meta->first_buffer;
+ meta->first_buffer += delta;
+ meta->head_buffer += delta;
+ meta->commit_buffer += delta;
+ buffer->last_text_delta = THIS_TEXT_PTR - meta->text_addr;
+ buffer->last_data_delta = THIS_DATA_PTR - meta->data_addr;
+ continue;
+ }
+
+ if (cpu < nr_cpu_ids - 1)
+ next_meta = rb_range_meta(buffer, nr_pages, cpu + 1);
+ else
+ next_meta = (void *)buffer->range_addr_end;
+
+ memset(meta, 0, next_meta - (void *)meta);
+
+ meta->magic = RING_BUFFER_META_MAGIC;
+ meta->struct_size = sizeof(*meta);
+
+ meta->nr_subbufs = nr_pages + 1;
+ meta->subbuf_size = PAGE_SIZE;
+
+ subbuf = rb_subbufs_from_meta(meta);
+
+ meta->first_buffer = (unsigned long)subbuf;
+ rb_meta_init_text_addr(meta);
+
+ /*
+ * The buffers[] array holds the order of the sub-buffers
+ * that are after the meta data. The sub-buffers may
+ * be swapped out when read and inserted into a different
+ * location of the ring buffer. Although their addresses
+ * remain the same, the buffers[] array contains the
+ * index into the sub-buffers holding their actual order.
+ */
+ for (i = 0; i < meta->nr_subbufs; i++) {
+ meta->buffers[i] = i;
+ rb_init_page(subbuf);
+ subbuf += meta->subbuf_size;
+ }
+ }
+}
+
+static void *rbm_start(struct seq_file *m, loff_t *pos)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = m->private;
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ unsigned long val;
+
+ if (!meta)
+ return NULL;
+
+ if (*pos > meta->nr_subbufs)
+ return NULL;
+
+ val = *pos;
+ val++;
+
+ return (void *)val;
+}
+
+static void *rbm_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+
+ return rbm_start(m, pos);
+}
+
+static int rbm_show(struct seq_file *m, void *v)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = m->private;
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ unsigned long val = (unsigned long)v;
+
+ if (val == 1) {
+ seq_printf(m, "head_buffer: %d\n",
+ rb_meta_subbuf_idx(meta, (void *)meta->head_buffer));
+ seq_printf(m, "commit_buffer: %d\n",
+ rb_meta_subbuf_idx(meta, (void *)meta->commit_buffer));
+ seq_printf(m, "subbuf_size: %d\n", meta->subbuf_size);
+ seq_printf(m, "nr_subbufs: %d\n", meta->nr_subbufs);
+ return 0;
+ }
+
+ val -= 2;
+ seq_printf(m, "buffer[%ld]: %d\n", val, meta->buffers[val]);
+
+ return 0;
+}
+
+static void rbm_stop(struct seq_file *m, void *p)
+{
+}
+
+static const struct seq_operations rb_meta_seq_ops = {
+ .start = rbm_start,
+ .next = rbm_next,
+ .show = rbm_show,
+ .stop = rbm_stop,
+};
+
+int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, int cpu)
+{
+ struct seq_file *m;
+ int ret;
+
+ ret = seq_open(file, &rb_meta_seq_ops);
+ if (ret)
+ return ret;
+
+ m = file->private_data;
+ m->private = buffer->buffers[cpu];
+
+ return 0;
+}
+
+/* Map the buffer_pages to the previous head and commit pages */
+static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *bpage)
+{
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+
+ if (meta->head_buffer == (unsigned long)bpage->page)
+ cpu_buffer->head_page = bpage;
+
+ if (meta->commit_buffer == (unsigned long)bpage->page) {
+ cpu_buffer->commit_page = bpage;
+ cpu_buffer->tail_page = bpage;
+ }
+}
+
static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
long nr_pages, struct list_head *pages)
{
+ struct trace_buffer *buffer = cpu_buffer->buffer;
+ struct ring_buffer_meta *meta = NULL;
struct buffer_page *bpage, *tmp;
bool user_thread = current->mm != NULL;
gfp_t mflags;
@@ -1515,6 +2023,10 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
*/
if (user_thread)
set_current_oom_origin();
+
+ if (buffer->range_addr_start)
+ meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu);
+
for (i = 0; i < nr_pages; i++) {
struct page *page;
@@ -1525,16 +2037,32 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
rb_check_bpage(cpu_buffer, bpage);
- list_add(&bpage->list, pages);
-
- page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
- mflags | __GFP_COMP | __GFP_ZERO,
- cpu_buffer->buffer->subbuf_order);
- if (!page)
- goto free_pages;
- bpage->page = page_address(page);
+ /*
+ * Append the pages as for mapped buffers we want to keep
+ * the order
+ */
+ list_add_tail(&bpage->list, pages);
+
+ if (meta) {
+ /* A range was given. Use that for the buffer page */
+ bpage->page = rb_range_buffer(cpu_buffer, i + 1);
+ if (!bpage->page)
+ goto free_pages;
+ /* If this is valid from a previous boot */
+ if (meta->head_buffer)
+ rb_meta_buffer_update(cpu_buffer, bpage);
+ bpage->range = 1;
+ bpage->id = i + 1;
+ } else {
+ page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
+ mflags | __GFP_COMP | __GFP_ZERO,
+ cpu_buffer->buffer->subbuf_order);
+ if (!page)
+ goto free_pages;
+ bpage->page = page_address(page);
+ rb_init_page(bpage->page);
+ }
bpage->order = cpu_buffer->buffer->subbuf_order;
- rb_init_page(bpage->page);
if (user_thread && fatal_signal_pending(current))
goto free_pages;
@@ -1584,6 +2112,7 @@ static struct ring_buffer_per_cpu *
rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_meta *meta;
struct buffer_page *bpage;
struct page *page;
int ret;
@@ -1614,12 +2143,28 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
cpu_buffer->reader_page = bpage;
- page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
- cpu_buffer->buffer->subbuf_order);
- if (!page)
- goto fail_free_reader;
- bpage->page = page_address(page);
- rb_init_page(bpage->page);
+ if (buffer->range_addr_start) {
+ /*
+ * Range mapped buffers have the same restrictions as memory
+ * mapped ones do.
+ */
+ cpu_buffer->mapped = 1;
+ cpu_buffer->ring_meta = rb_range_meta(buffer, nr_pages, cpu);
+ bpage->page = rb_range_buffer(cpu_buffer, 0);
+ if (!bpage->page)
+ goto fail_free_reader;
+ if (cpu_buffer->ring_meta->head_buffer)
+ rb_meta_buffer_update(cpu_buffer, bpage);
+ bpage->range = 1;
+ } else {
+ page = alloc_pages_node(cpu_to_node(cpu),
+ GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
+ cpu_buffer->buffer->subbuf_order);
+ if (!page)
+ goto fail_free_reader;
+ bpage->page = page_address(page);
+ rb_init_page(bpage->page);
+ }
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
INIT_LIST_HEAD(&cpu_buffer->new_pages);
@@ -1628,11 +2173,35 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
if (ret < 0)
goto fail_free_reader;
- cpu_buffer->head_page
- = list_entry(cpu_buffer->pages, struct buffer_page, list);
- cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+ rb_meta_validate_events(cpu_buffer);
+
+ /* If the boot meta was valid then this has already been updated */
+ meta = cpu_buffer->ring_meta;
+ if (!meta || !meta->head_buffer ||
+ !cpu_buffer->head_page || !cpu_buffer->commit_page || !cpu_buffer->tail_page) {
+ if (meta && meta->head_buffer &&
+ (cpu_buffer->head_page || cpu_buffer->commit_page || cpu_buffer->tail_page)) {
+ pr_warn("Ring buffer meta buffers not all mapped\n");
+ if (!cpu_buffer->head_page)
+ pr_warn(" Missing head_page\n");
+ if (!cpu_buffer->commit_page)
+ pr_warn(" Missing commit_page\n");
+ if (!cpu_buffer->tail_page)
+ pr_warn(" Missing tail_page\n");
+ }
- rb_head_page_activate(cpu_buffer);
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages, struct buffer_page, list);
+ cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+
+ rb_head_page_activate(cpu_buffer);
+
+ if (cpu_buffer->ring_meta)
+ meta->commit_buffer = meta->head_buffer;
+ } else {
+ /* The valid meta buffer still needs to activate the head page */
+ rb_head_page_activate(cpu_buffer);
+ }
return cpu_buffer;
@@ -1669,22 +2238,14 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
kfree(cpu_buffer);
}
-/**
- * __ring_buffer_alloc - allocate a new ring_buffer
- * @size: the size in bytes per cpu that is needed.
- * @flags: attributes to set for the ring buffer.
- * @key: ring buffer reader_lock_key.
- *
- * Currently the only flag that is available is the RB_FL_OVERWRITE
- * flag. This flag means that the buffer will overwrite old data
- * when the buffer wraps. If this flag is not set, the buffer will
- * drop data when the tail hits the head.
- */
-struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
- struct lock_class_key *key)
+static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
+ int order, unsigned long start,
+ unsigned long end,
+ struct lock_class_key *key)
{
struct trace_buffer *buffer;
long nr_pages;
+ int subbuf_size;
int bsize;
int cpu;
int ret;
@@ -1698,14 +2259,13 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
goto fail_free_buffer;
- /* Default buffer page size - one system page */
- buffer->subbuf_order = 0;
- buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
+ buffer->subbuf_order = order;
+ subbuf_size = (PAGE_SIZE << order);
+ buffer->subbuf_size = subbuf_size - BUF_PAGE_HDR_SIZE;
/* Max payload is buffer page size - header (8bytes) */
buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2);
- nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
buffer->flags = flags;
buffer->clock = trace_clock_local;
buffer->reader_lock_key = key;
@@ -1713,10 +2273,6 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters);
init_waitqueue_head(&buffer->irq_work.waiters);
- /* need at least two pages */
- if (nr_pages < 2)
- nr_pages = 2;
-
buffer->cpus = nr_cpu_ids;
bsize = sizeof(void *) * nr_cpu_ids;
@@ -1725,6 +2281,56 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (!buffer->buffers)
goto fail_free_cpumask;
+ /* If start/end are specified, then that overrides size */
+ if (start && end) {
+ unsigned long ptr;
+ int n;
+
+ size = end - start;
+ size = size / nr_cpu_ids;
+
+ /*
+ * The number of sub-buffers (nr_pages) is determined by the
+ * total size allocated minus the meta data size.
+ * Then that is divided by the number of per CPU buffers
+ * needed, plus account for the integer array index that
+ * will be appended to the meta data.
+ */
+ nr_pages = (size - sizeof(struct ring_buffer_meta)) /
+ (subbuf_size + sizeof(int));
+ /* Need at least two pages plus the reader page */
+ if (nr_pages < 3)
+ goto fail_free_buffers;
+
+ again:
+ /* Make sure that the size fits aligned */
+ for (n = 0, ptr = start; n < nr_cpu_ids; n++) {
+ ptr += sizeof(struct ring_buffer_meta) +
+ sizeof(int) * nr_pages;
+ ptr = ALIGN(ptr, subbuf_size);
+ ptr += subbuf_size * nr_pages;
+ }
+ if (ptr > end) {
+ if (nr_pages <= 3)
+ goto fail_free_buffers;
+ nr_pages--;
+ goto again;
+ }
+
+ /* nr_pages should not count the reader page */
+ nr_pages--;
+ buffer->range_addr_start = start;
+ buffer->range_addr_end = end;
+
+ rb_range_meta_init(buffer, nr_pages);
+ } else {
+
+ /* need at least two pages */
+ nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
+ if (nr_pages < 2)
+ nr_pages = 2;
+ }
+
cpu = raw_smp_processor_id();
cpumask_set_cpu(cpu, buffer->cpumask);
buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
@@ -1753,9 +2359,73 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
kfree(buffer);
return NULL;
}
+
+/**
+ * __ring_buffer_alloc - allocate a new ring_buffer
+ * @size: the size in bytes per cpu that is needed.
+ * @flags: attributes to set for the ring buffer.
+ * @key: ring buffer reader_lock_key.
+ *
+ * Currently the only flag that is available is the RB_FL_OVERWRITE
+ * flag. This flag means that the buffer will overwrite old data
+ * when the buffer wraps. If this flag is not set, the buffer will
+ * drop data when the tail hits the head.
+ */
+struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
+ struct lock_class_key *key)
+{
+ /* Default buffer page size - one system page */
+ return alloc_buffer(size, flags, 0, 0, 0,key);
+
+}
EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
/**
+ * __ring_buffer_alloc_range - allocate a new ring_buffer from existing memory
+ * @size: the size in bytes per cpu that is needed.
+ * @flags: attributes to set for the ring buffer.
+ * @start: start of allocated range
+ * @range_size: size of allocated range
+ * @order: sub-buffer order
+ * @key: ring buffer reader_lock_key.
+ *
+ * Currently the only flag that is available is the RB_FL_OVERWRITE
+ * flag. This flag means that the buffer will overwrite old data
+ * when the buffer wraps. If this flag is not set, the buffer will
+ * drop data when the tail hits the head.
+ */
+struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags,
+ int order, unsigned long start,
+ unsigned long range_size,
+ struct lock_class_key *key)
+{
+ return alloc_buffer(size, flags, order, start, start + range_size, key);
+}
+
+/**
+ * ring_buffer_last_boot_delta - return the delta offset from last boot
+ * @buffer: The buffer to return the delta from
+ * @text: Return text delta
+ * @data: Return data delta
+ *
+ * Returns: The true if the delta is non zero
+ */
+bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text,
+ long *data)
+{
+ if (!buffer)
+ return false;
+
+ if (!buffer->last_text_delta)
+ return false;
+
+ *text = buffer->last_text_delta;
+ *data = buffer->last_data_delta;
+
+ return true;
+}
+
+/**
* ring_buffer_free - free a ring buffer.
* @buffer: the buffer to free.
*/
@@ -2364,6 +3034,52 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
iter->next_event = 0;
}
+/* Return the index into the sub-buffers for a given sub-buffer */
+static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf)
+{
+ void *subbuf_array;
+
+ subbuf_array = (void *)meta + sizeof(int) * meta->nr_subbufs;
+ subbuf_array = (void *)ALIGN((unsigned long)subbuf_array, meta->subbuf_size);
+ return (subbuf - subbuf_array) / meta->subbuf_size;
+}
+
+static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *next_page)
+{
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ unsigned long old_head = (unsigned long)next_page->page;
+ unsigned long new_head;
+
+ rb_inc_page(&next_page);
+ new_head = (unsigned long)next_page->page;
+
+ /*
+ * Only move it forward once, if something else came in and
+ * moved it forward, then we don't want to touch it.
+ */
+ (void)cmpxchg(&meta->head_buffer, old_head, new_head);
+}
+
+static void rb_update_meta_reader(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *reader)
+{
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ void *old_reader = cpu_buffer->reader_page->page;
+ void *new_reader = reader->page;
+ int id;
+
+ id = reader->id;
+ cpu_buffer->reader_page->id = id;
+ reader->id = 0;
+
+ meta->buffers[0] = rb_meta_subbuf_idx(meta, new_reader);
+ meta->buffers[id] = rb_meta_subbuf_idx(meta, old_reader);
+
+ /* The head pointer is the one after the reader */
+ rb_update_meta_head(cpu_buffer, reader);
+}
+
/*
* rb_handle_head_page - writer hit the head page
*
@@ -2413,6 +3129,8 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes);
local_inc(&cpu_buffer->pages_lost);
+ if (cpu_buffer->ring_meta)
+ rb_update_meta_head(cpu_buffer, next_page);
/*
* The entries will be zeroed out when we move the
* tail page.
@@ -2974,6 +3692,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
rb_inc_page(&cpu_buffer->commit_page);
+ if (cpu_buffer->ring_meta) {
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ meta->commit_buffer = (unsigned long)cpu_buffer->commit_page->page;
+ }
/* add barrier to keep gcc from optimizing too much */
barrier();
}
@@ -3420,11 +4142,10 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
struct rb_event_info *info,
unsigned long tail)
{
- struct ring_buffer_event *event;
struct buffer_data_page *bpage;
u64 ts, delta;
bool full = false;
- int e;
+ int ret;
bpage = info->tail_page->page;
@@ -3450,39 +4171,12 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
goto out;
- ts = bpage->time_stamp;
-
- for (e = 0; e < tail; e += rb_event_length(event)) {
-
- event = (struct ring_buffer_event *)(bpage->data + e);
-
- switch (event->type_len) {
-
- case RINGBUF_TYPE_TIME_EXTEND:
- delta = rb_event_time_stamp(event);
- ts += delta;
- break;
-
- case RINGBUF_TYPE_TIME_STAMP:
- delta = rb_event_time_stamp(event);
- delta = rb_fix_abs_ts(delta, ts);
- if (delta < ts) {
- buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
- cpu_buffer->cpu, ts, delta);
- }
- ts = delta;
- break;
-
- case RINGBUF_TYPE_PADDING:
- if (event->time_delta == 1)
- break;
- fallthrough;
- case RINGBUF_TYPE_DATA:
- ts += event->time_delta;
- break;
-
- default:
- RB_WARN_ON(cpu_buffer, 1);
+ ret = rb_read_data_buffer(bpage, tail, cpu_buffer->cpu, &ts, &delta);
+ if (ret < 0) {
+ if (delta < ts) {
+ buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
+ cpu_buffer->cpu, ts, delta);
+ goto out;
}
}
if ((full && ts > info->ts) ||
@@ -4591,6 +5285,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
if (!ret)
goto spin;
+ if (cpu_buffer->ring_meta)
+ rb_update_meta_reader(cpu_buffer, reader);
+
/*
* Yay! We succeeded in replacing the page.
*
@@ -5212,6 +5909,9 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct trace_buffer_meta *meta = cpu_buffer->meta_page;
+ if (!meta)
+ return;
+
meta->reader.read = cpu_buffer->reader_page->read;
meta->reader.id = cpu_buffer->reader_page->id;
meta->reader.lost_events = cpu_buffer->lost_events;
@@ -5268,11 +5968,16 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->lost_events = 0;
cpu_buffer->last_overrun = 0;
- if (cpu_buffer->mapped)
- rb_update_meta_page(cpu_buffer);
-
rb_head_page_activate(cpu_buffer);
cpu_buffer->pages_removed = 0;
+
+ if (cpu_buffer->mapped) {
+ rb_update_meta_page(cpu_buffer);
+ if (cpu_buffer->ring_meta) {
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ meta->commit_buffer = meta->head_buffer;
+ }
+ }
}
/* Must have disabled the cpu buffer then done a synchronize_rcu */
@@ -5303,6 +6008,7 @@ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ struct ring_buffer_meta *meta;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return;
@@ -5321,6 +6027,11 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
atomic_dec(&cpu_buffer->record_disabled);
atomic_dec(&cpu_buffer->resize_disabled);
+ /* Make sure persistent meta now uses this buffer's addresses */
+ meta = rb_range_meta(buffer, 0, cpu_buffer->cpu);
+ if (meta)
+ rb_meta_init_text_addr(meta);
+
mutex_unlock(&buffer->mutex);
}
EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
@@ -5335,6 +6046,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_meta *meta;
int cpu;
/* prevent another thread from changing buffer sizes */
@@ -5362,6 +6074,11 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
reset_disabled_cpu_buffer(cpu_buffer);
+ /* Make sure persistent meta now uses this buffer's addresses */
+ meta = rb_range_meta(buffer, 0, cpu_buffer->cpu);
+ if (meta)
+ rb_meta_init_text_addr(meta);
+
atomic_dec(&cpu_buffer->record_disabled);
atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled);
}
@@ -6135,10 +6852,10 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
/* install subbuf ID to kern VA translation */
cpu_buffer->subbuf_ids = subbuf_ids;
- meta->meta_page_size = PAGE_SIZE;
meta->meta_struct_len = sizeof(*meta);
meta->nr_subbufs = nr_subbufs;
meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+ meta->meta_page_size = meta->subbuf_size;
rb_update_meta_page(cpu_buffer);
}
@@ -6155,7 +6872,7 @@ rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu)
mutex_lock(&cpu_buffer->mapping_lock);
- if (!cpu_buffer->mapped) {
+ if (!cpu_buffer->user_mapped) {
mutex_unlock(&cpu_buffer->mapping_lock);
return ERR_PTR(-ENODEV);
}
@@ -6179,19 +6896,26 @@ static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer,
lockdep_assert_held(&cpu_buffer->mapping_lock);
+ /* mapped is always greater or equal to user_mapped */
+ if (WARN_ON(cpu_buffer->mapped < cpu_buffer->user_mapped))
+ return -EINVAL;
+
if (inc && cpu_buffer->mapped == UINT_MAX)
return -EBUSY;
- if (WARN_ON(!inc && cpu_buffer->mapped == 0))
+ if (WARN_ON(!inc && cpu_buffer->user_mapped == 0))
return -EINVAL;
mutex_lock(&cpu_buffer->buffer->mutex);
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
- if (inc)
+ if (inc) {
+ cpu_buffer->user_mapped++;
cpu_buffer->mapped++;
- else
+ } else {
+ cpu_buffer->user_mapped--;
cpu_buffer->mapped--;
+ }
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
mutex_unlock(&cpu_buffer->buffer->mutex);
@@ -6214,7 +6938,7 @@ static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer,
static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
struct vm_area_struct *vma)
{
- unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff;
+ unsigned long nr_subbufs, nr_pages, nr_vma_pages, pgoff = vma->vm_pgoff;
unsigned int subbuf_pages, subbuf_order;
struct page **pages;
int p = 0, s = 0;
@@ -6225,6 +6949,12 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
!(vma->vm_flags & VM_MAYSHARE))
return -EPERM;
+ subbuf_order = cpu_buffer->buffer->subbuf_order;
+ subbuf_pages = 1 << subbuf_order;
+
+ if (subbuf_order && pgoff % subbuf_pages)
+ return -EINVAL;
+
/*
* Make sure the mapping cannot become writable later. Also tell the VM
* to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND).
@@ -6234,37 +6964,38 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
lockdep_assert_held(&cpu_buffer->mapping_lock);
- subbuf_order = cpu_buffer->buffer->subbuf_order;
- subbuf_pages = 1 << subbuf_order;
-
nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */
- nr_pages = ((nr_subbufs) << subbuf_order) - pgoff + 1; /* + meta-page */
+ nr_pages = ((nr_subbufs + 1) << subbuf_order) - pgoff; /* + meta-page */
- vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
- if (!vma_pages || vma_pages > nr_pages)
+ nr_vma_pages = vma_pages(vma);
+ if (!nr_vma_pages || nr_vma_pages > nr_pages)
return -EINVAL;
- nr_pages = vma_pages;
+ nr_pages = nr_vma_pages;
pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
if (!pages)
return -ENOMEM;
if (!pgoff) {
+ unsigned long meta_page_padding;
+
pages[p++] = virt_to_page(cpu_buffer->meta_page);
/*
- * TODO: Align sub-buffers on their size, once
- * vm_insert_pages() supports the zero-page.
+ * Pad with the zero-page to align the meta-page with the
+ * sub-buffers.
*/
- } else {
- /* Skip the meta-page */
- pgoff--;
+ meta_page_padding = subbuf_pages - 1;
+ while (meta_page_padding-- && p < nr_pages) {
+ unsigned long __maybe_unused zero_addr =
+ vma->vm_start + (PAGE_SIZE * p);
- if (pgoff % subbuf_pages) {
- err = -EINVAL;
- goto out;
+ pages[p++] = ZERO_PAGE(zero_addr);
}
+ } else {
+ /* Skip the meta-page */
+ pgoff -= subbuf_pages;
s += pgoff / subbuf_pages;
}
@@ -6316,7 +7047,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
mutex_lock(&cpu_buffer->mapping_lock);
- if (cpu_buffer->mapped) {
+ if (cpu_buffer->user_mapped) {
err = __rb_map_vma(cpu_buffer, vma);
if (!err)
err = __rb_inc_dec_mapped(cpu_buffer, true);
@@ -6347,12 +7078,15 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
*/
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
rb_setup_ids_meta_page(cpu_buffer, subbuf_ids);
+
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
err = __rb_map_vma(cpu_buffer, vma);
if (!err) {
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
- cpu_buffer->mapped = 1;
+ /* This is the first time it is mapped by user */
+ cpu_buffer->mapped++;
+ cpu_buffer->user_mapped = 1;
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
} else {
kfree(cpu_buffer->subbuf_ids);
@@ -6380,10 +7114,10 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
mutex_lock(&cpu_buffer->mapping_lock);
- if (!cpu_buffer->mapped) {
+ if (!cpu_buffer->user_mapped) {
err = -ENODEV;
goto out;
- } else if (cpu_buffer->mapped > 1) {
+ } else if (cpu_buffer->user_mapped > 1) {
__rb_inc_dec_mapped(cpu_buffer, false);
goto out;
}
@@ -6391,7 +7125,10 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
mutex_lock(&buffer->mutex);
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
- cpu_buffer->mapped = 0;
+ /* This is the last user space mapping */
+ if (!WARN_ON_ONCE(cpu_buffer->mapped < cpu_buffer->user_mapped))
+ cpu_buffer->mapped--;
+ cpu_buffer->user_mapped = 0;
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c3b2c7dfadef..b4f348b4653f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -482,7 +482,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export);
TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \
TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \
TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \
- TRACE_ITER_HASH_PTR)
+ TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK)
/* trace_options that are only supported by global_trace */
#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \
@@ -490,7 +490,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export);
/* trace_flags that are default zero for instances */
#define ZEROED_TRACE_FLAGS \
- (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK)
+ (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK)
/*
* The global_trace is the descriptor that holds the top-level tracing
@@ -500,6 +500,29 @@ static struct trace_array global_trace = {
.trace_flags = TRACE_DEFAULT_FLAGS,
};
+static struct trace_array *printk_trace = &global_trace;
+
+static __always_inline bool printk_binsafe(struct trace_array *tr)
+{
+ /*
+ * The binary format of traceprintk can cause a crash if used
+ * by a buffer from another boot. Force the use of the
+ * non binary version of trace_printk if the trace_printk
+ * buffer is a boot mapped ring buffer.
+ */
+ return !(tr->flags & TRACE_ARRAY_FL_BOOT);
+}
+
+static void update_printk_trace(struct trace_array *tr)
+{
+ if (printk_trace == tr)
+ return;
+
+ printk_trace->trace_flags &= ~TRACE_ITER_TRACE_PRINTK;
+ printk_trace = tr;
+ tr->trace_flags |= TRACE_ITER_TRACE_PRINTK;
+}
+
void trace_set_ring_buffer_expanded(struct trace_array *tr)
{
if (!tr)
@@ -1117,7 +1140,7 @@ EXPORT_SYMBOL_GPL(__trace_array_puts);
*/
int __trace_puts(unsigned long ip, const char *str, int size)
{
- return __trace_array_puts(&global_trace, ip, str, size);
+ return __trace_array_puts(printk_trace, ip, str, size);
}
EXPORT_SYMBOL_GPL(__trace_puts);
@@ -1128,6 +1151,7 @@ EXPORT_SYMBOL_GPL(__trace_puts);
*/
int __trace_bputs(unsigned long ip, const char *str)
{
+ struct trace_array *tr = READ_ONCE(printk_trace);
struct ring_buffer_event *event;
struct trace_buffer *buffer;
struct bputs_entry *entry;
@@ -1135,14 +1159,17 @@ int __trace_bputs(unsigned long ip, const char *str)
int size = sizeof(struct bputs_entry);
int ret = 0;
- if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
+ if (!printk_binsafe(tr))
+ return __trace_puts(ip, str, strlen(str));
+
+ if (!(tr->trace_flags & TRACE_ITER_PRINTK))
return 0;
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
trace_ctx = tracing_gen_ctx();
- buffer = global_trace.array_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
@@ -1155,7 +1182,7 @@ int __trace_bputs(unsigned long ip, const char *str)
entry->str = str;
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL);
+ ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL);
ret = 1;
out:
@@ -3021,7 +3048,7 @@ void trace_dump_stack(int skip)
/* Skip 1 to skip this function. */
skip++;
#endif
- __ftrace_trace_stack(global_trace.array_buffer.buffer,
+ __ftrace_trace_stack(printk_trace->array_buffer.buffer,
tracing_gen_ctx(), skip, NULL);
}
EXPORT_SYMBOL_GPL(trace_dump_stack);
@@ -3240,12 +3267,15 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
struct trace_event_call *call = &event_bprint;
struct ring_buffer_event *event;
struct trace_buffer *buffer;
- struct trace_array *tr = &global_trace;
+ struct trace_array *tr = READ_ONCE(printk_trace);
struct bprint_entry *entry;
unsigned int trace_ctx;
char *tbuffer;
int len = 0, size;
+ if (!printk_binsafe(tr))
+ return trace_vprintk(ip, fmt, args);
+
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
@@ -3338,7 +3368,7 @@ __trace_array_vprintk(struct trace_buffer *buffer,
memcpy(&entry->buf, tbuffer, len + 1);
if (!call_filter_check_discard(call, entry, buffer, event)) {
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, trace_ctx, 6, NULL);
+ ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL);
}
out:
@@ -3434,7 +3464,7 @@ int trace_array_printk_buf(struct trace_buffer *buffer,
int ret;
va_list ap;
- if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
+ if (!(printk_trace->trace_flags & TRACE_ITER_PRINTK))
return 0;
va_start(ap, fmt);
@@ -3446,7 +3476,7 @@ int trace_array_printk_buf(struct trace_buffer *buffer,
__printf(2, 0)
int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
{
- return trace_array_vprintk(&global_trace, ip, fmt, args);
+ return trace_array_vprintk(printk_trace, ip, fmt, args);
}
EXPORT_SYMBOL_GPL(trace_vprintk);
@@ -3667,8 +3697,11 @@ static void test_can_verify(void)
void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
va_list ap)
{
+ long text_delta = iter->tr->text_delta;
+ long data_delta = iter->tr->data_delta;
const char *p = fmt;
const char *str;
+ bool good;
int i, j;
if (WARN_ON_ONCE(!fmt))
@@ -3687,7 +3720,10 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
j = 0;
- /* We only care about %s and variants */
+ /*
+ * We only care about %s and variants
+ * as well as %p[sS] if delta is non-zero
+ */
for (i = 0; p[i]; i++) {
if (i + 1 >= iter->fmt_size) {
/*
@@ -3716,6 +3752,11 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
}
if (p[i+j] == 's')
break;
+
+ if (text_delta && p[i+1] == 'p' &&
+ ((p[i+2] == 's' || p[i+2] == 'S')))
+ break;
+
star = false;
}
j = 0;
@@ -3729,6 +3770,24 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
iter->fmt[i] = '\0';
trace_seq_vprintf(&iter->seq, iter->fmt, ap);
+ /* Add delta to %pS pointers */
+ if (p[i+1] == 'p') {
+ unsigned long addr;
+ char fmt[4];
+
+ fmt[0] = '%';
+ fmt[1] = 'p';
+ fmt[2] = p[i+2]; /* Either %ps or %pS */
+ fmt[3] = '\0';
+
+ addr = va_arg(ap, unsigned long);
+ addr += text_delta;
+ trace_seq_printf(&iter->seq, fmt, (void *)addr);
+
+ p += i + 3;
+ continue;
+ }
+
/*
* If iter->seq is full, the above call no longer guarantees
* that ap is in sync with fmt processing, and further calls
@@ -3747,6 +3806,14 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
/* The ap now points to the string data of the %s */
str = va_arg(ap, const char *);
+ good = trace_safe_str(iter, str, star, len);
+
+ /* Could be from the last boot */
+ if (data_delta && !good) {
+ str += data_delta;
+ good = trace_safe_str(iter, str, star, len);
+ }
+
/*
* If you hit this warning, it is likely that the
* trace event in question used %s on a string that
@@ -3756,8 +3823,7 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
* instead. See samples/trace_events/trace-events-sample.h
* for reference.
*/
- if (WARN_ONCE(!trace_safe_str(iter, str, star, len),
- "fmt: '%s' current_buffer: '%s'",
+ if (WARN_ONCE(!good, "fmt: '%s' current_buffer: '%s'",
fmt, seq_buf_str(&iter->seq.seq))) {
int ret;
@@ -4919,6 +4985,11 @@ static int tracing_open(struct inode *inode, struct file *file)
static bool
trace_ok_for_array(struct tracer *t, struct trace_array *tr)
{
+#ifdef CONFIG_TRACER_SNAPSHOT
+ /* arrays with mapped buffer range do not have snapshots */
+ if (tr->range_addr_start && t->use_max_tr)
+ return false;
+#endif
return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances;
}
@@ -5011,7 +5082,7 @@ static int show_traces_open(struct inode *inode, struct file *file)
return 0;
}
-static int show_traces_release(struct inode *inode, struct file *file)
+static int tracing_seq_release(struct inode *inode, struct file *file)
{
struct trace_array *tr = inode->i_private;
@@ -5052,7 +5123,7 @@ static const struct file_operations show_traces_fops = {
.open = show_traces_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = show_traces_release,
+ .release = tracing_seq_release,
};
static ssize_t
@@ -5237,7 +5308,8 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
{
if ((mask == TRACE_ITER_RECORD_TGID) ||
- (mask == TRACE_ITER_RECORD_CMD))
+ (mask == TRACE_ITER_RECORD_CMD) ||
+ (mask == TRACE_ITER_TRACE_PRINTK))
lockdep_assert_held(&event_mutex);
/* do nothing if flag is already set */
@@ -5249,6 +5321,25 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
if (tr->current_trace->flag_changed(tr, mask, !!enabled))
return -EINVAL;
+ if (mask == TRACE_ITER_TRACE_PRINTK) {
+ if (enabled) {
+ update_printk_trace(tr);
+ } else {
+ /*
+ * The global_trace cannot clear this.
+ * It's flag only gets cleared if another instance sets it.
+ */
+ if (printk_trace == &global_trace)
+ return -EINVAL;
+ /*
+ * An instance must always have it set.
+ * by default, that's the global_trace instane.
+ */
+ if (printk_trace == tr)
+ update_printk_trace(&global_trace);
+ }
+ }
+
if (enabled)
tr->trace_flags |= mask;
else
@@ -6034,6 +6125,18 @@ out:
return ret;
}
+static void update_last_data(struct trace_array *tr)
+{
+ if (!tr->text_delta && !tr->data_delta)
+ return;
+
+ /* Clear old data */
+ tracing_reset_online_cpus(&tr->array_buffer);
+
+ /* Using current data now */
+ tr->text_delta = 0;
+ tr->data_delta = 0;
+}
/**
* tracing_update_buffers - used by tracing facility to expand ring buffers
@@ -6051,6 +6154,9 @@ int tracing_update_buffers(struct trace_array *tr)
int ret = 0;
mutex_lock(&trace_types_lock);
+
+ update_last_data(tr);
+
if (!tr->ring_buffer_expanded)
ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
@@ -6106,6 +6212,8 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
mutex_lock(&trace_types_lock);
+ update_last_data(tr);
+
if (!tr->ring_buffer_expanded) {
ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
@@ -6854,6 +6962,37 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
}
static ssize_t
+tracing_last_boot_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ struct seq_buf seq;
+ char buf[64];
+
+ seq_buf_init(&seq, buf, 64);
+
+ seq_buf_printf(&seq, "text delta:\t%ld\n", tr->text_delta);
+ seq_buf_printf(&seq, "data delta:\t%ld\n", tr->data_delta);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, seq_buf_used(&seq));
+}
+
+static int tracing_buffer_meta_open(struct inode *inode, struct file *filp)
+{
+ struct trace_array *tr = inode->i_private;
+ int cpu = tracing_get_cpu(inode);
+ int ret;
+
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
+
+ ret = ring_buffer_meta_seq_init(filp, tr->array_buffer.buffer, cpu);
+ if (ret < 0)
+ __trace_array_put(tr);
+ return ret;
+}
+
+static ssize_t
tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
@@ -7429,6 +7568,13 @@ static const struct file_operations tracing_entries_fops = {
.release = tracing_release_generic_tr,
};
+static const struct file_operations tracing_buffer_meta_fops = {
+ .open = tracing_buffer_meta_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = tracing_seq_release,
+};
+
static const struct file_operations tracing_total_entries_fops = {
.open = tracing_open_generic_tr,
.read = tracing_total_entries_read,
@@ -7469,6 +7615,13 @@ static const struct file_operations trace_time_stamp_mode_fops = {
.release = tracing_single_release_tr,
};
+static const struct file_operations last_boot_fops = {
+ .open = tracing_open_generic_tr,
+ .read = tracing_last_boot_read,
+ .llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
+};
+
#ifdef CONFIG_TRACER_SNAPSHOT
static const struct file_operations snapshot_fops = {
.open = tracing_snapshot_open,
@@ -8661,12 +8814,17 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu,
tr, cpu, &tracing_entries_fops);
+ if (tr->range_addr_start)
+ trace_create_cpu_file("buffer_meta", TRACE_MODE_READ, d_cpu,
+ tr, cpu, &tracing_buffer_meta_fops);
#ifdef CONFIG_TRACER_SNAPSHOT
- trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu,
- tr, cpu, &snapshot_fops);
+ if (!tr->range_addr_start) {
+ trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu,
+ tr, cpu, &snapshot_fops);
- trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu,
- tr, cpu, &snapshot_raw_fops);
+ trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu,
+ tr, cpu, &snapshot_raw_fops);
+ }
#endif
}
@@ -9203,7 +9361,21 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size
buf->tr = tr;
- buf->buffer = ring_buffer_alloc(size, rb_flags);
+ if (tr->range_addr_start && tr->range_addr_size) {
+ buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0,
+ tr->range_addr_start,
+ tr->range_addr_size);
+
+ ring_buffer_last_boot_delta(buf->buffer,
+ &tr->text_delta, &tr->data_delta);
+ /*
+ * This is basically the same as a mapped buffer,
+ * with the same restrictions.
+ */
+ tr->mapped++;
+ } else {
+ buf->buffer = ring_buffer_alloc(size, rb_flags);
+ }
if (!buf->buffer)
return -ENOMEM;
@@ -9240,6 +9412,10 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
return ret;
#ifdef CONFIG_TRACER_MAX_TRACE
+ /* Fix mapped buffer trace arrays do not have snapshot buffers */
+ if (tr->range_addr_start)
+ return 0;
+
ret = allocate_trace_buffer(tr, &tr->max_buffer,
allocate_snapshot ? size : 1);
if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) {
@@ -9340,7 +9516,9 @@ static int trace_array_create_dir(struct trace_array *tr)
}
static struct trace_array *
-trace_array_create_systems(const char *name, const char *systems)
+trace_array_create_systems(const char *name, const char *systems,
+ unsigned long range_addr_start,
+ unsigned long range_addr_size)
{
struct trace_array *tr;
int ret;
@@ -9366,6 +9544,10 @@ trace_array_create_systems(const char *name, const char *systems)
goto out_free_tr;
}
+ /* Only for boot up memory mapped ring buffers */
+ tr->range_addr_start = range_addr_start;
+ tr->range_addr_size = range_addr_size;
+
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9423,7 +9605,7 @@ trace_array_create_systems(const char *name, const char *systems)
static struct trace_array *trace_array_create(const char *name)
{
- return trace_array_create_systems(name, NULL);
+ return trace_array_create_systems(name, NULL, 0, 0);
}
static int instance_mkdir(const char *name)
@@ -9448,6 +9630,31 @@ out_unlock:
return ret;
}
+static u64 map_pages(u64 start, u64 size)
+{
+ struct page **pages;
+ phys_addr_t page_start;
+ unsigned int page_count;
+ unsigned int i;
+ void *vaddr;
+
+ page_count = DIV_ROUND_UP(size, PAGE_SIZE);
+
+ page_start = start;
+ pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ return 0;
+
+ for (i = 0; i < page_count; i++) {
+ phys_addr_t addr = page_start + i * PAGE_SIZE;
+ pages[i] = pfn_to_page(addr >> PAGE_SHIFT);
+ }
+ vaddr = vmap(pages, page_count, VM_MAP, PAGE_KERNEL);
+ kfree(pages);
+
+ return (u64)(unsigned long)vaddr;
+}
+
/**
* trace_array_get_by_name - Create/Lookup a trace array, given its name.
* @name: The name of the trace array to be looked up/created.
@@ -9477,7 +9684,7 @@ struct trace_array *trace_array_get_by_name(const char *name, const char *system
goto out_unlock;
}
- tr = trace_array_create_systems(name, systems);
+ tr = trace_array_create_systems(name, systems, 0, 0);
if (IS_ERR(tr))
tr = NULL;
@@ -9507,6 +9714,9 @@ static int __remove_instance(struct trace_array *tr)
set_tracer_flag(tr, 1 << i, 0);
}
+ if (printk_trace == tr)
+ update_printk_trace(&global_trace);
+
tracing_set_nop(tr);
clear_ftrace_function_probes(tr);
event_trace_del_tracer(tr);
@@ -9669,10 +9879,15 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
if (ftrace_create_function_files(tr, d_tracer))
MEM_FAIL(1, "Could not allocate function filter files");
+ if (tr->range_addr_start) {
+ trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer,
+ tr, &last_boot_fops);
#ifdef CONFIG_TRACER_SNAPSHOT
- trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer,
- tr, &snapshot_fops);
+ } else {
+ trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer,
+ tr, &snapshot_fops);
#endif
+ }
trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_err_log_fops);
@@ -10292,6 +10507,7 @@ __init static void enable_instances(void)
{
struct trace_array *tr;
char *curr_str;
+ char *name;
char *str;
char *tok;
@@ -10300,19 +10516,107 @@ __init static void enable_instances(void)
str = boot_instance_info;
while ((curr_str = strsep(&str, "\t"))) {
+ phys_addr_t start = 0;
+ phys_addr_t size = 0;
+ unsigned long addr = 0;
+ bool traceprintk = false;
+ bool traceoff = false;
+ char *flag_delim;
+ char *addr_delim;
tok = strsep(&curr_str, ",");
- if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
- do_allocate_snapshot(tok);
+ flag_delim = strchr(tok, '^');
+ addr_delim = strchr(tok, '@');
- tr = trace_array_get_by_name(tok, NULL);
- if (!tr) {
- pr_warn("Failed to create instance buffer %s\n", curr_str);
+ if (addr_delim)
+ *addr_delim++ = '\0';
+
+ if (flag_delim)
+ *flag_delim++ = '\0';
+
+ name = tok;
+
+ if (flag_delim) {
+ char *flag;
+
+ while ((flag = strsep(&flag_delim, "^"))) {
+ if (strcmp(flag, "traceoff") == 0) {
+ traceoff = true;
+ } else if ((strcmp(flag, "printk") == 0) ||
+ (strcmp(flag, "traceprintk") == 0) ||
+ (strcmp(flag, "trace_printk") == 0)) {
+ traceprintk = true;
+ } else {
+ pr_info("Tracing: Invalid instance flag '%s' for %s\n",
+ flag, name);
+ }
+ }
+ }
+
+ tok = addr_delim;
+ if (tok && isdigit(*tok)) {
+ start = memparse(tok, &tok);
+ if (!start) {
+ pr_warn("Tracing: Invalid boot instance address for %s\n",
+ name);
+ continue;
+ }
+ if (*tok != ':') {
+ pr_warn("Tracing: No size specified for instance %s\n", name);
+ continue;
+ }
+ tok++;
+ size = memparse(tok, &tok);
+ if (!size) {
+ pr_warn("Tracing: Invalid boot instance size for %s\n",
+ name);
+ continue;
+ }
+ } else if (tok) {
+ if (!reserve_mem_find_by_name(tok, &start, &size)) {
+ start = 0;
+ pr_warn("Failed to map boot instance %s to %s\n", name, tok);
+ continue;
+ }
+ }
+
+ if (start) {
+ addr = map_pages(start, size);
+ if (addr) {
+ pr_info("Tracing: mapped boot instance %s at physical memory %pa of size 0x%lx\n",
+ name, &start, (unsigned long)size);
+ } else {
+ pr_warn("Tracing: Failed to map boot instance %s\n", name);
+ continue;
+ }
+ } else {
+ /* Only non mapped buffers have snapshot buffers */
+ if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
+ do_allocate_snapshot(name);
+ }
+
+ tr = trace_array_create_systems(name, NULL, addr, size);
+ if (IS_ERR(tr)) {
+ pr_warn("Tracing: Failed to create instance buffer %s\n", curr_str);
continue;
}
- /* Allow user space to delete it */
- trace_array_put(tr);
+
+ if (traceoff)
+ tracer_tracing_off(tr);
+
+ if (traceprintk)
+ update_printk_trace(tr);
+
+ /*
+ * If start is set, then this is a mapped buffer, and
+ * cannot be deleted by user space, so keep the reference
+ * to it.
+ */
+ if (start)
+ tr->flags |= TRACE_ARRAY_FL_BOOT;
+ else
+ trace_array_put(tr);
while ((tok = strsep(&curr_str, ","))) {
early_enable_events(tr, tok, true);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index bd3e3069300e..c866991b9c78 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -336,7 +336,6 @@ struct trace_array {
bool allocated_snapshot;
spinlock_t snapshot_trigger_lock;
unsigned int snapshot;
- unsigned int mapped;
unsigned long max_latency;
#ifdef CONFIG_FSNOTIFY
struct dentry *d_max_latency;
@@ -344,6 +343,13 @@ struct trace_array {
struct irq_work fsnotify_irqwork;
#endif
#endif
+ /* The below is for memory mapped ring buffer */
+ unsigned int mapped;
+ unsigned long range_addr_start;
+ unsigned long range_addr_size;
+ long text_delta;
+ long data_delta;
+
struct trace_pid_list __rcu *filtered_pids;
struct trace_pid_list __rcu *filtered_no_pids;
/*
@@ -423,7 +429,8 @@ struct trace_array {
};
enum {
- TRACE_ARRAY_FL_GLOBAL = (1 << 0)
+ TRACE_ARRAY_FL_GLOBAL = BIT(0),
+ TRACE_ARRAY_FL_BOOT = BIT(1),
};
extern struct list_head ftrace_trace_arrays;
@@ -644,6 +651,8 @@ trace_buffer_lock_reserve(struct trace_buffer *buffer,
unsigned long len,
unsigned int trace_ctx);
+int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, int cpu);
+
struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
struct trace_array_cpu *data);
@@ -1312,6 +1321,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
C(IRQ_INFO, "irq-info"), \
C(MARKERS, "markers"), \
C(EVENT_FORK, "event-fork"), \
+ C(TRACE_PRINTK, "trace_printk_dest"), \
C(PAUSE_ON_TRACE, "pause-on-trace"), \
C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \
FUNCTION_FLAGS \
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 13d0387ac6a6..a569daaac4c4 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -544,6 +544,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
struct trace_seq *s = &iter->seq;
struct trace_entry *ent = iter->ent;
+ addr += iter->tr->text_delta;
+
if (addr < (unsigned long)__irqentry_text_start ||
addr >= (unsigned long)__irqentry_text_end)
return;
@@ -710,6 +712,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
struct ftrace_graph_ret *graph_ret;
struct ftrace_graph_ent *call;
unsigned long long duration;
+ unsigned long func;
int cpu = iter->cpu;
int i;
@@ -717,6 +720,8 @@ print_graph_entry_leaf(struct trace_iterator *iter,
call = &entry->graph_ent;
duration = graph_ret->rettime - graph_ret->calltime;
+ func = call->func + iter->tr->text_delta;
+
if (data) {
struct fgraph_cpu_data *cpu_data;
@@ -747,10 +752,10 @@ print_graph_entry_leaf(struct trace_iterator *iter,
* enabled.
*/
if (flags & __TRACE_GRAPH_PRINT_RETVAL)
- print_graph_retval(s, graph_ret->retval, true, (void *)call->func,
+ print_graph_retval(s, graph_ret->retval, true, (void *)func,
!!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX));
else
- trace_seq_printf(s, "%ps();\n", (void *)call->func);
+ trace_seq_printf(s, "%ps();\n", (void *)func);
print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET,
cpu, iter->ent->pid, flags);
@@ -766,6 +771,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
struct ftrace_graph_ent *call = &entry->graph_ent;
struct fgraph_data *data = iter->private;
struct trace_array *tr = iter->tr;
+ unsigned long func;
int i;
if (data) {
@@ -788,7 +794,9 @@ print_graph_entry_nested(struct trace_iterator *iter,
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
trace_seq_putc(s, ' ');
- trace_seq_printf(s, "%ps() {\n", (void *)call->func);
+ func = call->func + iter->tr->text_delta;
+
+ trace_seq_printf(s, "%ps() {\n", (void *)func);
if (trace_seq_has_overflowed(s))
return TRACE_TYPE_PARTIAL_LINE;
@@ -863,6 +871,8 @@ check_irq_entry(struct trace_iterator *iter, u32 flags,
int *depth_irq;
struct fgraph_data *data = iter->private;
+ addr += iter->tr->text_delta;
+
/*
* If we are either displaying irqs, or we got called as
* a graph event and private data does not exist,
@@ -990,11 +1000,14 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
unsigned long long duration = trace->rettime - trace->calltime;
struct fgraph_data *data = iter->private;
struct trace_array *tr = iter->tr;
+ unsigned long func;
pid_t pid = ent->pid;
int cpu = iter->cpu;
int func_match = 1;
int i;
+ func = trace->func + iter->tr->text_delta;
+
if (check_irq_return(iter, flags, trace->depth))
return TRACE_TYPE_HANDLED;
@@ -1033,7 +1046,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
* function-retval option is enabled.
*/
if (flags & __TRACE_GRAPH_PRINT_RETVAL) {
- print_graph_retval(s, trace->retval, false, (void *)trace->func,
+ print_graph_retval(s, trace->retval, false, (void *)func,
!!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX));
} else {
/*
@@ -1046,7 +1059,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL))
trace_seq_puts(s, "}\n");
else
- trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
+ trace_seq_printf(s, "} /* %ps */\n", (void *)func);
}
/* Overrun */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d8b302d01083..868f2f912f28 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -990,8 +990,11 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
}
static void print_fn_trace(struct trace_seq *s, unsigned long ip,
- unsigned long parent_ip, int flags)
+ unsigned long parent_ip, long delta, int flags)
{
+ ip += delta;
+ parent_ip += delta;
+
seq_print_ip_sym(s, ip, flags);
if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) {
@@ -1009,7 +1012,7 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- print_fn_trace(s, field->ip, field->parent_ip, flags);
+ print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags);
trace_seq_putc(s, '\n');
return trace_handle_return(s);
@@ -1230,6 +1233,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
struct trace_seq *s = &iter->seq;
unsigned long *p;
unsigned long *end;
+ long delta = iter->tr->text_delta;
trace_assign_type(field, iter->ent);
end = (unsigned long *)((long)iter->ent + iter->ent_size);
@@ -1242,7 +1246,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
break;
trace_seq_puts(s, " => ");
- seq_print_ip_sym(s, *p, flags);
+ seq_print_ip_sym(s, (*p) + delta, flags);
trace_seq_putc(s, '\n');
}
@@ -1587,10 +1591,13 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
{
struct print_entry *field;
struct trace_seq *s = &iter->seq;
+ unsigned long ip;
trace_assign_type(field, iter->ent);
- seq_print_ip_sym(s, field->ip, flags);
+ ip = field->ip + iter->tr->text_delta;
+
+ seq_print_ip_sym(s, ip, flags);
trace_seq_printf(s, ": %s", field->buf);
return trace_handle_return(s);
@@ -1674,7 +1681,7 @@ trace_func_repeats_print(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- print_fn_trace(s, field->ip, field->parent_ip, flags);
+ print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags);
trace_seq_printf(s, " (repeats: %u, last_ts:", field->count);
trace_print_time(s, iter,
iter->ts - FUNC_REPEATS_GET_DELTA_TS(field));
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 130ca7e7787e..ae2ace5e515a 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -547,7 +547,7 @@ probe_wakeup(void *ignore, struct task_struct *p)
* - wakeup_dl handles tasks belonging to sched_dl class only.
*/
if (tracing_dl || (wakeup_dl && !dl_task(p)) ||
- (wakeup_rt && !dl_task(p) && !rt_task(p)) ||
+ (wakeup_rt && !rt_or_dl_task(p)) ||
(!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))
return;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 9c581d6da843..785733245ead 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -564,6 +564,7 @@ static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *re
BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
+ perf_fetch_caller_regs(regs);
*(struct pt_regs **)&param = regs;
param.syscall_nr = rec->nr;
for (i = 0; i < sys_data->nb_args; i++)
@@ -575,6 +576,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
+ struct pt_regs *fake_regs;
struct hlist_head *head;
unsigned long args[6];
bool valid_prog_array;
@@ -602,7 +604,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
size = ALIGN(size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- rec = perf_trace_buf_alloc(size, NULL, &rctx);
+ rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
if (!rec)
return;
@@ -611,7 +613,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
if ((valid_prog_array &&
- !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
+ !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
@@ -666,6 +668,7 @@ static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *reg
} __aligned(8) param;
/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
+ perf_fetch_caller_regs(regs);
*(struct pt_regs **)&param = regs;
param.syscall_nr = rec->nr;
param.ret = rec->ret;
@@ -676,6 +679,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
+ struct pt_regs *fake_regs;
struct hlist_head *head;
bool valid_prog_array;
int syscall_nr;
@@ -701,7 +705,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- rec = perf_trace_buf_alloc(size, NULL, &rctx);
+ rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
if (!rec)
return;
@@ -709,7 +713,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
rec->ret = syscall_get_return_value(current, regs);
if ((valid_prog_array &&
- !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) ||
+ !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) ||
hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 0b0b95418b16..aa0b2e47f2f2 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -853,9 +853,8 @@ static int sort_idmaps(struct uid_gid_map *map)
cmp_extents_forward, NULL);
/* Only copy the memory from forward we actually need. */
- map->reverse = kmemdup(map->forward,
- map->nr_extents * sizeof(struct uid_gid_extent),
- GFP_KERNEL);
+ map->reverse = kmemdup_array(map->forward, map->nr_extents,
+ sizeof(struct uid_gid_extent), GFP_KERNEL);
if (!map->reverse)
return -ENOMEM;
diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
index 8b4f8cc2e0ec..1fec61603ef3 100644
--- a/kernel/vmcore_info.c
+++ b/kernel/vmcore_info.c
@@ -198,17 +198,17 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PG_private);
VMCOREINFO_NUMBER(PG_swapcache);
VMCOREINFO_NUMBER(PG_swapbacked);
-#define PAGE_SLAB_MAPCOUNT_VALUE (~PG_slab)
+#define PAGE_SLAB_MAPCOUNT_VALUE (PGTY_slab << 24)
VMCOREINFO_NUMBER(PAGE_SLAB_MAPCOUNT_VALUE);
#ifdef CONFIG_MEMORY_FAILURE
VMCOREINFO_NUMBER(PG_hwpoison);
#endif
VMCOREINFO_NUMBER(PG_head_mask);
-#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy)
+#define PAGE_BUDDY_MAPCOUNT_VALUE (PGTY_buddy << 24)
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#define PAGE_HUGETLB_MAPCOUNT_VALUE (~PG_hugetlb)
+#define PAGE_HUGETLB_MAPCOUNT_VALUE (PGTY_hugetlb << 24)
VMCOREINFO_NUMBER(PAGE_HUGETLB_MAPCOUNT_VALUE);
-#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline)
+#define PAGE_OFFLINE_MAPCOUNT_VALUE (PGTY_offline << 24)
VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
#ifdef CONFIG_KALLSYMS
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 03b90d7d2175..d36242fd4936 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -666,8 +666,8 @@ struct watch_queue *get_watch_queue(int fd)
struct fd f;
f = fdget(fd);
- if (f.file) {
- pipe = get_pipe_info(f.file, false);
+ if (fd_file(f)) {
+ pipe = get_pipe_info(fd_file(f), false);
if (pipe && pipe->watch_queue) {
wqueue = pipe->watch_queue;
kref_get(&wqueue->usage);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 830a83895493..262691ba62b7 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -1203,7 +1203,10 @@ static void __init lockup_detector_delay_init(struct work_struct *work)
ret = watchdog_hardlockup_probe();
if (ret) {
- pr_info("Delayed init of the lockup detector failed: %d\n", ret);
+ if (ret == -ENODEV)
+ pr_info("NMI not fully supported\n");
+ else
+ pr_info("Delayed init of the lockup detector failed: %d\n", ret);
pr_info("Hard watchdog permanently disabled\n");
return;
}