aboutsummaryrefslogtreecommitdiff
path: root/kernel/events/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r--kernel/events/core.c717
1 files changed, 506 insertions, 211 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e5aaa806702d..426c2ffba16d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -46,6 +46,10 @@
#include <linux/filter.h>
#include <linux/namei.h>
#include <linux/parser.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/mm.h>
+#include <linux/proc_ns.h>
+#include <linux/mount.h>
#include "internal.h"
@@ -355,6 +359,8 @@ enum event_type_t {
EVENT_FLEXIBLE = 0x1,
EVENT_PINNED = 0x2,
EVENT_TIME = 0x4,
+ /* see ctx_resched() for details */
+ EVENT_CPU = 0x8,
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};
@@ -375,6 +381,7 @@ static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
+static atomic_t nr_namespaces_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
static atomic_t nr_switch_events __read_mostly;
@@ -382,6 +389,7 @@ static atomic_t nr_switch_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
+static cpumask_var_t perf_online_mask;
/*
* perf event paranoia level:
@@ -453,7 +461,7 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
return ret;
@@ -678,6 +686,8 @@ perf_cgroup_set_timestamp(struct task_struct *task,
info->timestamp = ctx->timestamp;
}
+static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
+
#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
@@ -690,61 +700,46 @@ perf_cgroup_set_timestamp(struct task_struct *task,
static void perf_cgroup_switch(struct task_struct *task, int mode)
{
struct perf_cpu_context *cpuctx;
- struct pmu *pmu;
+ struct list_head *list;
unsigned long flags;
/*
- * disable interrupts to avoid geting nr_cgroup
- * changes via __perf_event_disable(). Also
- * avoids preemption.
+ * Disable interrupts and preemption to avoid this CPU's
+ * cgrp_cpuctx_entry to change under us.
*/
local_irq_save(flags);
- /*
- * we reschedule only in the presence of cgroup
- * constrained events.
- */
-
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->unique_pmu != pmu)
- continue; /* ensure we process each cpuctx once */
+ list = this_cpu_ptr(&cgrp_cpuctx_list);
+ list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
+ WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
- /*
- * perf_cgroup_events says at least one
- * context on this CPU has cgroup events.
- *
- * ctx->nr_cgroups reports the number of cgroup
- * events for a context.
- */
- if (cpuctx->ctx.nr_cgroups > 0) {
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_pmu_disable(cpuctx->ctx.pmu);
- if (mode & PERF_CGROUP_SWOUT) {
- cpu_ctx_sched_out(cpuctx, EVENT_ALL);
- /*
- * must not be done before ctxswout due
- * to event_filter_match() in event_sched_out()
- */
- cpuctx->cgrp = NULL;
- }
+ if (mode & PERF_CGROUP_SWOUT) {
+ cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ /*
+ * must not be done before ctxswout due
+ * to event_filter_match() in event_sched_out()
+ */
+ cpuctx->cgrp = NULL;
+ }
- if (mode & PERF_CGROUP_SWIN) {
- WARN_ON_ONCE(cpuctx->cgrp);
- /*
- * set cgrp before ctxsw in to allow
- * event_filter_match() to not have to pass
- * task around
- * we pass the cpuctx->ctx to perf_cgroup_from_task()
- * because cgorup events are only per-cpu
- */
- cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
- cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
- }
- perf_pmu_enable(cpuctx->ctx.pmu);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+ if (mode & PERF_CGROUP_SWIN) {
+ WARN_ON_ONCE(cpuctx->cgrp);
+ /*
+ * set cgrp before ctxsw in to allow
+ * event_filter_match() to not have to pass
+ * task around
+ * we pass the cpuctx->ctx to perf_cgroup_from_task()
+ * because cgorup events are only per-cpu
+ */
+ cpuctx->cgrp = perf_cgroup_from_task(task,
+ &cpuctx->ctx);
+ cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
}
+ perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
local_irq_restore(flags);
@@ -889,6 +884,7 @@ list_update_cgroup_event(struct perf_event *event,
struct perf_event_context *ctx, bool add)
{
struct perf_cpu_context *cpuctx;
+ struct list_head *cpuctx_entry;
if (!is_cgroup_event(event))
return;
@@ -902,15 +898,16 @@ list_update_cgroup_event(struct perf_event *event,
* this will always be called from the right CPU.
*/
cpuctx = __get_cpu_context(ctx);
-
- /*
- * cpuctx->cgrp is NULL until a cgroup event is sched in or
- * ctx->nr_cgroup == 0 .
- */
- if (add && perf_cgroup_from_task(current, ctx) == event->cgrp)
- cpuctx->cgrp = event->cgrp;
- else if (!add)
+ cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
+ /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
+ if (add) {
+ list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
+ if (perf_cgroup_from_task(current, ctx) == event->cgrp)
+ cpuctx->cgrp = event->cgrp;
+ } else {
+ list_del(cpuctx_entry);
cpuctx->cgrp = NULL;
+ }
}
#else /* !CONFIG_CGROUP_PERF */
@@ -929,11 +926,6 @@ static inline int is_cgroup_event(struct perf_event *event)
return 0;
}
-static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
-{
- return 0;
-}
-
static inline void update_cgrp_time_from_event(struct perf_event *event)
{
}
@@ -1005,7 +997,7 @@ list_update_cgroup_event(struct perf_event *event,
*/
#define PERF_CPU_HRTIMER (1000 / HZ)
/*
- * function must be called with interrupts disbled
+ * function must be called with interrupts disabled
*/
static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
@@ -1453,6 +1445,27 @@ static void update_group_times(struct perf_event *leader)
update_event_times(event);
}
+static enum event_type_t get_event_type(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ enum event_type_t event_type;
+
+ lockdep_assert_held(&ctx->lock);
+
+ /*
+ * It's 'group type', really, because if our group leader is
+ * pinned, so are we.
+ */
+ if (event->group_leader != event)
+ event = event->group_leader;
+
+ event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
+ if (!ctx->task)
+ event_type |= EVENT_CPU;
+
+ return event_type;
+}
+
static struct list_head *
ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
{
@@ -2226,7 +2239,8 @@ ctx_sched_in(struct perf_event_context *ctx,
struct task_struct *task);
static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+ struct perf_event_context *ctx,
+ enum event_type_t event_type)
{
if (!cpuctx->task_ctx)
return;
@@ -2234,7 +2248,7 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
return;
- ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+ ctx_sched_out(ctx, cpuctx, event_type);
}
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
@@ -2249,13 +2263,51 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
}
+/*
+ * We want to maintain the following priority of scheduling:
+ * - CPU pinned (EVENT_CPU | EVENT_PINNED)
+ * - task pinned (EVENT_PINNED)
+ * - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
+ * - task flexible (EVENT_FLEXIBLE).
+ *
+ * In order to avoid unscheduling and scheduling back in everything every
+ * time an event is added, only do it for the groups of equal priority and
+ * below.
+ *
+ * This can be called after a batch operation on task events, in which case
+ * event_type is a bit mask of the types of events involved. For CPU events,
+ * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
+ */
static void ctx_resched(struct perf_cpu_context *cpuctx,
- struct perf_event_context *task_ctx)
+ struct perf_event_context *task_ctx,
+ enum event_type_t event_type)
{
+ enum event_type_t ctx_event_type = event_type & EVENT_ALL;
+ bool cpu_event = !!(event_type & EVENT_CPU);
+
+ /*
+ * If pinned groups are involved, flexible groups also need to be
+ * scheduled out.
+ */
+ if (event_type & EVENT_PINNED)
+ event_type |= EVENT_FLEXIBLE;
+
perf_pmu_disable(cpuctx->ctx.pmu);
if (task_ctx)
- task_ctx_sched_out(cpuctx, task_ctx);
- cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ task_ctx_sched_out(cpuctx, task_ctx, event_type);
+
+ /*
+ * Decide which cpu ctx groups to schedule out based on the types
+ * of events that caused rescheduling:
+ * - EVENT_CPU: schedule out corresponding groups;
+ * - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
+ * - otherwise, do nothing more.
+ */
+ if (cpu_event)
+ cpu_ctx_sched_out(cpuctx, ctx_event_type);
+ else if (ctx_event_type & EVENT_PINNED)
+ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+
perf_event_sched_in(cpuctx, task_ctx, current);
perf_pmu_enable(cpuctx->ctx.pmu);
}
@@ -2302,7 +2354,7 @@ static int __perf_install_in_context(void *info)
if (reprogram) {
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
add_event_to_ctx(event, ctx);
- ctx_resched(cpuctx, task_ctx);
+ ctx_resched(cpuctx, task_ctx, get_event_type(event));
} else {
add_event_to_ctx(event, ctx);
}
@@ -2469,7 +2521,7 @@ static void __perf_event_enable(struct perf_event *event,
if (ctx->task)
WARN_ON_ONCE(task_ctx != ctx);
- ctx_resched(cpuctx, task_ctx);
+ ctx_resched(cpuctx, task_ctx, get_event_type(event));
}
/*
@@ -2896,7 +2948,7 @@ unlock:
if (do_switch) {
raw_spin_lock(&ctx->lock);
- task_ctx_sched_out(cpuctx, ctx);
+ task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
raw_spin_unlock(&ctx->lock);
}
}
@@ -2943,7 +2995,7 @@ static void perf_pmu_sched_task(struct task_struct *prev,
return;
list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
- pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */
+ pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
if (WARN_ON_ONCE(!pmu->sched_task))
continue;
@@ -3133,8 +3185,12 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
* We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned,
* cpu flexible, task flexible.
+ *
+ * However, if task's ctx is not carrying any pinned
+ * events, no need to flip the cpuctx's events around.
*/
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ if (!list_empty(&ctx->pinned_groups))
+ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, ctx, task);
perf_pmu_enable(ctx->pmu);
perf_ctx_unlock(cpuctx, ctx);
@@ -3449,6 +3505,7 @@ static int event_enable_on_exec(struct perf_event *event,
static void perf_event_enable_on_exec(int ctxn)
{
struct perf_event_context *ctx, *clone_ctx = NULL;
+ enum event_type_t event_type = 0;
struct perf_cpu_context *cpuctx;
struct perf_event *event;
unsigned long flags;
@@ -3462,15 +3519,19 @@ static void perf_event_enable_on_exec(int ctxn)
cpuctx = __get_cpu_context(ctx);
perf_ctx_lock(cpuctx, ctx);
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
- list_for_each_entry(event, &ctx->event_list, event_entry)
+ list_for_each_entry(event, &ctx->event_list, event_entry) {
enabled |= event_enable_on_exec(event, ctx);
+ event_type |= get_event_type(event);
+ }
/*
* Unclone and reschedule this context if we enabled any event.
*/
if (enabled) {
clone_ctx = unclone_ctx(ctx);
- ctx_resched(cpuctx, ctx);
+ ctx_resched(cpuctx, ctx, event_type);
+ } else {
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
}
perf_ctx_unlock(cpuctx, ctx);
@@ -3487,14 +3548,15 @@ struct perf_read_data {
int ret;
};
-static int find_cpu_to_read(struct perf_event *event, int local_cpu)
+static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
{
- int event_cpu = event->oncpu;
u16 local_pkg, event_pkg;
if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
- event_pkg = topology_physical_package_id(event_cpu);
- local_pkg = topology_physical_package_id(local_cpu);
+ int local_cpu = smp_processor_id();
+
+ event_pkg = topology_physical_package_id(event_cpu);
+ local_pkg = topology_physical_package_id(local_cpu);
if (event_pkg == local_pkg)
return local_cpu;
@@ -3577,10 +3639,10 @@ static inline u64 perf_event_count(struct perf_event *event)
* will not be local and we cannot read them atomically
* - must not have a pmu::count method
*/
-u64 perf_event_read_local(struct perf_event *event)
+int perf_event_read_local(struct perf_event *event, u64 *value)
{
unsigned long flags;
- u64 val;
+ int ret = 0;
/*
* Disabling interrupts avoids all counter scheduling (context
@@ -3588,25 +3650,37 @@ u64 perf_event_read_local(struct perf_event *event)
*/
local_irq_save(flags);
- /* If this is a per-task event, it must be for current */
- WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
- event->hw.target != current);
-
- /* If this is a per-CPU event, it must be for this CPU */
- WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
- event->cpu != smp_processor_id());
-
/*
* It must not be an event with inherit set, we cannot read
* all child counters from atomic context.
*/
- WARN_ON_ONCE(event->attr.inherit);
+ if (event->attr.inherit) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
/*
* It must not have a pmu::count method, those are not
* NMI safe.
*/
- WARN_ON_ONCE(event->pmu->count);
+ if (event->pmu->count) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ /* If this is a per-task event, it must be for current */
+ if ((event->attach_state & PERF_ATTACH_TASK) &&
+ event->hw.target != current) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* If this is a per-CPU event, it must be for this CPU */
+ if (!(event->attach_state & PERF_ATTACH_TASK) &&
+ event->cpu != smp_processor_id()) {
+ ret = -EINVAL;
+ goto out;
+ }
/*
* If the event is currently on this CPU, its either a per-task event,
@@ -3616,15 +3690,16 @@ u64 perf_event_read_local(struct perf_event *event)
if (event->oncpu == smp_processor_id())
event->pmu->read(event);
- val = local64_read(&event->count);
+ *value = local64_read(&event->count);
+out:
local_irq_restore(flags);
- return val;
+ return ret;
}
static int perf_event_read(struct perf_event *event, bool group)
{
- int ret = 0, cpu_to_read, local_cpu;
+ int event_cpu, ret = 0;
/*
* If event is enabled and currently active on a CPU, update the
@@ -3637,21 +3712,25 @@ static int perf_event_read(struct perf_event *event, bool group)
.ret = 0,
};
- local_cpu = get_cpu();
- cpu_to_read = find_cpu_to_read(event, local_cpu);
- put_cpu();
+ event_cpu = READ_ONCE(event->oncpu);
+ if ((unsigned)event_cpu >= nr_cpu_ids)
+ return 0;
+
+ preempt_disable();
+ event_cpu = __perf_event_read_cpu(event, event_cpu);
/*
* Purposely ignore the smp_call_function_single() return
* value.
*
- * If event->oncpu isn't a valid CPU it means the event got
+ * If event_cpu isn't a valid CPU it means the event got
* scheduled out and that will have updated the event count.
*
* Therefore, either way, we'll have an up-to-date event count
* after this.
*/
- (void)smp_call_function_single(cpu_to_read, __perf_event_read, &data, 1);
+ (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
+ preempt_enable();
ret = data.ret;
} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
struct perf_event_context *ctx = event->ctx;
@@ -3749,14 +3828,6 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
return ERR_PTR(-EACCES);
- /*
- * We could be clever and allow to attach a event to an
- * offline CPU and activate it when the CPU comes up, but
- * that's for later.
- */
- if (!cpu_online(cpu))
- return ERR_PTR(-ENODEV);
-
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
ctx = &cpuctx->ctx;
get_ctx(ctx);
@@ -3931,6 +4002,8 @@ static void unaccount_event(struct perf_event *event)
atomic_dec(&nr_mmap_events);
if (event->attr.comm)
atomic_dec(&nr_comm_events);
+ if (event->attr.namespaces)
+ atomic_dec(&nr_namespaces_events);
if (event->attr.task)
atomic_dec(&nr_task_events);
if (event->attr.freq)
@@ -4196,7 +4269,7 @@ int perf_event_release_kernel(struct perf_event *event)
raw_spin_lock_irq(&ctx->lock);
/*
- * Mark this even as STATE_DEAD, there is no external reference to it
+ * Mark this event as STATE_DEAD, there is no external reference to it
* anymore.
*
* Anybody acquiring event->child_mutex after the below loop _must_
@@ -4312,7 +4385,9 @@ EXPORT_SYMBOL_GPL(perf_event_read_value);
static int __perf_read_group_add(struct perf_event *leader,
u64 read_format, u64 *values)
{
+ struct perf_event_context *ctx = leader->ctx;
struct perf_event *sub;
+ unsigned long flags;
int n = 1; /* skip @nr */
int ret;
@@ -4342,12 +4417,15 @@ static int __perf_read_group_add(struct perf_event *leader,
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ raw_spin_lock_irqsave(&ctx->lock, flags);
+
list_for_each_entry(sub, &leader->sibling_list, group_entry) {
values[n++] += perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
}
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
return 0;
}
@@ -4869,9 +4947,9 @@ unlock:
rcu_read_unlock();
}
-static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int perf_mmap_fault(struct vm_fault *vmf)
{
- struct perf_event *event = vma->vm_file->private_data;
+ struct perf_event *event = vmf->vma->vm_file->private_data;
struct ring_buffer *rb;
int ret = VM_FAULT_SIGBUS;
@@ -4894,7 +4972,7 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
goto unlock;
get_page(vmf->page);
- vmf->page->mapping = vma->vm_file->f_mapping;
+ vmf->page->mapping = vmf->vma->vm_file->f_mapping;
vmf->page->index = vmf->pgoff;
ret = 0;
@@ -5664,9 +5742,6 @@ static void perf_output_read_one(struct perf_output_handle *handle,
__output_copy(handle, values, n * sizeof(u64));
}
-/*
- * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
- */
static void perf_output_read_group(struct perf_output_handle *handle,
struct perf_event *event,
u64 enabled, u64 running)
@@ -5711,6 +5786,13 @@ static void perf_output_read_group(struct perf_output_handle *handle,
#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
PERF_FORMAT_TOTAL_TIME_RUNNING)
+/*
+ * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
+ *
+ * The problem is that its both hard and excessively expensive to iterate the
+ * child list, not to mention that its impossible to IPI the children running
+ * on another CPU, from interrupt/NMI context.
+ */
static void perf_output_read(struct perf_output_handle *handle,
struct perf_event *event)
{
@@ -6431,6 +6513,7 @@ static void perf_event_task(struct task_struct *task,
void perf_event_fork(struct task_struct *task)
{
perf_event_task(task, NULL, 1);
+ perf_event_namespaces(task);
}
/*
@@ -6533,6 +6616,132 @@ void perf_event_comm(struct task_struct *task, bool exec)
}
/*
+ * namespaces tracking
+ */
+
+struct perf_namespaces_event {
+ struct task_struct *task;
+
+ struct {
+ struct perf_event_header header;
+
+ u32 pid;
+ u32 tid;
+ u64 nr_namespaces;
+ struct perf_ns_link_info link_info[NR_NAMESPACES];
+ } event_id;
+};
+
+static int perf_event_namespaces_match(struct perf_event *event)
+{
+ return event->attr.namespaces;
+}
+
+static void perf_event_namespaces_output(struct perf_event *event,
+ void *data)
+{
+ struct perf_namespaces_event *namespaces_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ if (!perf_event_namespaces_match(event))
+ return;
+
+ perf_event_header__init_id(&namespaces_event->event_id.header,
+ &sample, event);
+ ret = perf_output_begin(&handle, event,
+ namespaces_event->event_id.header.size);
+ if (ret)
+ return;
+
+ namespaces_event->event_id.pid = perf_event_pid(event,
+ namespaces_event->task);
+ namespaces_event->event_id.tid = perf_event_tid(event,
+ namespaces_event->task);
+
+ perf_output_put(&handle, namespaces_event->event_id);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
+ struct task_struct *task,
+ const struct proc_ns_operations *ns_ops)
+{
+ struct path ns_path;
+ struct inode *ns_inode;
+ void *error;
+
+ error = ns_get_path(&ns_path, task, ns_ops);
+ if (!error) {
+ ns_inode = ns_path.dentry->d_inode;
+ ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
+ ns_link_info->ino = ns_inode->i_ino;
+ }
+}
+
+void perf_event_namespaces(struct task_struct *task)
+{
+ struct perf_namespaces_event namespaces_event;
+ struct perf_ns_link_info *ns_link_info;
+
+ if (!atomic_read(&nr_namespaces_events))
+ return;
+
+ namespaces_event = (struct perf_namespaces_event){
+ .task = task,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_NAMESPACES,
+ .misc = 0,
+ .size = sizeof(namespaces_event.event_id),
+ },
+ /* .pid */
+ /* .tid */
+ .nr_namespaces = NR_NAMESPACES,
+ /* .link_info[NR_NAMESPACES] */
+ },
+ };
+
+ ns_link_info = namespaces_event.event_id.link_info;
+
+ perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
+ task, &mntns_operations);
+
+#ifdef CONFIG_USER_NS
+ perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
+ task, &userns_operations);
+#endif
+#ifdef CONFIG_NET_NS
+ perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
+ task, &netns_operations);
+#endif
+#ifdef CONFIG_UTS_NS
+ perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
+ task, &utsns_operations);
+#endif
+#ifdef CONFIG_IPC_NS
+ perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
+ task, &ipcns_operations);
+#endif
+#ifdef CONFIG_PID_NS
+ perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
+ task, &pidns_operations);
+#endif
+#ifdef CONFIG_CGROUPS
+ perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
+ task, &cgroupns_operations);
+#endif
+
+ perf_iterate_sb(perf_event_namespaces_output,
+ &namespaces_event,
+ NULL);
+}
+
+/*
* mmap tracking
*/
@@ -7511,7 +7720,8 @@ static int swevent_hlist_get_cpu(int cpu)
int err = 0;
mutex_lock(&swhash->hlist_mutex);
- if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
+ if (!swevent_hlist_deref(swhash) &&
+ cpumask_test_cpu(cpu, perf_online_mask)) {
struct swevent_hlist *hlist;
hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -7532,7 +7742,7 @@ static int swevent_hlist_get(void)
{
int err, cpu, failed_cpu;
- get_online_cpus();
+ mutex_lock(&pmus_lock);
for_each_possible_cpu(cpu) {
err = swevent_hlist_get_cpu(cpu);
if (err) {
@@ -7540,8 +7750,7 @@ static int swevent_hlist_get(void)
goto fail;
}
}
- put_online_cpus();
-
+ mutex_unlock(&pmus_lock);
return 0;
fail:
for_each_possible_cpu(cpu) {
@@ -7549,8 +7758,7 @@ fail:
break;
swevent_hlist_put_cpu(cpu);
}
-
- put_online_cpus();
+ mutex_unlock(&pmus_lock);
return err;
}
@@ -7845,12 +8053,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
bool is_kprobe, is_tracepoint;
struct bpf_prog *prog;
- if (event->attr.type == PERF_TYPE_HARDWARE ||
- event->attr.type == PERF_TYPE_SOFTWARE)
- return perf_event_set_bpf_handler(event, prog_fd);
-
if (event->attr.type != PERF_TYPE_TRACEPOINT)
- return -EINVAL;
+ return perf_event_set_bpf_handler(event, prog_fd);
if (event->tp_event->prog)
return -EEXIST;
@@ -8039,6 +8243,9 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
if (task == TASK_TOMBSTONE)
return;
+ if (!ifh->nr_file_filters)
+ return;
+
mm = get_task_mm(event->ctx->task);
if (!mm)
goto restart;
@@ -8209,6 +8416,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
* attribute.
*/
if (state == IF_STATE_END) {
+ ret = -EINVAL;
if (kernel && event->attr.exclude_kernel)
goto fail;
@@ -8216,6 +8424,18 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
if (!filename)
goto fail;
+ /*
+ * For now, we only support file-based filters
+ * in per-task events; doing so for CPU-wide
+ * events requires additional context switching
+ * trickery, since same object code will be
+ * mapped at different virtual addresses in
+ * different processes.
+ */
+ ret = -EOPNOTSUPP;
+ if (!event->ctx->task)
+ goto fail_free_name;
+
/* look up the path and grab its inode */
ret = kern_path(filename, LOOKUP_FOLLOW, &path);
if (ret)
@@ -8231,6 +8451,8 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
!S_ISREG(filter->inode->i_mode))
/* free_filters_list() will iput() */
goto fail;
+
+ event->addr_filters.nr_file_filters++;
}
/* ready to consume more filters */
@@ -8270,24 +8492,13 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
if (WARN_ON_ONCE(event->parent))
return -EINVAL;
- /*
- * For now, we only support filtering in per-task events; doing so
- * for CPU-wide events requires additional context switching trickery,
- * since same object code will be mapped at different virtual
- * addresses in different processes.
- */
- if (!event->ctx->task)
- return -EOPNOTSUPP;
-
ret = perf_event_parse_addr_filter(event, filter_str, &filters);
if (ret)
- return ret;
+ goto fail_clear_files;
ret = event->pmu->addr_filters_validate(&filters);
- if (ret) {
- free_filters_list(&filters);
- return ret;
- }
+ if (ret)
+ goto fail_free_filters;
/* remove existing filters, if any */
perf_addr_filters_splice(event, &filters);
@@ -8296,6 +8507,14 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
perf_event_for_each_child(event, perf_event_addr_filters_apply);
return ret;
+
+fail_free_filters:
+ free_filters_list(&filters);
+
+fail_clear_files:
+ event->addr_filters.nr_file_filters = 0;
+
+ return ret;
}
static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -8647,37 +8866,10 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
return NULL;
}
-static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
-
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-
- if (cpuctx->unique_pmu == old_pmu)
- cpuctx->unique_pmu = pmu;
- }
-}
-
static void free_pmu_context(struct pmu *pmu)
{
- struct pmu *i;
-
mutex_lock(&pmus_lock);
- /*
- * Like a real lame refcount.
- */
- list_for_each_entry(i, &pmus, entry) {
- if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
- update_pmu_context(i, pmu);
- goto out;
- }
- }
-
free_percpu(pmu->pmu_cpu_context);
-out:
mutex_unlock(&pmus_lock);
}
@@ -8740,7 +8932,7 @@ perf_event_mux_interval_ms_store(struct device *dev,
pmu->hrtimer_interval_ms = timer;
/* update all cpuctx for this PMU */
- get_online_cpus();
+ cpus_read_lock();
for_each_online_cpu(cpu) {
struct perf_cpu_context *cpuctx;
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
@@ -8749,7 +8941,7 @@ perf_event_mux_interval_ms_store(struct device *dev,
cpu_function_call(cpu,
(remote_function_f)perf_mux_hrtimer_restart, cpuctx);
}
- put_online_cpus();
+ cpus_read_unlock();
mutex_unlock(&mux_interval_mutex);
return count;
@@ -8879,10 +9071,9 @@ skip_type:
lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
cpuctx->ctx.pmu = pmu;
+ cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
__perf_mux_hrtimer_init(cpuctx, cpu);
-
- cpuctx->unique_pmu = pmu;
}
got_cpu_context:
@@ -8994,12 +9185,20 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
static struct pmu *perf_init_event(struct perf_event *event)
{
- struct pmu *pmu = NULL;
+ struct pmu *pmu;
int idx;
int ret;
idx = srcu_read_lock(&pmus_srcu);
+ /* Try parent's PMU first: */
+ if (event->parent && event->parent->pmu) {
+ pmu = event->parent->pmu;
+ ret = perf_try_init_event(pmu, event);
+ if (!ret)
+ goto unlock;
+ }
+
rcu_read_lock();
pmu = idr_find(&pmu_idr, event->attr.type);
rcu_read_unlock();
@@ -9092,6 +9291,8 @@ static void account_event(struct perf_event *event)
atomic_inc(&nr_mmap_events);
if (event->attr.comm)
atomic_inc(&nr_comm_events);
+ if (event->attr.namespaces)
+ atomic_inc(&nr_namespaces_events);
if (event->attr.task)
atomic_inc(&nr_task_events);
if (event->attr.freq)
@@ -9253,9 +9454,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
local64_set(&hwc->period_left, hwc->sample_period);
/*
- * we currently do not support PERF_FORMAT_GROUP on inherited events
+ * We currently do not support PERF_SAMPLE_READ on inherited events.
+ * See perf_output_read().
*/
- if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
+ if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
goto err_ns;
if (!has_branch_stack(event))
@@ -9268,9 +9470,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
pmu = perf_init_event(event);
- if (!pmu)
- goto err_ns;
- else if (IS_ERR(pmu)) {
+ if (IS_ERR(pmu)) {
err = PTR_ERR(pmu);
goto err_ns;
}
@@ -9283,8 +9483,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
sizeof(unsigned long),
GFP_KERNEL);
- if (!event->addr_filters_offs)
+ if (!event->addr_filters_offs) {
+ err = -ENOMEM;
goto err_per_task;
+ }
/* force hw sync on the address filters */
event->addr_filters_gen = 1;
@@ -9637,6 +9839,11 @@ SYSCALL_DEFINE5(perf_event_open,
return -EACCES;
}
+ if (attr.namespaces) {
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ }
+
if (attr.freq) {
if (attr.sample_freq > sysctl_perf_event_sample_rate)
return -EINVAL;
@@ -9689,12 +9896,10 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_task;
}
- get_online_cpus();
-
if (task) {
err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
if (err)
- goto err_cpus;
+ goto err_task;
/*
* Reuse ptrace permission checks for now.
@@ -9880,6 +10085,23 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_locked;
}
+ if (!task) {
+ /*
+ * Check if the @cpu we're creating an event for is online.
+ *
+ * We use the perf_cpu_context::ctx::mutex to serialize against
+ * the hotplug notifiers. See perf_event_{init,exit}_cpu().
+ */
+ struct perf_cpu_context *cpuctx =
+ container_of(ctx, struct perf_cpu_context, ctx);
+
+ if (!cpuctx->online) {
+ err = -ENODEV;
+ goto err_locked;
+ }
+ }
+
+
/*
* Must be under the same ctx::mutex as perf_install_in_context(),
* because we need to serialize with concurrent event creation.
@@ -9905,6 +10127,7 @@ SYSCALL_DEFINE5(perf_event_open,
* of swizzling perf_event::ctx.
*/
perf_remove_from_context(group_leader, 0);
+ put_ctx(gctx);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
@@ -9943,13 +10166,6 @@ SYSCALL_DEFINE5(perf_event_open,
perf_event__state_init(group_leader);
perf_install_in_context(ctx, group_leader, group_leader->cpu);
get_ctx(ctx);
-
- /*
- * Now that all events are installed in @ctx, nothing
- * references @gctx anymore, so drop the last reference we have
- * on it.
- */
- put_ctx(gctx);
}
/*
@@ -9975,8 +10191,6 @@ SYSCALL_DEFINE5(perf_event_open,
put_task_struct(task);
}
- put_online_cpus();
-
mutex_lock(&current->perf_event_mutex);
list_add_tail(&event->owner_entry, &current->perf_event_list);
mutex_unlock(&current->perf_event_mutex);
@@ -10010,8 +10224,6 @@ err_alloc:
err_cred:
if (task)
mutex_unlock(&task->signal->cred_guard_mutex);
-err_cpus:
- put_online_cpus();
err_task:
if (task)
put_task_struct(task);
@@ -10066,6 +10278,21 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
goto err_unlock;
}
+ if (!task) {
+ /*
+ * Check if the @cpu we're creating an event for is online.
+ *
+ * We use the perf_cpu_context::ctx::mutex to serialize against
+ * the hotplug notifiers. See perf_event_{init,exit}_cpu().
+ */
+ struct perf_cpu_context *cpuctx =
+ container_of(ctx, struct perf_cpu_context, ctx);
+ if (!cpuctx->online) {
+ err = -ENODEV;
+ goto err_unlock;
+ }
+ }
+
if (!exclusive_event_installable(event, ctx)) {
err = -EBUSY;
goto err_unlock;
@@ -10260,7 +10487,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
* in.
*/
raw_spin_lock_irq(&child_ctx->lock);
- task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
+ task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
/*
* Now that the context is inactive, destroy the task <-> ctx relation
@@ -10369,21 +10596,22 @@ void perf_event_free_task(struct task_struct *task)
continue;
mutex_lock(&ctx->mutex);
-again:
- list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
- group_entry)
- perf_free_event(event, ctx);
+ raw_spin_lock_irq(&ctx->lock);
+ /*
+ * Destroy the task <-> ctx relation and mark the context dead.
+ *
+ * This is important because even though the task hasn't been
+ * exposed yet the context has been (through child_list).
+ */
+ RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
+ WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
+ put_task_struct(task); /* cannot be last */
+ raw_spin_unlock_irq(&ctx->lock);
- list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
- group_entry)
+ list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
perf_free_event(event, ctx);
- if (!list_empty(&ctx->pinned_groups) ||
- !list_empty(&ctx->flexible_groups))
- goto again;
-
mutex_unlock(&ctx->mutex);
-
put_ctx(ctx);
}
}
@@ -10421,7 +10649,12 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
}
/*
- * inherit a event from parent task to child task:
+ * Inherit a event from parent task to child task.
+ *
+ * Returns:
+ * - valid pointer on success
+ * - NULL for orphaned events
+ * - IS_ERR() on error
*/
static struct perf_event *
inherit_event(struct perf_event *parent_event,
@@ -10515,6 +10748,16 @@ inherit_event(struct perf_event *parent_event,
return child_event;
}
+/*
+ * Inherits an event group.
+ *
+ * This will quietly suppress orphaned events; !inherit_event() is not an error.
+ * This matches with perf_event_release_kernel() removing all child events.
+ *
+ * Returns:
+ * - 0 on success
+ * - <0 on error
+ */
static int inherit_group(struct perf_event *parent_event,
struct task_struct *parent,
struct perf_event_context *parent_ctx,
@@ -10529,6 +10772,11 @@ static int inherit_group(struct perf_event *parent_event,
child, NULL, child_ctx);
if (IS_ERR(leader))
return PTR_ERR(leader);
+ /*
+ * @leader can be NULL here because of is_orphaned_event(). In this
+ * case inherit_event() will create individual events, similar to what
+ * perf_group_detach() would do anyway.
+ */
list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
child_ctr = inherit_event(sub, parent, parent_ctx,
child, leader, child_ctx);
@@ -10538,6 +10786,17 @@ static int inherit_group(struct perf_event *parent_event,
return 0;
}
+/*
+ * Creates the child task context and tries to inherit the event-group.
+ *
+ * Clears @inherited_all on !attr.inherited or error. Note that we'll leave
+ * inherited_all set when we 'fail' to inherit an orphaned event; this is
+ * consistent with perf_event_release_kernel() removing all child events.
+ *
+ * Returns:
+ * - 0 on success
+ * - <0 on error
+ */
static int
inherit_task_group(struct perf_event *event, struct task_struct *parent,
struct perf_event_context *parent_ctx,
@@ -10560,7 +10819,6 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
* First allocate and initialize a context for the
* child.
*/
-
child_ctx = alloc_perf_context(parent_ctx->pmu, child);
if (!child_ctx)
return -ENOMEM;
@@ -10622,7 +10880,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
ret = inherit_task_group(event, parent, parent_ctx,
child, ctxn, &inherited_all);
if (ret)
- break;
+ goto out_unlock;
}
/*
@@ -10638,7 +10896,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
ret = inherit_task_group(event, parent, parent_ctx,
child, ctxn, &inherited_all);
if (ret)
- break;
+ goto out_unlock;
}
raw_spin_lock_irqsave(&parent_ctx->lock, flags);
@@ -10666,6 +10924,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
}
raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
+out_unlock:
mutex_unlock(&parent_ctx->mutex);
perf_unpin_context(parent_ctx);
@@ -10701,6 +10960,8 @@ static void __init perf_event_init_all_cpus(void)
struct swevent_htable *swhash;
int cpu;
+ zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
+
for_each_possible_cpu(cpu) {
swhash = &per_cpu(swevent_htable, cpu);
mutex_init(&swhash->hlist_mutex);
@@ -10709,11 +10970,14 @@ static void __init perf_event_init_all_cpus(void)
INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
+#ifdef CONFIG_CGROUP_PERF
+ INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
+#endif
INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
}
}
-int perf_event_init_cpu(unsigned int cpu)
+void perf_swevent_init_cpu(unsigned int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
@@ -10726,7 +10990,6 @@ int perf_event_init_cpu(unsigned int cpu)
rcu_assign_pointer(swhash->swevent_hlist, hlist);
}
mutex_unlock(&swhash->hlist_mutex);
- return 0;
}
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
@@ -10744,19 +11007,22 @@ static void __perf_event_exit_context(void *__info)
static void perf_event_exit_cpu_context(int cpu)
{
+ struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
struct pmu *pmu;
- int idx;
- idx = srcu_read_lock(&pmus_srcu);
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
+ mutex_lock(&pmus_lock);
+ list_for_each_entry(pmu, &pmus, entry) {
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
mutex_lock(&ctx->mutex);
smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+ cpuctx->online = 0;
mutex_unlock(&ctx->mutex);
}
- srcu_read_unlock(&pmus_srcu, idx);
+ cpumask_clear_cpu(cpu, perf_online_mask);
+ mutex_unlock(&pmus_lock);
}
#else
@@ -10764,6 +11030,29 @@ static void perf_event_exit_cpu_context(int cpu) { }
#endif
+int perf_event_init_cpu(unsigned int cpu)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_event_context *ctx;
+ struct pmu *pmu;
+
+ perf_swevent_init_cpu(cpu);
+
+ mutex_lock(&pmus_lock);
+ cpumask_set_cpu(cpu, perf_online_mask);
+ list_for_each_entry(pmu, &pmus, entry) {
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
+
+ mutex_lock(&ctx->mutex);
+ cpuctx->online = 1;
+ mutex_unlock(&ctx->mutex);
+ }
+ mutex_unlock(&pmus_lock);
+
+ return 0;
+}
+
int perf_event_exit_cpu(unsigned int cpu)
{
perf_event_exit_cpu_context(cpu);
@@ -10906,5 +11195,11 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
.css_alloc = perf_cgroup_css_alloc,
.css_free = perf_cgroup_css_free,
.attach = perf_cgroup_attach,
+ /*
+ * Implicitly enable on dfl hierarchy so that perf events can
+ * always be filtered by cgroup2 path as long as perf_event
+ * controller is not mounted on a legacy hierarchy.
+ */
+ .implicit_on_dfl = true,
};
#endif /* CONFIG_CGROUP_PERF */