diff options
Diffstat (limited to 'tools/perf/util/bpf_skel')
-rw-r--r-- | tools/perf/util/bpf_skel/.gitignore | 2 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/lock_contention.bpf.c | 136 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/lock_data.h | 17 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/sample-filter.h | 27 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/sample_filter.bpf.c | 196 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/vmlinux.h | 173 |
6 files changed, 541 insertions, 10 deletions
diff --git a/tools/perf/util/bpf_skel/.gitignore b/tools/perf/util/bpf_skel/.gitignore index 5263e9e6c5d8..7a1c832825de 100644 --- a/tools/perf/util/bpf_skel/.gitignore +++ b/tools/perf/util/bpf_skel/.gitignore @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only .tmp -*.skel.h
\ No newline at end of file +*.skel.h diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c index e6007eaeda1a..8d3cfbb3cc65 100644 --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c @@ -4,11 +4,12 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> #include <bpf/bpf_core_read.h> +#include <asm-generic/errno-base.h> #include "lock_data.h" -/* default buffer size */ -#define MAX_ENTRIES 10240 +/* for collect_lock_syms(). 4096 was rejected by the verifier */ +#define MAX_CPUS 1024 /* lock contention flags from include/trace/events/lock.h */ #define LCB_F_SPIN (1U << 0) @@ -58,6 +59,13 @@ struct { struct { __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u64)); + __uint(value_size, sizeof(__u32)); + __uint(max_entries, MAX_ENTRIES); +} lock_syms SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); __uint(key_size, sizeof(__u32)); __uint(value_size, sizeof(__u8)); __uint(max_entries, 1); @@ -92,6 +100,14 @@ struct rw_semaphore___new { atomic_long_t owner; } __attribute__((preserve_access_index)); +struct mm_struct___old { + struct rw_semaphore mmap_sem; +} __attribute__((preserve_access_index)); + +struct mm_struct___new { + struct rw_semaphore mmap_lock; +} __attribute__((preserve_access_index)); + /* control flags */ int enabled; int has_cpu; @@ -106,7 +122,13 @@ int lock_owner; int aggr_mode; /* error stat */ -int lost; +int task_fail; +int stack_fail; +int time_fail; +int data_fail; + +int task_map_full; +int data_map_full; static inline int can_record(u64 *ctx) { @@ -159,11 +181,12 @@ static inline int update_task_data(struct task_struct *task) return -1; p = bpf_map_lookup_elem(&task_data, &pid); - if (p == NULL) { + if (p == NULL && !task_map_full) { struct contention_task_data data = {}; BPF_CORE_READ_STR_INTO(&data.comm, task, comm); - bpf_map_update_elem(&task_data, &pid, &data, BPF_NOEXIST); + if (bpf_map_update_elem(&task_data, &pid, &data, BPF_NOEXIST) == -E2BIG) + task_map_full = 1; } return 0; @@ -182,7 +205,13 @@ static inline struct task_struct *get_lock_owner(__u64 lock, __u32 flags) struct mutex *mutex = (void *)lock; owner = BPF_CORE_READ(mutex, owner.counter); } else if (flags == LCB_F_READ || flags == LCB_F_WRITE) { -#if __has_builtin(bpf_core_type_matches) + /* + * Support for the BPF_TYPE_MATCHES argument to the + * __builtin_preserve_type_info builtin was added at some point during + * development of clang 15 and it's what is needed for + * bpf_core_type_matches. + */ +#if __has_builtin(__builtin_preserve_type_info) && __clang_major__ >= 15 if (bpf_core_type_matches(struct rw_semaphore___old)) { struct rw_semaphore___old *rwsem = (void *)lock; owner = (unsigned long)BPF_CORE_READ(rwsem, owner); @@ -204,6 +233,41 @@ static inline struct task_struct *get_lock_owner(__u64 lock, __u32 flags) return task; } +static inline __u32 check_lock_type(__u64 lock, __u32 flags) +{ + struct task_struct *curr; + struct mm_struct___old *mm_old; + struct mm_struct___new *mm_new; + + switch (flags) { + case LCB_F_READ: /* rwsem */ + case LCB_F_WRITE: + curr = bpf_get_current_task_btf(); + if (curr->mm == NULL) + break; + mm_new = (void *)curr->mm; + if (bpf_core_field_exists(mm_new->mmap_lock)) { + if (&mm_new->mmap_lock == (void *)lock) + return LCD_F_MMAP_LOCK; + break; + } + mm_old = (void *)curr->mm; + if (bpf_core_field_exists(mm_old->mmap_sem)) { + if (&mm_old->mmap_sem == (void *)lock) + return LCD_F_MMAP_LOCK; + } + break; + case LCB_F_SPIN: /* spinlock */ + curr = bpf_get_current_task_btf(); + if (&curr->sighand->siglock == (void *)lock) + return LCD_F_SIGHAND_LOCK; + break; + default: + break; + } + return 0; +} + SEC("tp_btf/contention_begin") int contention_begin(u64 *ctx) { @@ -224,7 +288,7 @@ int contention_begin(u64 *ctx) bpf_map_update_elem(&tstamp, &pid, &zero, BPF_ANY); pelem = bpf_map_lookup_elem(&tstamp, &pid); if (pelem == NULL) { - lost++; + __sync_fetch_and_add(&task_fail, 1); return 0; } } @@ -237,7 +301,7 @@ int contention_begin(u64 *ctx) pelem->stack_id = bpf_get_stackid(ctx, &stacks, BPF_F_FAST_STACK_CMP | stack_skip); if (pelem->stack_id < 0) - lost++; + __sync_fetch_and_add(&stack_fail, 1); } else if (aggr_mode == LOCK_AGGR_TASK) { struct task_struct *task; @@ -281,6 +345,11 @@ int contention_end(u64 *ctx) return 0; duration = bpf_ktime_get_ns() - pelem->timestamp; + if ((__s64)duration < 0) { + bpf_map_delete_elem(&tstamp, &pid); + __sync_fetch_and_add(&time_fail, 1); + return 0; + } switch (aggr_mode) { case LOCK_AGGR_CALLER: @@ -306,6 +375,12 @@ int contention_end(u64 *ctx) data = bpf_map_lookup_elem(&lock_stat, &key); if (!data) { + if (data_map_full) { + bpf_map_delete_elem(&tstamp, &pid); + __sync_fetch_and_add(&data_fail, 1); + return 0; + } + struct contention_data first = { .total_time = duration, .max_time = duration, @@ -313,8 +388,17 @@ int contention_end(u64 *ctx) .count = 1, .flags = pelem->flags, }; + int err; + + if (aggr_mode == LOCK_AGGR_ADDR) + first.flags |= check_lock_type(pelem->lock, pelem->flags); - bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST); + err = bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST); + if (err < 0) { + if (err == -E2BIG) + data_map_full = 1; + __sync_fetch_and_add(&data_fail, 1); + } bpf_map_delete_elem(&tstamp, &pid); return 0; } @@ -332,4 +416,38 @@ int contention_end(u64 *ctx) return 0; } +extern struct rq runqueues __ksym; + +struct rq___old { + raw_spinlock_t lock; +} __attribute__((preserve_access_index)); + +struct rq___new { + raw_spinlock_t __lock; +} __attribute__((preserve_access_index)); + +SEC("raw_tp/bpf_test_finish") +int BPF_PROG(collect_lock_syms) +{ + __u64 lock_addr, lock_off; + __u32 lock_flag; + + if (bpf_core_field_exists(struct rq___new, __lock)) + lock_off = offsetof(struct rq___new, __lock); + else + lock_off = offsetof(struct rq___old, lock); + + for (int i = 0; i < MAX_CPUS; i++) { + struct rq *rq = bpf_per_cpu_ptr(&runqueues, i); + + if (rq == NULL) + break; + + lock_addr = (__u64)(void *)rq + lock_off; + lock_flag = LOCK_CLASS_RQLOCK; + bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY); + } + return 0; +} + char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h index 3d35fd4407ac..260062a9f2ab 100644 --- a/tools/perf/util/bpf_skel/lock_data.h +++ b/tools/perf/util/bpf_skel/lock_data.h @@ -15,6 +15,18 @@ struct contention_task_data { char comm[TASK_COMM_LEN]; }; +/* default buffer size */ +#define MAX_ENTRIES 16384 + +/* + * Upper bits of the flags in the contention_data are used to identify + * some well-known locks which do not have symbols (non-global locks). + */ +#define LCD_F_MMAP_LOCK (1U << 31) +#define LCD_F_SIGHAND_LOCK (1U << 30) + +#define LCB_F_MAX_FLAGS (1U << 7) + struct contention_data { u64 total_time; u64 min_time; @@ -29,4 +41,9 @@ enum lock_aggr_mode { LOCK_AGGR_CALLER, }; +enum lock_class_sym { + LOCK_CLASS_NONE, + LOCK_CLASS_RQLOCK, +}; + #endif /* UTIL_BPF_SKEL_LOCK_DATA_H */ diff --git a/tools/perf/util/bpf_skel/sample-filter.h b/tools/perf/util/bpf_skel/sample-filter.h new file mode 100644 index 000000000000..2e96e1ab084a --- /dev/null +++ b/tools/perf/util/bpf_skel/sample-filter.h @@ -0,0 +1,27 @@ +#ifndef PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H +#define PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H + +#define MAX_FILTERS 64 + +/* supported filter operations */ +enum perf_bpf_filter_op { + PBF_OP_EQ, + PBF_OP_NEQ, + PBF_OP_GT, + PBF_OP_GE, + PBF_OP_LT, + PBF_OP_LE, + PBF_OP_AND, + PBF_OP_GROUP_BEGIN, + PBF_OP_GROUP_END, +}; + +/* BPF map entry for filtering */ +struct perf_bpf_filter_entry { + enum perf_bpf_filter_op op; + __u32 part; /* sub-sample type info when it has multiple values */ + __u64 flags; /* perf sample type flags */ + __u64 value; +}; + +#endif /* PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H */
\ No newline at end of file diff --git a/tools/perf/util/bpf_skel/sample_filter.bpf.c b/tools/perf/util/bpf_skel/sample_filter.bpf.c new file mode 100644 index 000000000000..cffe493af1ed --- /dev/null +++ b/tools/perf/util/bpf_skel/sample_filter.bpf.c @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2023 Google +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> + +#include "sample-filter.h" + +/* BPF map that will be filled by user space */ +struct filters { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, struct perf_bpf_filter_entry); + __uint(max_entries, MAX_FILTERS); +} filters SEC(".maps"); + +int dropped; + +void *bpf_cast_to_kern_ctx(void *) __ksym; + +/* new kernel perf_sample_data definition */ +struct perf_sample_data___new { + __u64 sample_flags; +} __attribute__((preserve_access_index)); + +/* new kernel perf_mem_data_src definition */ +union perf_mem_data_src__new { + __u64 val; + struct { + __u64 mem_op:5, /* type of opcode */ + mem_lvl:14, /* memory hierarchy level */ + mem_snoop:5, /* snoop mode */ + mem_lock:2, /* lock instr */ + mem_dtlb:7, /* tlb access */ + mem_lvl_num:4, /* memory hierarchy level number */ + mem_remote:1, /* remote */ + mem_snoopx:2, /* snoop mode, ext */ + mem_blk:3, /* access blocked */ + mem_hops:3, /* hop level */ + mem_rsvd:18; + }; +}; + +/* helper function to return the given perf sample data */ +static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx, + struct perf_bpf_filter_entry *entry) +{ + struct perf_sample_data___new *data = (void *)kctx->data; + + if (!bpf_core_field_exists(data->sample_flags) || + (data->sample_flags & entry->flags) == 0) + return 0; + + switch (entry->flags) { + case PERF_SAMPLE_IP: + return kctx->data->ip; + case PERF_SAMPLE_ID: + return kctx->data->id; + case PERF_SAMPLE_TID: + if (entry->part) + return kctx->data->tid_entry.pid; + else + return kctx->data->tid_entry.tid; + case PERF_SAMPLE_CPU: + return kctx->data->cpu_entry.cpu; + case PERF_SAMPLE_TIME: + return kctx->data->time; + case PERF_SAMPLE_ADDR: + return kctx->data->addr; + case PERF_SAMPLE_PERIOD: + return kctx->data->period; + case PERF_SAMPLE_TRANSACTION: + return kctx->data->txn; + case PERF_SAMPLE_WEIGHT_STRUCT: + if (entry->part == 1) + return kctx->data->weight.var1_dw; + if (entry->part == 2) + return kctx->data->weight.var2_w; + if (entry->part == 3) + return kctx->data->weight.var3_w; + /* fall through */ + case PERF_SAMPLE_WEIGHT: + return kctx->data->weight.full; + case PERF_SAMPLE_PHYS_ADDR: + return kctx->data->phys_addr; + case PERF_SAMPLE_CODE_PAGE_SIZE: + return kctx->data->code_page_size; + case PERF_SAMPLE_DATA_PAGE_SIZE: + return kctx->data->data_page_size; + case PERF_SAMPLE_DATA_SRC: + if (entry->part == 1) + return kctx->data->data_src.mem_op; + if (entry->part == 2) + return kctx->data->data_src.mem_lvl_num; + if (entry->part == 3) { + __u32 snoop = kctx->data->data_src.mem_snoop; + __u32 snoopx = kctx->data->data_src.mem_snoopx; + + return (snoopx << 5) | snoop; + } + if (entry->part == 4) + return kctx->data->data_src.mem_remote; + if (entry->part == 5) + return kctx->data->data_src.mem_lock; + if (entry->part == 6) + return kctx->data->data_src.mem_dtlb; + if (entry->part == 7) + return kctx->data->data_src.mem_blk; + if (entry->part == 8) { + union perf_mem_data_src__new *data = (void *)&kctx->data->data_src; + + if (bpf_core_field_exists(data->mem_hops)) + return data->mem_hops; + + return 0; + } + /* return the whole word */ + return kctx->data->data_src.val; + default: + break; + } + return 0; +} + +#define CHECK_RESULT(data, op, val) \ + if (!(data op val)) { \ + if (!in_group) \ + goto drop; \ + } else if (in_group) { \ + group_result = 1; \ + } + +/* BPF program to be called from perf event overflow handler */ +SEC("perf_event") +int perf_sample_filter(void *ctx) +{ + struct bpf_perf_event_data_kern *kctx; + struct perf_bpf_filter_entry *entry; + __u64 sample_data; + int in_group = 0; + int group_result = 0; + int i; + + kctx = bpf_cast_to_kern_ctx(ctx); + + for (i = 0; i < MAX_FILTERS; i++) { + int key = i; /* needed for verifier :( */ + + entry = bpf_map_lookup_elem(&filters, &key); + if (entry == NULL) + break; + sample_data = perf_get_sample(kctx, entry); + + switch (entry->op) { + case PBF_OP_EQ: + CHECK_RESULT(sample_data, ==, entry->value) + break; + case PBF_OP_NEQ: + CHECK_RESULT(sample_data, !=, entry->value) + break; + case PBF_OP_GT: + CHECK_RESULT(sample_data, >, entry->value) + break; + case PBF_OP_GE: + CHECK_RESULT(sample_data, >=, entry->value) + break; + case PBF_OP_LT: + CHECK_RESULT(sample_data, <, entry->value) + break; + case PBF_OP_LE: + CHECK_RESULT(sample_data, <=, entry->value) + break; + case PBF_OP_AND: + CHECK_RESULT(sample_data, &, entry->value) + break; + case PBF_OP_GROUP_BEGIN: + in_group = 1; + group_result = 0; + break; + case PBF_OP_GROUP_END: + if (group_result == 0) + goto drop; + in_group = 0; + break; + } + } + /* generate sample data */ + return 1; + +drop: + __sync_fetch_and_add(&dropped, 1); + return 0; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/vmlinux.h b/tools/perf/util/bpf_skel/vmlinux.h new file mode 100644 index 000000000000..449b1ea91fc4 --- /dev/null +++ b/tools/perf/util/bpf_skel/vmlinux.h @@ -0,0 +1,173 @@ +#ifndef __VMLINUX_H +#define __VMLINUX_H + +#include <linux/bpf.h> +#include <linux/types.h> +#include <linux/perf_event.h> +#include <stdbool.h> + +// non-UAPI kernel data structures, used in the .bpf.c BPF tool component. + +// Just the fields used in these tools preserving the access index so that +// libbpf can fixup offsets with the ones used in the kernel when loading the +// BPF bytecode, if they differ from what is used here. + +typedef __u8 u8; +typedef __u32 u32; +typedef __u64 u64; +typedef __s64 s64; + +typedef int pid_t; + +enum cgroup_subsys_id { + perf_event_cgrp_id = 8, +}; + +enum { + HI_SOFTIRQ = 0, + TIMER_SOFTIRQ, + NET_TX_SOFTIRQ, + NET_RX_SOFTIRQ, + BLOCK_SOFTIRQ, + IRQ_POLL_SOFTIRQ, + TASKLET_SOFTIRQ, + SCHED_SOFTIRQ, + HRTIMER_SOFTIRQ, + RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ + + NR_SOFTIRQS +}; + +typedef struct { + s64 counter; +} __attribute__((preserve_access_index)) atomic64_t; + +typedef atomic64_t atomic_long_t; + +struct raw_spinlock { + int rawlock; +} __attribute__((preserve_access_index)); + +typedef struct raw_spinlock raw_spinlock_t; + +typedef struct { + struct raw_spinlock rlock; +} __attribute__((preserve_access_index)) spinlock_t; + +struct sighand_struct { + spinlock_t siglock; +} __attribute__((preserve_access_index)); + +struct rw_semaphore { + atomic_long_t owner; +} __attribute__((preserve_access_index)); + +struct mutex { + atomic_long_t owner; +} __attribute__((preserve_access_index)); + +struct kernfs_node { + u64 id; +} __attribute__((preserve_access_index)); + +struct cgroup { + struct kernfs_node *kn; + int level; +} __attribute__((preserve_access_index)); + +struct cgroup_subsys_state { + struct cgroup *cgroup; +} __attribute__((preserve_access_index)); + +struct css_set { + struct cgroup_subsys_state *subsys[13]; + struct cgroup *dfl_cgrp; +} __attribute__((preserve_access_index)); + +struct mm_struct { + struct rw_semaphore mmap_lock; +} __attribute__((preserve_access_index)); + +struct task_struct { + unsigned int flags; + struct mm_struct *mm; + pid_t pid; + pid_t tgid; + char comm[16]; + struct sighand_struct *sighand; + struct css_set *cgroups; +} __attribute__((preserve_access_index)); + +struct trace_entry { + short unsigned int type; + unsigned char flags; + unsigned char preempt_count; + int pid; +} __attribute__((preserve_access_index)); + +struct trace_event_raw_irq_handler_entry { + struct trace_entry ent; + int irq; + u32 __data_loc_name; + char __data[]; +} __attribute__((preserve_access_index)); + +struct trace_event_raw_irq_handler_exit { + struct trace_entry ent; + int irq; + int ret; + char __data[]; +} __attribute__((preserve_access_index)); + +struct trace_event_raw_softirq { + struct trace_entry ent; + unsigned int vec; + char __data[]; +} __attribute__((preserve_access_index)); + +struct trace_event_raw_workqueue_execute_start { + struct trace_entry ent; + void *work; + void *function; + char __data[]; +} __attribute__((preserve_access_index)); + +struct trace_event_raw_workqueue_execute_end { + struct trace_entry ent; + void *work; + void *function; + char __data[]; +} __attribute__((preserve_access_index)); + +struct trace_event_raw_workqueue_activate_work { + struct trace_entry ent; + void *work; + char __data[]; +} __attribute__((preserve_access_index)); + +struct perf_sample_data { + u64 addr; + u64 period; + union perf_sample_weight weight; + u64 txn; + union perf_mem_data_src data_src; + u64 ip; + struct { + u32 pid; + u32 tid; + } tid_entry; + u64 time; + u64 id; + struct { + u32 cpu; + } cpu_entry; + u64 phys_addr; + u64 data_page_size; + u64 code_page_size; +} __attribute__((__aligned__(64))) __attribute__((preserve_access_index)); + +struct bpf_perf_event_data_kern { + struct perf_sample_data *data; + struct perf_event *event; +} __attribute__((preserve_access_index)); +#endif // __VMLINUX_H |