6 files changed, 541 insertions, 10 deletions
diff --git a/tools/perf/util/bpf_skel/.gitignore b/tools/perf/util/bpf_skel/.gitignore
index 5263e9e6c5d8..7a1c832825de 100644
--- a/tools/perf/util/bpf_skel/.gitignore
+++ b/tools/perf/util/bpf_skel/.gitignore
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
 .tmp
-*.skel.h
-\ No newline at end of file
+*.skel.h
diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
index e6007eaeda1a..8d3cfbb3cc65 100644
--- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -4,11 +4,12 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_core_read.h>
+#include <asm-generic/errno-base.h>
 
 #include "lock_data.h"
 
-/* default buffer size */
-#define MAX_ENTRIES  10240
+/* for collect_lock_syms().  4096 was rejected by the verifier */
+#define MAX_CPUS  1024
 
 /* lock contention flags from include/trace/events/lock.h */
 #define LCB_F_SPIN	(1U << 0)
@@ -58,6 +59,13 @@ struct {
 
 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(__u64));
+	__uint(value_size, sizeof(__u32));
+	__uint(max_entries, MAX_ENTRIES);
+} lock_syms SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
 	__uint(key_size, sizeof(__u32));
 	__uint(value_size, sizeof(__u8));
 	__uint(max_entries, 1);
@@ -92,6 +100,14 @@ struct rw_semaphore___new {
 	atomic_long_t owner;
 } __attribute__((preserve_access_index));
 
+struct mm_struct___old {
+	struct rw_semaphore mmap_sem;
+} __attribute__((preserve_access_index));
+
+struct mm_struct___new {
+	struct rw_semaphore mmap_lock;
+} __attribute__((preserve_access_index));
+
 /* control flags */
 int enabled;
 int has_cpu;
@@ -106,7 +122,13 @@ int lock_owner;
 int aggr_mode;
 
 /* error stat */
-int lost;
+int task_fail;
+int stack_fail;
+int time_fail;
+int data_fail;
+
+int task_map_full;
+int data_map_full;
 
 static inline int can_record(u64 *ctx)
 {
@@ -159,11 +181,12 @@ static inline int update_task_data(struct task_struct *task)
 		return -1;
 
 	p = bpf_map_lookup_elem(&task_data, &pid);
-	if (p == NULL) {
+	if (p == NULL && !task_map_full) {
 		struct contention_task_data data = {};
 
 		BPF_CORE_READ_STR_INTO(&data.comm, task, comm);
-		bpf_map_update_elem(&task_data, &pid, &data, BPF_NOEXIST);
+		if (bpf_map_update_elem(&task_data, &pid, &data, BPF_NOEXIST) == -E2BIG)
+			task_map_full = 1;
 	}
 
 	return 0;
@@ -182,7 +205,13 @@ static inline struct task_struct *get_lock_owner(__u64 lock, __u32 flags)
 		struct mutex *mutex = (void *)lock;
 		owner = BPF_CORE_READ(mutex, owner.counter);
 	} else if (flags == LCB_F_READ || flags == LCB_F_WRITE) {
-#if __has_builtin(bpf_core_type_matches)
+	/*
+	 * Support for the BPF_TYPE_MATCHES argument to the
+	 * __builtin_preserve_type_info builtin was added at some point during
+	 * development of clang 15 and it's what is needed for
+	 * bpf_core_type_matches.
+	 */
+#if __has_builtin(__builtin_preserve_type_info) && __clang_major__ >= 15
 		if (bpf_core_type_matches(struct rw_semaphore___old)) {
 			struct rw_semaphore___old *rwsem = (void *)lock;
 			owner = (unsigned long)BPF_CORE_READ(rwsem, owner);
@@ -204,6 +233,41 @@ static inline struct task_struct *get_lock_owner(__u64 lock, __u32 flags)
 	return task;
 }
 
+static inline __u32 check_lock_type(__u64 lock, __u32 flags)
+{
+	struct task_struct *curr;
+	struct mm_struct___old *mm_old;
+	struct mm_struct___new *mm_new;
+
+	switch (flags) {
+	case LCB_F_READ:  /* rwsem */
+	case LCB_F_WRITE:
+		curr = bpf_get_current_task_btf();
+		if (curr->mm == NULL)
+			break;
+		mm_new = (void *)curr->mm;
+		if (bpf_core_field_exists(mm_new->mmap_lock)) {
+			if (&mm_new->mmap_lock == (void *)lock)
+				return LCD_F_MMAP_LOCK;
+			break;
+		}
+		mm_old = (void *)curr->mm;
+		if (bpf_core_field_exists(mm_old->mmap_sem)) {
+			if (&mm_old->mmap_sem == (void *)lock)
+				return LCD_F_MMAP_LOCK;
+		}
+		break;
+	case LCB_F_SPIN:  /* spinlock */
+		curr = bpf_get_current_task_btf();
+		if (&curr->sighand->siglock == (void *)lock)
+			return LCD_F_SIGHAND_LOCK;
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
 SEC("tp_btf/contention_begin")
 int contention_begin(u64 *ctx)
 {
@@ -224,7 +288,7 @@ int contention_begin(u64 *ctx)
 		bpf_map_update_elem(&tstamp, &pid, &zero, BPF_ANY);
 		pelem = bpf_map_lookup_elem(&tstamp, &pid);
 		if (pelem == NULL) {
-			lost++;
+			__sync_fetch_and_add(&task_fail, 1);
 			return 0;
 		}
 	}
@@ -237,7 +301,7 @@ int contention_begin(u64 *ctx)
 		pelem->stack_id = bpf_get_stackid(ctx, &stacks,
 						  BPF_F_FAST_STACK_CMP | stack_skip);
 		if (pelem->stack_id < 0)
-			lost++;
+			__sync_fetch_and_add(&stack_fail, 1);
 	} else if (aggr_mode == LOCK_AGGR_TASK) {
 		struct task_struct *task;
 
@@ -281,6 +345,11 @@ int contention_end(u64 *ctx)
 		return 0;
 
 	duration = bpf_ktime_get_ns() - pelem->timestamp;
+	if ((__s64)duration < 0) {
+		bpf_map_delete_elem(&tstamp, &pid);
+		__sync_fetch_and_add(&time_fail, 1);
+		return 0;
+	}
 
 	switch (aggr_mode) {
 	case LOCK_AGGR_CALLER:
@@ -306,6 +375,12 @@ int contention_end(u64 *ctx)
 
 	data = bpf_map_lookup_elem(&lock_stat, &key);
 	if (!data) {
+		if (data_map_full) {
+			bpf_map_delete_elem(&tstamp, &pid);
+			__sync_fetch_and_add(&data_fail, 1);
+			return 0;
+		}
+
 		struct contention_data first = {
 			.total_time = duration,
 			.max_time = duration,
@@ -313,8 +388,17 @@ int contention_end(u64 *ctx)
 			.count = 1,
 			.flags = pelem->flags,
 		};
+		int err;
+
+		if (aggr_mode == LOCK_AGGR_ADDR)
+			first.flags |= check_lock_type(pelem->lock, pelem->flags);
 
-		bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST);
+		err = bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST);
+		if (err < 0) {
+			if (err == -E2BIG)
+				data_map_full = 1;
+			__sync_fetch_and_add(&data_fail, 1);
+		}
 		bpf_map_delete_elem(&tstamp, &pid);
 		return 0;
 	}
@@ -332,4 +416,38 @@ int contention_end(u64 *ctx)
 	return 0;
 }
 
+extern struct rq runqueues __ksym;
+
+struct rq___old {
+	raw_spinlock_t lock;
+} __attribute__((preserve_access_index));
+
+struct rq___new {
+	raw_spinlock_t __lock;
+} __attribute__((preserve_access_index));
+
+SEC("raw_tp/bpf_test_finish")
+int BPF_PROG(collect_lock_syms)
+{
+	__u64 lock_addr, lock_off;
+	__u32 lock_flag;
+
+	if (bpf_core_field_exists(struct rq___new, __lock))
+		lock_off = offsetof(struct rq___new, __lock);
+	else
+		lock_off = offsetof(struct rq___old, lock);
+
+	for (int i = 0; i < MAX_CPUS; i++) {
+		struct rq *rq = bpf_per_cpu_ptr(&runqueues, i);
+
+		if (rq == NULL)
+			break;
+
+		lock_addr = (__u64)(void *)rq + lock_off;
+		lock_flag = LOCK_CLASS_RQLOCK;
+		bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
+	}
+	return 0;
+}
+
 char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h
index 3d35fd4407ac..260062a9f2ab 100644
--- a/tools/perf/util/bpf_skel/lock_data.h
+++ b/tools/perf/util/bpf_skel/lock_data.h
@@ -15,6 +15,18 @@ struct contention_task_data {
 	char comm[TASK_COMM_LEN];
 };
 
+/* default buffer size */
+#define MAX_ENTRIES  16384
+
+/*
+ * Upper bits of the flags in the contention_data are used to identify
+ * some well-known locks which do not have symbols (non-global locks).
+ */
+#define LCD_F_MMAP_LOCK		(1U << 31)
+#define LCD_F_SIGHAND_LOCK	(1U << 30)
+
+#define LCB_F_MAX_FLAGS		(1U << 7)
+
 struct contention_data {
 	u64 total_time;
 	u64 min_time;
@@ -29,4 +41,9 @@ enum lock_aggr_mode {
 	LOCK_AGGR_CALLER,
 };
 
+enum lock_class_sym {
+	LOCK_CLASS_NONE,
+	LOCK_CLASS_RQLOCK,
+};
+
 #endif /* UTIL_BPF_SKEL_LOCK_DATA_H */
diff --git a/tools/perf/util/bpf_skel/sample-filter.h b/tools/perf/util/bpf_skel/sample-filter.h
new file mode 100644
index 000000000000..2e96e1ab084a
--- /dev/null
+++ b/tools/perf/util/bpf_skel/sample-filter.h
@@ -0,0 +1,27 @@
+#ifndef PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H
+#define PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H
+
+#define MAX_FILTERS  64
+
+/* supported filter operations */
+enum perf_bpf_filter_op {
+	PBF_OP_EQ,
+	PBF_OP_NEQ,
+	PBF_OP_GT,
+	PBF_OP_GE,
+	PBF_OP_LT,
+	PBF_OP_LE,
+	PBF_OP_AND,
+	PBF_OP_GROUP_BEGIN,
+	PBF_OP_GROUP_END,
+};
+
+/* BPF map entry for filtering */
+struct perf_bpf_filter_entry {
+	enum perf_bpf_filter_op op;
+	__u32 part; /* sub-sample type info when it has multiple values */
+	__u64 flags; /* perf sample type flags */
+	__u64 value;
+};
+
+#endif /* PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H */
+\ No newline at end of file
diff --git a/tools/perf/util/bpf_skel/sample_filter.bpf.c b/tools/perf/util/bpf_skel/sample_filter.bpf.c
new file mode 100644
index 000000000000..cffe493af1ed
--- /dev/null
+++ b/tools/perf/util/bpf_skel/sample_filter.bpf.c
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2023 Google
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+#include "sample-filter.h"
+
+/* BPF map that will be filled by user space */
+struct filters {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, int);
+	__type(value, struct perf_bpf_filter_entry);
+	__uint(max_entries, MAX_FILTERS);
+} filters SEC(".maps");
+
+int dropped;
+
+void *bpf_cast_to_kern_ctx(void *) __ksym;
+
+/* new kernel perf_sample_data definition */
+struct perf_sample_data___new {
+	__u64 sample_flags;
+} __attribute__((preserve_access_index));
+
+/* new kernel perf_mem_data_src definition */
+union perf_mem_data_src__new {
+	__u64 val;
+	struct {
+		__u64   mem_op:5,	/* type of opcode */
+			mem_lvl:14,	/* memory hierarchy level */
+			mem_snoop:5,	/* snoop mode */
+			mem_lock:2,	/* lock instr */
+			mem_dtlb:7,	/* tlb access */
+			mem_lvl_num:4,	/* memory hierarchy level number */
+			mem_remote:1,   /* remote */
+			mem_snoopx:2,	/* snoop mode, ext */
+			mem_blk:3,	/* access blocked */
+			mem_hops:3,	/* hop level */
+			mem_rsvd:18;
+	};
+};
+
+/* helper function to return the given perf sample data */
+static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx,
+				    struct perf_bpf_filter_entry *entry)
+{
+	struct perf_sample_data___new *data = (void *)kctx->data;
+
+	if (!bpf_core_field_exists(data->sample_flags) ||
+	    (data->sample_flags & entry->flags) == 0)
+		return 0;
+
+	switch (entry->flags) {
+	case PERF_SAMPLE_IP:
+		return kctx->data->ip;
+	case PERF_SAMPLE_ID:
+		return kctx->data->id;
+	case PERF_SAMPLE_TID:
+		if (entry->part)
+			return kctx->data->tid_entry.pid;
+		else
+			return kctx->data->tid_entry.tid;
+	case PERF_SAMPLE_CPU:
+		return kctx->data->cpu_entry.cpu;
+	case PERF_SAMPLE_TIME:
+		return kctx->data->time;
+	case PERF_SAMPLE_ADDR:
+		return kctx->data->addr;
+	case PERF_SAMPLE_PERIOD:
+		return kctx->data->period;
+	case PERF_SAMPLE_TRANSACTION:
+		return kctx->data->txn;
+	case PERF_SAMPLE_WEIGHT_STRUCT:
+		if (entry->part == 1)
+			return kctx->data->weight.var1_dw;
+		if (entry->part == 2)
+			return kctx->data->weight.var2_w;
+		if (entry->part == 3)
+			return kctx->data->weight.var3_w;
+		/* fall through */
+	case PERF_SAMPLE_WEIGHT:
+		return kctx->data->weight.full;
+	case PERF_SAMPLE_PHYS_ADDR:
+		return kctx->data->phys_addr;
+	case PERF_SAMPLE_CODE_PAGE_SIZE:
+		return kctx->data->code_page_size;
+	case PERF_SAMPLE_DATA_PAGE_SIZE:
+		return kctx->data->data_page_size;
+	case PERF_SAMPLE_DATA_SRC:
+		if (entry->part == 1)
+			return kctx->data->data_src.mem_op;
+		if (entry->part == 2)
+			return kctx->data->data_src.mem_lvl_num;
+		if (entry->part == 3) {
+			__u32 snoop = kctx->data->data_src.mem_snoop;
+			__u32 snoopx = kctx->data->data_src.mem_snoopx;
+
+			return (snoopx << 5) | snoop;
+		}
+		if (entry->part == 4)
+			return kctx->data->data_src.mem_remote;
+		if (entry->part == 5)
+			return kctx->data->data_src.mem_lock;
+		if (entry->part == 6)
+			return kctx->data->data_src.mem_dtlb;
+		if (entry->part == 7)
+			return kctx->data->data_src.mem_blk;
+		if (entry->part == 8) {
+			union perf_mem_data_src__new *data = (void *)&kctx->data->data_src;
+
+			if (bpf_core_field_exists(data->mem_hops))
+				return data->mem_hops;
+
+			return 0;
+		}
+		/* return the whole word */
+		return kctx->data->data_src.val;
+	default:
+		break;
+	}
+	return 0;
+}
+
+#define CHECK_RESULT(data, op, val)			\
+	if (!(data op val)) {				\
+		if (!in_group)				\
+			goto drop;			\
+	} else if (in_group) {				\
+		group_result = 1;			\
+	}
+
+/* BPF program to be called from perf event overflow handler */
+SEC("perf_event")
+int perf_sample_filter(void *ctx)
+{
+	struct bpf_perf_event_data_kern *kctx;
+	struct perf_bpf_filter_entry *entry;
+	__u64 sample_data;
+	int in_group = 0;
+	int group_result = 0;
+	int i;
+
+	kctx = bpf_cast_to_kern_ctx(ctx);
+
+	for (i = 0; i < MAX_FILTERS; i++) {
+		int key = i; /* needed for verifier :( */
+
+		entry = bpf_map_lookup_elem(&filters, &key);
+		if (entry == NULL)
+			break;
+		sample_data = perf_get_sample(kctx, entry);
+
+		switch (entry->op) {
+		case PBF_OP_EQ:
+			CHECK_RESULT(sample_data, ==, entry->value)
+			break;
+		case PBF_OP_NEQ:
+			CHECK_RESULT(sample_data, !=, entry->value)
+			break;
+		case PBF_OP_GT:
+			CHECK_RESULT(sample_data, >, entry->value)
+			break;
+		case PBF_OP_GE:
+			CHECK_RESULT(sample_data, >=, entry->value)
+			break;
+		case PBF_OP_LT:
+			CHECK_RESULT(sample_data, <, entry->value)
+			break;
+		case PBF_OP_LE:
+			CHECK_RESULT(sample_data, <=, entry->value)
+			break;
+		case PBF_OP_AND:
+			CHECK_RESULT(sample_data, &, entry->value)
+			break;
+		case PBF_OP_GROUP_BEGIN:
+			in_group = 1;
+			group_result = 0;
+			break;
+		case PBF_OP_GROUP_END:
+			if (group_result == 0)
+				goto drop;
+			in_group = 0;
+			break;
+		}
+	}
+	/* generate sample data */
+	return 1;
+
+drop:
+	__sync_fetch_and_add(&dropped, 1);
+	return 0;
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/vmlinux.h b/tools/perf/util/bpf_skel/vmlinux.h
new file mode 100644
index 000000000000..449b1ea91fc4
--- /dev/null
+++ b/tools/perf/util/bpf_skel/vmlinux.h
@@ -0,0 +1,173 @@
+#ifndef __VMLINUX_H
+#define __VMLINUX_H
+
+#include <linux/bpf.h>
+#include <linux/types.h>
+#include <linux/perf_event.h>
+#include <stdbool.h>
+
+// non-UAPI kernel data structures, used in the .bpf.c BPF tool component.
+
+// Just the fields used in these tools preserving the access index so that
+// libbpf can fixup offsets with the ones used in the kernel when loading the
+// BPF bytecode, if they differ from what is used here.
+
+typedef __u8 u8;
+typedef __u32 u32;
+typedef __u64 u64;
+typedef __s64 s64;
+
+typedef int pid_t;
+
+enum cgroup_subsys_id {
+	perf_event_cgrp_id  = 8,
+};
+
+enum {
+	HI_SOFTIRQ = 0,
+	TIMER_SOFTIRQ,
+	NET_TX_SOFTIRQ,
+	NET_RX_SOFTIRQ,
+	BLOCK_SOFTIRQ,
+	IRQ_POLL_SOFTIRQ,
+	TASKLET_SOFTIRQ,
+	SCHED_SOFTIRQ,
+	HRTIMER_SOFTIRQ,
+	RCU_SOFTIRQ,    /* Preferable RCU should always be the last softirq */
+
+	NR_SOFTIRQS
+};
+
+typedef struct {
+	s64	counter;
+} __attribute__((preserve_access_index)) atomic64_t;
+
+typedef atomic64_t atomic_long_t;
+
+struct raw_spinlock {
+	int rawlock;
+} __attribute__((preserve_access_index));
+
+typedef struct raw_spinlock raw_spinlock_t;
+
+typedef struct {
+	struct raw_spinlock rlock;
+} __attribute__((preserve_access_index)) spinlock_t;
+
+struct sighand_struct {
+	spinlock_t siglock;
+} __attribute__((preserve_access_index));
+
+struct rw_semaphore {
+	atomic_long_t owner;
+} __attribute__((preserve_access_index));
+
+struct mutex {
+	atomic_long_t owner;
+} __attribute__((preserve_access_index));
+
+struct kernfs_node {
+	u64 id;
+} __attribute__((preserve_access_index));
+
+struct cgroup {
+	struct kernfs_node *kn;
+	int                level;
+}  __attribute__((preserve_access_index));
+
+struct cgroup_subsys_state {
+	struct cgroup *cgroup;
+} __attribute__((preserve_access_index));
+
+struct css_set {
+	struct cgroup_subsys_state *subsys[13];
+	struct cgroup *dfl_cgrp;
+} __attribute__((preserve_access_index));
+
+struct mm_struct {
+	struct rw_semaphore mmap_lock;
+} __attribute__((preserve_access_index));
+
+struct task_struct {
+	unsigned int	      flags;
+	struct mm_struct      *mm;
+	pid_t		      pid;
+	pid_t		      tgid;
+	char		      comm[16];
+	struct sighand_struct *sighand;
+	struct css_set	      *cgroups;
+} __attribute__((preserve_access_index));
+
+struct trace_entry {
+	short unsigned int type;
+	unsigned char	   flags;
+	unsigned char	   preempt_count;
+	int		   pid;
+} __attribute__((preserve_access_index));
+
+struct trace_event_raw_irq_handler_entry {
+	struct trace_entry ent;
+	int		   irq;
+	u32		   __data_loc_name;
+	char		   __data[];
+} __attribute__((preserve_access_index));
+
+struct trace_event_raw_irq_handler_exit {
+	struct trace_entry ent;
+	int		   irq;
+	int		   ret;
+	char		   __data[];
+} __attribute__((preserve_access_index));
+
+struct trace_event_raw_softirq {
+	struct trace_entry ent;
+	unsigned int	   vec;
+	char		   __data[];
+} __attribute__((preserve_access_index));
+
+struct trace_event_raw_workqueue_execute_start {
+	struct trace_entry ent;
+	void		   *work;
+	void		   *function;
+	char		   __data[];
+} __attribute__((preserve_access_index));
+
+struct trace_event_raw_workqueue_execute_end {
+	struct trace_entry ent;
+	void		   *work;
+	void		   *function;
+	char		  __data[];
+} __attribute__((preserve_access_index));
+
+struct trace_event_raw_workqueue_activate_work {
+	struct trace_entry ent;
+	void		   *work;
+	char		   __data[];
+} __attribute__((preserve_access_index));
+
+struct perf_sample_data {
+	u64			 addr;
+	u64			 period;
+	union perf_sample_weight weight;
+	u64			 txn;
+	union perf_mem_data_src	 data_src;
+	u64			 ip;
+	struct {
+		u32		 pid;
+		u32		 tid;
+	} tid_entry;
+	u64			 time;
+	u64			 id;
+	struct {
+		u32		 cpu;
+	} cpu_entry;
+	u64			 phys_addr;
+	u64			 data_page_size;
+	u64			 code_page_size;
+} __attribute__((__aligned__(64))) __attribute__((preserve_access_index));
+
+struct bpf_perf_event_data_kern {
+	struct perf_sample_data *data;
+	struct perf_event	*event;
+} __attribute__((preserve_access_index));
+#endif // __VMLINUX_H