77 files changed, 3715 insertions, 1100 deletions
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index a51267d88ca9..d8fe514c9ec9 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -26,6 +26,8 @@ perf-y += mmap.o
 perf-y += memswap.o
 perf-y += parse-events.o
 perf-y += parse-events-hybrid.o
+perf-y += print-events.o
+perf-y += tracepoint.o
 perf-y += perf_regs.o
 perf-y += path.o
 perf-y += print_binary.o
@@ -148,6 +150,8 @@ perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o
 perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter_cgroup.o
 perf-$(CONFIG_PERF_BPF_SKEL) += bpf_ftrace.o
 perf-$(CONFIG_PERF_BPF_SKEL) += bpf_off_cpu.o
+perf-$(CONFIG_PERF_BPF_SKEL) += bpf_kwork.o
+perf-$(CONFIG_PERF_BPF_SKEL) += bpf_lock_contention.o
 perf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o
 perf-$(CONFIG_LIBELF) += symbol-elf.o
 perf-$(CONFIG_LIBELF) += probe-file.o
diff --git a/tools/perf/util/amd-sample-raw.c b/tools/perf/util/amd-sample-raw.c
index d19d765195c5..238305868644 100644
--- a/tools/perf/util/amd-sample-raw.c
+++ b/tools/perf/util/amd-sample-raw.c
@@ -18,6 +18,7 @@
 #include "pmu-events/pmu-events.h"
 
 static u32 cpu_family, cpu_model, ibs_fetch_type, ibs_op_type;
+static bool zen4_ibs_extensions;
 
 static void pr_ibs_fetch_ctl(union ibs_fetch_ctl reg)
 {
@@ -39,6 +40,7 @@ static void pr_ibs_fetch_ctl(union ibs_fetch_ctl reg)
 	};
 	const char *ic_miss_str = NULL;
 	const char *l1tlb_pgsz_str = NULL;
+	char l3_miss_str[sizeof(" L3MissOnly _ FetchOcMiss _ FetchL3Miss _")] = "";
 
 	if (cpu_family == 0x19 && cpu_model < 0x10) {
 		/*
@@ -53,12 +55,19 @@ static void pr_ibs_fetch_ctl(union ibs_fetch_ctl reg)
 		ic_miss_str = ic_miss_strs[reg.ic_miss];
 	}
 
+	if (zen4_ibs_extensions) {
+		snprintf(l3_miss_str, sizeof(l3_miss_str),
+			 " L3MissOnly %d FetchOcMiss %d FetchL3Miss %d",
+			 reg.l3_miss_only, reg.fetch_oc_miss, reg.fetch_l3_miss);
+	}
+
 	printf("ibs_fetch_ctl:\t%016llx MaxCnt %7d Cnt %7d Lat %5d En %d Val %d Comp %d%s "
-	       "PhyAddrValid %d%s L1TlbMiss %d L2TlbMiss %d RandEn %d%s\n",
+		"PhyAddrValid %d%s L1TlbMiss %d L2TlbMiss %d RandEn %d%s%s\n",
 		reg.val, reg.fetch_maxcnt << 4, reg.fetch_cnt << 4, reg.fetch_lat,
 		reg.fetch_en, reg.fetch_val, reg.fetch_comp, ic_miss_str ? : "",
 		reg.phy_addr_valid, l1tlb_pgsz_str ? : "", reg.l1tlb_miss, reg.l2tlb_miss,
-		reg.rand_en, reg.fetch_comp ? (reg.fetch_l2_miss ? " L2Miss 1" : " L2Miss 0") : "");
+		reg.rand_en, reg.fetch_comp ? (reg.fetch_l2_miss ? " L2Miss 1" : " L2Miss 0") : "",
+		l3_miss_str);
 }
 
 static void pr_ic_ibs_extd_ctl(union ic_ibs_extd_ctl reg)
@@ -68,9 +77,15 @@ static void pr_ic_ibs_extd_ctl(union ic_ibs_extd_ctl reg)
 
 static void pr_ibs_op_ctl(union ibs_op_ctl reg)
 {
-	printf("ibs_op_ctl:\t%016llx MaxCnt %9d En %d Val %d CntCtl %d=%s CurCnt %9d\n",
-	       reg.val, ((reg.opmaxcnt_ext << 16) | reg.opmaxcnt) << 4, reg.op_en, reg.op_val,
-	       reg.cnt_ctl, reg.cnt_ctl ? "uOps" : "cycles", reg.opcurcnt);
+	char l3_miss_only[sizeof(" L3MissOnly _")] = "";
+
+	if (zen4_ibs_extensions)
+		snprintf(l3_miss_only, sizeof(l3_miss_only), " L3MissOnly %d", reg.l3_miss_only);
+
+	printf("ibs_op_ctl:\t%016llx MaxCnt %9d%s En %d Val %d CntCtl %d=%s CurCnt %9d\n",
+		reg.val, ((reg.opmaxcnt_ext << 16) | reg.opmaxcnt) << 4, l3_miss_only,
+		reg.op_en, reg.op_val, reg.cnt_ctl,
+		reg.cnt_ctl ? "uOps" : "cycles", reg.opcurcnt);
 }
 
 static void pr_ibs_op_data(union ibs_op_data reg)
@@ -84,7 +99,34 @@ static void pr_ibs_op_data(union ibs_op_data reg)
 		reg.op_brn_ret, reg.op_rip_invalid, reg.op_brn_fuse, reg.op_microcode);
 }
 
-static void pr_ibs_op_data2(union ibs_op_data2 reg)
+static void pr_ibs_op_data2_extended(union ibs_op_data2 reg)
+{
+	static const char * const data_src_str[] = {
+		"",
+		" DataSrc 1=Local L3 or other L1/L2 in CCX",
+		" DataSrc 2=A peer cache in a near CCX",
+		" DataSrc 3=Data returned from DRAM",
+		" DataSrc 4=(reserved)",
+		" DataSrc 5=A peer cache in a far CCX",
+		" DataSrc 6=DRAM address map with \"long latency\" bit set",
+		" DataSrc 7=Data returned from MMIO/Config/PCI/APIC",
+		" DataSrc 8=Extension Memory (S-Link, GenZ, etc)",
+		" DataSrc 9=(reserved)",
+		" DataSrc 10=(reserved)",
+		" DataSrc 11=(reserved)",
+		" DataSrc 12=Peer Agent Memory",
+		/* 13 to 31 are reserved. Avoid printing them. */
+	};
+	int data_src = (reg.data_src_hi << 3) | reg.data_src_lo;
+
+	printf("ibs_op_data2:\t%016llx %sRmtNode %d%s\n", reg.val,
+		(data_src == 1 || data_src == 2 || data_src == 5) ?
+			(reg.cache_hit_st ? "CacheHitSt 1=O-State " : "CacheHitSt 0=M-state ") : "",
+		reg.rmt_node,
+		data_src < (int)ARRAY_SIZE(data_src_str) ? data_src_str[data_src] : "");
+}
+
+static void pr_ibs_op_data2_default(union ibs_op_data2 reg)
 {
 	static const char * const data_src_str[] = {
 		"",
@@ -98,9 +140,16 @@ static void pr_ibs_op_data2(union ibs_op_data2 reg)
 	};
 
 	printf("ibs_op_data2:\t%016llx %sRmtNode %d%s\n", reg.val,
-	       reg.data_src == 2 ? (reg.cache_hit_st ? "CacheHitSt 1=O-State "
+	       reg.data_src_lo == 2 ? (reg.cache_hit_st ? "CacheHitSt 1=O-State "
 						     : "CacheHitSt 0=M-state ") : "",
-	       reg.rmt_node, data_src_str[reg.data_src]);
+	       reg.rmt_node, data_src_str[reg.data_src_lo]);
+}
+
+static void pr_ibs_op_data2(union ibs_op_data2 reg)
+{
+	if (zen4_ibs_extensions)
+		return pr_ibs_op_data2_extended(reg);
+	pr_ibs_op_data2_default(reg);
 }
 
 static void pr_ibs_op_data3(union ibs_op_data3 reg)
@@ -279,6 +328,9 @@ bool evlist__has_amd_ibs(struct evlist *evlist)
 		pmu_mapping += strlen(pmu_mapping) + 1 /* '\0' */;
 	}
 
+	if (perf_env__find_pmu_cap(env, "ibs_op", "zen4_ibs_extensions"))
+		zen4_ibs_extensions = 1;
+
 	if (ibs_fetch_type || ibs_op_type) {
 		if (!cpu_family)
 			parse_cpuid(env);
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 82cc396ef516..2c6a485c3de5 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -1720,6 +1720,7 @@ fallback:
 #include <bpf/btf.h>
 #include <bpf/libbpf.h>
 #include <linux/btf.h>
+#include <tools/dis-asm-compat.h>
 
 static int symbol__disassemble_bpf(struct symbol *sym,
 				   struct annotate_args *args)
@@ -1762,9 +1763,9 @@ static int symbol__disassemble_bpf(struct symbol *sym,
 		ret = errno;
 		goto out;
 	}
-	init_disassemble_info(&info, s,
-			      (fprintf_ftype) fprintf);
-
+	init_disassemble_info_compat(&info, s,
+				     (fprintf_ftype) fprintf,
+				     fprintf_styled);
 	info.arch = bfd_get_arch(bfdf);
 	info.mach = bfd_get_mach(bfdf);
 
diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 1a80151baed9..d040406f3314 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -387,26 +387,16 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
 	return arm_spe_deliver_synth_event(spe, speq, event, &sample);
 }
 
-#define SPE_MEM_TYPE	(ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS | \
-			 ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS | \
-			 ARM_SPE_REMOTE_ACCESS)
-
-static bool arm_spe__is_memory_event(enum arm_spe_sample_type type)
-{
-	if (type & SPE_MEM_TYPE)
-		return true;
-
-	return false;
-}
-
 static u64 arm_spe__synth_data_source(const struct arm_spe_record *record)
 {
 	union perf_mem_data_src	data_src = { 0 };
 
 	if (record->op == ARM_SPE_LD)
 		data_src.mem_op = PERF_MEM_OP_LOAD;
-	else
+	else if (record->op == ARM_SPE_ST)
 		data_src.mem_op = PERF_MEM_OP_STORE;
+	else
+		return 0;
 
 	if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) {
 		data_src.mem_lvl = PERF_MEM_LVL_L3;
@@ -510,7 +500,11 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
 			return err;
 	}
 
-	if (spe->sample_memory && arm_spe__is_memory_event(record->type)) {
+	/*
+	 * When data_src is zero it means the record is not a memory operation,
+	 * skip to synthesize memory sample for this case.
+	 */
+	if (spe->sample_memory && data_src) {
 		err = arm_spe__synth_mem_sample(speq, spe->memory_id, data_src);
 		if (err)
 			return err;
diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
index 511dd3caa1bc..6edab8a16de6 100644
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -1189,9 +1189,10 @@ void auxtrace_buffer__free(struct auxtrace_buffer *buffer)
 	free(buffer);
 }
 
-void auxtrace_synth_error(struct perf_record_auxtrace_error *auxtrace_error, int type,
-			  int code, int cpu, pid_t pid, pid_t tid, u64 ip,
-			  const char *msg, u64 timestamp)
+void auxtrace_synth_guest_error(struct perf_record_auxtrace_error *auxtrace_error, int type,
+				int code, int cpu, pid_t pid, pid_t tid, u64 ip,
+				const char *msg, u64 timestamp,
+				pid_t machine_pid, int vcpu)
 {
 	size_t size;
 
@@ -1207,12 +1208,26 @@ void auxtrace_synth_error(struct perf_record_auxtrace_error *auxtrace_error, int
 	auxtrace_error->ip = ip;
 	auxtrace_error->time = timestamp;
 	strlcpy(auxtrace_error->msg, msg, MAX_AUXTRACE_ERROR_MSG);
-
-	size = (void *)auxtrace_error->msg - (void *)auxtrace_error +
-	       strlen(auxtrace_error->msg) + 1;
+	if (machine_pid) {
+		auxtrace_error->fmt = 2;
+		auxtrace_error->machine_pid = machine_pid;
+		auxtrace_error->vcpu = vcpu;
+		size = sizeof(*auxtrace_error);
+	} else {
+		size = (void *)auxtrace_error->msg - (void *)auxtrace_error +
+		       strlen(auxtrace_error->msg) + 1;
+	}
 	auxtrace_error->header.size = PERF_ALIGN(size, sizeof(u64));
 }
 
+void auxtrace_synth_error(struct perf_record_auxtrace_error *auxtrace_error, int type,
+			  int code, int cpu, pid_t pid, pid_t tid, u64 ip,
+			  const char *msg, u64 timestamp)
+{
+	auxtrace_synth_guest_error(auxtrace_error, type, code, cpu, pid, tid,
+				   ip, msg, timestamp, 0, -1);
+}
+
 int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr,
 					 struct perf_tool *tool,
 					 struct perf_session *session,
@@ -1662,6 +1677,9 @@ size_t perf_event__fprintf_auxtrace_error(union perf_event *event, FILE *fp)
 	if (!e->fmt)
 		msg = (const char *)&e->time;
 
+	if (e->fmt >= 2 && e->machine_pid)
+		ret += fprintf(fp, " machine_pid %d vcpu %d", e->machine_pid, e->vcpu);
+
 	ret += fprintf(fp, " cpu %d pid %d tid %d ip %#"PRI_lx64" code %u: %s\n",
 		       e->cpu, e->pid, e->tid, e->ip, e->code, msg);
 	return ret;
diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h
index cd0d25c2751c..6a4fbfd34c6b 100644
--- a/tools/perf/util/auxtrace.h
+++ b/tools/perf/util/auxtrace.h
@@ -595,6 +595,10 @@ int auxtrace_index__process(int fd, u64 size, struct perf_session *session,
 			    bool needs_swap);
 void auxtrace_index__free(struct list_head *head);
 
+void auxtrace_synth_guest_error(struct perf_record_auxtrace_error *auxtrace_error, int type,
+				int code, int cpu, pid_t pid, pid_t tid, u64 ip,
+				const char *msg, u64 timestamp,
+				pid_t machine_pid, int vcpu);
 void auxtrace_synth_error(struct perf_record_auxtrace_error *auxtrace_error, int type,
 			  int code, int cpu, pid_t pid, pid_t tid, u64 ip,
 			  const char *msg, u64 timestamp);
diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index f8ad581ea247..d2c9b09ddb48 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -9,6 +9,7 @@
 #include <linux/bpf.h>
 #include <bpf/libbpf.h>
 #include <bpf/bpf.h>
+#include <linux/filter.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
@@ -49,6 +50,7 @@ struct bpf_prog_priv {
 	struct bpf_insn *insns_buf;
 	int nr_types;
 	int *type_mapping;
+	int *prologue_fds;
 };
 
 struct bpf_perf_object {
@@ -56,6 +58,11 @@ struct bpf_perf_object {
 	struct bpf_object *obj;
 };
 
+struct bpf_preproc_result {
+	struct bpf_insn *new_insn_ptr;
+	int new_insn_cnt;
+};
+
 static LIST_HEAD(bpf_objects_list);
 static struct hashmap *bpf_program_hash;
 static struct hashmap *bpf_map_hash;
@@ -63,20 +70,16 @@ static struct hashmap *bpf_map_hash;
 static struct bpf_perf_object *
 bpf_perf_object__next(struct bpf_perf_object *prev)
 {
-	struct bpf_perf_object *next;
-
-	if (!prev)
-		next = list_first_entry(&bpf_objects_list,
-					struct bpf_perf_object,
-					list);
-	else
-		next = list_next_entry(prev, list);
+	if (!prev) {
+		if (list_empty(&bpf_objects_list))
+			return NULL;
 
-	/* Empty list is noticed here so don't need checking on entry. */
-	if (&next->list == &bpf_objects_list)
+		return list_first_entry(&bpf_objects_list, struct bpf_perf_object, list);
+	}
+	if (list_is_last(&prev->list, &bpf_objects_list))
 		return NULL;
 
-	return next;
+	return list_next_entry(prev, list);
 }
 
 #define bpf_perf_object__for_each(perf_obj, tmp)	\
@@ -86,6 +89,7 @@ bpf_perf_object__next(struct bpf_perf_object *prev)
 	     (perf_obj) = (tmp), (tmp) = bpf_perf_object__next(tmp))
 
 static bool libbpf_initialized;
+static int libbpf_sec_handler;
 
 static int bpf_perf_object__add(struct bpf_object *obj)
 {
@@ -99,12 +103,76 @@ static int bpf_perf_object__add(struct bpf_object *obj)
 	return perf_obj ? 0 : -ENOMEM;
 }
 
+static void *program_priv(const struct bpf_program *prog)
+{
+	void *priv;
+
+	if (IS_ERR_OR_NULL(bpf_program_hash))
+		return NULL;
+	if (!hashmap__find(bpf_program_hash, prog, &priv))
+		return NULL;
+	return priv;
+}
+
+static struct bpf_insn prologue_init_insn[] = {
+	BPF_MOV64_IMM(BPF_REG_2, 0),
+	BPF_MOV64_IMM(BPF_REG_3, 0),
+	BPF_MOV64_IMM(BPF_REG_4, 0),
+	BPF_MOV64_IMM(BPF_REG_5, 0),
+};
+
+static int libbpf_prog_prepare_load_fn(struct bpf_program *prog,
+				       struct bpf_prog_load_opts *opts __maybe_unused,
+				       long cookie __maybe_unused)
+{
+	size_t init_size_cnt = ARRAY_SIZE(prologue_init_insn);
+	size_t orig_insn_cnt, insn_cnt, init_size, orig_size;
+	struct bpf_prog_priv *priv = program_priv(prog);
+	const struct bpf_insn *orig_insn;
+	struct bpf_insn *insn;
+
+	if (IS_ERR_OR_NULL(priv)) {
+		pr_debug("bpf: failed to get private field\n");
+		return -BPF_LOADER_ERRNO__INTERNAL;
+	}
+
+	if (!priv->need_prologue)
+		return 0;
+
+	/* prepend initialization code to program instructions */
+	orig_insn = bpf_program__insns(prog);
+	orig_insn_cnt = bpf_program__insn_cnt(prog);
+	init_size = init_size_cnt * sizeof(*insn);
+	orig_size = orig_insn_cnt * sizeof(*insn);
+
+	insn_cnt = orig_insn_cnt + init_size_cnt;
+	insn = malloc(insn_cnt * sizeof(*insn));
+	if (!insn)
+		return -ENOMEM;
+
+	memcpy(insn, prologue_init_insn, init_size);
+	memcpy((char *) insn + init_size, orig_insn, orig_size);
+	bpf_program__set_insns(prog, insn, insn_cnt);
+	return 0;
+}
+
 static int libbpf_init(void)
 {
+	LIBBPF_OPTS(libbpf_prog_handler_opts, handler_opts,
+		.prog_prepare_load_fn = libbpf_prog_prepare_load_fn,
+	);
+
 	if (libbpf_initialized)
 		return 0;
 
 	libbpf_set_print(libbpf_perf_print);
+	libbpf_sec_handler = libbpf_register_prog_handler(NULL, BPF_PROG_TYPE_KPROBE,
+							  0, &handler_opts);
+	if (libbpf_sec_handler < 0) {
+		pr_debug("bpf: failed to register libbpf section handler: %d\n",
+			 libbpf_sec_handler);
+		return -BPF_LOADER_ERRNO__INTERNAL;
+	}
 	libbpf_initialized = true;
 	return 0;
 }
@@ -188,14 +256,31 @@ struct bpf_object *bpf__prepare_load(const char *filename, bool source)
 	return obj;
 }
 
+static void close_prologue_programs(struct bpf_prog_priv *priv)
+{
+	struct perf_probe_event *pev;
+	int i, fd;
+
+	if (!priv->need_prologue)
+		return;
+	pev = &priv->pev;
+	for (i = 0; i < pev->ntevs; i++) {
+		fd = priv->prologue_fds[i];
+		if (fd != -1)
+			close(fd);
+	}
+}
+
 static void
 clear_prog_priv(const struct bpf_program *prog __maybe_unused,
 		void *_priv)
 {
 	struct bpf_prog_priv *priv = _priv;
 
+	close_prologue_programs(priv);
 	cleanup_perf_probe_events(&priv->pev, 1);
 	zfree(&priv->insns_buf);
+	zfree(&priv->prologue_fds);
 	zfree(&priv->type_mapping);
 	zfree(&priv->sys_name);
 	zfree(&priv->evt_name);
@@ -243,17 +328,6 @@ static bool ptr_equal(const void *key1, const void *key2,
 	return key1 == key2;
 }
 
-static void *program_priv(const struct bpf_program *prog)
-{
-	void *priv;
-
-	if (IS_ERR_OR_NULL(bpf_program_hash))
-		return NULL;
-	if (!hashmap__find(bpf_program_hash, prog, &priv))
-		return NULL;
-	return priv;
-}
-
 static int program_set_priv(struct bpf_program *prog, void *priv)
 {
 	void *old_priv;
@@ -558,8 +632,8 @@ static int bpf__prepare_probe(void)
 
 static int
 preproc_gen_prologue(struct bpf_program *prog, int n,
-		     struct bpf_insn *orig_insns, int orig_insns_cnt,
-		     struct bpf_prog_prep_result *res)
+		     const struct bpf_insn *orig_insns, int orig_insns_cnt,
+		     struct bpf_preproc_result *res)
 {
 	struct bpf_prog_priv *priv = program_priv(prog);
 	struct probe_trace_event *tev;
@@ -607,7 +681,6 @@ preproc_gen_prologue(struct bpf_program *prog, int n,
 
 	res->new_insn_ptr = buf;
 	res->new_insn_cnt = prologue_cnt + orig_insns_cnt;
-	res->pfd = NULL;
 	return 0;
 
 errout:
@@ -715,7 +788,7 @@ static int hook_load_preprocessor(struct bpf_program *prog)
 	struct bpf_prog_priv *priv = program_priv(prog);
 	struct perf_probe_event *pev;
 	bool need_prologue = false;
-	int err, i;
+	int i;
 
 	if (IS_ERR_OR_NULL(priv)) {
 		pr_debug("Internal error when hook preprocessor\n");
@@ -753,6 +826,13 @@ static int hook_load_preprocessor(struct bpf_program *prog)
 		return -ENOMEM;
 	}
 
+	priv->prologue_fds = malloc(sizeof(int) * pev->ntevs);
+	if (!priv->prologue_fds) {
+		pr_debug("Not enough memory: alloc prologue fds failed\n");
+		return -ENOMEM;
+	}
+	memset(priv->prologue_fds, -1, sizeof(int) * pev->ntevs);
+
 	priv->type_mapping = malloc(sizeof(int) * pev->ntevs);
 	if (!priv->type_mapping) {
 		pr_debug("Not enough memory: alloc type_mapping failed\n");
@@ -761,13 +841,7 @@ static int hook_load_preprocessor(struct bpf_program *prog)
 	memset(priv->type_mapping, -1,
 	       sizeof(int) * pev->ntevs);
 
-	err = map_prologue(pev, priv->type_mapping, &priv->nr_types);
-	if (err)
-		return err;
-
-	err = bpf_program__set_prep(prog, priv->nr_types,
-				    preproc_gen_prologue);
-	return err;
+	return map_prologue(pev, priv->type_mapping, &priv->nr_types);
 }
 
 int bpf__probe(struct bpf_object *obj)
@@ -874,6 +948,77 @@ int bpf__unprobe(struct bpf_object *obj)
 	return ret;
 }
 
+static int bpf_object__load_prologue(struct bpf_object *obj)
+{
+	int init_cnt = ARRAY_SIZE(prologue_init_insn);
+	const struct bpf_insn *orig_insns;
+	struct bpf_preproc_result res;
+	struct perf_probe_event *pev;
+	struct bpf_program *prog;
+	int orig_insns_cnt;
+
+	bpf_object__for_each_program(prog, obj) {
+		struct bpf_prog_priv *priv = program_priv(prog);
+		int err, i, fd;
+
+		if (IS_ERR_OR_NULL(priv)) {
+			pr_debug("bpf: failed to get private field\n");
+			return -BPF_LOADER_ERRNO__INTERNAL;
+		}
+
+		if (!priv->need_prologue)
+			continue;
+
+		/*
+		 * For each program that needs prologue we do following:
+		 *
+		 * - take its current instructions and use them
+		 *   to generate the new code with prologue
+		 * - load new instructions with bpf_prog_load
+		 *   and keep the fd in prologue_fds
+		 * - new fd will be used in bpf__foreach_event
+		 *   to connect this program with perf evsel
+		 */
+		orig_insns = bpf_program__insns(prog);
+		orig_insns_cnt = bpf_program__insn_cnt(prog);
+
+		pev = &priv->pev;
+		for (i = 0; i < pev->ntevs; i++) {
+			/*
+			 * Skipping artificall prologue_init_insn instructions
+			 * (init_cnt), so the prologue can be generated instead
+			 * of them.
+			 */
+			err = preproc_gen_prologue(prog, i,
+						   orig_insns + init_cnt,
+						   orig_insns_cnt - init_cnt,
+						   &res);
+			if (err)
+				return err;
+
+			fd = bpf_prog_load(bpf_program__get_type(prog),
+					   bpf_program__name(prog), "GPL",
+					   res.new_insn_ptr,
+					   res.new_insn_cnt, NULL);
+			if (fd < 0) {
+				char bf[128];
+
+				libbpf_strerror(-errno, bf, sizeof(bf));
+				pr_debug("bpf: load objects with prologue failed: err=%d: (%s)\n",
+					 -errno, bf);
+				return -errno;
+			}
+			priv->prologue_fds[i] = fd;
+		}
+		/*
+		 * We no longer need the original program,
+		 * we can unload it.
+		 */
+		bpf_program__unload(prog);
+	}
+	return 0;
+}
+
 int bpf__load(struct bpf_object *obj)
 {
 	int err;
@@ -885,7 +1030,7 @@ int bpf__load(struct bpf_object *obj)
 		pr_debug("bpf: load objects failed: err=%d: (%s)\n", err, bf);
 		return err;
 	}
-	return 0;
+	return bpf_object__load_prologue(obj);
 }
 
 int bpf__foreach_event(struct bpf_object *obj,
@@ -920,13 +1065,10 @@ int bpf__foreach_event(struct bpf_object *obj,
 		for (i = 0; i < pev->ntevs; i++) {
 			tev = &pev->tevs[i];
 
-			if (priv->need_prologue) {
-				int type = priv->type_mapping[i];
-
-				fd = bpf_program__nth_fd(prog, type);
-			} else {
+			if (priv->need_prologue)
+				fd = priv->prologue_fds[i];
+			else
 				fd = bpf_program__fd(prog);
-			}
 
 			if (fd < 0) {
 				pr_debug("bpf: failed to get file descriptor\n");
diff --git a/tools/perf/util/bpf-utils.c b/tools/perf/util/bpf-utils.c
index e271e05e51bc..80b1d2b3729b 100644
--- a/tools/perf/util/bpf-utils.c
+++ b/tools/perf/util/bpf-utils.c
@@ -149,11 +149,10 @@ get_bpf_prog_info_linear(int fd, __u64 arrays)
 		count = bpf_prog_info_read_offset_u32(&info, desc->count_offset);
 		size  = bpf_prog_info_read_offset_u32(&info, desc->size_offset);
 
-		data_len += count * size;
+		data_len += roundup(count * size, sizeof(__u64));
 	}
 
 	/* step 3: allocate continuous memory */
-	data_len = roundup(data_len, sizeof(__u64));
 	info_linear = malloc(sizeof(struct perf_bpil) + data_len);
 	if (!info_linear)
 		return ERR_PTR(-ENOMEM);
@@ -180,7 +179,7 @@ get_bpf_prog_info_linear(int fd, __u64 arrays)
 		bpf_prog_info_set_offset_u64(&info_linear->info,
 					     desc->array_offset,
 					     ptr_to_u64(ptr));
-		ptr += count * size;
+		ptr += roundup(count * size, sizeof(__u64));
 	}
 
 	/* step 5: call syscall again to get required arrays */
diff --git a/tools/perf/util/bpf_kwork.c b/tools/perf/util/bpf_kwork.c
new file mode 100644
index 000000000000..b629dd679d3f
--- /dev/null
+++ b/tools/perf/util/bpf_kwork.c
@@ -0,0 +1,346 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * bpf_kwork.c
+ *
+ * Copyright (c) 2022  Huawei Inc,  Yang Jihong <yangjihong1@huawei.com>
+ */
+
+#include <time.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <linux/time64.h>
+
+#include "util/debug.h"
+#include "util/kwork.h"
+
+#include <bpf/bpf.h>
+
+#include "util/bpf_skel/kwork_trace.skel.h"
+
+/*
+ * This should be in sync with "util/kwork_trace.bpf.c"
+ */
+#define MAX_KWORKNAME 128
+
+struct work_key {
+	u32 type;
+	u32 cpu;
+	u64 id;
+};
+
+struct report_data {
+	u64 nr;
+	u64 total_time;
+	u64 max_time;
+	u64 max_time_start;
+	u64 max_time_end;
+};
+
+struct kwork_class_bpf {
+	struct kwork_class *class;
+
+	void (*load_prepare)(struct perf_kwork *kwork);
+	int  (*get_work_name)(struct work_key *key, char **ret_name);
+};
+
+static struct kwork_trace_bpf *skel;
+
+static struct timespec ts_start;
+static struct timespec ts_end;
+
+void perf_kwork__trace_start(void)
+{
+	clock_gettime(CLOCK_MONOTONIC, &ts_start);
+	skel->bss->enabled = 1;
+}
+
+void perf_kwork__trace_finish(void)
+{
+	clock_gettime(CLOCK_MONOTONIC, &ts_end);
+	skel->bss->enabled = 0;
+}
+
+static int get_work_name_from_map(struct work_key *key, char **ret_name)
+{
+	char name[MAX_KWORKNAME] = { 0 };
+	int fd = bpf_map__fd(skel->maps.perf_kwork_names);
+
+	*ret_name = NULL;
+
+	if (fd < 0) {
+		pr_debug("Invalid names map fd\n");
+		return 0;
+	}
+
+	if ((bpf_map_lookup_elem(fd, key, name) == 0) && (strlen(name) != 0)) {
+		*ret_name = strdup(name);
+		if (*ret_name == NULL) {
+			pr_err("Failed to copy work name\n");
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static void irq_load_prepare(struct perf_kwork *kwork)
+{
+	if (kwork->report == KWORK_REPORT_RUNTIME) {
+		bpf_program__set_autoload(skel->progs.report_irq_handler_entry, true);
+		bpf_program__set_autoload(skel->progs.report_irq_handler_exit, true);
+	}
+}
+
+static struct kwork_class_bpf kwork_irq_bpf = {
+	.load_prepare  = irq_load_prepare,
+	.get_work_name = get_work_name_from_map,
+};
+
+static void softirq_load_prepare(struct perf_kwork *kwork)
+{
+	if (kwork->report == KWORK_REPORT_RUNTIME) {
+		bpf_program__set_autoload(skel->progs.report_softirq_entry, true);
+		bpf_program__set_autoload(skel->progs.report_softirq_exit, true);
+	} else if (kwork->report == KWORK_REPORT_LATENCY) {
+		bpf_program__set_autoload(skel->progs.latency_softirq_raise, true);
+		bpf_program__set_autoload(skel->progs.latency_softirq_entry, true);
+	}
+}
+
+static struct kwork_class_bpf kwork_softirq_bpf = {
+	.load_prepare  = softirq_load_prepare,
+	.get_work_name = get_work_name_from_map,
+};
+
+static void workqueue_load_prepare(struct perf_kwork *kwork)
+{
+	if (kwork->report == KWORK_REPORT_RUNTIME) {
+		bpf_program__set_autoload(skel->progs.report_workqueue_execute_start, true);
+		bpf_program__set_autoload(skel->progs.report_workqueue_execute_end, true);
+	} else if (kwork->report == KWORK_REPORT_LATENCY) {
+		bpf_program__set_autoload(skel->progs.latency_workqueue_activate_work, true);
+		bpf_program__set_autoload(skel->progs.latency_workqueue_execute_start, true);
+	}
+}
+
+static struct kwork_class_bpf kwork_workqueue_bpf = {
+	.load_prepare  = workqueue_load_prepare,
+	.get_work_name = get_work_name_from_map,
+};
+
+static struct kwork_class_bpf *
+kwork_class_bpf_supported_list[KWORK_CLASS_MAX] = {
+	[KWORK_CLASS_IRQ]       = &kwork_irq_bpf,
+	[KWORK_CLASS_SOFTIRQ]   = &kwork_softirq_bpf,
+	[KWORK_CLASS_WORKQUEUE] = &kwork_workqueue_bpf,
+};
+
+static bool valid_kwork_class_type(enum kwork_class_type type)
+{
+	return type >= 0 && type < KWORK_CLASS_MAX ? true : false;
+}
+
+static int setup_filters(struct perf_kwork *kwork)
+{
+	u8 val = 1;
+	int i, nr_cpus, key, fd;
+	struct perf_cpu_map *map;
+
+	if (kwork->cpu_list != NULL) {
+		fd = bpf_map__fd(skel->maps.perf_kwork_cpu_filter);
+		if (fd < 0) {
+			pr_debug("Invalid cpu filter fd\n");
+			return -1;
+		}
+
+		map = perf_cpu_map__new(kwork->cpu_list);
+		if (map == NULL) {
+			pr_debug("Invalid cpu_list\n");
+			return -1;
+		}
+
+		nr_cpus = libbpf_num_possible_cpus();
+		for (i = 0; i < perf_cpu_map__nr(map); i++) {
+			struct perf_cpu cpu = perf_cpu_map__cpu(map, i);
+
+			if (cpu.cpu >= nr_cpus) {
+				perf_cpu_map__put(map);
+				pr_err("Requested cpu %d too large\n", cpu.cpu);
+				return -1;
+			}
+			bpf_map_update_elem(fd, &cpu.cpu, &val, BPF_ANY);
+		}
+		perf_cpu_map__put(map);
+
+		skel->bss->has_cpu_filter = 1;
+	}
+
+	if (kwork->profile_name != NULL) {
+		if (strlen(kwork->profile_name) >= MAX_KWORKNAME) {
+			pr_err("Requested name filter %s too large, limit to %d\n",
+			       kwork->profile_name, MAX_KWORKNAME - 1);
+			return -1;
+		}
+
+		fd = bpf_map__fd(skel->maps.perf_kwork_name_filter);
+		if (fd < 0) {
+			pr_debug("Invalid name filter fd\n");
+			return -1;
+		}
+
+		key = 0;
+		bpf_map_update_elem(fd, &key, kwork->profile_name, BPF_ANY);
+
+		skel->bss->has_name_filter = 1;
+	}
+
+	return 0;
+}
+
+int perf_kwork__trace_prepare_bpf(struct perf_kwork *kwork)
+{
+	struct bpf_program *prog;
+	struct kwork_class *class;
+	struct kwork_class_bpf *class_bpf;
+	enum kwork_class_type type;
+
+	skel = kwork_trace_bpf__open();
+	if (!skel) {
+		pr_debug("Failed to open kwork trace skeleton\n");
+		return -1;
+	}
+
+	/*
+	 * set all progs to non-autoload,
+	 * then set corresponding progs according to config
+	 */
+	bpf_object__for_each_program(prog, skel->obj)
+		bpf_program__set_autoload(prog, false);
+
+	list_for_each_entry(class, &kwork->class_list, list) {
+		type = class->type;
+		if (!valid_kwork_class_type(type) ||
+		    (kwork_class_bpf_supported_list[type] == NULL)) {
+			pr_err("Unsupported bpf trace class %s\n", class->name);
+			goto out;
+		}
+
+		class_bpf = kwork_class_bpf_supported_list[type];
+		class_bpf->class = class;
+
+		if (class_bpf->load_prepare != NULL)
+			class_bpf->load_prepare(kwork);
+	}
+
+	if (kwork_trace_bpf__load(skel)) {
+		pr_debug("Failed to load kwork trace skeleton\n");
+		goto out;
+	}
+
+	if (setup_filters(kwork))
+		goto out;
+
+	if (kwork_trace_bpf__attach(skel)) {
+		pr_debug("Failed to attach kwork trace skeleton\n");
+		goto out;
+	}
+
+	return 0;
+
+out:
+	kwork_trace_bpf__destroy(skel);
+	return -1;
+}
+
+static int add_work(struct perf_kwork *kwork,
+		    struct work_key *key,
+		    struct report_data *data)
+{
+	struct kwork_work *work;
+	struct kwork_class_bpf *bpf_trace;
+	struct kwork_work tmp = {
+		.id = key->id,
+		.name = NULL,
+		.cpu = key->cpu,
+	};
+	enum kwork_class_type type = key->type;
+
+	if (!valid_kwork_class_type(type)) {
+		pr_debug("Invalid class type %d to add work\n", type);
+		return -1;
+	}
+
+	bpf_trace = kwork_class_bpf_supported_list[type];
+	tmp.class = bpf_trace->class;
+
+	if ((bpf_trace->get_work_name != NULL) &&
+	    (bpf_trace->get_work_name(key, &tmp.name)))
+		return -1;
+
+	work = perf_kwork_add_work(kwork, tmp.class, &tmp);
+	if (work == NULL)
+		return -1;
+
+	if (kwork->report == KWORK_REPORT_RUNTIME) {
+		work->nr_atoms = data->nr;
+		work->total_runtime = data->total_time;
+		work->max_runtime = data->max_time;
+		work->max_runtime_start = data->max_time_start;
+		work->max_runtime_end = data->max_time_end;
+	} else if (kwork->report == KWORK_REPORT_LATENCY) {
+		work->nr_atoms = data->nr;
+		work->total_latency = data->total_time;
+		work->max_latency = data->max_time;
+		work->max_latency_start = data->max_time_start;
+		work->max_latency_end = data->max_time_end;
+	} else {
+		pr_debug("Invalid bpf report type %d\n", kwork->report);
+		return -1;
+	}
+
+	kwork->timestart = (u64)ts_start.tv_sec * NSEC_PER_SEC + ts_start.tv_nsec;
+	kwork->timeend = (u64)ts_end.tv_sec * NSEC_PER_SEC + ts_end.tv_nsec;
+
+	return 0;
+}
+
+int perf_kwork__report_read_bpf(struct perf_kwork *kwork)
+{
+	struct report_data data;
+	struct work_key key = {
+		.type = 0,
+		.cpu  = 0,
+		.id   = 0,
+	};
+	struct work_key prev = {
+		.type = 0,
+		.cpu  = 0,
+		.id   = 0,
+	};
+	int fd = bpf_map__fd(skel->maps.perf_kwork_report);
+
+	if (fd < 0) {
+		pr_debug("Invalid report fd\n");
+		return -1;
+	}
+
+	while (!bpf_map_get_next_key(fd, &prev, &key)) {
+		if ((bpf_map_lookup_elem(fd, &key, &data)) != 0) {
+			pr_debug("Failed to lookup report elem\n");
+			return -1;
+		}
+
+		if ((data.nr != 0) && (add_work(kwork, &key, &data) != 0))
+			return -1;
+
+		prev = key;
+	}
+	return 0;
+}
+
+void perf_kwork__report_cleanup_bpf(void)
+{
+	kwork_trace_bpf__destroy(skel);
+}
diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
new file mode 100644
index 000000000000..c591a66733ef
--- /dev/null
+++ b/tools/perf/util/bpf_lock_contention.c
@@ -0,0 +1,189 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "util/debug.h"
+#include "util/evlist.h"
+#include "util/machine.h"
+#include "util/map.h"
+#include "util/symbol.h"
+#include "util/target.h"
+#include "util/thread_map.h"
+#include "util/lock-contention.h"
+#include <linux/zalloc.h>
+#include <bpf/bpf.h>
+
+#include "bpf_skel/lock_contention.skel.h"
+
+static struct lock_contention_bpf *skel;
+
+/* should be same as bpf_skel/lock_contention.bpf.c */
+struct lock_contention_key {
+	s32 stack_id;
+};
+
+struct lock_contention_data {
+	u64 total_time;
+	u64 min_time;
+	u64 max_time;
+	u32 count;
+	u32 flags;
+};
+
+int lock_contention_prepare(struct lock_contention *con)
+{
+	int i, fd;
+	int ncpus = 1, ntasks = 1;
+	struct evlist *evlist = con->evlist;
+	struct target *target = con->target;
+
+	skel = lock_contention_bpf__open();
+	if (!skel) {
+		pr_err("Failed to open lock-contention BPF skeleton\n");
+		return -1;
+	}
+
+	bpf_map__set_max_entries(skel->maps.stacks, con->map_nr_entries);
+	bpf_map__set_max_entries(skel->maps.lock_stat, con->map_nr_entries);
+
+	if (target__has_cpu(target))
+		ncpus = perf_cpu_map__nr(evlist->core.user_requested_cpus);
+	if (target__has_task(target))
+		ntasks = perf_thread_map__nr(evlist->core.threads);
+
+	bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus);
+	bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
+
+	if (lock_contention_bpf__load(skel) < 0) {
+		pr_err("Failed to load lock-contention BPF skeleton\n");
+		return -1;
+	}
+
+	if (target__has_cpu(target)) {
+		u32 cpu;
+		u8 val = 1;
+
+		skel->bss->has_cpu = 1;
+		fd = bpf_map__fd(skel->maps.cpu_filter);
+
+		for (i = 0; i < ncpus; i++) {
+			cpu = perf_cpu_map__cpu(evlist->core.user_requested_cpus, i).cpu;
+			bpf_map_update_elem(fd, &cpu, &val, BPF_ANY);
+		}
+	}
+
+	if (target__has_task(target)) {
+		u32 pid;
+		u8 val = 1;
+
+		skel->bss->has_task = 1;
+		fd = bpf_map__fd(skel->maps.task_filter);
+
+		for (i = 0; i < ntasks; i++) {
+			pid = perf_thread_map__pid(evlist->core.threads, i);
+			bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
+		}
+	}
+
+	if (target__none(target) && evlist->workload.pid > 0) {
+		u32 pid = evlist->workload.pid;
+		u8 val = 1;
+
+		skel->bss->has_task = 1;
+		fd = bpf_map__fd(skel->maps.task_filter);
+		bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
+	}
+
+	lock_contention_bpf__attach(skel);
+	return 0;
+}
+
+int lock_contention_start(void)
+{
+	skel->bss->enabled = 1;
+	return 0;
+}
+
+int lock_contention_stop(void)
+{
+	skel->bss->enabled = 0;
+	return 0;
+}
+
+int lock_contention_read(struct lock_contention *con)
+{
+	int fd, stack;
+	s32 prev_key, key;
+	struct lock_contention_data data;
+	struct lock_stat *st;
+	struct machine *machine = con->machine;
+	u64 stack_trace[CONTENTION_STACK_DEPTH];
+
+	fd = bpf_map__fd(skel->maps.lock_stat);
+	stack = bpf_map__fd(skel->maps.stacks);
+
+	con->lost = skel->bss->lost;
+
+	prev_key = 0;
+	while (!bpf_map_get_next_key(fd, &prev_key, &key)) {
+		struct map *kmap;
+		struct symbol *sym;
+		int idx;
+
+		bpf_map_lookup_elem(fd, &key, &data);
+		st = zalloc(sizeof(*st));
+		if (st == NULL)
+			return -1;
+
+		st->nr_contended = data.count;
+		st->wait_time_total = data.total_time;
+		st->wait_time_max = data.max_time;
+		st->wait_time_min = data.min_time;
+
+		if (data.count)
+			st->avg_wait_time = data.total_time / data.count;
+
+		st->flags = data.flags;
+
+		bpf_map_lookup_elem(stack, &key, stack_trace);
+
+		/* skip BPF + lock internal functions */
+		idx = CONTENTION_STACK_SKIP;
+		while (is_lock_function(machine, stack_trace[idx]) &&
+		       idx < CONTENTION_STACK_DEPTH - 1)
+			idx++;
+
+		st->addr = stack_trace[idx];
+		sym = machine__find_kernel_symbol(machine, st->addr, &kmap);
+
+		if (sym) {
+			unsigned long offset;
+			int ret = 0;
+
+			offset = kmap->map_ip(kmap, st->addr) - sym->start;
+
+			if (offset)
+				ret = asprintf(&st->name, "%s+%#lx", sym->name, offset);
+			else
+				st->name = strdup(sym->name);
+
+			if (ret < 0 || st->name == NULL)
+				return -1;
+		} else if (asprintf(&st->name, "%#lx", (unsigned long)st->addr) < 0) {
+			free(st);
+			return -1;
+		}
+
+		hlist_add_head(&st->hash_entry, con->result);
+		prev_key = key;
+	}
+
+	return 0;
+}
+
+int lock_contention_finish(void)
+{
+	if (skel) {
+		skel->bss->enabled = 0;
+		lock_contention_bpf__destroy(skel);
+	}
+
+	return 0;
+}
diff --git a/tools/perf/util/bpf_off_cpu.c b/tools/perf/util/bpf_off_cpu.c
index b73e84a02264..f289b7713598 100644
--- a/tools/perf/util/bpf_off_cpu.c
+++ b/tools/perf/util/bpf_off_cpu.c
@@ -265,6 +265,12 @@ int off_cpu_write(struct perf_session *session)
 
 	sample_type = evsel->core.attr.sample_type;
 
+	if (sample_type & ~OFFCPU_SAMPLE_TYPES) {
+		pr_err("not supported sample type: %llx\n",
+		       (unsigned long long)sample_type);
+		return -1;
+	}
+
 	if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) {
 		if (evsel->core.id)
 			sid = evsel->core.id[0];
@@ -319,7 +325,6 @@ int off_cpu_write(struct perf_session *session)
 		}
 		if (sample_type & PERF_SAMPLE_CGROUP)
 			data.array[n++] = key.cgroup_id;
-		/* TODO: handle more sample types */
 
 		size = n * sizeof(u64);
 		data.hdr.size = size;
diff --git a/tools/perf/util/bpf_skel/kwork_trace.bpf.c b/tools/perf/util/bpf_skel/kwork_trace.bpf.c
new file mode 100644
index 000000000000..063c124e0999
--- /dev/null
+++ b/tools/perf/util/bpf_skel/kwork_trace.bpf.c
@@ -0,0 +1,383 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2022, Huawei
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#define KWORK_COUNT 100
+#define MAX_KWORKNAME 128
+
+/*
+ * This should be in sync with "util/kwork.h"
+ */
+enum kwork_class_type {
+	KWORK_CLASS_IRQ,
+	KWORK_CLASS_SOFTIRQ,
+	KWORK_CLASS_WORKQUEUE,
+	KWORK_CLASS_MAX,
+};
+
+struct work_key {
+	__u32 type;
+	__u32 cpu;
+	__u64 id;
+};
+
+struct report_data {
+	__u64 nr;
+	__u64 total_time;
+	__u64 max_time;
+	__u64 max_time_start;
+	__u64 max_time_end;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(struct work_key));
+	__uint(value_size, MAX_KWORKNAME);
+	__uint(max_entries, KWORK_COUNT);
+} perf_kwork_names SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(struct work_key));
+	__uint(value_size, sizeof(__u64));
+	__uint(max_entries, KWORK_COUNT);
+} perf_kwork_time SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(struct work_key));
+	__uint(value_size, sizeof(struct report_data));
+	__uint(max_entries, KWORK_COUNT);
+} perf_kwork_report SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u8));
+	__uint(max_entries, 1);
+} perf_kwork_cpu_filter SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, MAX_KWORKNAME);
+	__uint(max_entries, 1);
+} perf_kwork_name_filter SEC(".maps");
+
+int enabled = 0;
+int has_cpu_filter = 0;
+int has_name_filter = 0;
+
+static __always_inline int local_strncmp(const char *s1,
+					 unsigned int sz, const char *s2)
+{
+	int ret = 0;
+	unsigned int i;
+
+	for (i = 0; i < sz; i++) {
+		ret = (unsigned char)s1[i] - (unsigned char)s2[i];
+		if (ret || !s1[i] || !s2[i])
+			break;
+	}
+
+	return ret;
+}
+
+static __always_inline int trace_event_match(struct work_key *key, char *name)
+{
+	__u8 *cpu_val;
+	char *name_val;
+	__u32 zero = 0;
+	__u32 cpu = bpf_get_smp_processor_id();
+
+	if (!enabled)
+		return 0;
+
+	if (has_cpu_filter) {
+		cpu_val = bpf_map_lookup_elem(&perf_kwork_cpu_filter, &cpu);
+		if (!cpu_val)
+			return 0;
+	}
+
+	if (has_name_filter && (name != NULL)) {
+		name_val = bpf_map_lookup_elem(&perf_kwork_name_filter, &zero);
+		if (name_val &&
+		    (local_strncmp(name_val, MAX_KWORKNAME, name) != 0)) {
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+static __always_inline void do_update_time(void *map, struct work_key *key,
+					   __u64 time_start, __u64 time_end)
+{
+	struct report_data zero, *data;
+	__s64 delta = time_end - time_start;
+
+	if (delta < 0)
+		return;
+
+	data = bpf_map_lookup_elem(map, key);
+	if (!data) {
+		__builtin_memset(&zero, 0, sizeof(zero));
+		bpf_map_update_elem(map, key, &zero, BPF_NOEXIST);
+		data = bpf_map_lookup_elem(map, key);
+		if (!data)
+			return;
+	}
+
+	if ((delta > data->max_time) ||
+	    (data->max_time == 0)) {
+		data->max_time       = delta;
+		data->max_time_start = time_start;
+		data->max_time_end   = time_end;
+	}
+
+	data->total_time += delta;
+	data->nr++;
+}
+
+static __always_inline void do_update_timestart(void *map, struct work_key *key)
+{
+	__u64 ts = bpf_ktime_get_ns();
+
+	bpf_map_update_elem(map, key, &ts, BPF_ANY);
+}
+
+static __always_inline void do_update_timeend(void *report_map, void *time_map,
+					      struct work_key *key)
+{
+	__u64 *time = bpf_map_lookup_elem(time_map, key);
+
+	if (time) {
+		bpf_map_delete_elem(time_map, key);
+		do_update_time(report_map, key, *time, bpf_ktime_get_ns());
+	}
+}
+
+static __always_inline void do_update_name(void *map,
+					   struct work_key *key, char *name)
+{
+	if (!bpf_map_lookup_elem(map, key))
+		bpf_map_update_elem(map, key, name, BPF_ANY);
+}
+
+static __always_inline int update_timestart(void *map, struct work_key *key)
+{
+	if (!trace_event_match(key, NULL))
+		return 0;
+
+	do_update_timestart(map, key);
+	return 0;
+}
+
+static __always_inline int update_timestart_and_name(void *time_map,
+						     void *names_map,
+						     struct work_key *key,
+						     char *name)
+{
+	if (!trace_event_match(key, name))
+		return 0;
+
+	do_update_timestart(time_map, key);
+	do_update_name(names_map, key, name);
+
+	return 0;
+}
+
+static __always_inline int update_timeend(void *report_map,
+					  void *time_map, struct work_key *key)
+{
+	if (!trace_event_match(key, NULL))
+		return 0;
+
+	do_update_timeend(report_map, time_map, key);
+
+	return 0;
+}
+
+static __always_inline int update_timeend_and_name(void *report_map,
+						   void *time_map,
+						   void *names_map,
+						   struct work_key *key,
+						   char *name)
+{
+	if (!trace_event_match(key, name))
+		return 0;
+
+	do_update_timeend(report_map, time_map, key);
+	do_update_name(names_map, key, name);
+
+	return 0;
+}
+
+SEC("tracepoint/irq/irq_handler_entry")
+int report_irq_handler_entry(struct trace_event_raw_irq_handler_entry *ctx)
+{
+	char name[MAX_KWORKNAME];
+	struct work_key key = {
+		.type = KWORK_CLASS_IRQ,
+		.cpu  = bpf_get_smp_processor_id(),
+		.id   = (__u64)ctx->irq,
+	};
+	void *name_addr = (void *)ctx + (ctx->__data_loc_name & 0xffff);
+
+	bpf_probe_read_kernel_str(name, sizeof(name), name_addr);
+
+	return update_timestart_and_name(&perf_kwork_time,
+					 &perf_kwork_names, &key, name);
+}
+
+SEC("tracepoint/irq/irq_handler_exit")
+int report_irq_handler_exit(struct trace_event_raw_irq_handler_exit *ctx)
+{
+	struct work_key key = {
+		.type = KWORK_CLASS_IRQ,
+		.cpu  = bpf_get_smp_processor_id(),
+		.id   = (__u64)ctx->irq,
+	};
+
+	return update_timeend(&perf_kwork_report, &perf_kwork_time, &key);
+}
+
+static char softirq_name_list[NR_SOFTIRQS][MAX_KWORKNAME] = {
+	{ "HI"       },
+	{ "TIMER"    },
+	{ "NET_TX"   },
+	{ "NET_RX"   },
+	{ "BLOCK"    },
+	{ "IRQ_POLL" },
+	{ "TASKLET"  },
+	{ "SCHED"    },
+	{ "HRTIMER"  },
+	{ "RCU"      },
+};
+
+SEC("tracepoint/irq/softirq_entry")
+int report_softirq_entry(struct trace_event_raw_softirq *ctx)
+{
+	unsigned int vec = ctx->vec;
+	struct work_key key = {
+		.type = KWORK_CLASS_SOFTIRQ,
+		.cpu  = bpf_get_smp_processor_id(),
+		.id   = (__u64)vec,
+	};
+
+	if (vec < NR_SOFTIRQS) {
+		return update_timestart_and_name(&perf_kwork_time,
+						 &perf_kwork_names, &key,
+						 softirq_name_list[vec]);
+	}
+
+	return 0;
+}
+
+SEC("tracepoint/irq/softirq_exit")
+int report_softirq_exit(struct trace_event_raw_softirq *ctx)
+{
+	struct work_key key = {
+		.type = KWORK_CLASS_SOFTIRQ,
+		.cpu  = bpf_get_smp_processor_id(),
+		.id   = (__u64)ctx->vec,
+	};
+
+	return update_timeend(&perf_kwork_report, &perf_kwork_time, &key);
+}
+
+SEC("tracepoint/irq/softirq_raise")
+int latency_softirq_raise(struct trace_event_raw_softirq *ctx)
+{
+	unsigned int vec = ctx->vec;
+	struct work_key key = {
+		.type = KWORK_CLASS_SOFTIRQ,
+		.cpu  = bpf_get_smp_processor_id(),
+		.id   = (__u64)vec,
+	};
+
+	if (vec < NR_SOFTIRQS) {
+		return update_timestart_and_name(&perf_kwork_time,
+						 &perf_kwork_names, &key,
+						 softirq_name_list[vec]);
+	}
+
+	return 0;
+}
+
+SEC("tracepoint/irq/softirq_entry")
+int latency_softirq_entry(struct trace_event_raw_softirq *ctx)
+{
+	struct work_key key = {
+		.type = KWORK_CLASS_SOFTIRQ,
+		.cpu  = bpf_get_smp_processor_id(),
+		.id   = (__u64)ctx->vec,
+	};
+
+	return update_timeend(&perf_kwork_report, &perf_kwork_time, &key);
+}
+
+SEC("tracepoint/workqueue/workqueue_execute_start")
+int report_workqueue_execute_start(struct trace_event_raw_workqueue_execute_start *ctx)
+{
+	struct work_key key = {
+		.type = KWORK_CLASS_WORKQUEUE,
+		.cpu  = bpf_get_smp_processor_id(),
+		.id   = (__u64)ctx->work,
+	};
+
+	return update_timestart(&perf_kwork_time, &key);
+}
+
+SEC("tracepoint/workqueue/workqueue_execute_end")
+int report_workqueue_execute_end(struct trace_event_raw_workqueue_execute_end *ctx)
+{
+	char name[MAX_KWORKNAME];
+	struct work_key key = {
+		.type = KWORK_CLASS_WORKQUEUE,
+		.cpu  = bpf_get_smp_processor_id(),
+		.id   = (__u64)ctx->work,
+	};
+	unsigned long long func_addr = (unsigned long long)ctx->function;
+
+	__builtin_memset(name, 0, sizeof(name));
+	bpf_snprintf(name, sizeof(name), "%ps", &func_addr, sizeof(func_addr));
+
+	return update_timeend_and_name(&perf_kwork_report, &perf_kwork_time,
+				       &perf_kwork_names, &key, name);
+}
+
+SEC("tracepoint/workqueue/workqueue_activate_work")
+int latency_workqueue_activate_work(struct trace_event_raw_workqueue_activate_work *ctx)
+{
+	struct work_key key = {
+		.type = KWORK_CLASS_WORKQUEUE,
+		.cpu  = bpf_get_smp_processor_id(),
+		.id   = (__u64)ctx->work,
+	};
+
+	return update_timestart(&perf_kwork_time, &key);
+}
+
+SEC("tracepoint/workqueue/workqueue_execute_start")
+int latency_workqueue_execute_start(struct trace_event_raw_workqueue_execute_start *ctx)
+{
+	char name[MAX_KWORKNAME];
+	struct work_key key = {
+		.type = KWORK_CLASS_WORKQUEUE,
+		.cpu  = bpf_get_smp_processor_id(),
+		.id   = (__u64)ctx->work,
+	};
+	unsigned long long func_addr = (unsigned long long)ctx->function;
+
+	__builtin_memset(name, 0, sizeof(name));
+	bpf_snprintf(name, sizeof(name), "%ps", &func_addr, sizeof(func_addr));
+
+	return update_timeend_and_name(&perf_kwork_report, &perf_kwork_time,
+				       &perf_kwork_names, &key, name);
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
new file mode 100644
index 000000000000..9e8b94eb6320
--- /dev/null
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2022 Google
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+/* maximum stack trace depth */
+#define MAX_STACKS   8
+
+/* default buffer size */
+#define MAX_ENTRIES  10240
+
+struct contention_key {
+	__s32 stack_id;
+};
+
+struct contention_data {
+	__u64 total_time;
+	__u64 min_time;
+	__u64 max_time;
+	__u32 count;
+	__u32 flags;
+};
+
+struct tstamp_data {
+	__u64 timestamp;
+	__u64 lock;
+	__u32 flags;
+	__s32 stack_id;
+};
+
+/* callstack storage  */
+struct {
+	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, MAX_STACKS * sizeof(__u64));
+	__uint(max_entries, MAX_ENTRIES);
+} stacks SEC(".maps");
+
+/* maintain timestamp at the beginning of contention */
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct tstamp_data);
+} tstamp SEC(".maps");
+
+/* actual lock contention statistics */
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(struct contention_key));
+	__uint(value_size, sizeof(struct contention_data));
+	__uint(max_entries, MAX_ENTRIES);
+} lock_stat SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u8));
+	__uint(max_entries, 1);
+} cpu_filter SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u8));
+	__uint(max_entries, 1);
+} task_filter SEC(".maps");
+
+/* control flags */
+int enabled;
+int has_cpu;
+int has_task;
+
+/* error stat */
+unsigned long lost;
+
+static inline int can_record(void)
+{
+	if (has_cpu) {
+		__u32 cpu = bpf_get_smp_processor_id();
+		__u8 *ok;
+
+		ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
+		if (!ok)
+			return 0;
+	}
+
+	if (has_task) {
+		__u8 *ok;
+		__u32 pid = bpf_get_current_pid_tgid();
+
+		ok = bpf_map_lookup_elem(&task_filter, &pid);
+		if (!ok)
+			return 0;
+	}
+
+	return 1;
+}
+
+SEC("tp_btf/contention_begin")
+int contention_begin(u64 *ctx)
+{
+	struct task_struct *curr;
+	struct tstamp_data *pelem;
+
+	if (!enabled || !can_record())
+		return 0;
+
+	curr = bpf_get_current_task_btf();
+	pelem = bpf_task_storage_get(&tstamp, curr, NULL,
+				     BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!pelem || pelem->lock)
+		return 0;
+
+	pelem->timestamp = bpf_ktime_get_ns();
+	pelem->lock = (__u64)ctx[0];
+	pelem->flags = (__u32)ctx[1];
+	pelem->stack_id = bpf_get_stackid(ctx, &stacks, BPF_F_FAST_STACK_CMP);
+
+	if (pelem->stack_id < 0)
+		lost++;
+	return 0;
+}
+
+SEC("tp_btf/contention_end")
+int contention_end(u64 *ctx)
+{
+	struct task_struct *curr;
+	struct tstamp_data *pelem;
+	struct contention_key key;
+	struct contention_data *data;
+	__u64 duration;
+
+	if (!enabled)
+		return 0;
+
+	curr = bpf_get_current_task_btf();
+	pelem = bpf_task_storage_get(&tstamp, curr, NULL, 0);
+	if (!pelem || pelem->lock != ctx[0])
+		return 0;
+
+	duration = bpf_ktime_get_ns() - pelem->timestamp;
+
+	key.stack_id = pelem->stack_id;
+	data = bpf_map_lookup_elem(&lock_stat, &key);
+	if (!data) {
+		struct contention_data first = {
+			.total_time = duration,
+			.max_time = duration,
+			.min_time = duration,
+			.count = 1,
+			.flags = pelem->flags,
+		};
+
+		bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST);
+		pelem->lock = 0;
+		return 0;
+	}
+
+	__sync_fetch_and_add(&data->total_time, duration);
+	__sync_fetch_and_add(&data->count, 1);
+
+	/* FIXME: need atomic operations */
+	if (data->max_time < duration)
+		data->max_time = duration;
+	if (data->min_time > duration)
+		data->min_time = duration;
+
+	pelem->lock = 0;
+	return 0;
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c
index 792ae2847080..cc6d7fd55118 100644
--- a/tools/perf/util/bpf_skel/off_cpu.bpf.c
+++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c
@@ -71,6 +71,11 @@ struct {
 	__uint(max_entries, 1);
 } cgroup_filter SEC(".maps");
 
+/* new kernel task_struct definition */
+struct task_struct___new {
+	long __state;
+} __attribute__((preserve_access_index));
+
 /* old kernel task_struct definition */
 struct task_struct___old {
 	long state;
@@ -93,14 +98,17 @@ const volatile bool uses_cgroup_v1 = false;
  */
 static inline int get_task_state(struct task_struct *t)
 {
-	if (bpf_core_field_exists(t->__state))
-		return BPF_CORE_READ(t, __state);
+	/* recast pointer to capture new type for compiler */
+	struct task_struct___new *t_new = (void *)t;
 
-	/* recast pointer to capture task_struct___old type for compiler */
-	struct task_struct___old *t_old = (void *)t;
+	if (bpf_core_field_exists(t_new->__state)) {
+		return BPF_CORE_READ(t_new, __state);
+	} else {
+		/* recast pointer to capture old type for compiler */
+		struct task_struct___old *t_old = (void *)t;
 
-	/* now use old "state" name of the field */
-	return BPF_CORE_READ(t_old, state);
+		return BPF_CORE_READ(t_old, state);
+	}
 }
 
 static inline __u64 get_cgroup_id(struct task_struct *t)
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index 82f3d46bea70..9e176146eb10 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -300,12 +300,6 @@ char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size,
 	return __dso__build_id_filename(dso, bf, size, is_debug, is_kallsyms);
 }
 
-#define dsos__for_each_with_build_id(pos, head)	\
-	list_for_each_entry(pos, head, node)	\
-		if (!pos->has_build_id)		\
-			continue;		\
-		else
-
 static int write_buildid(const char *name, size_t name_len, struct build_id *bid,
 			 pid_t pid, u16 misc, struct feat_fd *fd)
 {
@@ -567,14 +561,11 @@ char *build_id_cache__cachedir(const char *sbuild_id, const char *name,
 	char *realname = (char *)name, *filename;
 	bool slash = is_kallsyms || is_vdso;
 
-	if (!slash) {
+	if (!slash)
 		realname = nsinfo__realpath(name, nsi);
-		if (!realname)
-			return NULL;
-	}
 
 	if (asprintf(&filename, "%s%s%s%s%s", buildid_dir, slash ? "/" : "",
-		     is_vdso ? DSO__NAME_VDSO : realname,
+		     is_vdso ? DSO__NAME_VDSO : (realname ? realname : name),
 		     sbuild_id ? "/" : "", sbuild_id ?: "") < 0)
 		filename = NULL;
 
@@ -631,9 +622,12 @@ static int build_id_cache__add_sdt_cache(const char *sbuild_id,
 #endif
 
 static char *build_id_cache__find_debug(const char *sbuild_id,
-					struct nsinfo *nsi)
+					struct nsinfo *nsi,
+					const char *root_dir)
 {
+	const char *dirname = "/usr/lib/debug/.build-id/";
 	char *realname = NULL;
+	char dirbuf[PATH_MAX];
 	char *debugfile;
 	struct nscookie nsc;
 	size_t len = 0;
@@ -642,8 +636,12 @@ static char *build_id_cache__find_debug(const char *sbuild_id,
 	if (!debugfile)
 		goto out;
 
-	len = __symbol__join_symfs(debugfile, PATH_MAX,
-				   "/usr/lib/debug/.build-id/");
+	if (root_dir) {
+		path__join(dirbuf, PATH_MAX, root_dir, dirname);
+		dirname = dirbuf;
+	}
+
+	len = __symbol__join_symfs(debugfile, PATH_MAX, dirname);
 	snprintf(debugfile + len, PATH_MAX - len, "%.2s/%s.debug", sbuild_id,
 		 sbuild_id + 2);
 
@@ -674,14 +672,18 @@ out:
 
 int
 build_id_cache__add(const char *sbuild_id, const char *name, const char *realname,
-		    struct nsinfo *nsi, bool is_kallsyms, bool is_vdso)
+		    struct nsinfo *nsi, bool is_kallsyms, bool is_vdso,
+		    const char *proper_name, const char *root_dir)
 {
 	const size_t size = PATH_MAX;
 	char *filename = NULL, *dir_name = NULL, *linkname = zalloc(size), *tmp;
 	char *debugfile = NULL;
 	int err = -1;
 
-	dir_name = build_id_cache__cachedir(sbuild_id, name, nsi, is_kallsyms,
+	if (!proper_name)
+		proper_name = name;
+
+	dir_name = build_id_cache__cachedir(sbuild_id, proper_name, nsi, is_kallsyms,
 					    is_vdso);
 	if (!dir_name)
 		goto out_free;
@@ -721,7 +723,7 @@ build_id_cache__add(const char *sbuild_id, const char *name, const char *realnam
 	 */
 	if (!is_kallsyms && !is_vdso &&
 	    strncmp(".ko", name + strlen(name) - 3, 3)) {
-		debugfile = build_id_cache__find_debug(sbuild_id, nsi);
+		debugfile = build_id_cache__find_debug(sbuild_id, nsi, root_dir);
 		if (debugfile) {
 			zfree(&filename);
 			if (asprintf(&filename, "%s/%s", dir_name,
@@ -787,8 +789,9 @@ out_free:
 	return err;
 }
 
-int build_id_cache__add_s(const char *sbuild_id, const char *name,
-			  struct nsinfo *nsi, bool is_kallsyms, bool is_vdso)
+int __build_id_cache__add_s(const char *sbuild_id, const char *name,
+			    struct nsinfo *nsi, bool is_kallsyms, bool is_vdso,
+			    const char *proper_name, const char *root_dir)
 {
 	char *realname = NULL;
 	int err = -1;
@@ -802,8 +805,8 @@ int build_id_cache__add_s(const char *sbuild_id, const char *name,
 			goto out_free;
 	}
 
-	err = build_id_cache__add(sbuild_id, name, realname, nsi, is_kallsyms, is_vdso);
-
+	err = build_id_cache__add(sbuild_id, name, realname, nsi,
+				  is_kallsyms, is_vdso, proper_name, root_dir);
 out_free:
 	if (!is_kallsyms)
 		free(realname);
@@ -812,14 +815,16 @@ out_free:
 
 static int build_id_cache__add_b(const struct build_id *bid,
 				 const char *name, struct nsinfo *nsi,
-				 bool is_kallsyms, bool is_vdso)
+				 bool is_kallsyms, bool is_vdso,
+				 const char *proper_name,
+				 const char *root_dir)
 {
 	char sbuild_id[SBUILD_ID_SIZE];
 
 	build_id__sprintf(bid, sbuild_id);
 
-	return build_id_cache__add_s(sbuild_id, name, nsi, is_kallsyms,
-				     is_vdso);
+	return __build_id_cache__add_s(sbuild_id, name, nsi, is_kallsyms,
+				       is_vdso, proper_name, root_dir);
 }
 
 bool build_id_cache__cached(const char *sbuild_id)
@@ -872,12 +877,40 @@ out_free:
 	return err;
 }
 
+static int filename__read_build_id_ns(const char *filename,
+				      struct build_id *bid,
+				      struct nsinfo *nsi)
+{
+	struct nscookie nsc;
+	int ret;
+
+	nsinfo__mountns_enter(nsi, &nsc);
+	ret = filename__read_build_id(filename, bid);
+	nsinfo__mountns_exit(&nsc);
+
+	return ret;
+}
+
+static bool dso__build_id_mismatch(struct dso *dso, const char *name)
+{
+	struct build_id bid;
+
+	if (filename__read_build_id_ns(name, &bid, dso->nsinfo) < 0)
+		return false;
+
+	return !dso__build_id_equal(dso, &bid);
+}
+
 static int dso__cache_build_id(struct dso *dso, struct machine *machine,
 			       void *priv __maybe_unused)
 {
 	bool is_kallsyms = dso__is_kallsyms(dso);
 	bool is_vdso = dso__is_vdso(dso);
 	const char *name = dso->long_name;
+	const char *proper_name = NULL;
+	const char *root_dir = NULL;
+	char *allocated_name = NULL;
+	int ret = 0;
 
 	if (!dso->has_build_id)
 		return 0;
@@ -886,8 +919,29 @@ static int dso__cache_build_id(struct dso *dso, struct machine *machine,
 		is_kallsyms = true;
 		name = machine->mmap_name;
 	}
-	return build_id_cache__add_b(&dso->bid, name, dso->nsinfo,
-				     is_kallsyms, is_vdso);
+
+	if (!machine__is_host(machine)) {
+		if (*machine->root_dir) {
+			root_dir = machine->root_dir;
+			ret = asprintf(&allocated_name, "%s/%s", root_dir, name);
+			if (ret < 0)
+				return ret;
+			proper_name = name;
+			name = allocated_name;
+		} else if (is_kallsyms) {
+			/* Cannot get guest kallsyms */
+			return 0;
+		}
+	}
+
+	if (!is_kallsyms && dso__build_id_mismatch(dso, name))
+		goto out_free;
+
+	ret = build_id_cache__add_b(&dso->bid, name, dso->nsinfo,
+				    is_kallsyms, is_vdso, proper_name, root_dir);
+out_free:
+	free(allocated_name);
+	return ret;
 }
 
 static int
diff --git a/tools/perf/util/build-id.h b/tools/perf/util/build-id.h
index c19617151670..4e3a1169379b 100644
--- a/tools/perf/util/build-id.h
+++ b/tools/perf/util/build-id.h
@@ -66,10 +66,18 @@ int build_id_cache__list_build_ids(const char *pathname, struct nsinfo *nsi,
 				   struct strlist **result);
 bool build_id_cache__cached(const char *sbuild_id);
 int build_id_cache__add(const char *sbuild_id, const char *name, const char *realname,
-			struct nsinfo *nsi, bool is_kallsyms, bool is_vdso);
-int build_id_cache__add_s(const char *sbuild_id,
-			  const char *name, struct nsinfo *nsi,
-			  bool is_kallsyms, bool is_vdso);
+			struct nsinfo *nsi, bool is_kallsyms, bool is_vdso,
+			const char *proper_name, const char *root_dir);
+int __build_id_cache__add_s(const char *sbuild_id,
+			    const char *name, struct nsinfo *nsi,
+			    bool is_kallsyms, bool is_vdso,
+			    const char *proper_name, const char *root_dir);
+static inline int build_id_cache__add_s(const char *sbuild_id,
+					const char *name, struct nsinfo *nsi,
+					bool is_kallsyms, bool is_vdso)
+{
+	return __build_id_cache__add_s(sbuild_id, name, nsi, is_kallsyms, is_vdso, NULL, NULL);
+}
 int build_id_cache__remove_s(const char *sbuild_id);
 
 extern char buildid_dir[];
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 5c27a4b2e7a7..7e663673f79f 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -31,6 +31,7 @@
 #include "callchain.h"
 #include "branch.h"
 #include "symbol.h"
+#include "util.h"
 #include "../perf.h"
 
 #define CALLCHAIN_PARAM_DEFAULT			\
@@ -266,12 +267,17 @@ int parse_callchain_record(const char *arg, struct callchain_param *param)
 	do {
 		/* Framepointer style */
 		if (!strncmp(name, "fp", sizeof("fp"))) {
-			if (!strtok_r(NULL, ",", &saveptr)) {
-				param->record_mode = CALLCHAIN_FP;
-				ret = 0;
-			} else
-				pr_err("callchain: No more arguments "
-				       "needed for --call-graph fp\n");
+			ret = 0;
+			param->record_mode = CALLCHAIN_FP;
+
+			tok = strtok_r(NULL, ",", &saveptr);
+			if (tok) {
+				unsigned long size;
+
+				size = strtoul(tok, &name, 0);
+				if (size < (unsigned) sysctl__max_stack())
+					param->max_stack = size;
+			}
 			break;
 
 		/* Dwarf style */
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 8b95fb3c4d7b..16db965ac995 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -1451,7 +1451,7 @@ static int cs_etm__sample(struct cs_etm_queue *etmq,
 		 * tidq->packet->instr_count represents the number of
 		 * instructions in the current etm packet.
 		 *
-		 * Period instructions (Pi) contains the the number of
+		 * Period instructions (Pi) contains the number of
 		 * instructions executed after the sample point(n) from the
 		 * previous etm packet.  This will always be less than
 		 * etm->instructions_sample_period.
diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c
index f1ab6edba446..613d6ae82663 100644
--- a/tools/perf/util/data-convert-json.c
+++ b/tools/perf/util/data-convert-json.c
@@ -149,6 +149,7 @@ static int process_sample_event(struct perf_tool *tool,
 	struct convert_json *c = container_of(tool, struct convert_json, tool);
 	FILE *out = c->out;
 	struct addr_location al, tal;
+	u64 sample_type = __evlist__combined_sample_type(evsel->evlist);
 	u8 cpumode = PERF_RECORD_MISC_USER;
 
 	if (machine__resolve(machine, &al, sample) < 0) {
@@ -168,7 +169,9 @@ static int process_sample_event(struct perf_tool *tool,
 	output_json_key_format(out, true, 3, "pid", "%i", al.thread->pid_);
 	output_json_key_format(out, true, 3, "tid", "%i", al.thread->tid);
 
-	if (al.thread->cpu >= 0)
+	if ((sample_type & PERF_SAMPLE_CPU))
+		output_json_key_format(out, true, 3, "cpu", "%i", sample->cpu);
+	else if (al.thread->cpu >= 0)
 		output_json_key_format(out, true, 3, "cpu", "%i", al.thread->cpu);
 
 	output_json_key_string(out, true, 3, "comm", thread__comm_str(al.thread));
diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
index caabeac24c69..a7f68c309545 100644
--- a/tools/perf/util/data.c
+++ b/tools/perf/util/data.c
@@ -3,6 +3,7 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/zalloc.h>
+#include <linux/err.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <errno.h>
@@ -481,16 +482,21 @@ int perf_data__make_kcore_dir(struct perf_data *data, char *buf, size_t buf_sz)
 
 bool has_kcore_dir(const char *path)
 {
-	char *kcore_dir;
-	int ret;
-
-	if (asprintf(&kcore_dir, "%s/kcore_dir", path) < 0)
-		return false;
-
-	ret = access(kcore_dir, F_OK);
+	struct dirent *d = ERR_PTR(-EINVAL);
+	const char *name = "kcore_dir";
+	DIR *dir = opendir(path);
+	size_t n = strlen(name);
+	bool result = false;
+
+	if (dir) {
+		while (d && !result) {
+			d = readdir(dir);
+			result = d ? strncmp(d->d_name, name, n) : false;
+		}
+		closedir(dir);
+	}
 
-	free(kcore_dir);
-	return !ret;
+	return result;
 }
 
 char *perf_data__kallsyms_name(struct perf_data *data)
@@ -512,6 +518,25 @@ char *perf_data__kallsyms_name(struct perf_data *data)
 	return kallsyms_name;
 }
 
+char *perf_data__guest_kallsyms_name(struct perf_data *data, pid_t machine_pid)
+{
+	char *kallsyms_name;
+	struct stat st;
+
+	if (!data->is_dir)
+		return NULL;
+
+	if (asprintf(&kallsyms_name, "%s/kcore_dir__%d/kallsyms", data->path, machine_pid) < 0)
+		return NULL;
+
+	if (stat(kallsyms_name, &st)) {
+		free(kallsyms_name);
+		return NULL;
+	}
+
+	return kallsyms_name;
+}
+
 bool is_perf_data(const char *path)
 {
 	bool ret = false;
diff --git a/tools/perf/util/data.h b/tools/perf/util/data.h
index 7de53d6e2d7f..effcc195d7e9 100644
--- a/tools/perf/util/data.h
+++ b/tools/perf/util/data.h
@@ -4,6 +4,7 @@
 
 #include <stdio.h>
 #include <stdbool.h>
+#include <unistd.h>
 #include <linux/types.h>
 
 enum perf_data_mode {
@@ -101,5 +102,6 @@ unsigned long perf_data__size(struct perf_data *data);
 int perf_data__make_kcore_dir(struct perf_data *data, char *buf, size_t buf_sz);
 bool has_kcore_dir(const char *path);
 char *perf_data__kallsyms_name(struct perf_data *data);
+char *perf_data__guest_kallsyms_name(struct perf_data *data, pid_t machine_pid);
 bool is_perf_data(const char *path);
 #endif /* __PERF_DATA_H */
diff --git a/tools/perf/util/dlfilter.c b/tools/perf/util/dlfilter.c
index db964d5a52af..54e4d4495e00 100644
--- a/tools/perf/util/dlfilter.c
+++ b/tools/perf/util/dlfilter.c
@@ -495,6 +495,8 @@ int dlfilter__do_filter_event(struct dlfilter *d,
 	ASSIGN(misc);
 	ASSIGN(raw_size);
 	ASSIGN(raw_data);
+	ASSIGN(machine_pid);
+	ASSIGN(vcpu);
 
 	if (sample->branch_stack) {
 		d_sample.brstack_nr = sample->branch_stack->nr;
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index 97047a11282b..66981c7a9a18 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -227,6 +227,12 @@ struct dso {
 #define dso__for_each_symbol(dso, pos, n)	\
 	symbols__for_each_entry(&(dso)->symbols, pos, n)
 
+#define dsos__for_each_with_build_id(pos, head)	\
+	list_for_each_entry(pos, head, node)	\
+		if (!pos->has_build_id)		\
+			continue;		\
+		else
+
 static inline void dso__set_loaded(struct dso *dso)
 {
 	dso->loaded = true;
diff --git a/tools/perf/util/dsos.c b/tools/perf/util/dsos.c
index b97366f77bbf..2bd23e4cf19e 100644
--- a/tools/perf/util/dsos.c
+++ b/tools/perf/util/dsos.c
@@ -23,8 +23,19 @@ static int __dso_id__cmp(struct dso_id *a, struct dso_id *b)
 	if (a->ino > b->ino) return -1;
 	if (a->ino < b->ino) return 1;
 
-	if (a->ino_generation > b->ino_generation) return -1;
-	if (a->ino_generation < b->ino_generation) return 1;
+	/*
+	 * Synthesized MMAP events have zero ino_generation, avoid comparing
+	 * them with MMAP events with actual ino_generation.
+	 *
+	 * I found it harmful because the mismatch resulted in a new
+	 * dso that did not have a build ID whereas the original dso did have a
+	 * build ID. The build ID was essential because the object was not found
+	 * otherwise. - Adrian
+	 */
+	if (a->ino_generation && b->ino_generation) {
+		if (a->ino_generation > b->ino_generation) return -1;
+		if (a->ino_generation < b->ino_generation) return 1;
+	}
 
 	return 0;
 }
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index 579e44c59914..5b8cf6a421a4 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -179,7 +179,7 @@ static void perf_env__purge_bpf(struct perf_env *env __maybe_unused)
 
 void perf_env__exit(struct perf_env *env)
 {
-	int i;
+	int i, j;
 
 	perf_env__purge_bpf(env);
 	perf_env__purge_cgroups(env);
@@ -196,6 +196,8 @@ void perf_env__exit(struct perf_env *env)
 	zfree(&env->sibling_threads);
 	zfree(&env->pmu_mappings);
 	zfree(&env->cpu);
+	for (i = 0; i < env->nr_cpu_pmu_caps; i++)
+		zfree(&env->cpu_pmu_caps[i]);
 	zfree(&env->cpu_pmu_caps);
 	zfree(&env->numa_map);
 
@@ -217,11 +219,13 @@ void perf_env__exit(struct perf_env *env)
 	}
 	zfree(&env->hybrid_nodes);
 
-	for (i = 0; i < env->nr_hybrid_cpc_nodes; i++) {
-		zfree(&env->hybrid_cpc_nodes[i].cpu_pmu_caps);
-		zfree(&env->hybrid_cpc_nodes[i].pmu_name);
+	for (i = 0; i < env->nr_pmus_with_caps; i++) {
+		for (j = 0; j < env->pmu_caps[i].nr_caps; j++)
+			zfree(&env->pmu_caps[i].caps[j]);
+		zfree(&env->pmu_caps[i].caps);
+		zfree(&env->pmu_caps[i].pmu_name);
 	}
-	zfree(&env->hybrid_cpc_nodes);
+	zfree(&env->pmu_caps);
 }
 
 void perf_env__init(struct perf_env *env)
@@ -527,3 +531,51 @@ int perf_env__numa_node(struct perf_env *env, struct perf_cpu cpu)
 
 	return cpu.cpu >= 0 && cpu.cpu < env->nr_numa_map ? env->numa_map[cpu.cpu] : -1;
 }
+
+char *perf_env__find_pmu_cap(struct perf_env *env, const char *pmu_name,
+			     const char *cap)
+{
+	char *cap_eq;
+	int cap_size;
+	char **ptr;
+	int i, j;
+
+	if (!pmu_name || !cap)
+		return NULL;
+
+	cap_size = strlen(cap);
+	cap_eq = zalloc(cap_size + 2);
+	if (!cap_eq)
+		return NULL;
+
+	memcpy(cap_eq, cap, cap_size);
+	cap_eq[cap_size] = '=';
+
+	if (!strcmp(pmu_name, "cpu")) {
+		for (i = 0; i < env->nr_cpu_pmu_caps; i++) {
+			if (!strncmp(env->cpu_pmu_caps[i], cap_eq, cap_size + 1)) {
+				free(cap_eq);
+				return &env->cpu_pmu_caps[i][cap_size + 1];
+			}
+		}
+		goto out;
+	}
+
+	for (i = 0; i < env->nr_pmus_with_caps; i++) {
+		if (strcmp(env->pmu_caps[i].pmu_name, pmu_name))
+			continue;
+
+		ptr = env->pmu_caps[i].caps;
+
+		for (j = 0; j < env->pmu_caps[i].nr_caps; j++) {
+			if (!strncmp(ptr[j], cap_eq, cap_size + 1)) {
+				free(cap_eq);
+				return &ptr[j][cap_size + 1];
+			}
+		}
+	}
+
+out:
+	free(cap_eq);
+	return NULL;
+}
diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h
index a3541f98e1fc..4566c51f2fd9 100644
--- a/tools/perf/util/env.h
+++ b/tools/perf/util/env.h
@@ -43,10 +43,10 @@ struct hybrid_node {
 	char	*cpus;
 };
 
-struct hybrid_cpc_node {
-	int		nr_cpu_pmu_caps;
+struct pmu_caps {
+	int		nr_caps;
 	unsigned int    max_branches;
-	char            *cpu_pmu_caps;
+	char            **caps;
 	char            *pmu_name;
 };
 
@@ -74,14 +74,14 @@ struct perf_env {
 	int			nr_groups;
 	int			nr_cpu_pmu_caps;
 	int			nr_hybrid_nodes;
-	int			nr_hybrid_cpc_nodes;
+	int			nr_pmus_with_caps;
 	char			*cmdline;
 	const char		**cmdline_argv;
 	char			*sibling_cores;
 	char			*sibling_dies;
 	char			*sibling_threads;
 	char			*pmu_mappings;
-	char			*cpu_pmu_caps;
+	char			**cpu_pmu_caps;
 	struct cpu_topology_map	*cpu;
 	struct cpu_cache_level	*caches;
 	int			 caches_cnt;
@@ -94,7 +94,7 @@ struct perf_env {
 	struct memory_node	*memory_nodes;
 	unsigned long long	 memory_bsize;
 	struct hybrid_node	*hybrid_nodes;
-	struct hybrid_cpc_node	*hybrid_cpc_nodes;
+	struct pmu_caps		*pmu_caps;
 #ifdef HAVE_LIBBPF_SUPPORT
 	/*
 	 * bpf_info_lock protects bpf rbtrees. This is needed because the
@@ -172,4 +172,6 @@ bool perf_env__insert_btf(struct perf_env *env, struct btf_node *btf_node);
 struct btf_node *perf_env__find_btf(struct perf_env *env, __u32 btf_id);
 
 int perf_env__numa_node(struct perf_env *env, struct perf_cpu cpu);
+char *perf_env__find_pmu_cap(struct perf_env *env, const char *pmu_name,
+			     const char *cap);
 #endif /* __PERF_ENV_H */
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 0476bb3a4188..1fa14598b916 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -76,6 +76,7 @@ static const char *perf_event__names[] = {
 	[PERF_RECORD_TIME_CONV]			= "TIME_CONV",
 	[PERF_RECORD_HEADER_FEATURE]		= "FEATURE",
 	[PERF_RECORD_COMPRESSED]		= "COMPRESSED",
+	[PERF_RECORD_FINISHED_INIT]		= "FINISHED_INIT",
 };
 
 const char *perf_event__name(unsigned int id)
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index cdd72e05fd28..a7b0931d5137 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -148,6 +148,8 @@ struct perf_sample {
 	u64 code_page_size;
 	u64 cgroup;
 	u32 flags;
+	u32 machine_pid;
+	u32 vcpu;
 	u16 insn_len;
 	u8  cpumode;
 	u16 misc;
@@ -482,4 +484,25 @@ void arch_perf_synthesize_sample_weight(const struct perf_sample *data, __u64 *a
 const char *arch_perf_header_entry(const char *se_header);
 int arch_support_sort_key(const char *sort_key);
 
+static inline bool perf_event_header__cpumode_is_guest(u8 cpumode)
+{
+	return cpumode == PERF_RECORD_MISC_GUEST_KERNEL ||
+	       cpumode == PERF_RECORD_MISC_GUEST_USER;
+}
+
+static inline bool perf_event_header__misc_is_guest(u16 misc)
+{
+	return perf_event_header__cpumode_is_guest(misc & PERF_RECORD_MISC_CPUMODE_MASK);
+}
+
+static inline bool perf_event_header__is_guest(const struct perf_event_header *header)
+{
+	return perf_event_header__misc_is_guest(header->misc);
+}
+
+static inline bool perf_event__is_guest(const union perf_event *event)
+{
+	return perf_event_header__is_guest(&event->header);
+}
+
 #endif /* __PERF_RECORD_H */
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 48af7d379d82..48167f3941a6 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -309,7 +309,7 @@ struct evsel *evlist__add_aux_dummy(struct evlist *evlist, bool system_wide)
 	return evsel;
 }
 
-static int evlist__add_attrs(struct evlist *evlist, struct perf_event_attr *attrs, size_t nr_attrs)
+int evlist__add_attrs(struct evlist *evlist, struct perf_event_attr *attrs, size_t nr_attrs)
 {
 	struct evsel *evsel, *n;
 	LIST_HEAD(head);
@@ -342,9 +342,14 @@ int __evlist__add_default_attrs(struct evlist *evlist, struct perf_event_attr *a
 	return evlist__add_attrs(evlist, attrs, nr_attrs);
 }
 
-__weak int arch_evlist__add_default_attrs(struct evlist *evlist __maybe_unused)
+__weak int arch_evlist__add_default_attrs(struct evlist *evlist,
+					  struct perf_event_attr *attrs,
+					  size_t nr_attrs)
 {
-	return 0;
+	if (!nr_attrs)
+		return 0;
+
+	return __evlist__add_default_attrs(evlist, attrs, nr_attrs);
 }
 
 struct evsel *evlist__find_tracepoint_by_id(struct evlist *evlist, int id)
@@ -1244,34 +1249,8 @@ bool evlist__valid_read_format(struct evlist *evlist)
 u16 evlist__id_hdr_size(struct evlist *evlist)
 {
 	struct evsel *first = evlist__first(evlist);
-	struct perf_sample *data;
-	u64 sample_type;
-	u16 size = 0;
-
-	if (!first->core.attr.sample_id_all)
-		goto out;
-
-	sample_type = first->core.attr.sample_type;
-
-	if (sample_type & PERF_SAMPLE_TID)
-		size += sizeof(data->tid) * 2;
-
-       if (sample_type & PERF_SAMPLE_TIME)
-		size += sizeof(data->time);
-
-	if (sample_type & PERF_SAMPLE_ID)
-		size += sizeof(data->id);
 
-	if (sample_type & PERF_SAMPLE_STREAM_ID)
-		size += sizeof(data->stream_id);
-
-	if (sample_type & PERF_SAMPLE_CPU)
-		size += sizeof(data->cpu) * 2;
-
-	if (sample_type & PERF_SAMPLE_IDENTIFIER)
-		size += sizeof(data->id);
-out:
-	return size;
+	return first->core.attr.sample_id_all ? evsel__id_hdr_size(first) : 0;
 }
 
 bool evlist__valid_sample_id_all(struct evlist *evlist)
@@ -1533,10 +1512,22 @@ int evlist__start_workload(struct evlist *evlist)
 int evlist__parse_sample(struct evlist *evlist, union perf_event *event, struct perf_sample *sample)
 {
 	struct evsel *evsel = evlist__event2evsel(evlist, event);
+	int ret;
 
 	if (!evsel)
 		return -EFAULT;
-	return evsel__parse_sample(evsel, event, sample);
+	ret = evsel__parse_sample(evsel, event, sample);
+	if (ret)
+		return ret;
+	if (perf_guest && sample->id) {
+		struct perf_sample_id *sid = evlist__id2sid(evlist, sample->id);
+
+		if (sid) {
+			sample->machine_pid = sid->machine_pid;
+			sample->vcpu = sid->vcpu.cpu;
+		}
+	}
+	return 0;
 }
 
 int evlist__parse_sample_timestamp(struct evlist *evlist, union perf_event *event, u64 *timestamp)
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 1bde9ccf4e7d..351ba2887a79 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -104,13 +104,18 @@ static inline int evlist__add_default(struct evlist *evlist)
 	return __evlist__add_default(evlist, true);
 }
 
+int evlist__add_attrs(struct evlist *evlist, struct perf_event_attr *attrs, size_t nr_attrs);
+
 int __evlist__add_default_attrs(struct evlist *evlist,
 				     struct perf_event_attr *attrs, size_t nr_attrs);
 
+int arch_evlist__add_default_attrs(struct evlist *evlist,
+				   struct perf_event_attr *attrs,
+				   size_t nr_attrs);
+
 #define evlist__add_default_attrs(evlist, array) \
-	__evlist__add_default_attrs(evlist, array, ARRAY_SIZE(array))
+	arch_evlist__add_default_attrs(evlist, array, ARRAY_SIZE(array))
 
-int arch_evlist__add_default_attrs(struct evlist *evlist);
 struct evsel *arch_evlist__leader(struct list_head *list);
 
 int evlist__add_dummy(struct evlist *evlist);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index ce499c5da8d7..4852089e1d79 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -48,6 +48,7 @@
 #include "util.h"
 #include "hashmap.h"
 #include "pmu-hybrid.h"
+#include "off_cpu.h"
 #include "../perf-sys.h"
 #include "util/parse-branch-options.h"
 #include <internal/xyarray.h>
@@ -593,9 +594,14 @@ static int evsel__add_modifiers(struct evsel *evsel, char *bf, size_t size)
 	return r;
 }
 
+int __weak arch_evsel__hw_name(struct evsel *evsel, char *bf, size_t size)
+{
+	return scnprintf(bf, size, "%s", __evsel__hw_name(evsel->core.attr.config));
+}
+
 static int evsel__hw_name(struct evsel *evsel, char *bf, size_t size)
 {
-	int r = scnprintf(bf, size, "%s", __evsel__hw_name(evsel->core.attr.config));
+	int r = arch_evsel__hw_name(evsel, bf, size);
 	return r + evsel__add_modifiers(evsel, bf + r, size - r);
 }
 
@@ -1091,6 +1097,11 @@ void __weak arch_evsel__fixup_new_cycles(struct perf_event_attr *attr __maybe_un
 {
 }
 
+void __weak arch__post_evsel_config(struct evsel *evsel __maybe_unused,
+				    struct perf_event_attr *attr __maybe_unused)
+{
+}
+
 static void evsel__set_default_freq_period(struct record_opts *opts,
 					   struct perf_event_attr *attr)
 {
@@ -1102,6 +1113,11 @@ static void evsel__set_default_freq_period(struct record_opts *opts,
 	}
 }
 
+static bool evsel__is_offcpu_event(struct evsel *evsel)
+{
+	return evsel__is_bpf_output(evsel) && !strcmp(evsel->name, OFFCPU_EVENT);
+}
+
 /*
  * The enable_on_exec/disabled value strategy:
  *
@@ -1366,6 +1382,11 @@ void evsel__config(struct evsel *evsel, struct record_opts *opts,
 	 */
 	if (evsel__is_dummy_event(evsel))
 		evsel__reset_sample_bit(evsel, BRANCH_STACK);
+
+	if (evsel__is_offcpu_event(evsel))
+		evsel->core.attr.sample_type &= OFFCPU_SAMPLE_TYPES;
+
+	arch__post_evsel_config(evsel, attr);
 }
 
 int evsel__set_filter(struct evsel *evsel, const char *filter)
@@ -2349,6 +2370,7 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
 	data->misc    = event->header.misc;
 	data->id = -1ULL;
 	data->data_src = PERF_MEM_DATA_SRC_NONE;
+	data->vcpu = -1;
 
 	if (event->header.type != PERF_RECORD_SAMPLE) {
 		if (!evsel->core.attr.sample_id_all)
@@ -2708,6 +2730,32 @@ int evsel__parse_sample_timestamp(struct evsel *evsel, union perf_event *event,
 	return 0;
 }
 
+u16 evsel__id_hdr_size(struct evsel *evsel)
+{
+	u64 sample_type = evsel->core.attr.sample_type;
+	u16 size = 0;
+
+	if (sample_type & PERF_SAMPLE_TID)
+		size += sizeof(u64);
+
+	if (sample_type & PERF_SAMPLE_TIME)
+		size += sizeof(u64);
+
+	if (sample_type & PERF_SAMPLE_ID)
+		size += sizeof(u64);
+
+	if (sample_type & PERF_SAMPLE_STREAM_ID)
+		size += sizeof(u64);
+
+	if (sample_type & PERF_SAMPLE_CPU)
+		size += sizeof(u64);
+
+	if (sample_type & PERF_SAMPLE_IDENTIFIER)
+		size += sizeof(u64);
+
+	return size;
+}
+
 struct tep_format_field *evsel__field(struct evsel *evsel, const char *name)
 {
 	return tep_find_field(evsel->tp_format, name);
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 73ea48e94079..d927713b513e 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -271,6 +271,7 @@ extern const char *const evsel__hw_names[PERF_COUNT_HW_MAX];
 extern const char *const evsel__sw_names[PERF_COUNT_SW_MAX];
 extern char *evsel__bpf_counter_events;
 bool evsel__match_bpf_counter_events(const char *name);
+int arch_evsel__hw_name(struct evsel *evsel, char *bf, size_t size);
 
 int __evsel__hw_cache_type_op_res_name(u8 type, u8 op, u8 result, char *bf, size_t size);
 const char *evsel__name(struct evsel *evsel);
@@ -297,6 +298,7 @@ void evsel__set_sample_id(struct evsel *evsel, bool use_sample_identifier);
 
 void arch_evsel__set_sample_weight(struct evsel *evsel);
 void arch_evsel__fixup_new_cycles(struct perf_event_attr *attr);
+void arch__post_evsel_config(struct evsel *evsel, struct perf_event_attr *attr);
 
 int evsel__set_filter(struct evsel *evsel, const char *filter);
 int evsel__append_tp_filter(struct evsel *evsel, const char *filter);
@@ -380,6 +382,8 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
 int evsel__parse_sample_timestamp(struct evsel *evsel, union perf_event *event,
 				  u64 *timestamp);
 
+u16 evsel__id_hdr_size(struct evsel *evsel);
+
 static inline struct evsel *evsel__next(struct evsel *evsel)
 {
 	return list_entry(evsel->core.node.next, struct evsel, core.node);
diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c
index 675f318ce7c1..c15a9852fa41 100644
--- a/tools/perf/util/expr.c
+++ b/tools/perf/util/expr.c
@@ -12,6 +12,7 @@
 #include "expr-bison.h"
 #include "expr-flex.h"
 #include "smt.h"
+#include "tsc.h"
 #include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/zalloc.h>
@@ -402,6 +403,13 @@ double expr_id_data__source_count(const struct expr_id_data *data)
 	return data->val.source_count;
 }
 
+#if !defined(__i386__) && !defined(__x86_64__)
+double arch_get_tsc_freq(void)
+{
+	return 0.0;
+}
+#endif
+
 double expr__get_literal(const char *literal)
 {
 	static struct cpu_topology *topology;
@@ -417,6 +425,11 @@ double expr__get_literal(const char *literal)
 		goto out;
 	}
 
+	if (!strcasecmp("#system_tsc_freq", literal)) {
+		result = arch_get_tsc_freq();
+		goto out;
+	}
+
 	/*
 	 * Assume that topology strings are consistent, such as CPUs "0-1"
 	 * wouldn't be listed as "0,1", and so after deduplication the number of
diff --git a/tools/perf/util/expr.l b/tools/perf/util/expr.l
index 0a13eb20c814..4dc8edbfd9ce 100644
--- a/tools/perf/util/expr.l
+++ b/tools/perf/util/expr.l
@@ -91,7 +91,7 @@ static int literal(yyscan_t scanner)
 }
 %}
 
-number		([0-9]+\.?[0-9]*|[0-9]*\.?[0-9]+)
+number		([0-9]+\.?[0-9]*|[0-9]*\.?[0-9]+)(e-?[0-9]+)?
 
 sch		[-,=]
 spec		\\{sch}
diff --git a/tools/perf/util/genelf.c b/tools/perf/util/genelf.c
index aed49806a09b..953338b9e887 100644
--- a/tools/perf/util/genelf.c
+++ b/tools/perf/util/genelf.c
@@ -30,7 +30,11 @@
 
 #define BUILD_ID_URANDOM /* different uuid for each run */
 
-#ifdef HAVE_LIBCRYPTO
+// FIXME, remove this and fix the deprecation warnings before its removed and
+// We'll break for good here...
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+#ifdef HAVE_LIBCRYPTO_SUPPORT
 
 #define BUILD_ID_MD5
 #undef BUILD_ID_SHA	/* does not seem to work well when linked with Java */
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 53332da100e8..c30c29c51410 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -1512,18 +1512,13 @@ static int write_compressed(struct feat_fd *ff __maybe_unused,
 	return do_write(ff, &(ff->ph->env.comp_mmap_len), sizeof(ff->ph->env.comp_mmap_len));
 }
 
-static int write_per_cpu_pmu_caps(struct feat_fd *ff, struct perf_pmu *pmu,
-				  bool write_pmu)
+static int __write_pmu_caps(struct feat_fd *ff, struct perf_pmu *pmu,
+			    bool write_pmu)
 {
 	struct perf_pmu_caps *caps = NULL;
-	int nr_caps;
 	int ret;
 
-	nr_caps = perf_pmu__caps_parse(pmu);
-	if (nr_caps < 0)
-		return nr_caps;
-
-	ret = do_write(ff, &nr_caps, sizeof(nr_caps));
+	ret = do_write(ff, &pmu->nr_caps, sizeof(pmu->nr_caps));
 	if (ret < 0)
 		return ret;
 
@@ -1550,33 +1545,60 @@ static int write_cpu_pmu_caps(struct feat_fd *ff,
 			      struct evlist *evlist __maybe_unused)
 {
 	struct perf_pmu *cpu_pmu = perf_pmu__find("cpu");
+	int ret;
 
 	if (!cpu_pmu)
 		return -ENOENT;
 
-	return write_per_cpu_pmu_caps(ff, cpu_pmu, false);
+	ret = perf_pmu__caps_parse(cpu_pmu);
+	if (ret < 0)
+		return ret;
+
+	return __write_pmu_caps(ff, cpu_pmu, false);
 }
 
-static int write_hybrid_cpu_pmu_caps(struct feat_fd *ff,
-				     struct evlist *evlist __maybe_unused)
+static int write_pmu_caps(struct feat_fd *ff,
+			  struct evlist *evlist __maybe_unused)
 {
-	struct perf_pmu *pmu;
-	u32 nr_pmu = perf_pmu__hybrid_pmu_num();
+	struct perf_pmu *pmu = NULL;
+	int nr_pmu = 0;
 	int ret;
 
-	if (nr_pmu == 0)
-		return -ENOENT;
+	while ((pmu = perf_pmu__scan(pmu))) {
+		if (!pmu->name || !strcmp(pmu->name, "cpu") ||
+		    perf_pmu__caps_parse(pmu) <= 0)
+			continue;
+		nr_pmu++;
+	}
 
 	ret = do_write(ff, &nr_pmu, sizeof(nr_pmu));
 	if (ret < 0)
 		return ret;
 
+	if (!nr_pmu)
+		return 0;
+
+	/*
+	 * Write hybrid pmu caps first to maintain compatibility with
+	 * older perf tool.
+	 */
+	pmu = NULL;
 	perf_pmu__for_each_hybrid_pmu(pmu) {
-		ret = write_per_cpu_pmu_caps(ff, pmu, true);
+		ret = __write_pmu_caps(ff, pmu, true);
 		if (ret < 0)
 			return ret;
 	}
 
+	pmu = NULL;
+	while ((pmu = perf_pmu__scan(pmu))) {
+		if (!pmu->name || !strcmp(pmu->name, "cpu") ||
+		    !pmu->nr_caps || perf_pmu__is_hybrid(pmu->name))
+			continue;
+
+		ret = __write_pmu_caps(ff, pmu, true);
+		if (ret < 0)
+			return ret;
+	}
 	return 0;
 }
 
@@ -2051,32 +2073,20 @@ static void print_compressed(struct feat_fd *ff, FILE *fp)
 		ff->ph->env.comp_level, ff->ph->env.comp_ratio);
 }
 
-static void print_per_cpu_pmu_caps(FILE *fp, int nr_caps, char *cpu_pmu_caps,
-				   char *pmu_name)
+static void __print_pmu_caps(FILE *fp, int nr_caps, char **caps, char *pmu_name)
 {
-	const char *delimiter;
-	char *str, buf[128];
+	const char *delimiter = "";
+	int i;
 
 	if (!nr_caps) {
-		if (!pmu_name)
-			fprintf(fp, "# cpu pmu capabilities: not available\n");
-		else
-			fprintf(fp, "# %s pmu capabilities: not available\n", pmu_name);
+		fprintf(fp, "# %s pmu capabilities: not available\n", pmu_name);
 		return;
 	}
 
-	if (!pmu_name)
-		scnprintf(buf, sizeof(buf), "# cpu pmu capabilities: ");
-	else
-		scnprintf(buf, sizeof(buf), "# %s pmu capabilities: ", pmu_name);
-
-	delimiter = buf;
-
-	str = cpu_pmu_caps;
-	while (nr_caps--) {
-		fprintf(fp, "%s%s", delimiter, str);
+	fprintf(fp, "# %s pmu capabilities: ", pmu_name);
+	for (i = 0; i < nr_caps; i++) {
+		fprintf(fp, "%s%s", delimiter, caps[i]);
 		delimiter = ", ";
-		str += strlen(str) + 1;
 	}
 
 	fprintf(fp, "\n");
@@ -2084,19 +2094,18 @@ static void print_per_cpu_pmu_caps(FILE *fp, int nr_caps, char *cpu_pmu_caps,
 
 static void print_cpu_pmu_caps(struct feat_fd *ff, FILE *fp)
 {
-	print_per_cpu_pmu_caps(fp, ff->ph->env.nr_cpu_pmu_caps,
-			       ff->ph->env.cpu_pmu_caps, NULL);
+	__print_pmu_caps(fp, ff->ph->env.nr_cpu_pmu_caps,
+			 ff->ph->env.cpu_pmu_caps, (char *)"cpu");
 }
 
-static void print_hybrid_cpu_pmu_caps(struct feat_fd *ff, FILE *fp)
+static void print_pmu_caps(struct feat_fd *ff, FILE *fp)
 {
-	struct hybrid_cpc_node *n;
+	struct pmu_caps *pmu_caps;
 
-	for (int i = 0; i < ff->ph->env.nr_hybrid_cpc_nodes; i++) {
-		n = &ff->ph->env.hybrid_cpc_nodes[i];
-		print_per_cpu_pmu_caps(fp, n->nr_cpu_pmu_caps,
-				       n->cpu_pmu_caps,
-				       n->pmu_name);
+	for (int i = 0; i < ff->ph->env.nr_pmus_with_caps; i++) {
+		pmu_caps = &ff->ph->env.pmu_caps[i];
+		__print_pmu_caps(fp, pmu_caps->nr_caps, pmu_caps->caps,
+				 pmu_caps->pmu_name);
 	}
 }
 
@@ -3207,28 +3216,26 @@ static int process_compressed(struct feat_fd *ff,
 	return 0;
 }
 
-static int process_per_cpu_pmu_caps(struct feat_fd *ff, int *nr_cpu_pmu_caps,
-				    char **cpu_pmu_caps,
-				    unsigned int *max_branches)
+static int __process_pmu_caps(struct feat_fd *ff, int *nr_caps,
+			      char ***caps, unsigned int *max_branches)
 {
-	char *name, *value;
-	struct strbuf sb;
-	u32 nr_caps;
+	char *name, *value, *ptr;
+	u32 nr_pmu_caps, i;
 
-	if (do_read_u32(ff, &nr_caps))
+	*nr_caps = 0;
+	*caps = NULL;
+
+	if (do_read_u32(ff, &nr_pmu_caps))
 		return -1;
 
-	if (!nr_caps) {
-		pr_debug("cpu pmu capabilities not available\n");
+	if (!nr_pmu_caps)
 		return 0;
-	}
 
-	*nr_cpu_pmu_caps = nr_caps;
-
-	if (strbuf_init(&sb, 128) < 0)
+	*caps = zalloc(sizeof(char *) * nr_pmu_caps);
+	if (!*caps)
 		return -1;
 
-	while (nr_caps--) {
+	for (i = 0; i < nr_pmu_caps; i++) {
 		name = do_read_string(ff);
 		if (!name)
 			goto error;
@@ -3237,12 +3244,10 @@ static int process_per_cpu_pmu_caps(struct feat_fd *ff, int *nr_cpu_pmu_caps,
 		if (!value)
 			goto free_name;
 
-		if (strbuf_addf(&sb, "%s=%s", name, value) < 0)
+		if (asprintf(&ptr, "%s=%s", name, value) < 0)
 			goto free_value;
 
-		/* include a NULL character at the end */
-		if (strbuf_add(&sb, "", 1) < 0)
-			goto free_value;
+		(*caps)[i] = ptr;
 
 		if (!strcmp(name, "branches"))
 			*max_branches = atoi(value);
@@ -3250,7 +3255,7 @@ static int process_per_cpu_pmu_caps(struct feat_fd *ff, int *nr_cpu_pmu_caps,
 		free(value);
 		free(name);
 	}
-	*cpu_pmu_caps = strbuf_detach(&sb, NULL);
+	*nr_caps = nr_pmu_caps;
 	return 0;
 
 free_value:
@@ -3258,64 +3263,76 @@ free_value:
 free_name:
 	free(name);
 error:
-	strbuf_release(&sb);
+	for (; i > 0; i--)
+		free((*caps)[i - 1]);
+	free(*caps);
+	*caps = NULL;
+	*nr_caps = 0;
 	return -1;
 }
 
 static int process_cpu_pmu_caps(struct feat_fd *ff,
 				void *data __maybe_unused)
 {
-	return process_per_cpu_pmu_caps(ff, &ff->ph->env.nr_cpu_pmu_caps,
-					&ff->ph->env.cpu_pmu_caps,
-					&ff->ph->env.max_branches);
+	int ret = __process_pmu_caps(ff, &ff->ph->env.nr_cpu_pmu_caps,
+				     &ff->ph->env.cpu_pmu_caps,
+				     &ff->ph->env.max_branches);
+
+	if (!ret && !ff->ph->env.cpu_pmu_caps)
+		pr_debug("cpu pmu capabilities not available\n");
+	return ret;
 }
 
-static int process_hybrid_cpu_pmu_caps(struct feat_fd *ff,
-				       void *data __maybe_unused)
+static int process_pmu_caps(struct feat_fd *ff, void *data __maybe_unused)
 {
-	struct hybrid_cpc_node *nodes;
+	struct pmu_caps *pmu_caps;
 	u32 nr_pmu, i;
 	int ret;
+	int j;
 
 	if (do_read_u32(ff, &nr_pmu))
 		return -1;
 
 	if (!nr_pmu) {
-		pr_debug("hybrid cpu pmu capabilities not available\n");
+		pr_debug("pmu capabilities not available\n");
 		return 0;
 	}
 
-	nodes = zalloc(sizeof(*nodes) * nr_pmu);
-	if (!nodes)
+	pmu_caps = zalloc(sizeof(*pmu_caps) * nr_pmu);
+	if (!pmu_caps)
 		return -ENOMEM;
 
 	for (i = 0; i < nr_pmu; i++) {
-		struct hybrid_cpc_node *n = &nodes[i];
-
-		ret = process_per_cpu_pmu_caps(ff, &n->nr_cpu_pmu_caps,
-					       &n->cpu_pmu_caps,
-					       &n->max_branches);
+		ret = __process_pmu_caps(ff, &pmu_caps[i].nr_caps,
+					 &pmu_caps[i].caps,
+					 &pmu_caps[i].max_branches);
 		if (ret)
 			goto err;
 
-		n->pmu_name = do_read_string(ff);
-		if (!n->pmu_name) {
+		pmu_caps[i].pmu_name = do_read_string(ff);
+		if (!pmu_caps[i].pmu_name) {
 			ret = -1;
 			goto err;
 		}
+		if (!pmu_caps[i].nr_caps) {
+			pr_debug("%s pmu capabilities not available\n",
+				 pmu_caps[i].pmu_name);
+		}
 	}
 
-	ff->ph->env.nr_hybrid_cpc_nodes = nr_pmu;
-	ff->ph->env.hybrid_cpc_nodes = nodes;
+	ff->ph->env.nr_pmus_with_caps = nr_pmu;
+	ff->ph->env.pmu_caps = pmu_caps;
 	return 0;
 
 err:
 	for (i = 0; i < nr_pmu; i++) {
-		free(nodes[i].cpu_pmu_caps);
-		free(nodes[i].pmu_name);
+		for (j = 0; j < pmu_caps[i].nr_caps; j++)
+			free(pmu_caps[i].caps[j]);
+		free(pmu_caps[i].caps);
+		free(pmu_caps[i].pmu_name);
 	}
 
-	free(nodes);
+	free(pmu_caps);
 	return ret;
 }
 
@@ -3381,7 +3398,7 @@ const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE] = {
 	FEAT_OPR(CPU_PMU_CAPS,	cpu_pmu_caps,	false),
 	FEAT_OPR(CLOCK_DATA,	clock_data,	false),
 	FEAT_OPN(HYBRID_TOPOLOGY,	hybrid_topology,	true),
-	FEAT_OPR(HYBRID_CPU_PMU_CAPS,	hybrid_cpu_pmu_caps,	false),
+	FEAT_OPR(PMU_CAPS,	pmu_caps,	false),
 };
 
 struct header_print_data {
@@ -3686,6 +3703,20 @@ int perf_session__write_header(struct perf_session *session,
 	return perf_session__do_write_header(session, evlist, fd, at_exit, NULL);
 }
 
+size_t perf_session__data_offset(const struct evlist *evlist)
+{
+	struct evsel *evsel;
+	size_t data_offset;
+
+	data_offset = sizeof(struct perf_file_header);
+	evlist__for_each_entry(evlist, evsel) {
+		data_offset += evsel->core.ids * sizeof(u64);
+	}
+	data_offset += evlist->core.nr_entries * sizeof(struct perf_file_attr);
+
+	return data_offset;
+}
+
 int perf_session__inject_header(struct perf_session *session,
 				struct evlist *evlist,
 				int fd,
@@ -4349,6 +4380,9 @@ int perf_event__process_event_update(struct perf_tool *tool __maybe_unused,
 	struct evsel *evsel;
 	struct perf_cpu_map *map;
 
+	if (dump_trace)
+		perf_event__fprintf_event_update(event, stdout);
+
 	if (!pevlist || *pevlist == NULL)
 		return -EINVAL;
 
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 08563c1f1bff..2d5e601ba60f 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -46,7 +46,7 @@ enum {
 	HEADER_CPU_PMU_CAPS,
 	HEADER_CLOCK_DATA,
 	HEADER_HYBRID_TOPOLOGY,
-	HEADER_HYBRID_CPU_PMU_CAPS,
+	HEADER_PMU_CAPS,
 	HEADER_LAST_FEATURE,
 	HEADER_FEAT_BITS	= 256,
 };
@@ -136,6 +136,8 @@ int perf_session__inject_header(struct perf_session *session,
 				int fd,
 				struct feat_copier *fc);
 
+size_t perf_session__data_offset(const struct evlist *evlist);
+
 void perf_header__set_feat(struct perf_header *header, int feat);
 void perf_header__clear_feat(struct perf_header *header, int feat);
 bool perf_header__has_feat(const struct perf_header *header, int feat);
diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index 62b2f375a94d..d5e9fc8106dd 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -74,10 +74,12 @@ struct intel_pt {
 	bool data_queued;
 	bool est_tsc;
 	bool sync_switch;
+	bool sync_switch_not_supported;
 	bool mispred_all;
 	bool use_thread_stack;
 	bool callstack;
 	bool cap_event_trace;
+	bool have_guest_sideband;
 	unsigned int br_stack_sz;
 	unsigned int br_stack_sz_plus;
 	int have_sched_switch;
@@ -195,6 +197,9 @@ struct intel_pt_queue {
 	struct thread *guest_thread;
 	struct thread *unknown_guest_thread;
 	pid_t guest_machine_pid;
+	pid_t guest_pid;
+	pid_t guest_tid;
+	int vcpu;
 	bool exclude_kernel;
 	bool have_sample;
 	u64 time;
@@ -685,7 +690,7 @@ static int intel_pt_get_guest(struct intel_pt_queue *ptq)
 	struct machine *machine;
 	pid_t pid = ptq->pid <= 0 ? DEFAULT_GUEST_KERNEL_ID : ptq->pid;
 
-	if (ptq->guest_machine && pid == ptq->guest_machine_pid)
+	if (ptq->guest_machine && pid == ptq->guest_machine->pid)
 		return 0;
 
 	ptq->guest_machine = NULL;
@@ -705,7 +710,6 @@ static int intel_pt_get_guest(struct intel_pt_queue *ptq)
 		return -1;
 
 	ptq->guest_machine = machine;
-	ptq->guest_machine_pid = pid;
 
 	return 0;
 }
@@ -759,28 +763,44 @@ static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn,
 	cpumode = intel_pt_nr_cpumode(ptq, *ip, nr);
 
 	if (nr) {
-		if ((!symbol_conf.guest_code && cpumode != PERF_RECORD_MISC_GUEST_KERNEL) ||
-		    intel_pt_get_guest(ptq))
+		if (ptq->pt->have_guest_sideband) {
+			if (!ptq->guest_machine || ptq->guest_machine_pid != ptq->pid) {
+				intel_pt_log("ERROR: guest sideband but no guest machine\n");
+				return -EINVAL;
+			}
+		} else if ((!symbol_conf.guest_code && cpumode != PERF_RECORD_MISC_GUEST_KERNEL) ||
+			   intel_pt_get_guest(ptq)) {
+			intel_pt_log("ERROR: no guest machine\n");
 			return -EINVAL;
+		}
 		machine = ptq->guest_machine;
 		thread = ptq->guest_thread;
 		if (!thread) {
-			if (cpumode != PERF_RECORD_MISC_GUEST_KERNEL)
+			if (cpumode != PERF_RECORD_MISC_GUEST_KERNEL) {
+				intel_pt_log("ERROR: no guest thread\n");
 				return -EINVAL;
+			}
 			thread = ptq->unknown_guest_thread;
 		}
 	} else {
 		thread = ptq->thread;
 		if (!thread) {
-			if (cpumode != PERF_RECORD_MISC_KERNEL)
+			if (cpumode != PERF_RECORD_MISC_KERNEL) {
+				intel_pt_log("ERROR: no thread\n");
 				return -EINVAL;
+			}
 			thread = ptq->pt->unknown_thread;
 		}
 	}
 
 	while (1) {
-		if (!thread__find_map(thread, cpumode, *ip, &al) || !al.map->dso)
+		if (!thread__find_map(thread, cpumode, *ip, &al) || !al.map->dso) {
+			if (al.map)
+				intel_pt_log("ERROR: thread has no dso for %#" PRIx64 "\n", *ip);
+			else
+				intel_pt_log("ERROR: thread has no map for %#" PRIx64 "\n", *ip);
 			return -EINVAL;
+		}
 
 		if (al.map->dso->data.status == DSO_DATA_STATUS_ERROR &&
 		    dso__data_status_seen(al.map->dso,
@@ -821,8 +841,12 @@ static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn,
 			len = dso__data_read_offset(al.map->dso, machine,
 						    offset, buf,
 						    INTEL_PT_INSN_BUF_SZ);
-			if (len <= 0)
+			if (len <= 0) {
+				intel_pt_log("ERROR: failed to read at %" PRIu64 " ", offset);
+				if (intel_pt_enable_logging)
+					dso__fprintf(al.map->dso, intel_pt_log_fp());
 				return -EINVAL;
+			}
 
 			if (intel_pt_get_insn(buf, len, x86_64, intel_pt_insn))
 				return -EINVAL;
@@ -1370,6 +1394,55 @@ static void intel_pt_first_timestamp(struct intel_pt *pt, u64 timestamp)
 	}
 }
 
+static int intel_pt_get_guest_from_sideband(struct intel_pt_queue *ptq)
+{
+	struct machines *machines = &ptq->pt->session->machines;
+	struct machine *machine;
+	pid_t machine_pid = ptq->pid;
+	pid_t tid;
+	int vcpu;
+
+	if (machine_pid <= 0)
+		return 0; /* Not a guest machine */
+
+	machine = machines__find(machines, machine_pid);
+	if (!machine)
+		return 0; /* Not a guest machine */
+
+	if (ptq->guest_machine != machine) {
+		ptq->guest_machine = NULL;
+		thread__zput(ptq->guest_thread);
+		thread__zput(ptq->unknown_guest_thread);
+
+		ptq->unknown_guest_thread = machine__find_thread(machine, 0, 0);
+		if (!ptq->unknown_guest_thread)
+			return -1;
+		ptq->guest_machine = machine;
+	}
+
+	vcpu = ptq->thread ? ptq->thread->guest_cpu : -1;
+	if (vcpu < 0)
+		return -1;
+
+	tid = machine__get_current_tid(machine, vcpu);
+
+	if (ptq->guest_thread && ptq->guest_thread->tid != tid)
+		thread__zput(ptq->guest_thread);
+
+	if (!ptq->guest_thread) {
+		ptq->guest_thread = machine__find_thread(machine, -1, tid);
+		if (!ptq->guest_thread)
+			return -1;
+	}
+
+	ptq->guest_machine_pid = machine_pid;
+	ptq->guest_pid = ptq->guest_thread->pid_;
+	ptq->guest_tid = tid;
+	ptq->vcpu = vcpu;
+
+	return 0;
+}
+
 static void intel_pt_set_pid_tid_cpu(struct intel_pt *pt,
 				     struct auxtrace_queue *queue)
 {
@@ -1390,6 +1463,13 @@ static void intel_pt_set_pid_tid_cpu(struct intel_pt *pt,
 		if (queue->cpu == -1)
 			ptq->cpu = ptq->thread->cpu;
 	}
+
+	if (pt->have_guest_sideband && intel_pt_get_guest_from_sideband(ptq)) {
+		ptq->guest_machine_pid = 0;
+		ptq->guest_pid = -1;
+		ptq->guest_tid = -1;
+		ptq->vcpu = -1;
+	}
 }
 
 static void intel_pt_sample_flags(struct intel_pt_queue *ptq)
@@ -1577,6 +1657,17 @@ static void intel_pt_prep_a_sample(struct intel_pt_queue *ptq,
 
 	sample->pid = ptq->pid;
 	sample->tid = ptq->tid;
+
+	if (ptq->pt->have_guest_sideband) {
+		if ((ptq->state->from_ip && ptq->state->from_nr) ||
+		    (ptq->state->to_ip && ptq->state->to_nr)) {
+			sample->pid = ptq->guest_pid;
+			sample->tid = ptq->guest_tid;
+			sample->machine_pid = ptq->guest_machine_pid;
+			sample->vcpu = ptq->vcpu;
+		}
+	}
+
 	sample->cpu = ptq->cpu;
 	sample->insn_len = ptq->insn_len;
 	memcpy(sample->insn, ptq->insn, INTEL_PT_INSN_BUF_SZ);
@@ -2324,7 +2415,8 @@ static int intel_pt_synth_iflag_chg_sample(struct intel_pt_queue *ptq)
 }
 
 static int intel_pt_synth_error(struct intel_pt *pt, int code, int cpu,
-				pid_t pid, pid_t tid, u64 ip, u64 timestamp)
+				pid_t pid, pid_t tid, u64 ip, u64 timestamp,
+				pid_t machine_pid, int vcpu)
 {
 	union perf_event event;
 	char msg[MAX_AUXTRACE_ERROR_MSG];
@@ -2341,8 +2433,9 @@ static int intel_pt_synth_error(struct intel_pt *pt, int code, int cpu,
 
 	intel_pt__strerror(code, msg, MAX_AUXTRACE_ERROR_MSG);
 
-	auxtrace_synth_error(&event.auxtrace_error, PERF_AUXTRACE_ERROR_ITRACE,
-			     code, cpu, pid, tid, ip, msg, timestamp);
+	auxtrace_synth_guest_error(&event.auxtrace_error, PERF_AUXTRACE_ERROR_ITRACE,
+				   code, cpu, pid, tid, ip, msg, timestamp,
+				   machine_pid, vcpu);
 
 	err = perf_session__deliver_synth_event(pt->session, &event, NULL);
 	if (err)
@@ -2357,11 +2450,22 @@ static int intel_ptq_synth_error(struct intel_pt_queue *ptq,
 {
 	struct intel_pt *pt = ptq->pt;
 	u64 tm = ptq->timestamp;
+	pid_t machine_pid = 0;
+	pid_t pid = ptq->pid;
+	pid_t tid = ptq->tid;
+	int vcpu = -1;
 
 	tm = pt->timeless_decoding ? 0 : tsc_to_perf_time(tm, &pt->tc);
 
-	return intel_pt_synth_error(pt, state->err, ptq->cpu, ptq->pid,
-				    ptq->tid, state->from_ip, tm);
+	if (pt->have_guest_sideband && state->from_nr) {
+		machine_pid = ptq->guest_machine_pid;
+		vcpu = ptq->vcpu;
+		pid = ptq->guest_pid;
+		tid = ptq->guest_tid;
+	}
+
+	return intel_pt_synth_error(pt, state->err, ptq->cpu, pid, tid,
+				    state->from_ip, tm, machine_pid, vcpu);
 }
 
 static int intel_pt_next_tid(struct intel_pt *pt, struct intel_pt_queue *ptq)
@@ -2624,6 +2728,9 @@ static void intel_pt_enable_sync_switch(struct intel_pt *pt)
 {
 	unsigned int i;
 
+	if (pt->sync_switch_not_supported)
+		return;
+
 	pt->sync_switch = true;
 
 	for (i = 0; i < pt->queues.nr_queues; i++) {
@@ -2635,6 +2742,23 @@ static void intel_pt_enable_sync_switch(struct intel_pt *pt)
 	}
 }
 
+static void intel_pt_disable_sync_switch(struct intel_pt *pt)
+{
+	unsigned int i;
+
+	pt->sync_switch = false;
+
+	for (i = 0; i < pt->queues.nr_queues; i++) {
+		struct auxtrace_queue *queue = &pt->queues.queue_array[i];
+		struct intel_pt_queue *ptq = queue->priv;
+
+		if (ptq) {
+			ptq->sync_switch = false;
+			intel_pt_next_tid(pt, ptq);
+		}
+	}
+}
+
 /*
  * To filter against time ranges, it is only necessary to look at the next start
  * or end time.
@@ -2928,7 +3052,8 @@ static int intel_pt_process_timeless_sample(struct intel_pt *pt,
 static int intel_pt_lost(struct intel_pt *pt, struct perf_sample *sample)
 {
 	return intel_pt_synth_error(pt, INTEL_PT_ERR_LOST, sample->cpu,
-				    sample->pid, sample->tid, 0, sample->time);
+				    sample->pid, sample->tid, 0, sample->time,
+				    sample->machine_pid, sample->vcpu);
 }
 
 static struct intel_pt_queue *intel_pt_cpu_to_ptq(struct intel_pt *pt, int cpu)
@@ -3066,6 +3191,33 @@ static int intel_pt_context_switch_in(struct intel_pt *pt,
 	return machine__set_current_tid(pt->machine, cpu, pid, tid);
 }
 
+static int intel_pt_guest_context_switch(struct intel_pt *pt,
+					 union perf_event *event,
+					 struct perf_sample *sample)
+{
+	bool out = event->header.misc & PERF_RECORD_MISC_SWITCH_OUT;
+	struct machines *machines = &pt->session->machines;
+	struct machine *machine = machines__find(machines, sample->machine_pid);
+
+	pt->have_guest_sideband = true;
+
+	/*
+	 * sync_switch cannot handle guest machines at present, so just disable
+	 * it.
+	 */
+	pt->sync_switch_not_supported = true;
+	if (pt->sync_switch)
+		intel_pt_disable_sync_switch(pt);
+
+	if (out)
+		return 0;
+
+	if (!machine)
+		return -EINVAL;
+
+	return machine__set_current_tid(machine, sample->vcpu, sample->pid, sample->tid);
+}
+
 static int intel_pt_context_switch(struct intel_pt *pt, union perf_event *event,
 				   struct perf_sample *sample)
 {
@@ -3073,6 +3225,9 @@ static int intel_pt_context_switch(struct intel_pt *pt, union perf_event *event,
 	pid_t pid, tid;
 	int cpu, ret;
 
+	if (perf_event__is_guest(event))
+		return intel_pt_guest_context_switch(pt, event, sample);
+
 	cpu = sample->cpu;
 
 	if (pt->have_sched_switch == 3) {
diff --git a/tools/perf/util/kwork.h b/tools/perf/util/kwork.h
new file mode 100644
index 000000000000..320c0a6d2e08
--- /dev/null
+++ b/tools/perf/util/kwork.h
@@ -0,0 +1,257 @@
+#ifndef PERF_UTIL_KWORK_H
+#define PERF_UTIL_KWORK_H
+
+#include "perf.h"
+
+#include "util/tool.h"
+#include "util/event.h"
+#include "util/evlist.h"
+#include "util/session.h"
+#include "util/time-utils.h"
+
+#include <linux/list.h>
+#include <linux/bitmap.h>
+
+enum kwork_class_type {
+	KWORK_CLASS_IRQ,
+	KWORK_CLASS_SOFTIRQ,
+	KWORK_CLASS_WORKQUEUE,
+	KWORK_CLASS_MAX,
+};
+
+enum kwork_report_type {
+	KWORK_REPORT_RUNTIME,
+	KWORK_REPORT_LATENCY,
+	KWORK_REPORT_TIMEHIST,
+};
+
+enum kwork_trace_type {
+	KWORK_TRACE_RAISE,
+	KWORK_TRACE_ENTRY,
+	KWORK_TRACE_EXIT,
+	KWORK_TRACE_MAX,
+};
+
+/*
+ * data structure:
+ *
+ *                 +==================+ +============+ +======================+
+ *                 |      class       | |    work    | |         atom         |
+ *                 +==================+ +============+ +======================+
+ * +------------+  |  +-----+         | |  +------+  | |  +-------+   +-----+ |
+ * | perf_kwork | +-> | irq | --------|+-> | eth0 | --+-> | raise | - | ... | --+   +-----------+
+ * +-----+------+ ||  +-----+         |||  +------+  |||  +-------+   +-----+ | |   |           |
+ *       |        ||                  |||            |||                      | +-> | atom_page |
+ *       |        ||                  |||            |||  +-------+   +-----+ |     |           |
+ *       |  class_list                |||            |+-> | entry | - | ... | ----> |           |
+ *       |        ||                  |||            |||  +-------+   +-----+ |     |           |
+ *       |        ||                  |||            |||                      | +-> |           |
+ *       |        ||                  |||            |||  +-------+   +-----+ | |   |           |
+ *       |        ||                  |||            |+-> | exit  | - | ... | --+   +-----+-----+
+ *       |        ||                  |||            | |  +-------+   +-----+ |           |
+ *       |        ||                  |||            | |                      |           |
+ *       |        ||                  |||  +-----+   | |                      |           |
+ *       |        ||                  |+-> | ... |   | |                      |           |
+ *       |        ||                  | |  +-----+   | |                      |           |
+ *       |        ||                  | |            | |                      |           |
+ *       |        ||  +---------+     | |  +-----+   | |  +-------+   +-----+ |           |
+ *       |        +-> | softirq | -------> | RCU | ---+-> | raise | - | ... | --+   +-----+-----+
+ *       |        ||  +---------+     | |  +-----+   |||  +-------+   +-----+ | |   |           |
+ *       |        ||                  | |            |||                      | +-> | atom_page |
+ *       |        ||                  | |            |||  +-------+   +-----+ |     |           |
+ *       |        ||                  | |            |+-> | entry | - | ... | ----> |           |
+ *       |        ||                  | |            |||  +-------+   +-----+ |     |           |
+ *       |        ||                  | |            |||                      | +-> |           |
+ *       |        ||                  | |            |||  +-------+   +-----+ | |   |           |
+ *       |        ||                  | |            |+-> | exit  | - | ... | --+   +-----+-----+
+ *       |        ||                  | |            | |  +-------+   +-----+ |           |
+ *       |        ||                  | |            | |                      |           |
+ *       |        ||  +-----------+   | |  +-----+   | |                      |           |
+ *       |        +-> | workqueue | -----> | ... |   | |                      |           |
+ *       |         |  +-----------+   | |  +-----+   | |                      |           |
+ *       |         +==================+ +============+ +======================+           |
+ *       |                                                                                |
+ *       +---->  atom_page_list  ---------------------------------------------------------+
+ *
+ */
+
+struct kwork_atom {
+	struct list_head list;
+	u64 time;
+	struct kwork_atom *prev;
+
+	void *page_addr;
+	unsigned long bit_inpage;
+};
+
+#define NR_ATOM_PER_PAGE 128
+struct kwork_atom_page {
+	struct list_head list;
+	struct kwork_atom atoms[NR_ATOM_PER_PAGE];
+	DECLARE_BITMAP(bitmap, NR_ATOM_PER_PAGE);
+};
+
+struct kwork_class;
+struct kwork_work {
+	/*
+	 * class field
+	 */
+	struct rb_node node;
+	struct kwork_class *class;
+
+	/*
+	 * work field
+	 */
+	u64 id;
+	int cpu;
+	char *name;
+
+	/*
+	 * atom field
+	 */
+	u64 nr_atoms;
+	struct list_head atom_list[KWORK_TRACE_MAX];
+
+	/*
+	 * runtime report
+	 */
+	u64 max_runtime;
+	u64 max_runtime_start;
+	u64 max_runtime_end;
+	u64 total_runtime;
+
+	/*
+	 * latency report
+	 */
+	u64 max_latency;
+	u64 max_latency_start;
+	u64 max_latency_end;
+	u64 total_latency;
+};
+
+struct kwork_class {
+	struct list_head list;
+	const char *name;
+	enum kwork_class_type type;
+
+	unsigned int nr_tracepoints;
+	const struct evsel_str_handler *tp_handlers;
+
+	struct rb_root_cached work_root;
+
+	int (*class_init)(struct kwork_class *class,
+			  struct perf_session *session);
+
+	void (*work_init)(struct kwork_class *class,
+			  struct kwork_work *work,
+			  struct evsel *evsel,
+			  struct perf_sample *sample,
+			  struct machine *machine);
+
+	void (*work_name)(struct kwork_work *work,
+			  char *buf, int len);
+};
+
+struct perf_kwork;
+struct trace_kwork_handler {
+	int (*raise_event)(struct perf_kwork *kwork,
+			   struct kwork_class *class, struct evsel *evsel,
+			   struct perf_sample *sample, struct machine *machine);
+
+	int (*entry_event)(struct perf_kwork *kwork,
+			   struct kwork_class *class, struct evsel *evsel,
+			   struct perf_sample *sample, struct machine *machine);
+
+	int (*exit_event)(struct perf_kwork *kwork,
+			  struct kwork_class *class, struct evsel *evsel,
+			  struct perf_sample *sample, struct machine *machine);
+};
+
+struct perf_kwork {
+	/*
+	 * metadata
+	 */
+	struct perf_tool tool;
+	struct list_head class_list;
+	struct list_head atom_page_list;
+	struct list_head sort_list, cmp_id;
+	struct rb_root_cached sorted_work_root;
+	const struct trace_kwork_handler *tp_handler;
+
+	/*
+	 * profile filters
+	 */
+	const char *profile_name;
+
+	const char *cpu_list;
+	DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
+
+	const char *time_str;
+	struct perf_time_interval ptime;
+
+	/*
+	 * options for command
+	 */
+	bool force;
+	const char *event_list_str;
+	enum kwork_report_type report;
+
+	/*
+	 * options for subcommand
+	 */
+	bool summary;
+	const char *sort_order;
+	bool show_callchain;
+	unsigned int max_stack;
+	bool use_bpf;
+
+	/*
+	 * statistics
+	 */
+	u64 timestart;
+	u64 timeend;
+
+	unsigned long nr_events;
+	unsigned long nr_lost_chunks;
+	unsigned long nr_lost_events;
+
+	u64 all_runtime;
+	u64 all_count;
+	u64 nr_skipped_events[KWORK_TRACE_MAX + 1];
+};
+
+struct kwork_work *perf_kwork_add_work(struct perf_kwork *kwork,
+				       struct kwork_class *class,
+				       struct kwork_work *key);
+
+#ifdef HAVE_BPF_SKEL
+
+int perf_kwork__trace_prepare_bpf(struct perf_kwork *kwork);
+int perf_kwork__report_read_bpf(struct perf_kwork *kwork);
+void perf_kwork__report_cleanup_bpf(void);
+
+void perf_kwork__trace_start(void);
+void perf_kwork__trace_finish(void);
+
+#else  /* !HAVE_BPF_SKEL */
+
+static inline int
+perf_kwork__trace_prepare_bpf(struct perf_kwork *kwork __maybe_unused)
+{
+	return -1;
+}
+
+static inline int
+perf_kwork__report_read_bpf(struct perf_kwork *kwork __maybe_unused)
+{
+	return -1;
+}
+
+static inline void perf_kwork__report_cleanup_bpf(void) {}
+
+static inline void perf_kwork__trace_start(void) {}
+static inline void perf_kwork__trace_finish(void) {}
+
+#endif  /* HAVE_BPF_SKEL */
+
+#endif  /* PERF_UTIL_KWORK_H */
diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c
index 96c8ef60f4f8..2dc797007419 100644
--- a/tools/perf/util/llvm-utils.c
+++ b/tools/perf/util/llvm-utils.c
@@ -25,7 +25,7 @@
 		"$CLANG_OPTIONS $PERF_BPF_INC_OPTIONS $KERNEL_INC_OPTIONS " \
 		"-Wno-unused-value -Wno-pointer-sign "		\
 		"-working-directory $WORKING_DIR "		\
-		"-c \"$CLANG_SOURCE\" -target bpf $CLANG_EMIT_LLVM -O2 -o - $LLVM_OPTIONS_PIPE"
+		"-c \"$CLANG_SOURCE\" -target bpf $CLANG_EMIT_LLVM -g -O2 -o - $LLVM_OPTIONS_PIPE"
 
 struct llvm_param llvm_param = {
 	.clang_path = "clang",
diff --git a/tools/perf/util/lock-contention.h b/tools/perf/util/lock-contention.h
new file mode 100644
index 000000000000..2146efc33396
--- /dev/null
+++ b/tools/perf/util/lock-contention.h
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef PERF_LOCK_CONTENTION_H
+#define PERF_LOCK_CONTENTION_H
+
+#include <linux/list.h>
+#include <linux/rbtree.h>
+
+struct lock_stat {
+	struct hlist_node	hash_entry;
+	struct rb_node		rb;		/* used for sorting */
+
+	u64			addr;		/* address of lockdep_map, used as ID */
+	char			*name;		/* for strcpy(), we cannot use const */
+
+	unsigned int		nr_acquire;
+	unsigned int		nr_acquired;
+	unsigned int		nr_contended;
+	unsigned int		nr_release;
+
+	union {
+		unsigned int	nr_readlock;
+		unsigned int	flags;
+	};
+	unsigned int		nr_trylock;
+
+	/* these times are in nano sec. */
+	u64                     avg_wait_time;
+	u64			wait_time_total;
+	u64			wait_time_min;
+	u64			wait_time_max;
+
+	int			broken; /* flag of blacklist */
+	int			combined;
+};
+
+/*
+ * States of lock_seq_stat
+ *
+ * UNINITIALIZED is required for detecting first event of acquire.
+ * As the nature of lock events, there is no guarantee
+ * that the first event for the locks are acquire,
+ * it can be acquired, contended or release.
+ */
+#define SEQ_STATE_UNINITIALIZED      0	       /* initial state */
+#define SEQ_STATE_RELEASED	1
+#define SEQ_STATE_ACQUIRING	2
+#define SEQ_STATE_ACQUIRED	3
+#define SEQ_STATE_READ_ACQUIRED	4
+#define SEQ_STATE_CONTENDED	5
+
+/*
+ * MAX_LOCK_DEPTH
+ * Imported from include/linux/sched.h.
+ * Should this be synchronized?
+ */
+#define MAX_LOCK_DEPTH 48
+
+/*
+ * struct lock_seq_stat:
+ * Place to put on state of one lock sequence
+ * 1) acquire -> acquired -> release
+ * 2) acquire -> contended -> acquired -> release
+ * 3) acquire (with read or try) -> release
+ * 4) Are there other patterns?
+ */
+struct lock_seq_stat {
+	struct list_head        list;
+	int			state;
+	u64			prev_event_time;
+	u64                     addr;
+
+	int                     read_count;
+};
+
+struct thread_stat {
+	struct rb_node		rb;
+
+	u32                     tid;
+	struct list_head        seq_list;
+};
+
+/*
+ * CONTENTION_STACK_DEPTH
+ * Number of stack trace entries to find callers
+ */
+#define CONTENTION_STACK_DEPTH  8
+
+/*
+ * CONTENTION_STACK_SKIP
+ * Number of stack trace entries to skip when finding callers.
+ * The first few entries belong to the locking implementation itself.
+ */
+#define CONTENTION_STACK_SKIP  3
+
+/*
+ * flags for lock:contention_begin
+ * Imported from include/trace/events/lock.h.
+ */
+#define LCB_F_SPIN	(1U << 0)
+#define LCB_F_READ	(1U << 1)
+#define LCB_F_WRITE	(1U << 2)
+#define LCB_F_RT	(1U << 3)
+#define LCB_F_PERCPU	(1U << 4)
+#define LCB_F_MUTEX	(1U << 5)
+
+struct evlist;
+struct machine;
+struct target;
+
+struct lock_contention {
+	struct evlist *evlist;
+	struct target *target;
+	struct machine *machine;
+	struct hlist_head *result;
+	unsigned long map_nr_entries;
+	unsigned long lost;
+};
+
+#ifdef HAVE_BPF_SKEL
+
+int lock_contention_prepare(struct lock_contention *con);
+int lock_contention_start(void);
+int lock_contention_stop(void);
+int lock_contention_read(struct lock_contention *con);
+int lock_contention_finish(void);
+
+#else  /* !HAVE_BPF_SKEL */
+
+static inline int lock_contention_prepare(struct lock_contention *con __maybe_unused)
+{
+	return 0;
+}
+
+static inline int lock_contention_start(void) { return 0; }
+static inline int lock_contention_stop(void) { return 0; }
+static inline int lock_contention_finish(void) { return 0; }
+
+static inline int lock_contention_read(struct lock_contention *con __maybe_unused)
+{
+	return 0;
+}
+
+#endif  /* HAVE_BPF_SKEL */
+
+bool is_lock_function(struct machine *machine, u64 addr);
+
+#endif  /* PERF_LOCK_CONTENTION_H */
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 009061852808..facc13fbf16e 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -1742,6 +1742,7 @@ static int machine__process_kernel_mmap_event(struct machine *machine,
 	struct map *map;
 	enum dso_space_type dso_space;
 	bool is_kernel_mmap;
+	const char *mmap_name = machine->mmap_name;
 
 	/* If we have maps from kcore then we do not need or want any others */
 	if (machine__uses_kcore(machine))
@@ -1752,8 +1753,16 @@ static int machine__process_kernel_mmap_event(struct machine *machine,
 	else
 		dso_space = DSO_SPACE__KERNEL_GUEST;
 
-	is_kernel_mmap = memcmp(xm->name, machine->mmap_name,
-				strlen(machine->mmap_name) - 1) == 0;
+	is_kernel_mmap = memcmp(xm->name, mmap_name, strlen(mmap_name) - 1) == 0;
+	if (!is_kernel_mmap && !machine__is_host(machine)) {
+		/*
+		 * If the event was recorded inside the guest and injected into
+		 * the host perf.data file, then it will match a host mmap_name,
+		 * so try that - see machine__set_mmap_name().
+		 */
+		mmap_name = "[kernel.kallsyms]";
+		is_kernel_mmap = memcmp(xm->name, mmap_name, strlen(mmap_name) - 1) == 0;
+	}
 	if (xm->name[0] == '/' ||
 	    (!is_kernel_mmap && xm->name[0] == '[')) {
 		map = machine__addnew_module_map(machine, xm->start,
@@ -1767,7 +1776,7 @@ static int machine__process_kernel_mmap_event(struct machine *machine,
 			dso__set_build_id(map->dso, bid);
 
 	} else if (is_kernel_mmap) {
-		const char *symbol_name = (xm->name + strlen(machine->mmap_name));
+		const char *symbol_name = xm->name + strlen(mmap_name);
 		/*
 		 * Should be there already, from the build-id table in
 		 * the header.
@@ -3174,9 +3183,7 @@ int machines__for_each_thread(struct machines *machines,
 
 pid_t machine__get_current_tid(struct machine *machine, int cpu)
 {
-	int nr_cpus = min(machine->env->nr_cpus_avail, MAX_NR_CPUS);
-
-	if (cpu < 0 || cpu >= nr_cpus || !machine->current_tid)
+	if (cpu < 0 || (size_t)cpu >= machine->current_tid_sz)
 		return -1;
 
 	return machine->current_tid[cpu];
@@ -3186,26 +3193,16 @@ int machine__set_current_tid(struct machine *machine, int cpu, pid_t pid,
 			     pid_t tid)
 {
 	struct thread *thread;
-	int nr_cpus = min(machine->env->nr_cpus_avail, MAX_NR_CPUS);
+	const pid_t init_val = -1;
 
 	if (cpu < 0)
 		return -EINVAL;
 
-	if (!machine->current_tid) {
-		int i;
-
-		machine->current_tid = calloc(nr_cpus, sizeof(pid_t));
-		if (!machine->current_tid)
-			return -ENOMEM;
-		for (i = 0; i < nr_cpus; i++)
-			machine->current_tid[i] = -1;
-	}
-
-	if (cpu >= nr_cpus) {
-		pr_err("Requested CPU %d too large. ", cpu);
-		pr_err("Consider raising MAX_NR_CPUS\n");
-		return -EINVAL;
-	}
+	if (realloc_array_as_needed(machine->current_tid,
+				    machine->current_tid_sz,
+				    (unsigned int)cpu,
+				    &init_val))
+		return -ENOMEM;
 
 	machine->current_tid[cpu] = tid;
 
@@ -3327,3 +3324,18 @@ int machine__for_each_dso(struct machine *machine, machine__dso_t fn, void *priv
 	}
 	return err;
 }
+
+int machine__for_each_kernel_map(struct machine *machine, machine__map_t fn, void *priv)
+{
+	struct maps *maps = machine__kernel_maps(machine);
+	struct map *map;
+	int err = 0;
+
+	for (map = maps__first(maps); map != NULL; map = map__next(map)) {
+		err = fn(map, priv);
+		if (err != 0) {
+			break;
+		}
+	}
+	return err;
+}
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index 5d7daf7cb7bc..74935dfaa937 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -48,6 +48,7 @@ struct machine {
 	bool		  single_address_space;
 	char		  *root_dir;
 	char		  *mmap_name;
+	char		  *kallsyms_filename;
 	struct threads    threads[THREADS__TABLE_SIZE];
 	struct vdso_info  *vdso_info;
 	struct perf_env   *env;
@@ -56,6 +57,7 @@ struct machine {
 	struct map	  *vmlinux_map;
 	u64		  kernel_start;
 	pid_t		  *current_tid;
+	size_t		  current_tid_sz;
 	union { /* Tool specific area */
 		void	  *priv;
 		u64	  db_id;
@@ -262,6 +264,11 @@ typedef int (*machine__dso_t)(struct dso *dso, struct machine *machine, void *pr
 
 int machine__for_each_dso(struct machine *machine, machine__dso_t fn,
 			  void *priv);
+
+typedef int (*machine__map_t)(struct map *map, void *priv);
+int machine__for_each_kernel_map(struct machine *machine, machine__map_t fn,
+				 void *priv);
+
 int machine__for_each_thread(struct machine *machine,
 			     int (*fn)(struct thread *thread, void *p),
 			     void *priv);
diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
index ee8fcfa115e5..8f7baeabc5cf 100644
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -1372,6 +1372,7 @@ static int parse_ids(bool metric_no_merge, struct perf_pmu *fake_pmu,
 
 	*out_evlist = NULL;
 	if (!metric_no_merge || hashmap__size(ids->ids) == 0) {
+		bool added_event = false;
 		int i;
 		/*
 		 * We may fail to share events between metrics because a tool
@@ -1393,8 +1394,16 @@ static int parse_ids(bool metric_no_merge, struct perf_pmu *fake_pmu,
 				if (!tmp)
 					return -ENOMEM;
 				ids__insert(ids->ids, tmp);
+				added_event = true;
 			}
 		}
+		if (!added_event && hashmap__size(ids->ids) == 0) {
+			char *tmp = strdup("duration_time");
+
+			if (!tmp)
+				return -ENOMEM;
+			ids__insert(ids->ids, tmp);
+		}
 	}
 	ret = metricgroup__build_event_string(&events, ids, modifier,
 					      has_constraint);
diff --git a/tools/perf/util/off_cpu.h b/tools/perf/util/off_cpu.h
index 548008f74d42..2dd67c60f211 100644
--- a/tools/perf/util/off_cpu.h
+++ b/tools/perf/util/off_cpu.h
@@ -1,6 +1,8 @@
 #ifndef PERF_UTIL_OFF_CPU_H
 #define PERF_UTIL_OFF_CPU_H
 
+#include <linux/perf_event.h>
+
 struct evlist;
 struct target;
 struct perf_session;
@@ -8,6 +10,13 @@ struct record_opts;
 
 #define OFFCPU_EVENT  "offcpu-time"
 
+#define OFFCPU_SAMPLE_TYPES  (PERF_SAMPLE_IDENTIFIER | PERF_SAMPLE_IP | \
+			      PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \
+			      PERF_SAMPLE_ID | PERF_SAMPLE_CPU | \
+			      PERF_SAMPLE_PERIOD | PERF_SAMPLE_CALLCHAIN | \
+			      PERF_SAMPLE_CGROUP)
+
+
 #ifdef HAVE_BPF_SKEL
 int off_cpu_prepare(struct evlist *evlist, struct target *target,
 		    struct record_opts *opts);
diff --git a/tools/perf/util/ordered-events.h b/tools/perf/util/ordered-events.h
index 0b05c3c0aeaa..8febbd7c98ca 100644
--- a/tools/perf/util/ordered-events.h
+++ b/tools/perf/util/ordered-events.h
@@ -75,4 +75,10 @@ void ordered_events__set_copy_on_queue(struct ordered_events *oe, bool copy)
 {
 	oe->copy_on_queue = copy;
 }
+
+static inline u64 ordered_events__last_flush_time(struct ordered_events *oe)
+{
+	return oe->last_flush;
+}
+
 #endif /* __ORDERED_EVENTS_H */
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 7ed235740431..206c76623c06 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -5,18 +5,12 @@
 #include <dirent.h>
 #include <errno.h>
 #include <sys/ioctl.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
 #include <sys/param.h>
 #include "term.h"
-#include "build-id.h"
 #include "evlist.h"
 #include "evsel.h"
-#include <subcmd/pager.h>
 #include <subcmd/parse-options.h>
 #include "parse-events.h"
-#include <subcmd/exec-cmd.h>
 #include "string2.h"
 #include "strlist.h"
 #include "bpf-loader.h"
@@ -24,23 +18,24 @@
 #include <api/fs/tracing_path.h>
 #include <perf/cpumap.h>
 #include "parse-events-bison.h"
-#define YY_EXTRA_TYPE void*
 #include "parse-events-flex.h"
 #include "pmu.h"
-#include "thread_map.h"
-#include "probe-file.h"
 #include "asm/bug.h"
 #include "util/parse-branch-options.h"
-#include "metricgroup.h"
 #include "util/evsel_config.h"
 #include "util/event.h"
-#include "util/pfm.h"
+#include "perf.h"
 #include "util/parse-events-hybrid.h"
 #include "util/pmu-hybrid.h"
-#include "perf.h"
+#include "tracepoint.h"
 
 #define MAX_NAME_LEN 100
 
+struct perf_pmu_event_symbol {
+	char	*symbol;
+	enum perf_pmu_event_symbol_type	type;
+};
+
 #ifdef PARSER_DEBUG
 extern int parse_events_debug;
 #endif
@@ -154,21 +149,6 @@ struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = {
 	},
 };
 
-struct event_symbol event_symbols_tool[PERF_TOOL_MAX] = {
-	[PERF_TOOL_DURATION_TIME] = {
-		.symbol = "duration_time",
-		.alias  = "",
-	},
-	[PERF_TOOL_USER_TIME] = {
-		.symbol = "user_time",
-		.alias  = "",
-	},
-	[PERF_TOOL_SYSTEM_TIME] = {
-		.symbol = "system_time",
-		.alias  = "",
-	},
-};
-
 #define __PERF_EVENT_FIELD(config, name) \
 	((config & PERF_EVENT_##name##_MASK) >> PERF_EVENT_##name##_SHIFT)
 
@@ -177,121 +157,6 @@ struct event_symbol event_symbols_tool[PERF_TOOL_MAX] = {
 #define PERF_EVENT_TYPE(config)		__PERF_EVENT_FIELD(config, TYPE)
 #define PERF_EVENT_ID(config)		__PERF_EVENT_FIELD(config, EVENT)
 
-#define for_each_subsystem(sys_dir, sys_dirent)			\
-	while ((sys_dirent = readdir(sys_dir)) != NULL)		\
-		if (sys_dirent->d_type == DT_DIR &&		\
-		    (strcmp(sys_dirent->d_name, ".")) &&	\
-		    (strcmp(sys_dirent->d_name, "..")))
-
-static int tp_event_has_id(const char *dir_path, struct dirent *evt_dir)
-{
-	char evt_path[MAXPATHLEN];
-	int fd;
-
-	snprintf(evt_path, MAXPATHLEN, "%s/%s/id", dir_path, evt_dir->d_name);
-	fd = open(evt_path, O_RDONLY);
-	if (fd < 0)
-		return -EINVAL;
-	close(fd);
-
-	return 0;
-}
-
-#define for_each_event(dir_path, evt_dir, evt_dirent)		\
-	while ((evt_dirent = readdir(evt_dir)) != NULL)		\
-		if (evt_dirent->d_type == DT_DIR &&		\
-		    (strcmp(evt_dirent->d_name, ".")) &&	\
-		    (strcmp(evt_dirent->d_name, "..")) &&	\
-		    (!tp_event_has_id(dir_path, evt_dirent)))
-
-#define MAX_EVENT_LENGTH 512
-
-struct tracepoint_path *tracepoint_id_to_path(u64 config)
-{
-	struct tracepoint_path *path = NULL;
-	DIR *sys_dir, *evt_dir;
-	struct dirent *sys_dirent, *evt_dirent;
-	char id_buf[24];
-	int fd;
-	u64 id;
-	char evt_path[MAXPATHLEN];
-	char *dir_path;
-
-	sys_dir = tracing_events__opendir();
-	if (!sys_dir)
-		return NULL;
-
-	for_each_subsystem(sys_dir, sys_dirent) {
-		dir_path = get_events_file(sys_dirent->d_name);
-		if (!dir_path)
-			continue;
-		evt_dir = opendir(dir_path);
-		if (!evt_dir)
-			goto next;
-
-		for_each_event(dir_path, evt_dir, evt_dirent) {
-
-			scnprintf(evt_path, MAXPATHLEN, "%s/%s/id", dir_path,
-				  evt_dirent->d_name);
-			fd = open(evt_path, O_RDONLY);
-			if (fd < 0)
-				continue;
-			if (read(fd, id_buf, sizeof(id_buf)) < 0) {
-				close(fd);
-				continue;
-			}
-			close(fd);
-			id = atoll(id_buf);
-			if (id == config) {
-				put_events_file(dir_path);
-				closedir(evt_dir);
-				closedir(sys_dir);
-				path = zalloc(sizeof(*path));
-				if (!path)
-					return NULL;
-				if (asprintf(&path->system, "%.*s", MAX_EVENT_LENGTH, sys_dirent->d_name) < 0) {
-					free(path);
-					return NULL;
-				}
-				if (asprintf(&path->name, "%.*s", MAX_EVENT_LENGTH, evt_dirent->d_name) < 0) {
-					zfree(&path->system);
-					free(path);
-					return NULL;
-				}
-				return path;
-			}
-		}
-		closedir(evt_dir);
-next:
-		put_events_file(dir_path);
-	}
-
-	closedir(sys_dir);
-	return NULL;
-}
-
-struct tracepoint_path *tracepoint_name_to_path(const char *name)
-{
-	struct tracepoint_path *path = zalloc(sizeof(*path));
-	char *str = strchr(name, ':');
-
-	if (path == NULL || str == NULL) {
-		free(path);
-		return NULL;
-	}
-
-	path->system = strndup(name, str - name);
-	path->name = strdup(str+1);
-
-	if (path->system == NULL || path->name == NULL) {
-		zfree(&path->system);
-		zfree(&path->name);
-		zfree(&path);
-	}
-
-	return path;
-}
-
 const char *event_type(int type)
 {
 	switch (type) {
@@ -2666,571 +2531,6 @@ int exclude_perf(const struct option *opt,
 					  NULL);
 }
 
-static const char * const event_type_descriptors[] = {
-	"Hardware event",
-	"Software event",
-	"Tracepoint event",
-	"Hardware cache event",
-	"Raw hardware event descriptor",
-	"Hardware breakpoint",
-};
-
-static int cmp_string(const void *a, const void *b)
-{
-	const char * const *as = a;
-	const char * const *bs = b;
-
-	return strcmp(*as, *bs);
-}
-
-/*
- * Print the events from <debugfs_mount_point>/tracing/events
- */
-
-void print_tracepoint_events(const char *subsys_glob, const char *event_glob,
-			     bool name_only)
-{
-	DIR *sys_dir, *evt_dir;
-	struct dirent *sys_dirent, *evt_dirent;
-	char evt_path[MAXPATHLEN];
-	char *dir_path;
-	char **evt_list = NULL;
-	unsigned int evt_i = 0, evt_num = 0;
-	bool evt_num_known = false;
-
-restart:
-	sys_dir = tracing_events__opendir();
-	if (!sys_dir)
-		return;
-
-	if (evt_num_known) {
-		evt_list = zalloc(sizeof(char *) * evt_num);
-		if (!evt_list)
-			goto out_close_sys_dir;
-	}
-
-	for_each_subsystem(sys_dir, sys_dirent) {
-		if (subsys_glob != NULL &&
-		    !strglobmatch(sys_dirent->d_name, subsys_glob))
-			continue;
-
-		dir_path = get_events_file(sys_dirent->d_name);
-		if (!dir_path)
-			continue;
-		evt_dir = opendir(dir_path);
-		if (!evt_dir)
-			goto next;
-
-		for_each_event(dir_path, evt_dir, evt_dirent) {
-			if (event_glob != NULL &&
-			    !strglobmatch(evt_dirent->d_name, event_glob))
-				continue;
-
-			if (!evt_num_known) {
-				evt_num++;
-				continue;
-			}
-
-			snprintf(evt_path, MAXPATHLEN, "%s:%s",
-				 sys_dirent->d_name, evt_dirent->d_name);
-
-			evt_list[evt_i] = strdup(evt_path);
-			if (evt_list[evt_i] == NULL) {
-				put_events_file(dir_path);
-				goto out_close_evt_dir;
-			}
-			evt_i++;
-		}
-		closedir(evt_dir);
-next:
-		put_events_file(dir_path);
-	}
-	closedir(sys_dir);
-
-	if (!evt_num_known) {
-		evt_num_known = true;
-		goto restart;
-	}
-	qsort(evt_list, evt_num, sizeof(char *), cmp_string);
-	evt_i = 0;
-	while (evt_i < evt_num) {
-		if (name_only) {
-			printf("%s ", evt_list[evt_i++]);
-			continue;
-		}
-		printf("  %-50s [%s]\n", evt_list[evt_i++],
-				event_type_descriptors[PERF_TYPE_TRACEPOINT]);
-	}
-	if (evt_num && pager_in_use())
-		printf("\n");
-
-out_free:
-	evt_num = evt_i;
-	for (evt_i = 0; evt_i < evt_num; evt_i++)
-		zfree(&evt_list[evt_i]);
-	zfree(&evt_list);
-	return;
-
-out_close_evt_dir:
-	closedir(evt_dir);
-out_close_sys_dir:
-	closedir(sys_dir);
-
-	printf("FATAL: not enough memory to print %s\n",
-			event_type_descriptors[PERF_TYPE_TRACEPOINT]);
-	if (evt_list)
-		goto out_free;
-}
-
-/*
- * Check whether event is in <debugfs_mount_point>/tracing/events
- */
-
-int is_valid_tracepoint(const char *event_string)
-{
-	DIR *sys_dir, *evt_dir;
-	struct dirent *sys_dirent, *evt_dirent;
-	char evt_path[MAXPATHLEN];
-	char *dir_path;
-
-	sys_dir = tracing_events__opendir();
-	if (!sys_dir)
-		return 0;
-
-	for_each_subsystem(sys_dir, sys_dirent) {
-		dir_path = get_events_file(sys_dirent->d_name);
-		if (!dir_path)
-			continue;
-		evt_dir = opendir(dir_path);
-		if (!evt_dir)
-			goto next;
-
-		for_each_event(dir_path, evt_dir, evt_dirent) {
-			snprintf(evt_path, MAXPATHLEN, "%s:%s",
-				 sys_dirent->d_name, evt_dirent->d_name);
-			if (!strcmp(evt_path, event_string)) {
-				closedir(evt_dir);
-				closedir(sys_dir);
-				return 1;
-			}
-		}
-		closedir(evt_dir);
-next:
-		put_events_file(dir_path);
-	}
-	closedir(sys_dir);
-	return 0;
-}
-
-static bool is_event_supported(u8 type, u64 config)
-{
-	bool ret = true;
-	int open_return;
-	struct evsel *evsel;
-	struct perf_event_attr attr = {
-		.type = type,
-		.config = config,
-		.disabled = 1,
-	};
-	struct perf_thread_map *tmap = thread_map__new_by_tid(0);
-
-	if (tmap == NULL)
-		return false;
-
-	evsel = evsel__new(&attr);
-	if (evsel) {
-		open_return = evsel__open(evsel, NULL, tmap);
-		ret = open_return >= 0;
-
-		if (open_return == -EACCES) {
-			/*
-			 * This happens if the paranoid value
-			 * /proc/sys/kernel/perf_event_paranoid is set to 2
-			 * Re-run with exclude_kernel set; we don't do that
-			 * by default as some ARM machines do not support it.
-			 *
-			 */
-			evsel->core.attr.exclude_kernel = 1;
-			ret = evsel__open(evsel, NULL, tmap) >= 0;
-		}
-		evsel__delete(evsel);
-	}
-
-	perf_thread_map__put(tmap);
-	return ret;
-}
-
-void print_sdt_events(const char *subsys_glob, const char *event_glob,
-		      bool name_only)
-{
-	struct probe_cache *pcache;
-	struct probe_cache_entry *ent;
-	struct strlist *bidlist, *sdtlist;
-	struct strlist_config cfg = {.dont_dupstr = true};
-	struct str_node *nd, *nd2;
-	char *buf, *path, *ptr = NULL;
-	bool show_detail = false;
-	int ret;
-
-	sdtlist = strlist__new(NULL, &cfg);
-	if (!sdtlist) {
-		pr_debug("Failed to allocate new strlist for SDT\n");
-		return;
-	}
-	bidlist = build_id_cache__list_all(true);
-	if (!bidlist) {
-		pr_debug("Failed to get buildids: %d\n", errno);
-		return;
-	}
-	strlist__for_each_entry(nd, bidlist) {
-		pcache = probe_cache__new(nd->s, NULL);
-		if (!pcache)
-			continue;
-		list_for_each_entry(ent, &pcache->entries, node) {
-			if (!ent->sdt)
-				continue;
-			if (subsys_glob &&
-			    !strglobmatch(ent->pev.group, subsys_glob))
-				continue;
-			if (event_glob &&
-			    !strglobmatch(ent->pev.event, event_glob))
-				continue;
-			ret = asprintf(&buf, "%s:%s@%s", ent->pev.group,
-					ent->pev.event, nd->s);
-			if (ret > 0)
-				strlist__add(sdtlist, buf);
-		}
-		probe_cache__delete(pcache);
-	}
-	strlist__delete(bidlist);
-
-	strlist__for_each_entry(nd, sdtlist) {
-		buf = strchr(nd->s, '@');
-		if (buf)
-			*(buf++) = '\0';
-		if (name_only) {
-			printf("%s ", nd->s);
-			continue;
-		}
-		nd2 = strlist__next(nd);
-		if (nd2) {
-			ptr = strchr(nd2->s, '@');
-			if (ptr)
-				*ptr = '\0';
-			if (strcmp(nd->s, nd2->s) == 0)
-				show_detail = true;
-		}
-		if (show_detail) {
-			path = build_id_cache__origname(buf);
-			ret = asprintf(&buf, "%s@%s(%.12s)", nd->s, path, buf);
-			if (ret > 0) {
-				printf("  %-50s [%s]\n", buf, "SDT event");
-				free(buf);
-			}
-			free(path);
-		} else
-			printf("  %-50s [%s]\n", nd->s, "SDT event");
-		if (nd2) {
-			if (strcmp(nd->s, nd2->s) != 0)
-				show_detail = false;
-			if (ptr)
-				*ptr = '@';
-		}
-	}
-	strlist__delete(sdtlist);
-}
-
-int print_hwcache_events(const char *event_glob, bool name_only)
-{
-	unsigned int type, op, i, evt_i = 0, evt_num = 0, npmus = 0;
-	char name[64], new_name[128];
-	char **evt_list = NULL, **evt_pmus = NULL;
-	bool evt_num_known = false;
-	struct perf_pmu *pmu = NULL;
-
-	if (perf_pmu__has_hybrid()) {
-		npmus = perf_pmu__hybrid_pmu_num();
-		evt_pmus = zalloc(sizeof(char *) * npmus);
-		if (!evt_pmus)
-			goto out_enomem;
-	}
-
-restart:
-	if (evt_num_known) {
-		evt_list = zalloc(sizeof(char *) * evt_num);
-		if (!evt_list)
-			goto out_enomem;
-	}
-
-	for (type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) {
-		for (op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) {
-			/* skip invalid cache type */
-			if (!evsel__is_cache_op_valid(type, op))
-				continue;
-
-			for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) {
-				unsigned int hybrid_supported = 0, j;
-				bool supported;
-
-				__evsel__hw_cache_type_op_res_name(type, op, i, name, sizeof(name));
-				if (event_glob != NULL && !strglobmatch(name, event_glob))
-					continue;
-
-				if (!perf_pmu__has_hybrid()) {
-					if (!is_event_supported(PERF_TYPE_HW_CACHE,
-								type | (op << 8) | (i << 16))) {
-						continue;
-					}
-				} else {
-					perf_pmu__for_each_hybrid_pmu(pmu) {
-						if (!evt_num_known) {
-							evt_num++;
-							continue;
-						}
-
-						supported = is_event_supported(
-									PERF_TYPE_HW_CACHE,
-									type | (op << 8) | (i << 16) |
-									((__u64)pmu->type << PERF_PMU_TYPE_SHIFT));
-						if (supported) {
-							snprintf(new_name, sizeof(new_name), "%s/%s/",
-								 pmu->name, name);
-							evt_pmus[hybrid_supported] = strdup(new_name);
-							hybrid_supported++;
-						}
-					}
-
-					if (hybrid_supported == 0)
-						continue;
-				}
-
-				if (!evt_num_known) {
-					evt_num++;
-					continue;
-				}
-
-				if ((hybrid_supported == 0) ||
-				    (hybrid_supported == npmus)) {
-					evt_list[evt_i] = strdup(name);
-					if (npmus > 0) {
-						for (j = 0; j < npmus; j++)
-							zfree(&evt_pmus[j]);
-					}
-				} else {
-					for (j = 0; j < hybrid_supported; j++) {
-						evt_list[evt_i++] = evt_pmus[j];
-						evt_pmus[j] = NULL;
-					}
-					continue;
-				}
-
-				if (evt_list[evt_i] == NULL)
-					goto out_enomem;
-				evt_i++;
-			}
-		}
-	}
-
-	if (!evt_num_known) {
-		evt_num_known = true;
-		goto restart;
-	}
-
-	for (evt_i = 0; evt_i < evt_num; evt_i++) {
-		if (!evt_list[evt_i])
-			break;
-	}
-
-	evt_num = evt_i;
-	qsort(evt_list, evt_num, sizeof(char *), cmp_string);
-	evt_i = 0;
-	while (evt_i < evt_num) {
-		if (name_only) {
-			printf("%s ", evt_list[evt_i++]);
-			continue;
-		}
-		printf("  %-50s [%s]\n", evt_list[evt_i++],
-				event_type_descriptors[PERF_TYPE_HW_CACHE]);
-	}
-	if (evt_num && pager_in_use())
-		printf("\n");
-
-out_free:
-	evt_num = evt_i;
-	for (evt_i = 0; evt_i < evt_num; evt_i++)
-		zfree(&evt_list[evt_i]);
-	zfree(&evt_list);
-
-	for (evt_i = 0; evt_i < npmus; evt_i++)
-		zfree(&evt_pmus[evt_i]);
-	zfree(&evt_pmus);
-	return evt_num;
-
-out_enomem:
-	printf("FATAL: not enough memory to print %s\n", event_type_descriptors[PERF_TYPE_HW_CACHE]);
-	if (evt_list)
-		goto out_free;
-	return evt_num;
-}
-
-static void print_tool_event(const struct event_symbol *syms, const char *event_glob,
-			     bool name_only)
-{
-	if (syms->symbol == NULL)
-		return;
-
-	if (event_glob && !(strglobmatch(syms->symbol, event_glob) ||
-	      (syms->alias && strglobmatch(syms->alias, event_glob))))
-		return;
-
-	if (name_only)
-		printf("%s ", syms->symbol);
-	else {
-		char name[MAX_NAME_LEN];
-		if (syms->alias && strlen(syms->alias))
-			snprintf(name, MAX_NAME_LEN, "%s OR %s", syms->symbol, syms->alias);
-		else
-			strlcpy(name, syms->symbol, MAX_NAME_LEN);
-		printf("  %-50s [%s]\n", name, "Tool event");
-	}
-}
-
-void print_tool_events(const char *event_glob, bool name_only)
-{
-	// Start at 1 because the first enum entry symbols no tool event
-	for (int i = 1; i < PERF_TOOL_MAX; ++i) {
-		print_tool_event(event_symbols_tool + i, event_glob, name_only);
-	}
-	if (pager_in_use())
-		printf("\n");
-}
-
-void print_symbol_events(const char *event_glob, unsigned type,
-				struct event_symbol *syms, unsigned max,
-				bool name_only)
-{
-	unsigned int i, evt_i = 0, evt_num = 0;
-	char name[MAX_NAME_LEN];
-	char **evt_list = NULL;
-	bool evt_num_known = false;
-
-restart:
-	if (evt_num_known) {
-		evt_list = zalloc(sizeof(char *) * evt_num);
-		if (!evt_list)
-			goto out_enomem;
-		syms -= max;
-	}
-
-	for (i = 0; i < max; i++, syms++) {
-		/*
-		 * New attr.config still not supported here, the latest
-		 * example was PERF_COUNT_SW_CGROUP_SWITCHES
-		 */
-		if (syms->symbol == NULL)
-			continue;
-
-		if (event_glob != NULL && !(strglobmatch(syms->symbol, event_glob) ||
-		      (syms->alias && strglobmatch(syms->alias, event_glob))))
-			continue;
-
-		if (!is_event_supported(type, i))
-			continue;
-
-		if (!evt_num_known) {
-			evt_num++;
-			continue;
-		}
-
-		if (!name_only && strlen(syms->alias))
-			snprintf(name, MAX_NAME_LEN, "%s OR %s", syms->symbol, syms->alias);
-		else
-			strlcpy(name, syms->symbol, MAX_NAME_LEN);
-
-		evt_list[evt_i] = strdup(name);
-		if (evt_list[evt_i] == NULL)
-			goto out_enomem;
-		evt_i++;
-	}
-
-	if (!evt_num_known) {
-		evt_num_known = true;
-		goto restart;
-	}
-	qsort(evt_list, evt_num, sizeof(char *), cmp_string);
-	evt_i = 0;
-	while (evt_i < evt_num) {
-		if (name_only) {
-			printf("%s ", evt_list[evt_i++]);
-			continue;
-		}
-		printf("  %-50s [%s]\n", evt_list[evt_i++], event_type_descriptors[type]);
-	}
-	if (evt_num && pager_in_use())
-		printf("\n");
-
-out_free:
-	evt_num = evt_i;
-	for (evt_i = 0; evt_i < evt_num; evt_i++)
-		zfree(&evt_list[evt_i]);
-	zfree(&evt_list);
-	return;
-
-out_enomem:
-	printf("FATAL: not enough memory to print %s\n", event_type_descriptors[type]);
-	if (evt_list)
-		goto out_free;
-}
-
-/*
- * Print the help text for the event symbols:
- */
-void print_events(const char *event_glob, bool name_only, bool quiet_flag,
-			bool long_desc, bool details_flag, bool deprecated,
-			const char *pmu_name)
-{
-	print_symbol_events(event_glob, PERF_TYPE_HARDWARE,
-			    event_symbols_hw, PERF_COUNT_HW_MAX, name_only);
-
-	print_symbol_events(event_glob, PERF_TYPE_SOFTWARE,
-			    event_symbols_sw, PERF_COUNT_SW_MAX, name_only);
-	print_tool_events(event_glob, name_only);
-
-	print_hwcache_events(event_glob, name_only);
-
-	print_pmu_events(event_glob, name_only, quiet_flag, long_desc,
-			details_flag, deprecated, pmu_name);
-
-	if (event_glob != NULL)
-		return;
-
-	if (!name_only) {
-		printf("  %-50s [%s]\n",
-		       "rNNN",
-		       event_type_descriptors[PERF_TYPE_RAW]);
-		printf("  %-50s [%s]\n",
-		       "cpu/t1=v1[,t2=v2,t3 ...]/modifier",
-		       event_type_descriptors[PERF_TYPE_RAW]);
-		if (pager_in_use())
-			printf("   (see 'man perf-list' on how to encode it)\n\n");
-
-		printf("  %-50s [%s]\n",
-		       "mem:<addr>[/len][:access]",
-			event_type_descriptors[PERF_TYPE_BREAKPOINT]);
-		if (pager_in_use())
-			printf("\n");
-	}
-
-	print_tracepoint_events(NULL, NULL, name_only);
-
-	print_sdt_events(NULL, NULL, name_only);
-
-	metricgroup__print(true, true, NULL, name_only, details_flag,
-			   pmu_name);
-
-	print_libpfm_events(name_only, long_desc);
-}
-
 int parse_events__is_hardcoded_term(struct parse_events_term *term)
 {
 	return term->type_term != PARSE_EVENTS__TERM_TYPE_USER;
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index a38b8b160e80..ba9fa3ddaf6e 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -11,7 +11,6 @@
 #include <linux/perf_event.h>
 #include <string.h>
 
-struct list_head;
 struct evsel;
 struct evlist;
 struct parse_events_error;
@@ -19,14 +18,6 @@ struct parse_events_error;
 struct option;
 struct perf_pmu;
 
-struct tracepoint_path {
-	char *system;
-	char *name;
-	struct tracepoint_path *next;
-};
-
-struct tracepoint_path *tracepoint_id_to_path(u64 config);
-struct tracepoint_path *tracepoint_name_to_path(const char *name);
 bool have_tracepoints(struct list_head *evlist);
 
 const char *event_type(int type);
@@ -46,8 +37,6 @@ int parse_events_terms(struct list_head *terms, const char *str);
 int parse_filter(const struct option *opt, const char *str, int unset);
 int exclude_perf(const struct option *opt, const char *arg, int unset);
 
-#define EVENTS_HELP_MAX (128*1024)
-
 enum perf_pmu_event_symbol_type {
 	PMU_EVENT_SYMBOL_ERR,		/* not a PMU EVENT */
 	PMU_EVENT_SYMBOL,		/* normal style PMU event */
@@ -56,11 +45,6 @@ enum perf_pmu_event_symbol_type {
 	PMU_EVENT_SYMBOL_SUFFIX2,	/* suffix of pre-suf2 style event */
 };
 
-struct perf_pmu_event_symbol {
-	char	*symbol;
-	enum perf_pmu_event_symbol_type	type;
-};
-
 enum {
 	PARSE_EVENTS__TERM_TYPE_NUM,
 	PARSE_EVENTS__TERM_TYPE_STR,
@@ -219,28 +203,13 @@ void parse_events_update_lists(struct list_head *list_event,
 void parse_events_evlist_error(struct parse_events_state *parse_state,
 			       int idx, const char *str);
 
-void print_events(const char *event_glob, bool name_only, bool quiet,
-		  bool long_desc, bool details_flag, bool deprecated,
-		  const char *pmu_name);
-
 struct event_symbol {
 	const char	*symbol;
 	const char	*alias;
 };
 extern struct event_symbol event_symbols_hw[];
 extern struct event_symbol event_symbols_sw[];
-void print_symbol_events(const char *event_glob, unsigned type,
-				struct event_symbol *syms, unsigned max,
-				bool name_only);
-void print_tool_events(const char *event_glob, bool name_only);
-void print_tracepoint_events(const char *subsys_glob, const char *event_glob,
-			     bool name_only);
-int print_hwcache_events(const char *event_glob, bool name_only);
-void print_sdt_events(const char *subsys_glob, const char *event_glob,
-		      bool name_only);
-int is_valid_tracepoint(const char *event_string);
 
-int valid_event_mount(const char *eventfs);
 char *parse_events_formats_error_string(char *additional_terms);
 
 void parse_events_error__init(struct parse_events_error *err);
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 9a1c7e63e663..0112e1c36418 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -1890,7 +1890,11 @@ int perf_pmu__caps_parse(struct perf_pmu *pmu)
 	const char *sysfs = sysfs__mountpoint();
 	DIR *caps_dir;
 	struct dirent *evt_ent;
-	int nr_caps = 0;
+
+	if (pmu->caps_initialized)
+		return pmu->nr_caps;
+
+	pmu->nr_caps = 0;
 
 	if (!sysfs)
 		return -1;
@@ -1898,8 +1902,10 @@ int perf_pmu__caps_parse(struct perf_pmu *pmu)
 	snprintf(caps_path, PATH_MAX,
 		 "%s" EVENT_SOURCE_DEVICE_PATH "%s/caps", sysfs, pmu->name);
 
-	if (stat(caps_path, &st) < 0)
+	if (stat(caps_path, &st) < 0) {
+		pmu->caps_initialized = true;
 		return 0;	/* no error if caps does not exist */
+	}
 
 	caps_dir = opendir(caps_path);
 	if (!caps_dir)
@@ -1926,13 +1932,14 @@ int perf_pmu__caps_parse(struct perf_pmu *pmu)
 			continue;
 		}
 
-		nr_caps++;
+		pmu->nr_caps++;
 		fclose(file);
 	}
 
 	closedir(caps_dir);
 
-	return nr_caps;
+	pmu->caps_initialized = true;
+	return pmu->nr_caps;
 }
 
 void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config,
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 541889fa9f9c..4b45fd8da5a3 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -46,6 +46,8 @@ struct perf_pmu {
 	struct perf_cpu_map *cpus;
 	struct list_head format;  /* HEAD struct perf_pmu_format -> list */
 	struct list_head aliases; /* HEAD struct perf_pmu_alias -> list */
+	bool caps_initialized;
+	u32 nr_caps;
 	struct list_head caps;    /* HEAD struct perf_pmu_caps -> list */
 	struct list_head list;    /* ELEM */
 	struct list_head hybrid_list;
diff --git a/tools/perf/util/print-events.c b/tools/perf/util/print-events.c
new file mode 100644
index 000000000000..ba1ab5134685
--- /dev/null
+++ b/tools/perf/util/print-events.c
@@ -0,0 +1,572 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <dirent.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/param.h>
+
+#include <api/fs/tracing_path.h>
+#include <linux/stddef.h>
+#include <linux/perf_event.h>
+#include <linux/zalloc.h>
+#include <subcmd/pager.h>
+
+#include "build-id.h"
+#include "debug.h"
+#include "evsel.h"
+#include "metricgroup.h"
+#include "parse-events.h"
+#include "pmu.h"
+#include "print-events.h"
+#include "probe-file.h"
+#include "string2.h"
+#include "strlist.h"
+#include "thread_map.h"
+#include "tracepoint.h"
+#include "pfm.h"
+#include "pmu-hybrid.h"
+
+#define MAX_NAME_LEN 100
+
+static const char * const event_type_descriptors[] = {
+	"Hardware event",
+	"Software event",
+	"Tracepoint event",
+	"Hardware cache event",
+	"Raw hardware event descriptor",
+	"Hardware breakpoint",
+};
+
+static const struct event_symbol event_symbols_tool[PERF_TOOL_MAX] = {
+	[PERF_TOOL_DURATION_TIME] = {
+		.symbol = "duration_time",
+		.alias  = "",
+	},
+	[PERF_TOOL_USER_TIME] = {
+		.symbol = "user_time",
+		.alias  = "",
+	},
+	[PERF_TOOL_SYSTEM_TIME] = {
+		.symbol = "system_time",
+		.alias  = "",
+	},
+};
+
+static int cmp_string(const void *a, const void *b)
+{
+	const char * const *as = a;
+	const char * const *bs = b;
+
+	return strcmp(*as, *bs);
+}
+
+/*
+ * Print the events from <debugfs_mount_point>/tracing/events
+ */
+void print_tracepoint_events(const char *subsys_glob,
+			     const char *event_glob, bool name_only)
+{
+	DIR *sys_dir, *evt_dir;
+	struct dirent *sys_dirent, *evt_dirent;
+	char evt_path[MAXPATHLEN];
+	char *dir_path;
+	char **evt_list = NULL;
+	unsigned int evt_i = 0, evt_num = 0;
+	bool evt_num_known = false;
+
+restart:
+	sys_dir = tracing_events__opendir();
+	if (!sys_dir)
+		return;
+
+	if (evt_num_known) {
+		evt_list = zalloc(sizeof(char *) * evt_num);
+		if (!evt_list)
+			goto out_close_sys_dir;
+	}
+
+	for_each_subsystem(sys_dir, sys_dirent) {
+		if (subsys_glob != NULL &&
+		    !strglobmatch(sys_dirent->d_name, subsys_glob))
+			continue;
+
+		dir_path = get_events_file(sys_dirent->d_name);
+		if (!dir_path)
+			continue;
+		evt_dir = opendir(dir_path);
+		if (!evt_dir)
+			goto next;
+
+		for_each_event(dir_path, evt_dir, evt_dirent) {
+			if (event_glob != NULL &&
+			    !strglobmatch(evt_dirent->d_name, event_glob))
+				continue;
+
+			if (!evt_num_known) {
+				evt_num++;
+				continue;
+			}
+
+			snprintf(evt_path, MAXPATHLEN, "%s:%s",
+				 sys_dirent->d_name, evt_dirent->d_name);
+
+			evt_list[evt_i] = strdup(evt_path);
+			if (evt_list[evt_i] == NULL) {
+				put_events_file(dir_path);
+				goto out_close_evt_dir;
+			}
+			evt_i++;
+		}
+		closedir(evt_dir);
+next:
+		put_events_file(dir_path);
+	}
+	closedir(sys_dir);
+
+	if (!evt_num_known) {
+		evt_num_known = true;
+		goto restart;
+	}
+	qsort(evt_list, evt_num, sizeof(char *), cmp_string);
+	evt_i = 0;
+	while (evt_i < evt_num) {
+		if (name_only) {
+			printf("%s ", evt_list[evt_i++]);
+			continue;
+		}
+		printf("  %-50s [%s]\n", evt_list[evt_i++],
+				event_type_descriptors[PERF_TYPE_TRACEPOINT]);
+	}
+	if (evt_num && pager_in_use())
+		printf("\n");
+
+out_free:
+	evt_num = evt_i;
+	for (evt_i = 0; evt_i < evt_num; evt_i++)
+		zfree(&evt_list[evt_i]);
+	zfree(&evt_list);
+	return;
+
+out_close_evt_dir:
+	closedir(evt_dir);
+out_close_sys_dir:
+	closedir(sys_dir);
+
+	printf("FATAL: not enough memory to print %s\n",
+			event_type_descriptors[PERF_TYPE_TRACEPOINT]);
+	if (evt_list)
+		goto out_free;
+}
+
+void print_sdt_events(const char *subsys_glob, const char *event_glob,
+		      bool name_only)
+{
+	struct probe_cache *pcache;
+	struct probe_cache_entry *ent;
+	struct strlist *bidlist, *sdtlist;
+	struct strlist_config cfg = {.dont_dupstr = true};
+	struct str_node *nd, *nd2;
+	char *buf, *path, *ptr = NULL;
+	bool show_detail = false;
+	int ret;
+
+	sdtlist = strlist__new(NULL, &cfg);
+	if (!sdtlist) {
+		pr_debug("Failed to allocate new strlist for SDT\n");
+		return;
+	}
+	bidlist = build_id_cache__list_all(true);
+	if (!bidlist) {
+		pr_debug("Failed to get buildids: %d\n", errno);
+		return;
+	}
+	strlist__for_each_entry(nd, bidlist) {
+		pcache = probe_cache__new(nd->s, NULL);
+		if (!pcache)
+			continue;
+		list_for_each_entry(ent, &pcache->entries, node) {
+			if (!ent->sdt)
+				continue;
+			if (subsys_glob &&
+			    !strglobmatch(ent->pev.group, subsys_glob))
+				continue;
+			if (event_glob &&
+			    !strglobmatch(ent->pev.event, event_glob))
+				continue;
+			ret = asprintf(&buf, "%s:%s@%s", ent->pev.group,
+					ent->pev.event, nd->s);
+			if (ret > 0)
+				strlist__add(sdtlist, buf);
+		}
+		probe_cache__delete(pcache);
+	}
+	strlist__delete(bidlist);
+
+	strlist__for_each_entry(nd, sdtlist) {
+		buf = strchr(nd->s, '@');
+		if (buf)
+			*(buf++) = '\0';
+		if (name_only) {
+			printf("%s ", nd->s);
+			continue;
+		}
+		nd2 = strlist__next(nd);
+		if (nd2) {
+			ptr = strchr(nd2->s, '@');
+			if (ptr)
+				*ptr = '\0';
+			if (strcmp(nd->s, nd2->s) == 0)
+				show_detail = true;
+		}
+		if (show_detail) {
+			path = build_id_cache__origname(buf);
+			ret = asprintf(&buf, "%s@%s(%.12s)", nd->s, path, buf);
+			if (ret > 0) {
+				printf("  %-50s [%s]\n", buf, "SDT event");
+				free(buf);
+			}
+			free(path);
+		} else
+			printf("  %-50s [%s]\n", nd->s, "SDT event");
+		if (nd2) {
+			if (strcmp(nd->s, nd2->s) != 0)
+				show_detail = false;
+			if (ptr)
+				*ptr = '@';
+		}
+	}
+	strlist__delete(sdtlist);
+}
+
+static bool is_event_supported(u8 type, unsigned int config)
+{
+	bool ret = true;
+	int open_return;
+	struct evsel *evsel;
+	struct perf_event_attr attr = {
+		.type = type,
+		.config = config,
+		.disabled = 1,
+	};
+	struct perf_thread_map *tmap = thread_map__new_by_tid(0);
+
+	if (tmap == NULL)
+		return false;
+
+	evsel = evsel__new(&attr);
+	if (evsel) {
+		open_return = evsel__open(evsel, NULL, tmap);
+		ret = open_return >= 0;
+
+		if (open_return == -EACCES) {
+			/*
+			 * This happens if the paranoid value
+			 * /proc/sys/kernel/perf_event_paranoid is set to 2
+			 * Re-run with exclude_kernel set; we don't do that
+			 * by default as some ARM machines do not support it.
+			 *
+			 */
+			evsel->core.attr.exclude_kernel = 1;
+			ret = evsel__open(evsel, NULL, tmap) >= 0;
+		}
+		evsel__delete(evsel);
+	}
+
+	perf_thread_map__put(tmap);
+	return ret;
+}
+
+int print_hwcache_events(const char *event_glob, bool name_only)
+{
+	unsigned int type, op, i, evt_i = 0, evt_num = 0, npmus = 0;
+	char name[64], new_name[128];
+	char **evt_list = NULL, **evt_pmus = NULL;
+	bool evt_num_known = false;
+	struct perf_pmu *pmu = NULL;
+
+	if (perf_pmu__has_hybrid()) {
+		npmus = perf_pmu__hybrid_pmu_num();
+		evt_pmus = zalloc(sizeof(char *) * npmus);
+		if (!evt_pmus)
+			goto out_enomem;
+	}
+
+restart:
+	if (evt_num_known) {
+		evt_list = zalloc(sizeof(char *) * evt_num);
+		if (!evt_list)
+			goto out_enomem;
+	}
+
+	for (type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) {
+		for (op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) {
+			/* skip invalid cache type */
+			if (!evsel__is_cache_op_valid(type, op))
+				continue;
+
+			for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) {
+				unsigned int hybrid_supported = 0, j;
+				bool supported;
+
+				__evsel__hw_cache_type_op_res_name(type, op, i, name, sizeof(name));
+				if (event_glob != NULL && !strglobmatch(name, event_glob))
+					continue;
+
+				if (!perf_pmu__has_hybrid()) {
+					if (!is_event_supported(PERF_TYPE_HW_CACHE,
+								type | (op << 8) | (i << 16))) {
+						continue;
+					}
+				} else {
+					perf_pmu__for_each_hybrid_pmu(pmu) {
+						if (!evt_num_known) {
+							evt_num++;
+							continue;
+						}
+
+						supported = is_event_supported(
+							PERF_TYPE_HW_CACHE,
+							type | (op << 8) | (i << 16) |
+							((__u64)pmu->type << PERF_PMU_TYPE_SHIFT));
+						if (supported) {
+							snprintf(new_name, sizeof(new_name),
+								 "%s/%s/", pmu->name, name);
+							evt_pmus[hybrid_supported] =
+								strdup(new_name);
+							hybrid_supported++;
+						}
+					}
+
+					if (hybrid_supported == 0)
+						continue;
+				}
+
+				if (!evt_num_known) {
+					evt_num++;
+					continue;
+				}
+
+				if ((hybrid_supported == 0) ||
+				    (hybrid_supported == npmus)) {
+					evt_list[evt_i] = strdup(name);
+					if (npmus > 0) {
+						for (j = 0; j < npmus; j++)
+							zfree(&evt_pmus[j]);
+					}
+				} else {
+					for (j = 0; j < hybrid_supported; j++) {
+						evt_list[evt_i++] = evt_pmus[j];
+						evt_pmus[j] = NULL;
+					}
+					continue;
+				}
+
+				if (evt_list[evt_i] == NULL)
+					goto out_enomem;
+				evt_i++;
+			}
+		}
+	}
+
+	if (!evt_num_known) {
+		evt_num_known = true;
+		goto restart;
+	}
+
+	for (evt_i = 0; evt_i < evt_num; evt_i++) {
+		if (!evt_list[evt_i])
+			break;
+	}
+
+	evt_num = evt_i;
+	qsort(evt_list, evt_num, sizeof(char *), cmp_string);
+	evt_i = 0;
+	while (evt_i < evt_num) {
+		if (name_only) {
+			printf("%s ", evt_list[evt_i++]);
+			continue;
+		}
+		printf("  %-50s [%s]\n", evt_list[evt_i++],
+				event_type_descriptors[PERF_TYPE_HW_CACHE]);
+	}
+	if (evt_num && pager_in_use())
+		printf("\n");
+
+out_free:
+	evt_num = evt_i;
+	for (evt_i = 0; evt_i < evt_num; evt_i++)
+		zfree(&evt_list[evt_i]);
+	zfree(&evt_list);
+
+	for (evt_i = 0; evt_i < npmus; evt_i++)
+		zfree(&evt_pmus[evt_i]);
+	zfree(&evt_pmus);
+	return evt_num;
+
+out_enomem:
+	printf("FATAL: not enough memory to print %s\n",
+		event_type_descriptors[PERF_TYPE_HW_CACHE]);
+	if (evt_list)
+		goto out_free;
+	return evt_num;
+}
+
+static void print_tool_event(const struct event_symbol *syms, const char *event_glob,
+			     bool name_only)
+{
+	if (syms->symbol == NULL)
+		return;
+
+	if (event_glob && !(strglobmatch(syms->symbol, event_glob) ||
+	      (syms->alias && strglobmatch(syms->alias, event_glob))))
+		return;
+
+	if (name_only)
+		printf("%s ", syms->symbol);
+	else {
+		char name[MAX_NAME_LEN];
+
+		if (syms->alias && strlen(syms->alias))
+			snprintf(name, MAX_NAME_LEN, "%s OR %s", syms->symbol, syms->alias);
+		else
+			strlcpy(name, syms->symbol, MAX_NAME_LEN);
+		printf("  %-50s [%s]\n", name, "Tool event");
+	}
+}
+
+void print_tool_events(const char *event_glob, bool name_only)
+{
+	// Start at 1 because the first enum entry means no tool event.
+	for (int i = 1; i < PERF_TOOL_MAX; ++i)
+		print_tool_event(event_symbols_tool + i, event_glob, name_only);
+
+	if (pager_in_use())
+		printf("\n");
+}
+
+void print_symbol_events(const char *event_glob, unsigned int type,
+			 struct event_symbol *syms, unsigned int max,
+			 bool name_only)
+{
+	unsigned int i, evt_i = 0, evt_num = 0;
+	char name[MAX_NAME_LEN];
+	char **evt_list = NULL;
+	bool evt_num_known = false;
+
+restart:
+	if (evt_num_known) {
+		evt_list = zalloc(sizeof(char *) * evt_num);
+		if (!evt_list)
+			goto out_enomem;
+		syms -= max;
+	}
+
+	for (i = 0; i < max; i++, syms++) {
+		/*
+		 * New attr.config still not supported here, the latest
+		 * example was PERF_COUNT_SW_CGROUP_SWITCHES
+		 */
+		if (syms->symbol == NULL)
+			continue;
+
+		if (event_glob != NULL && !(strglobmatch(syms->symbol, event_glob) ||
+		      (syms->alias && strglobmatch(syms->alias, event_glob))))
+			continue;
+
+		if (!is_event_supported(type, i))
+			continue;
+
+		if (!evt_num_known) {
+			evt_num++;
+			continue;
+		}
+
+		if (!name_only && strlen(syms->alias))
+			snprintf(name, MAX_NAME_LEN, "%s OR %s", syms->symbol, syms->alias);
+		else
+			strlcpy(name, syms->symbol, MAX_NAME_LEN);
+
+		evt_list[evt_i] = strdup(name);
+		if (evt_list[evt_i] == NULL)
+			goto out_enomem;
+		evt_i++;
+	}
+
+	if (!evt_num_known) {
+		evt_num_known = true;
+		goto restart;
+	}
+	qsort(evt_list, evt_num, sizeof(char *), cmp_string);
+	evt_i = 0;
+	while (evt_i < evt_num) {
+		if (name_only) {
+			printf("%s ", evt_list[evt_i++]);
+			continue;
+		}
+		printf("  %-50s [%s]\n", evt_list[evt_i++], event_type_descriptors[type]);
+	}
+	if (evt_num && pager_in_use())
+		printf("\n");
+
+out_free:
+	evt_num = evt_i;
+	for (evt_i = 0; evt_i < evt_num; evt_i++)
+		zfree(&evt_list[evt_i]);
+	zfree(&evt_list);
+	return;
+
+out_enomem:
+	printf("FATAL: not enough memory to print %s\n", event_type_descriptors[type]);
+	if (evt_list)
+		goto out_free;
+}
+
+/*
+ * Print the help text for the event symbols:
+ */
+void print_events(const char *event_glob, bool name_only, bool quiet_flag,
+			bool long_desc, bool details_flag, bool deprecated,
+			const char *pmu_name)
+{
+	print_symbol_events(event_glob, PERF_TYPE_HARDWARE,
+			    event_symbols_hw, PERF_COUNT_HW_MAX, name_only);
+
+	print_symbol_events(event_glob, PERF_TYPE_SOFTWARE,
+			    event_symbols_sw, PERF_COUNT_SW_MAX, name_only);
+	print_tool_events(event_glob, name_only);
+
+	print_hwcache_events(event_glob, name_only);
+
+	print_pmu_events(event_glob, name_only, quiet_flag, long_desc,
+			details_flag, deprecated, pmu_name);
+
+	if (event_glob != NULL)
+		return;
+
+	if (!name_only) {
+		printf("  %-50s [%s]\n",
+		       "rNNN",
+		       event_type_descriptors[PERF_TYPE_RAW]);
+		printf("  %-50s [%s]\n",
+		       "cpu/t1=v1[,t2=v2,t3 ...]/modifier",
+		       event_type_descriptors[PERF_TYPE_RAW]);
+		if (pager_in_use())
+			printf("   (see 'man perf-list' on how to encode it)\n\n");
+
+		printf("  %-50s [%s]\n",
+		       "mem:<addr>[/len][:access]",
+			event_type_descriptors[PERF_TYPE_BREAKPOINT]);
+		if (pager_in_use())
+			printf("\n");
+	}
+
+	print_tracepoint_events(NULL, NULL, name_only);
+
+	print_sdt_events(NULL, NULL, name_only);
+
+	metricgroup__print(true, true, NULL, name_only, details_flag,
+			   pmu_name);
+
+	print_libpfm_events(name_only, long_desc);
+}
diff --git a/tools/perf/util/print-events.h b/tools/perf/util/print-events.h
new file mode 100644
index 000000000000..1da9910d83a6
--- /dev/null
+++ b/tools/perf/util/print-events.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PERF_PRINT_EVENTS_H
+#define __PERF_PRINT_EVENTS_H
+
+#include <stdbool.h>
+
+struct event_symbol;
+
+void print_events(const char *event_glob, bool name_only, bool quiet_flag,
+		  bool long_desc, bool details_flag, bool deprecated,
+		  const char *pmu_name);
+int print_hwcache_events(const char *event_glob, bool name_only);
+void print_sdt_events(const char *subsys_glob, const char *event_glob,
+		      bool name_only);
+void print_symbol_events(const char *event_glob, unsigned int type,
+			 struct event_symbol *syms, unsigned int max,
+			 bool name_only);
+void print_tool_events(const char *event_glob, bool name_only);
+void print_tracepoint_events(const char *subsys_glob, const char *event_glob,
+			     bool name_only);
+
+#endif /* __PERF_PRINT_EVENTS_H */
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 062b5cbe67af..67c12d5303e7 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -1349,7 +1349,7 @@ int parse_line_range_desc(const char *arg, struct line_range *lr)
 				/*
 				 * Adjust the number of lines here.
 				 * If the number of lines == 1, the
-				 * the end of line should be equal to
+				 * end of line should be equal to
 				 * the start of line.
 				 */
 				lr->end--;
diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c
index 5b09ecbb05dc..b529636ab3ea 100644
--- a/tools/perf/util/record.c
+++ b/tools/perf/util/record.c
@@ -121,7 +121,7 @@ void evlist__config(struct evlist *evlist, struct record_opts *opts, struct call
 	evlist__for_each_entry(evlist, evsel)
 		evsel__config_leader_sampling(evsel, evlist);
 
-	if (opts->full_auxtrace) {
+	if (opts->full_auxtrace || opts->sample_identifier) {
 		/*
 		 * Need to be able to synthesize and parse selected events with
 		 * arbitrary sample types, which requires always being able to
diff --git a/tools/perf/util/record.h b/tools/perf/util/record.h
index be9a957501f4..4269e916f450 100644
--- a/tools/perf/util/record.h
+++ b/tools/perf/util/record.h
@@ -28,6 +28,7 @@ struct record_opts {
 	bool	      sample_time;
 	bool	      sample_time_set;
 	bool	      sample_cpu;
+	bool	      sample_identifier;
 	bool	      period;
 	bool	      period_set;
 	bool	      running_time;
diff --git a/tools/perf/util/scripting-engines/Build b/tools/perf/util/scripting-engines/Build
index 7b342ce38d99..c92326c2233a 100644
--- a/tools/perf/util/scripting-engines/Build
+++ b/tools/perf/util/scripting-engines/Build
@@ -1,6 +1,6 @@
 perf-$(CONFIG_LIBPERL)   += trace-event-perl.o
 perf-$(CONFIG_LIBPYTHON) += trace-event-python.o
 
-CFLAGS_trace-event-perl.o += $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow -Wno-nested-externs -Wno-undef -Wno-switch-default
+CFLAGS_trace-event-perl.o += $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow -Wno-nested-externs -Wno-undef -Wno-switch-default -Wno-bad-function-cast -Wno-declaration-after-statement -Wno-switch-enum
 
-CFLAGS_trace-event-python.o += $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow
+CFLAGS_trace-event-python.o += $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow -Wno-error=deprecated-declarations
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index adba01b7d9dd..5bbc1b16f368 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -861,6 +861,13 @@ static PyObject *get_perf_sample_dict(struct perf_sample *sample,
 	brstacksym = python_process_brstacksym(sample, al->thread);
 	pydict_set_item_string_decref(dict, "brstacksym", brstacksym);
 
+	if (sample->machine_pid) {
+		pydict_set_item_string_decref(dict_sample, "machine_pid",
+				_PyLong_FromLong(sample->machine_pid));
+		pydict_set_item_string_decref(dict_sample, "vcpu",
+				_PyLong_FromLong(sample->vcpu));
+	}
+
 	pydict_set_item_string_decref(dict_sample, "cpumode",
 			_PyLong_FromLong((unsigned long)sample->cpumode));
 
@@ -1509,7 +1516,7 @@ static void python_do_process_switch(union perf_event *event,
 		np_tid = event->context_switch.next_prev_tid;
 	}
 
-	t = tuple_new(9);
+	t = tuple_new(11);
 	if (!t)
 		return;
 
@@ -1522,6 +1529,8 @@ static void python_do_process_switch(union perf_event *event,
 	tuple_set_s32(t, 6, machine->pid);
 	tuple_set_bool(t, 7, out);
 	tuple_set_bool(t, 8, out_preempt);
+	tuple_set_s32(t, 9, sample->machine_pid);
+	tuple_set_s32(t, 10, sample->vcpu);
 
 	call_object(handler, t, handler_name);
 
@@ -1559,7 +1568,7 @@ static void python_process_auxtrace_error(struct perf_session *session __maybe_u
 		msg = (const char *)&e->time;
 	}
 
-	t = tuple_new(9);
+	t = tuple_new(11);
 
 	tuple_set_u32(t, 0, e->type);
 	tuple_set_u32(t, 1, e->code);
@@ -1570,6 +1579,8 @@ static void python_process_auxtrace_error(struct perf_session *session __maybe_u
 	tuple_set_u64(t, 6, tm);
 	tuple_set_string(t, 7, msg);
 	tuple_set_u32(t, 8, cpumode);
+	tuple_set_s32(t, 9, e->machine_pid);
+	tuple_set_s32(t, 10, e->vcpu);
 
 	call_object(handler, t, handler_name);
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 0aa818977d2b..98e16659a149 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -374,10 +374,6 @@ static int process_finished_round_stub(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
-static int process_finished_round(struct perf_tool *tool,
-				  union perf_event *event,
-				  struct ordered_events *oe);
-
 static int skipn(int fd, off_t n)
 {
 	char buf[4096];
@@ -534,7 +530,7 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
 		tool->build_id = process_event_op2_stub;
 	if (tool->finished_round == NULL) {
 		if (tool->ordered_events)
-			tool->finished_round = process_finished_round;
+			tool->finished_round = perf_event__process_finished_round;
 		else
 			tool->finished_round = process_finished_round_stub;
 	}
@@ -562,6 +558,8 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
 		tool->feature = process_event_op2_stub;
 	if (tool->compressed == NULL)
 		tool->compressed = perf_session__process_compressed_event;
+	if (tool->finished_init == NULL)
+		tool->finished_init = process_event_op2_stub;
 }
 
 static void swap_sample_id_all(union perf_event *event, void *data)
@@ -897,6 +895,10 @@ static void perf_event__auxtrace_error_swap(union perf_event *event,
 	event->auxtrace_error.ip   = bswap_64(event->auxtrace_error.ip);
 	if (event->auxtrace_error.fmt)
 		event->auxtrace_error.time = bswap_64(event->auxtrace_error.time);
+	if (event->auxtrace_error.fmt >= 2) {
+		event->auxtrace_error.machine_pid = bswap_32(event->auxtrace_error.machine_pid);
+		event->auxtrace_error.vcpu = bswap_32(event->auxtrace_error.vcpu);
+	}
 }
 
 static void perf_event__thread_map_swap(union perf_event *event,
@@ -1067,9 +1069,9 @@ static perf_event__swap_op perf_event__swap_ops[] = {
  *      Flush every events below timestamp 7
  *      etc...
  */
-static int process_finished_round(struct perf_tool *tool __maybe_unused,
-				  union perf_event *event __maybe_unused,
-				  struct ordered_events *oe)
+int perf_event__process_finished_round(struct perf_tool *tool __maybe_unused,
+				       union perf_event *event __maybe_unused,
+				       struct ordered_events *oe)
 {
 	if (dump_trace)
 		fprintf(stdout, "\n");
@@ -1420,7 +1422,9 @@ static struct machine *machines__find_for_cpumode(struct machines *machines,
 	     (sample->cpumode == PERF_RECORD_MISC_GUEST_USER))) {
 		u32 pid;
 
-		if (event->header.type == PERF_RECORD_MMAP
+		if (sample->machine_pid)
+			pid = sample->machine_pid;
+		else if (event->header.type == PERF_RECORD_MMAP
 		    || event->header.type == PERF_RECORD_MMAP2)
 			pid = event->mmap.pid;
 		else
@@ -1706,6 +1710,8 @@ static s64 perf_session__process_user_event(struct perf_session *session,
 		if (err)
 			dump_event(session->evlist, event, file_offset, &sample, file_path);
 		return err;
+	case PERF_RECORD_FINISHED_INIT:
+		return tool->finished_init(session, event);
 	default:
 		return -EINVAL;
 	}
@@ -2751,39 +2757,120 @@ void perf_session__fprintf_info(struct perf_session *session, FILE *fp,
 	fprintf(fp, "# ========\n#\n");
 }
 
+static int perf_session__register_guest(struct perf_session *session, pid_t machine_pid)
+{
+	struct machine *machine = machines__findnew(&session->machines, machine_pid);
+	struct thread *thread;
+
+	if (!machine)
+		return -ENOMEM;
+
+	machine->single_address_space = session->machines.host.single_address_space;
+
+	thread = machine__idle_thread(machine);
+	if (!thread)
+		return -ENOMEM;
+	thread__put(thread);
+
+	machine->kallsyms_filename = perf_data__guest_kallsyms_name(session->data, machine_pid);
+
+	return 0;
+}
+
+static int perf_session__set_guest_cpu(struct perf_session *session, pid_t pid,
+				       pid_t tid, int guest_cpu)
+{
+	struct machine *machine = &session->machines.host;
+	struct thread *thread = machine__findnew_thread(machine, pid, tid);
+
+	if (!thread)
+		return -ENOMEM;
+	thread->guest_cpu = guest_cpu;
+	thread__put(thread);
+
+	return 0;
+}
+
 int perf_event__process_id_index(struct perf_session *session,
 				 union perf_event *event)
 {
 	struct evlist *evlist = session->evlist;
 	struct perf_record_id_index *ie = &event->id_index;
+	size_t sz = ie->header.size - sizeof(*ie);
 	size_t i, nr, max_nr;
+	size_t e1_sz = sizeof(struct id_index_entry);
+	size_t e2_sz = sizeof(struct id_index_entry_2);
+	size_t etot_sz = e1_sz + e2_sz;
+	struct id_index_entry_2 *e2;
+	pid_t last_pid = 0;
 
-	max_nr = (ie->header.size - sizeof(struct perf_record_id_index)) /
-		 sizeof(struct id_index_entry);
+	max_nr = sz / e1_sz;
 	nr = ie->nr;
-	if (nr > max_nr)
+	if (nr > max_nr) {
+		printf("Too big: nr %zu max_nr %zu\n", nr, max_nr);
 		return -EINVAL;
+	}
+
+	if (sz >= nr * etot_sz) {
+		max_nr = sz / etot_sz;
+		if (nr > max_nr) {
+			printf("Too big2: nr %zu max_nr %zu\n", nr, max_nr);
+			return -EINVAL;
+		}
+		e2 = (void *)ie + sizeof(*ie) + nr * e1_sz;
+	} else {
+		e2 = NULL;
+	}
 
 	if (dump_trace)
 		fprintf(stdout, " nr: %zu\n", nr);
 
-	for (i = 0; i < nr; i++) {
+	for (i = 0; i < nr; i++, (e2 ? e2++ : 0)) {
 		struct id_index_entry *e = &ie->entries[i];
 		struct perf_sample_id *sid;
+		int ret;
 
 		if (dump_trace) {
 			fprintf(stdout,	" ... id: %"PRI_lu64, e->id);
 			fprintf(stdout,	"  idx: %"PRI_lu64, e->idx);
 			fprintf(stdout,	"  cpu: %"PRI_ld64, e->cpu);
-			fprintf(stdout,	"  tid: %"PRI_ld64"\n", e->tid);
+			fprintf(stdout, "  tid: %"PRI_ld64, e->tid);
+			if (e2) {
+				fprintf(stdout, "  machine_pid: %"PRI_ld64, e2->machine_pid);
+				fprintf(stdout, "  vcpu: %"PRI_lu64"\n", e2->vcpu);
+			} else {
+				fprintf(stdout, "\n");
+			}
 		}
 
 		sid = evlist__id2sid(evlist, e->id);
 		if (!sid)
 			return -ENOENT;
+
 		sid->idx = e->idx;
 		sid->cpu.cpu = e->cpu;
 		sid->tid = e->tid;
+
+		if (!e2)
+			continue;
+
+		sid->machine_pid = e2->machine_pid;
+		sid->vcpu.cpu = e2->vcpu;
+
+		if (!sid->machine_pid)
+			continue;
+
+		if (sid->machine_pid != last_pid) {
+			ret = perf_session__register_guest(session, sid->machine_pid);
+			if (ret)
+				return ret;
+			last_pid = sid->machine_pid;
+			perf_guest = true;
+		}
+
+		ret = perf_session__set_guest_cpu(session, sid->machine_pid, e->tid, e2->vcpu);
+		if (ret)
+			return ret;
 	}
 	return 0;
 }
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 34500a3da735..be5871ea558f 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -155,4 +155,8 @@ int perf_session__deliver_synth_event(struct perf_session *session,
 int perf_event__process_id_index(struct perf_session *session,
 				 union perf_event *event);
 
+int perf_event__process_finished_round(struct perf_tool *tool,
+				       union perf_event *event,
+				       struct ordered_events *oe);
+
 #endif /* __PERF_SESSION_H */
diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py
index c255a2c90cd6..5b1e6468d5e8 100644
--- a/tools/perf/util/setup.py
+++ b/tools/perf/util/setup.py
@@ -11,7 +11,7 @@ def clang_has_option(option):
     return [o for o in cc_output if ((b"unknown argument" in o) or (b"is not supported" in o))] == [ ]
 
 if cc_is_clang:
-    from distutils.sysconfig import get_config_vars
+    from sysconfig import get_config_vars
     vars = get_config_vars()
     for var in ('CFLAGS', 'OPT'):
         vars[var] = sub("-specs=[^ ]+", "", vars[var])
@@ -28,10 +28,10 @@ if cc_is_clang:
         if not clang_has_option("-ffat-lto-objects"):
             vars[var] = sub("-ffat-lto-objects", "", vars[var])
 
-from distutils.core import setup, Extension
+from setuptools import setup, Extension
 
-from distutils.command.build_ext   import build_ext   as _build_ext
-from distutils.command.install_lib import install_lib as _install_lib
+from setuptools.command.build_ext   import build_ext   as _build_ext
+from setuptools.command.install_lib import install_lib as _install_lib
 
 class build_ext(_build_ext):
     def finalize_options(self):
@@ -48,7 +48,9 @@ class install_lib(_install_lib):
 cflags = getenv('CFLAGS', '').split()
 # switch off several checks (need to be at the end of cflags list)
 cflags += ['-fno-strict-aliasing', '-Wno-write-strings', '-Wno-unused-parameter', '-Wno-redundant-decls', '-DPYTHON_PERF' ]
-if not cc_is_clang:
+if cc_is_clang:
+    cflags += ["-Wno-unused-command-line-argument" ]
+else:
     cflags += ['-Wno-cast-function-type' ]
 
 src_perf  = getenv('srctree') + '/tools/perf'
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index 606f09b09226..44045565c8f8 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -374,7 +374,7 @@ static void abs_printout(struct perf_stat_config *config,
 			config->csv_output ? 0 : config->unit_width,
 			evsel->unit, config->csv_sep);
 
-	fprintf(output, "%-*s", config->csv_output ? 0 : 25, evsel__name(evsel));
+	fprintf(output, "%-*s", config->csv_output ? 0 : 32, evsel__name(evsel));
 
 	print_cgroup(config, evsel);
 }
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index ecd377938eea..75bec32d4f57 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -233,6 +233,33 @@ Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
 	return NULL;
 }
 
+static int elf_read_program_header(Elf *elf, u64 vaddr, GElf_Phdr *phdr)
+{
+	size_t i, phdrnum;
+	u64 sz;
+
+	if (elf_getphdrnum(elf, &phdrnum))
+		return -1;
+
+	for (i = 0; i < phdrnum; i++) {
+		if (gelf_getphdr(elf, i, phdr) == NULL)
+			return -1;
+
+		if (phdr->p_type != PT_LOAD)
+			continue;
+
+		sz = max(phdr->p_memsz, phdr->p_filesz);
+		if (!sz)
+			continue;
+
+		if (vaddr >= phdr->p_vaddr && (vaddr < phdr->p_vaddr + sz))
+			return 0;
+	}
+
+	/* Not found any valid program header */
+	return -1;
+}
+
 static bool want_demangle(bool is_kernel_sym)
 {
 	return is_kernel_sym ? symbol_conf.demangle_kernel : symbol_conf.demangle;
@@ -1209,6 +1236,7 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss,
 					sym.st_value);
 			used_opd = true;
 		}
+
 		/*
 		 * When loading symbols in a data mapping, ABS symbols (which
 		 * has a value of SHN_ABS in its st_shndx) failed at
@@ -1227,6 +1255,17 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss,
 
 		gelf_getshdr(sec, &shdr);
 
+		/*
+		 * If the attribute bit SHF_ALLOC is not set, the section
+		 * doesn't occupy memory during process execution.
+		 * E.g. ".gnu.warning.*" section is used by linker to generate
+		 * warnings when calling deprecated functions, the symbols in
+		 * the section aren't loaded to memory during process execution,
+		 * so skip them.
+		 */
+		if (!(shdr.sh_flags & SHF_ALLOC))
+			continue;
+
 		secstrs = secstrs_sym;
 
 		/*
@@ -1262,11 +1301,33 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss,
 				goto out_elf_end;
 		} else if ((used_opd && runtime_ss->adjust_symbols) ||
 			   (!used_opd && syms_ss->adjust_symbols)) {
-			pr_debug4("%s: adjusting symbol: st_value: %#" PRIx64 " "
-				  "sh_addr: %#" PRIx64 " sh_offset: %#" PRIx64 "\n", __func__,
-				  (u64)sym.st_value, (u64)shdr.sh_addr,
-				  (u64)shdr.sh_offset);
-			sym.st_value -= shdr.sh_addr - shdr.sh_offset;
+			GElf_Phdr phdr;
+
+			if (elf_read_program_header(syms_ss->elf,
+						    (u64)sym.st_value, &phdr)) {
+				pr_debug4("%s: failed to find program header for "
+					   "symbol: %s st_value: %#" PRIx64 "\n",
+					   __func__, elf_name, (u64)sym.st_value);
+				pr_debug4("%s: adjusting symbol: st_value: %#" PRIx64 " "
+					"sh_addr: %#" PRIx64 " sh_offset: %#" PRIx64 "\n",
+					__func__, (u64)sym.st_value, (u64)shdr.sh_addr,
+					(u64)shdr.sh_offset);
+				/*
+				 * Fail to find program header, let's rollback
+				 * to use shdr.sh_addr and shdr.sh_offset to
+				 * calibrate symbol's file address, though this
+				 * is not necessary for normal C ELF file, we
+				 * still need to handle java JIT symbols in this
+				 * case.
+				 */
+				sym.st_value -= shdr.sh_addr - shdr.sh_offset;
+			} else {
+				pr_debug4("%s: adjusting symbol: st_value: %#" PRIx64 " "
+					"p_vaddr: %#" PRIx64 " p_offset: %#" PRIx64 "\n",
+					__func__, (u64)sym.st_value, (u64)phdr.p_vaddr,
+					(u64)phdr.p_offset);
+				sym.st_value -= phdr.p_vaddr - phdr.p_offset;
+			}
 		}
 
 		demangled = demangle_sym(dso, kmodule, elf_name);
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index f72baf636724..a4b22caa7c24 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -2300,11 +2300,13 @@ do_kallsyms:
 static int dso__load_guest_kernel_sym(struct dso *dso, struct map *map)
 {
 	int err;
-	const char *kallsyms_filename = NULL;
+	const char *kallsyms_filename;
 	struct machine *machine = map__kmaps(map)->machine;
 	char path[PATH_MAX];
 
-	if (machine__is_default_guest(machine)) {
+	if (machine->kallsyms_filename) {
+		kallsyms_filename = machine->kallsyms_filename;
+	} else if (machine__is_default_guest(machine)) {
 		/*
 		 * if the user specified a vmlinux filename, use it and only
 		 * it, reporting errors to the user if it cannot be used.
diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index 27acdc5e5723..2ae59c03ae77 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c
@@ -754,7 +754,7 @@ static int __event__synthesize_thread(union perf_event *comm_event,
 	snprintf(filename, sizeof(filename), "%s/proc/%d/task",
 		 machine->root_dir, pid);
 
-	n = scandir(filename, &dirent, filter_task, alphasort);
+	n = scandir(filename, &dirent, filter_task, NULL);
 	if (n < 0)
 		return n;
 
@@ -767,11 +767,12 @@ static int __event__synthesize_thread(union perf_event *comm_event,
 		if (*end)
 			continue;
 
-		rc = -1;
+		/* some threads may exit just after scan, ignore it */
 		if (perf_event__prepare_comm(comm_event, pid, _pid, machine,
 					     &tgid, &ppid, &kernel_thread) != 0)
-			break;
+			continue;
 
+		rc = -1;
 		if (perf_event__synthesize_fork(tool, fork_event, _pid, tgid,
 						ppid, process, machine) < 0)
 			break;
@@ -987,7 +988,7 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
 		return 0;
 
 	snprintf(proc_path, sizeof(proc_path), "%s/proc", machine->root_dir);
-	n = scandir(proc_path, &dirent, filter_task, alphasort);
+	n = scandir(proc_path, &dirent, filter_task, NULL);
 	if (n < 0)
 		return err;
 
@@ -1711,48 +1712,112 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_fo
 	return 0;
 }
 
-int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process,
-				    struct evlist *evlist, struct machine *machine)
+int perf_event__synthesize_id_sample(__u64 *array, u64 type, const struct perf_sample *sample)
+{
+	__u64 *start = array;
+
+	/*
+	 * used for cross-endian analysis. See git commit 65014ab3
+	 * for why this goofiness is needed.
+	 */
+	union u64_swap u;
+
+	if (type & PERF_SAMPLE_TID) {
+		u.val32[0] = sample->pid;
+		u.val32[1] = sample->tid;
+		*array = u.val64;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_TIME) {
+		*array = sample->time;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_ID) {
+		*array = sample->id;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_STREAM_ID) {
+		*array = sample->stream_id;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_CPU) {
+		u.val32[0] = sample->cpu;
+		u.val32[1] = 0;
+		*array = u.val64;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_IDENTIFIER) {
+		*array = sample->id;
+		array++;
+	}
+
+	return (void *)array - (void *)start;
+}
+
+int __perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process,
+				      struct evlist *evlist, struct machine *machine, size_t from)
 {
 	union perf_event *ev;
 	struct evsel *evsel;
-	size_t nr = 0, i = 0, sz, max_nr, n;
+	size_t nr = 0, i = 0, sz, max_nr, n, pos;
+	size_t e1_sz = sizeof(struct id_index_entry);
+	size_t e2_sz = sizeof(struct id_index_entry_2);
+	size_t etot_sz = e1_sz + e2_sz;
+	bool e2_needed = false;
 	int err;
 
-	pr_debug2("Synthesizing id index\n");
-
-	max_nr = (UINT16_MAX - sizeof(struct perf_record_id_index)) /
-		 sizeof(struct id_index_entry);
+	max_nr = (UINT16_MAX - sizeof(struct perf_record_id_index)) / etot_sz;
 
-	evlist__for_each_entry(evlist, evsel)
+	pos = 0;
+	evlist__for_each_entry(evlist, evsel) {
+		if (pos++ < from)
+			continue;
 		nr += evsel->core.ids;
+	}
+
+	if (!nr)
+		return 0;
+
+	pr_debug2("Synthesizing id index\n");
 
 	n = nr > max_nr ? max_nr : nr;
-	sz = sizeof(struct perf_record_id_index) + n * sizeof(struct id_index_entry);
+	sz = sizeof(struct perf_record_id_index) + n * etot_sz;
 	ev = zalloc(sz);
 	if (!ev)
 		return -ENOMEM;
 
+	sz = sizeof(struct perf_record_id_index) + n * e1_sz;
+
 	ev->id_index.header.type = PERF_RECORD_ID_INDEX;
-	ev->id_index.header.size = sz;
 	ev->id_index.nr = n;
 
+	pos = 0;
 	evlist__for_each_entry(evlist, evsel) {
 		u32 j;
 
-		for (j = 0; j < evsel->core.ids; j++) {
+		if (pos++ < from)
+			continue;
+		for (j = 0; j < evsel->core.ids; j++, i++) {
 			struct id_index_entry *e;
+			struct id_index_entry_2 *e2;
 			struct perf_sample_id *sid;
 
 			if (i >= n) {
+				ev->id_index.header.size = sz + (e2_needed ? n * e2_sz : 0);
 				err = process(tool, ev, NULL, machine);
 				if (err)
 					goto out_err;
 				nr -= n;
 				i = 0;
+				e2_needed = false;
 			}
 
-			e = &ev->id_index.entries[i++];
+			e = &ev->id_index.entries[i];
 
 			e->id = evsel->core.id[j];
 
@@ -1765,11 +1830,18 @@ int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_
 			e->idx = sid->idx;
 			e->cpu = sid->cpu.cpu;
 			e->tid = sid->tid;
+
+			if (sid->machine_pid)
+				e2_needed = true;
+
+			e2 = (void *)ev + sz;
+			e2[i].machine_pid = sid->machine_pid;
+			e2[i].vcpu        = sid->vcpu.cpu;
 		}
 	}
 
-	sz = sizeof(struct perf_record_id_index) + nr * sizeof(struct id_index_entry);
-	ev->id_index.header.size = sz;
+	sz = sizeof(struct perf_record_id_index) + nr * e1_sz;
+	ev->id_index.header.size = sz + (e2_needed ? nr * e2_sz : 0);
 	ev->id_index.nr = nr;
 
 	err = process(tool, ev, NULL, machine);
@@ -1779,6 +1851,12 @@ out_err:
 	return err;
 }
 
+int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process,
+				    struct evlist *evlist, struct machine *machine)
+{
+	return __perf_event__synthesize_id_index(tool, process, evlist, machine, 0);
+}
+
 int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool,
 				  struct target *target, struct perf_thread_map *threads,
 				  perf_event__handler_t process, bool needs_mmap,
diff --git a/tools/perf/util/synthetic-events.h b/tools/perf/util/synthetic-events.h
index 78a0450db164..81cb3d6af0b9 100644
--- a/tools/perf/util/synthetic-events.h
+++ b/tools/perf/util/synthetic-events.h
@@ -55,6 +55,8 @@ int perf_event__synthesize_extra_attr(struct perf_tool *tool, struct evlist *evs
 int perf_event__synthesize_extra_kmaps(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine);
 int perf_event__synthesize_features(struct perf_tool *tool, struct perf_session *session, struct evlist *evlist, perf_event__handler_t process);
 int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process, struct evlist *evlist, struct machine *machine);
+int __perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process, struct evlist *evlist, struct machine *machine, size_t from);
+int perf_event__synthesize_id_sample(__u64 *array, u64 type, const struct perf_sample *sample);
 int perf_event__synthesize_kernel_mmap(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine);
 int perf_event__synthesize_mmap_events(struct perf_tool *tool, union perf_event *event, pid_t pid, pid_t tgid, perf_event__handler_t process, struct machine *machine, bool mmap_data);
 int perf_event__synthesize_modules(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine);
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index 665e5c0618ed..e3e5427e1c3c 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -47,6 +47,7 @@ struct thread *thread__new(pid_t pid, pid_t tid)
 		thread->tid = tid;
 		thread->ppid = -1;
 		thread->cpu = -1;
+		thread->guest_cpu = -1;
 		thread->lbr_stitch_enable = false;
 		INIT_LIST_HEAD(&thread->namespaces_list);
 		INIT_LIST_HEAD(&thread->comm_list);
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index b066fb30d203..241f300d7d6e 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -39,6 +39,7 @@ struct thread {
 	pid_t			tid;
 	pid_t			ppid;
 	int			cpu;
+	int			guest_cpu; /* For QEMU thread */
 	refcount_t		refcnt;
 	bool			comm_set;
 	int			comm_len;
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index f2352dba1875..c957fb849ac6 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -76,7 +76,8 @@ struct perf_tool {
 			stat_config,
 			stat,
 			stat_round,
-			feature;
+			feature,
+			finished_init;
 	event_op4	compressed;
 	event_op3	auxtrace;
 	bool		ordered_events;
diff --git a/tools/perf/util/topdown.c b/tools/perf/util/topdown.c
index a369f84ceb6a..1090841550f7 100644
--- a/tools/perf/util/topdown.c
+++ b/tools/perf/util/topdown.c
@@ -65,3 +65,10 @@ __weak bool arch_topdown_sample_read(struct evsel *leader __maybe_unused)
 {
 	return false;
 }
+
+__weak const char *arch_get_topdown_pmu_name(struct evlist *evlist
+					     __maybe_unused,
+					     bool warn __maybe_unused)
+{
+	return "cpu";
+}
diff --git a/tools/perf/util/topdown.h b/tools/perf/util/topdown.h
index 118e75281f93..f9531528c559 100644
--- a/tools/perf/util/topdown.h
+++ b/tools/perf/util/topdown.h
@@ -2,11 +2,12 @@
 #ifndef TOPDOWN_H
 #define TOPDOWN_H 1
 #include "evsel.h"
+#include "evlist.h"
 
 bool arch_topdown_check_group(bool *warn);
 void arch_topdown_group_warn(void);
 bool arch_topdown_sample_read(struct evsel *leader);
-
+const char *arch_get_topdown_pmu_name(struct evlist *evlist, bool warn);
 int topdown_filter_events(const char **attr, char **str, bool use_group,
 			  const char *pmu_name);
 
diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c
index a65f65d0857e..892c323b4ac9 100644
--- a/tools/perf/util/trace-event-info.c
+++ b/tools/perf/util/trace-event-info.c
@@ -19,16 +19,24 @@
 #include <linux/kernel.h>
 #include <linux/zalloc.h>
 #include <internal/lib.h> // page_size
+#include <sys/param.h>
 
 #include "trace-event.h"
+#include "tracepoint.h"
 #include <api/fs/tracing_path.h>
 #include "evsel.h"
 #include "debug.h"
 
 #define VERSION "0.6"
+#define MAX_EVENT_LENGTH 512
 
 static int output_fd;
 
+struct tracepoint_path {
+	char *system;
+	char *name;
+	struct tracepoint_path *next;
+};
 
 int bigendian(void)
 {
@@ -400,6 +408,94 @@ put_tracepoints_path(struct tracepoint_path *tps)
 	}
 }
 
+static struct tracepoint_path *tracepoint_id_to_path(u64 config)
+{
+	struct tracepoint_path *path = NULL;
+	DIR *sys_dir, *evt_dir;
+	struct dirent *sys_dirent, *evt_dirent;
+	char id_buf[24];
+	int fd;
+	u64 id;
+	char evt_path[MAXPATHLEN];
+	char *dir_path;
+
+	sys_dir = tracing_events__opendir();
+	if (!sys_dir)
+		return NULL;
+
+	for_each_subsystem(sys_dir, sys_dirent) {
+		dir_path = get_events_file(sys_dirent->d_name);
+		if (!dir_path)
+			continue;
+		evt_dir = opendir(dir_path);
+		if (!evt_dir)
+			goto next;
+
+		for_each_event(dir_path, evt_dir, evt_dirent) {
+
+			scnprintf(evt_path, MAXPATHLEN, "%s/%s/id", dir_path,
+				  evt_dirent->d_name);
+			fd = open(evt_path, O_RDONLY);
+			if (fd < 0)
+				continue;
+			if (read(fd, id_buf, sizeof(id_buf)) < 0) {
+				close(fd);
+				continue;
+			}
+			close(fd);
+			id = atoll(id_buf);
+			if (id == config) {
+				put_events_file(dir_path);
+				closedir(evt_dir);
+				closedir(sys_dir);
+				path = zalloc(sizeof(*path));
+				if (!path)
+					return NULL;
+				if (asprintf(&path->system, "%.*s",
+					     MAX_EVENT_LENGTH, sys_dirent->d_name) < 0) {
+					free(path);
+					return NULL;
+				}
+				if (asprintf(&path->name, "%.*s",
+					     MAX_EVENT_LENGTH, evt_dirent->d_name) < 0) {
+					zfree(&path->system);
+					free(path);
+					return NULL;
+				}
+				return path;
+			}
+		}
+		closedir(evt_dir);
+next:
+		put_events_file(dir_path);
+	}
+
+	closedir(sys_dir);
+	return NULL;
+}
+
+static struct tracepoint_path *tracepoint_name_to_path(const char *name)
+{
+	struct tracepoint_path *path = zalloc(sizeof(*path));
+	char *str = strchr(name, ':');
+
+	if (path == NULL || str == NULL) {
+		free(path);
+		return NULL;
+	}
+
+	path->system = strndup(name, str - name);
+	path->name = strdup(str+1);
+
+	if (path->system == NULL || path->name == NULL) {
+		zfree(&path->system);
+		zfree(&path->name);
+		zfree(&path);
+	}
+
+	return path;
+}
+
 static struct tracepoint_path *
 get_tracepoints_path(struct list_head *pattrs)
 {
diff --git a/tools/perf/util/tracepoint.c b/tools/perf/util/tracepoint.c
new file mode 100644
index 000000000000..89ef56c43311
--- /dev/null
+++ b/tools/perf/util/tracepoint.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "tracepoint.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/param.h>
+#include <unistd.h>
+
+#include <api/fs/tracing_path.h>
+
+int tp_event_has_id(const char *dir_path, struct dirent *evt_dir)
+{
+	char evt_path[MAXPATHLEN];
+	int fd;
+
+	snprintf(evt_path, MAXPATHLEN, "%s/%s/id", dir_path, evt_dir->d_name);
+	fd = open(evt_path, O_RDONLY);
+	if (fd < 0)
+		return -EINVAL;
+	close(fd);
+
+	return 0;
+}
+
+/*
+ * Check whether event is in <debugfs_mount_point>/tracing/events
+ */
+int is_valid_tracepoint(const char *event_string)
+{
+	DIR *sys_dir, *evt_dir;
+	struct dirent *sys_dirent, *evt_dirent;
+	char evt_path[MAXPATHLEN];
+	char *dir_path;
+
+	sys_dir = tracing_events__opendir();
+	if (!sys_dir)
+		return 0;
+
+	for_each_subsystem(sys_dir, sys_dirent) {
+		dir_path = get_events_file(sys_dirent->d_name);
+		if (!dir_path)
+			continue;
+		evt_dir = opendir(dir_path);
+		if (!evt_dir)
+			goto next;
+
+		for_each_event(dir_path, evt_dir, evt_dirent) {
+			snprintf(evt_path, MAXPATHLEN, "%s:%s",
+				 sys_dirent->d_name, evt_dirent->d_name);
+			if (!strcmp(evt_path, event_string)) {
+				closedir(evt_dir);
+				closedir(sys_dir);
+				return 1;
+			}
+		}
+		closedir(evt_dir);
+next:
+		put_events_file(dir_path);
+	}
+	closedir(sys_dir);
+	return 0;
+}
diff --git a/tools/perf/util/tracepoint.h b/tools/perf/util/tracepoint.h
new file mode 100644
index 000000000000..c4a110fe87d7
--- /dev/null
+++ b/tools/perf/util/tracepoint.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PERF_TRACEPOINT_H
+#define __PERF_TRACEPOINT_H
+
+#include <dirent.h>
+#include <string.h>
+
+int tp_event_has_id(const char *dir_path, struct dirent *evt_dir);
+
+#define for_each_event(dir_path, evt_dir, evt_dirent)		\
+	while ((evt_dirent = readdir(evt_dir)) != NULL)		\
+		if (evt_dirent->d_type == DT_DIR &&		\
+		    (strcmp(evt_dirent->d_name, ".")) &&	\
+		    (strcmp(evt_dirent->d_name, "..")) &&	\
+		    (!tp_event_has_id(dir_path, evt_dirent)))
+
+#define for_each_subsystem(sys_dir, sys_dirent)			\
+	while ((sys_dirent = readdir(sys_dir)) != NULL)		\
+		if (sys_dirent->d_type == DT_DIR &&		\
+		    (strcmp(sys_dirent->d_name, ".")) &&	\
+		    (strcmp(sys_dirent->d_name, "..")))
+
+int is_valid_tracepoint(const char *event_string);
+
+#endif /* __PERF_TRACEPOINT_H */
diff --git a/tools/perf/util/tsc.h b/tools/perf/util/tsc.h
index 7d83a31732a7..88fd1c4c1cb8 100644
--- a/tools/perf/util/tsc.h
+++ b/tools/perf/util/tsc.h
@@ -25,6 +25,7 @@ int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc,
 u64 perf_time_to_tsc(u64 ns, struct perf_tsc_conversion *tc);
 u64 tsc_to_perf_time(u64 cyc, struct perf_tsc_conversion *tc);
 u64 rdtsc(void);
+double arch_get_tsc_freq(void);
 
 size_t perf_event__fprintf_time_conv(union perf_event *event, FILE *fp);
 
diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c
index 37622699c91a..81b6bd6e1536 100644
--- a/tools/perf/util/unwind-libunwind-local.c
+++ b/tools/perf/util/unwind-libunwind-local.c
@@ -174,7 +174,7 @@ static int elf_section_address_and_offset(int fd, const char *name, u64 *address
 	Elf *elf;
 	GElf_Ehdr ehdr;
 	GElf_Shdr shdr;
-	int ret;
+	int ret = -1;
 
 	elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL);
 	if (elf == NULL)
@@ -197,7 +197,7 @@ out_err:
 #ifndef NO_LIBUNWIND_DEBUG_FRAME
 static u64 elf_section_offset(int fd, const char *name)
 {
-	u64 address, offset;
+	u64 address, offset = 0;
 
 	if (elf_section_address_and_offset(fd, name, &address, &offset))
 		return 0;
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index eeb83c80f458..391c1e928bd7 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -18,6 +18,7 @@
 #include <linux/kernel.h>
 #include <linux/log2.h>
 #include <linux/time64.h>
+#include <linux/overflow.h>
 #include <unistd.h>
 #include "cap.h"
 #include "strlist.h"
@@ -200,7 +201,7 @@ static int rm_rf_depth_pat(const char *path, int depth, const char **pat)
 	return rmdir(path);
 }
 
-static int rm_rf_kcore_dir(const char *path)
+static int rm_rf_a_kcore_dir(const char *path, const char *name)
 {
 	char kcore_dir_path[PATH_MAX];
 	const char *pat[] = {
@@ -210,11 +211,44 @@ static int rm_rf_kcore_dir(const char *path)
 		NULL,
 	};
 
-	snprintf(kcore_dir_path, sizeof(kcore_dir_path), "%s/kcore_dir", path);
+	snprintf(kcore_dir_path, sizeof(kcore_dir_path), "%s/%s", path, name);
 
 	return rm_rf_depth_pat(kcore_dir_path, 0, pat);
 }
 
+static bool kcore_dir_filter(const char *name __maybe_unused, struct dirent *d)
+{
+	const char *pat[] = {
+		"kcore_dir",
+		"kcore_dir__[1-9]*",
+		NULL,
+	};
+
+	return match_pat(d->d_name, pat);
+}
+
+static int rm_rf_kcore_dir(const char *path)
+{
+	struct strlist *kcore_dirs;
+	struct str_node *nd;
+	int ret;
+
+	kcore_dirs = lsdir(path, kcore_dir_filter);
+
+	if (!kcore_dirs)
+		return 0;
+
+	strlist__for_each_entry(nd, kcore_dirs) {
+		ret = rm_rf_a_kcore_dir(path, nd->s);
+		if (ret)
+			return ret;
+	}
+
+	strlist__delete(kcore_dirs);
+
+	return 0;
+}
+
 int rm_rf_perf_data(const char *path)
 {
 	const char *pat[] = {
@@ -467,3 +501,35 @@ char *filename_with_chroot(int pid, const char *filename)
 
 	return new_name;
 }
+
+/*
+ * Reallocate an array *arr of size *arr_sz so that it is big enough to contain
+ * x elements of size msz, initializing new entries to *init_val or zero if
+ * init_val is NULL
+ */
+int do_realloc_array_as_needed(void **arr, size_t *arr_sz, size_t x, size_t msz, const void *init_val)
+{
+	size_t new_sz = *arr_sz;
+	void *new_arr;
+	size_t i;
+
+	if (!new_sz)
+		new_sz = msz >= 64 ? 1 : roundup(64, msz); /* Start with at least 64 bytes */
+	while (x >= new_sz) {
+		if (check_mul_overflow(new_sz, (size_t)2, &new_sz))
+			return -ENOMEM;
+	}
+	if (new_sz == *arr_sz)
+		return 0;
+	new_arr = calloc(new_sz, msz);
+	if (!new_arr)
+		return -ENOMEM;
+	memcpy(new_arr, *arr, *arr_sz * msz);
+	if (init_val) {
+		for (i = *arr_sz; i < new_sz; i++)
+			memcpy(new_arr + (i * msz), init_val, msz);
+	}
+	*arr = new_arr;
+	*arr_sz = new_sz;
+	return 0;
+}
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 0f78f1e7782d..c1f2d423a9ec 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -79,4 +79,19 @@ struct perf_debuginfod {
 void perf_debuginfod_setup(struct perf_debuginfod *di);
 
 char *filename_with_chroot(int pid, const char *filename);
+
+int do_realloc_array_as_needed(void **arr, size_t *arr_sz, size_t x,
+			       size_t msz, const void *init_val);
+
+#define realloc_array_as_needed(a, n, x, v) ({			\
+	typeof(x) __x = (x);					\
+	__x >= (n) ?						\
+		do_realloc_array_as_needed((void **)&(a),	\
+					   &(n),		\
+					   __x,			\
+					   sizeof(*(a)),	\
+					   (const void *)(v)) :	\
+		0;						\
+	})
+
 #endif /* GIT_COMPAT_UTIL_H */