aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/x86/events/core.c63
-rw-r--r--arch/x86/events/intel/bts.c3
-rw-r--r--arch/x86/events/intel/pt.c29
-rw-r--r--arch/x86/events/intel/uncore.c9
-rw-r--r--arch/x86/events/intel/uncore.h2
-rw-r--r--arch/x86/events/intel/uncore_snb.c185
-rw-r--r--include/linux/perf_event.h11
-rw-r--r--include/linux/uprobes.h24
-rw-r--r--kernel/events/core.c402
-rw-r--r--kernel/events/uprobes.c139
-rw-r--r--kernel/trace/bpf_trace.c25
-rw-r--r--kernel/trace/trace_uprobe.c31
-rw-r--r--tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c26
13 files changed, 619 insertions, 330 deletions
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index be01823b1bb4..65ab6460aed4 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -41,6 +41,8 @@
#include <asm/desc.h>
#include <asm/ldt.h>
#include <asm/unwind.h>
+#include <asm/uprobes.h>
+#include <asm/ibt.h>
#include "perf_event.h"
@@ -2816,6 +2818,46 @@ static unsigned long get_segment_base(unsigned int segment)
return get_desc_base(desc);
}
+#ifdef CONFIG_UPROBES
+/*
+ * Heuristic-based check if uprobe is installed at the function entry.
+ *
+ * Under assumption of user code being compiled with frame pointers,
+ * `push %rbp/%ebp` is a good indicator that we indeed are.
+ *
+ * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
+ * If we get this wrong, captured stack trace might have one extra bogus
+ * entry, but the rest of stack trace will still be meaningful.
+ */
+static bool is_uprobe_at_func_entry(struct pt_regs *regs)
+{
+ struct arch_uprobe *auprobe;
+
+ if (!current->utask)
+ return false;
+
+ auprobe = current->utask->auprobe;
+ if (!auprobe)
+ return false;
+
+ /* push %rbp/%ebp */
+ if (auprobe->insn[0] == 0x55)
+ return true;
+
+ /* endbr64 (64-bit only) */
+ if (user_64bit_mode(regs) && is_endbr(*(u32 *)auprobe->insn))
+ return true;
+
+ return false;
+}
+
+#else
+static bool is_uprobe_at_func_entry(struct pt_regs *regs)
+{
+ return false;
+}
+#endif /* CONFIG_UPROBES */
+
#ifdef CONFIG_IA32_EMULATION
#include <linux/compat.h>
@@ -2827,6 +2869,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent
unsigned long ss_base, cs_base;
struct stack_frame_ia32 frame;
const struct stack_frame_ia32 __user *fp;
+ u32 ret_addr;
if (user_64bit_mode(regs))
return 0;
@@ -2836,6 +2879,12 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent
fp = compat_ptr(ss_base + regs->bp);
pagefault_disable();
+
+ /* see perf_callchain_user() below for why we do this */
+ if (is_uprobe_at_func_entry(regs) &&
+ !get_user(ret_addr, (const u32 __user *)regs->sp))
+ perf_callchain_store(entry, ret_addr);
+
while (entry->nr < entry->max_stack) {
if (!valid_user_frame(fp, sizeof(frame)))
break;
@@ -2864,6 +2913,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
{
struct stack_frame frame;
const struct stack_frame __user *fp;
+ unsigned long ret_addr;
if (perf_guest_state()) {
/* TODO: We don't support guest os callchain now */
@@ -2887,6 +2937,19 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
return;
pagefault_disable();
+
+ /*
+ * If we are called from uprobe handler, and we are indeed at the very
+ * entry to user function (which is normally a `push %rbp` instruction,
+ * under assumption of application being compiled with frame pointers),
+ * we should read return address from *regs->sp before proceeding
+ * to follow frame pointers, otherwise we'll skip immediate caller
+ * as %rbp is not yet setup.
+ */
+ if (is_uprobe_at_func_entry(regs) &&
+ !get_user(ret_addr, (const unsigned long __user *)regs->sp))
+ perf_callchain_store(entry, ret_addr);
+
while (entry->nr < entry->max_stack) {
if (!valid_user_frame(fp, sizeof(frame)))
break;
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 974e917e65b2..8f78b0c900ef 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -557,9 +557,6 @@ static int bts_event_init(struct perf_event *event)
* disabled, so disallow intel_bts driver for unprivileged
* users on paranoid systems since it provides trace data
* to the user in a zero-copy fashion.
- *
- * Note that the default paranoia setting permits unprivileged
- * users to profile the kernel.
*/
if (event->attr.exclude_kernel) {
ret = perf_allow_kernel(&event->attr);
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index b4aa8daa4773..fd4670a6694e 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -416,7 +416,7 @@ static bool pt_event_valid(struct perf_event *event)
static void pt_config_start(struct perf_event *event)
{
struct pt *pt = this_cpu_ptr(&pt_ctx);
- u64 ctl = event->hw.config;
+ u64 ctl = event->hw.aux_config;
ctl |= RTIT_CTL_TRACEEN;
if (READ_ONCE(pt->vmx_on))
@@ -424,7 +424,7 @@ static void pt_config_start(struct perf_event *event)
else
wrmsrl(MSR_IA32_RTIT_CTL, ctl);
- WRITE_ONCE(event->hw.config, ctl);
+ WRITE_ONCE(event->hw.aux_config, ctl);
}
/* Address ranges and their corresponding msr configuration registers */
@@ -503,7 +503,7 @@ static void pt_config(struct perf_event *event)
u64 reg;
/* First round: clear STATUS, in particular the PSB byte counter. */
- if (!event->hw.config) {
+ if (!event->hw.aux_config) {
perf_event_itrace_started(event);
wrmsrl(MSR_IA32_RTIT_STATUS, 0);
}
@@ -533,14 +533,14 @@ static void pt_config(struct perf_event *event)
reg |= (event->attr.config & PT_CONFIG_MASK);
- event->hw.config = reg;
+ event->hw.aux_config = reg;
pt_config_start(event);
}
static void pt_config_stop(struct perf_event *event)
{
struct pt *pt = this_cpu_ptr(&pt_ctx);
- u64 ctl = READ_ONCE(event->hw.config);
+ u64 ctl = READ_ONCE(event->hw.aux_config);
/* may be already stopped by a PMI */
if (!(ctl & RTIT_CTL_TRACEEN))
@@ -550,7 +550,7 @@ static void pt_config_stop(struct perf_event *event)
if (!READ_ONCE(pt->vmx_on))
wrmsrl(MSR_IA32_RTIT_CTL, ctl);
- WRITE_ONCE(event->hw.config, ctl);
+ WRITE_ONCE(event->hw.aux_config, ctl);
/*
* A wrmsr that disables trace generation serializes other PT
@@ -1557,7 +1557,7 @@ void intel_pt_handle_vmx(int on)
/* Turn PTs back on */
if (!on && event)
- wrmsrl(MSR_IA32_RTIT_CTL, event->hw.config);
+ wrmsrl(MSR_IA32_RTIT_CTL, event->hw.aux_config);
local_irq_restore(flags);
}
@@ -1606,6 +1606,7 @@ static void pt_event_stop(struct perf_event *event, int mode)
* see comment in intel_pt_interrupt().
*/
WRITE_ONCE(pt->handle_nmi, 0);
+ barrier();
pt_config_stop(event);
@@ -1657,11 +1658,10 @@ static long pt_event_snapshot_aux(struct perf_event *event,
return 0;
/*
- * Here, handle_nmi tells us if the tracing is on
+ * There is no PT interrupt in this mode, so stop the trace and it will
+ * remain stopped while the buffer is copied.
*/
- if (READ_ONCE(pt->handle_nmi))
- pt_config_stop(event);
-
+ pt_config_stop(event);
pt_read_offset(buf);
pt_update_head(pt);
@@ -1673,11 +1673,10 @@ static long pt_event_snapshot_aux(struct perf_event *event,
ret = perf_output_copy_aux(&pt->handle, handle, from, to);
/*
- * If the tracing was on when we turned up, restart it.
- * Compiler barrier not needed as we couldn't have been
- * preempted by anything that touches pt->handle_nmi.
+ * Here, handle_nmi tells us if the tracing was on.
+ * If the tracing was on, restart it.
*/
- if (pt->handle_nmi)
+ if (READ_ONCE(pt->handle_nmi))
pt_config_start(event);
return ret;
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 64ca8625eb58..d98fac567684 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1816,6 +1816,11 @@ static const struct intel_uncore_init_fun mtl_uncore_init __initconst = {
.mmio_init = adl_uncore_mmio_init,
};
+static const struct intel_uncore_init_fun lnl_uncore_init __initconst = {
+ .cpu_init = lnl_uncore_cpu_init,
+ .mmio_init = lnl_uncore_mmio_init,
+};
+
static const struct intel_uncore_init_fun icx_uncore_init __initconst = {
.cpu_init = icx_uncore_cpu_init,
.pci_init = icx_uncore_pci_init,
@@ -1893,6 +1898,10 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &adl_uncore_init),
X86_MATCH_VFM(INTEL_METEORLAKE, &mtl_uncore_init),
X86_MATCH_VFM(INTEL_METEORLAKE_L, &mtl_uncore_init),
+ X86_MATCH_VFM(INTEL_ARROWLAKE, &mtl_uncore_init),
+ X86_MATCH_VFM(INTEL_ARROWLAKE_U, &mtl_uncore_init),
+ X86_MATCH_VFM(INTEL_ARROWLAKE_H, &mtl_uncore_init),
+ X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_uncore_init),
X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &spr_uncore_init),
X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &spr_uncore_init),
X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &gnr_uncore_init),
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 027ef292c602..79ff32e13dcc 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -611,10 +611,12 @@ void skl_uncore_cpu_init(void);
void icl_uncore_cpu_init(void);
void tgl_uncore_cpu_init(void);
void adl_uncore_cpu_init(void);
+void lnl_uncore_cpu_init(void);
void mtl_uncore_cpu_init(void);
void tgl_uncore_mmio_init(void);
void tgl_l_uncore_mmio_init(void);
void adl_uncore_mmio_init(void);
+void lnl_uncore_mmio_init(void);
int snb_pci2phy_map_init(int devid);
/* uncore_snbep.c */
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 9462fd9f3b7a..3934e1e4e3b1 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -252,6 +252,7 @@ DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
DEFINE_UNCORE_FORMAT_ATTR(threshold, threshold, "config:24-29");
+DEFINE_UNCORE_FORMAT_ATTR(threshold2, threshold, "config:24-31");
/* Sandy Bridge uncore support */
static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
@@ -746,6 +747,34 @@ void mtl_uncore_cpu_init(void)
uncore_msr_uncores = mtl_msr_uncores;
}
+static struct intel_uncore_type *lnl_msr_uncores[] = {
+ &mtl_uncore_cbox,
+ &mtl_uncore_arb,
+ NULL
+};
+
+#define LNL_UNC_MSR_GLOBAL_CTL 0x240e
+
+static void lnl_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0)
+ wrmsrl(LNL_UNC_MSR_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN);
+}
+
+static struct intel_uncore_ops lnl_uncore_msr_ops = {
+ .init_box = lnl_uncore_msr_init_box,
+ .disable_event = snb_uncore_msr_disable_event,
+ .enable_event = snb_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+void lnl_uncore_cpu_init(void)
+{
+ mtl_uncore_cbox.num_boxes = 4;
+ mtl_uncore_cbox.ops = &lnl_uncore_msr_ops;
+ uncore_msr_uncores = lnl_msr_uncores;
+}
+
enum {
SNB_PCI_UNCORE_IMC,
};
@@ -1475,39 +1504,45 @@ static struct pci_dev *tgl_uncore_get_mc_dev(void)
ids++;
}
+ /* Just try to grab 00:00.0 device */
+ if (!mc_dev)
+ mc_dev = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(0, 0));
+
return mc_dev;
}
#define TGL_UNCORE_MMIO_IMC_MEM_OFFSET 0x10000
#define TGL_UNCORE_PCI_IMC_MAP_SIZE 0xe000
-static void __uncore_imc_init_box(struct intel_uncore_box *box,
- unsigned int base_offset)
+static void
+uncore_get_box_mmio_addr(struct intel_uncore_box *box,
+ unsigned int base_offset,
+ int bar_offset, int step)
{
struct pci_dev *pdev = tgl_uncore_get_mc_dev();
struct intel_uncore_pmu *pmu = box->pmu;
struct intel_uncore_type *type = pmu->type;
resource_size_t addr;
- u32 mch_bar;
+ u32 bar;
if (!pdev) {
pr_warn("perf uncore: Cannot find matched IMC device.\n");
return;
}
- pci_read_config_dword(pdev, SNB_UNCORE_PCI_IMC_BAR_OFFSET, &mch_bar);
- /* MCHBAR is disabled */
- if (!(mch_bar & BIT(0))) {
- pr_warn("perf uncore: MCHBAR is disabled. Failed to map IMC free-running counters.\n");
+ pci_read_config_dword(pdev, bar_offset, &bar);
+ if (!(bar & BIT(0))) {
+ pr_warn("perf uncore: BAR 0x%x is disabled. Failed to map %s counters.\n",
+ bar_offset, type->name);
pci_dev_put(pdev);
return;
}
- mch_bar &= ~BIT(0);
- addr = (resource_size_t)(mch_bar + TGL_UNCORE_MMIO_IMC_MEM_OFFSET * pmu->pmu_idx);
+ bar &= ~BIT(0);
+ addr = (resource_size_t)(bar + step * pmu->pmu_idx);
#ifdef CONFIG_PHYS_ADDR_T_64BIT
- pci_read_config_dword(pdev, SNB_UNCORE_PCI_IMC_BAR_OFFSET + 4, &mch_bar);
- addr |= ((resource_size_t)mch_bar << 32);
+ pci_read_config_dword(pdev, bar_offset + 4, &bar);
+ addr |= ((resource_size_t)bar << 32);
#endif
addr += base_offset;
@@ -1518,6 +1553,14 @@ static void __uncore_imc_init_box(struct intel_uncore_box *box,
pci_dev_put(pdev);
}
+static void __uncore_imc_init_box(struct intel_uncore_box *box,
+ unsigned int base_offset)
+{
+ uncore_get_box_mmio_addr(box, base_offset,
+ SNB_UNCORE_PCI_IMC_BAR_OFFSET,
+ TGL_UNCORE_MMIO_IMC_MEM_OFFSET);
+}
+
static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
{
__uncore_imc_init_box(box, 0);
@@ -1612,14 +1655,17 @@ static void adl_uncore_mmio_enable_box(struct intel_uncore_box *box)
writel(0, box->io_addr + uncore_mmio_box_ctl(box));
}
+#define MMIO_UNCORE_COMMON_OPS() \
+ .exit_box = uncore_mmio_exit_box, \
+ .disable_box = adl_uncore_mmio_disable_box, \
+ .enable_box = adl_uncore_mmio_enable_box, \
+ .disable_event = intel_generic_uncore_mmio_disable_event, \
+ .enable_event = intel_generic_uncore_mmio_enable_event, \
+ .read_counter = uncore_mmio_read_counter,
+
static struct intel_uncore_ops adl_uncore_mmio_ops = {
.init_box = adl_uncore_imc_init_box,
- .exit_box = uncore_mmio_exit_box,
- .disable_box = adl_uncore_mmio_disable_box,
- .enable_box = adl_uncore_mmio_enable_box,
- .disable_event = intel_generic_uncore_mmio_disable_event,
- .enable_event = intel_generic_uncore_mmio_enable_event,
- .read_counter = uncore_mmio_read_counter,
+ MMIO_UNCORE_COMMON_OPS()
};
#define ADL_UNC_CTL_CHMASK_MASK 0x00000f00
@@ -1703,3 +1749,108 @@ void adl_uncore_mmio_init(void)
}
/* end of Alder Lake MMIO uncore support */
+
+/* Lunar Lake MMIO uncore support */
+#define LNL_UNCORE_PCI_SAFBAR_OFFSET 0x68
+#define LNL_UNCORE_MAP_SIZE 0x1000
+#define LNL_UNCORE_SNCU_BASE 0xE4B000
+#define LNL_UNCORE_SNCU_CTR 0x390
+#define LNL_UNCORE_SNCU_CTRL 0x398
+#define LNL_UNCORE_SNCU_BOX_CTL 0x380
+#define LNL_UNCORE_GLOBAL_CTL 0x700
+#define LNL_UNCORE_HBO_BASE 0xE54000
+#define LNL_UNCORE_HBO_OFFSET -4096
+#define LNL_UNCORE_HBO_CTR 0x570
+#define LNL_UNCORE_HBO_CTRL 0x550
+#define LNL_UNCORE_HBO_BOX_CTL 0x548
+
+#define LNL_UNC_CTL_THRESHOLD 0xff000000
+#define LNL_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
+ SNB_UNC_CTL_UMASK_MASK | \
+ SNB_UNC_CTL_EDGE_DET | \
+ SNB_UNC_CTL_INVERT | \
+ LNL_UNC_CTL_THRESHOLD)
+
+static struct attribute *lnl_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_threshold2.attr,
+ NULL
+};
+
+static const struct attribute_group lnl_uncore_format_group = {
+ .name = "format",
+ .attrs = lnl_uncore_formats_attr,
+};
+
+static void lnl_uncore_hbo_init_box(struct intel_uncore_box *box)
+{
+ uncore_get_box_mmio_addr(box, LNL_UNCORE_HBO_BASE,
+ LNL_UNCORE_PCI_SAFBAR_OFFSET,
+ LNL_UNCORE_HBO_OFFSET);
+}
+
+static struct intel_uncore_ops lnl_uncore_hbo_ops = {
+ .init_box = lnl_uncore_hbo_init_box,
+ MMIO_UNCORE_COMMON_OPS()
+};
+
+static struct intel_uncore_type lnl_uncore_hbo = {
+ .name = "hbo",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 64,
+ .perf_ctr = LNL_UNCORE_HBO_CTR,
+ .event_ctl = LNL_UNCORE_HBO_CTRL,
+ .event_mask = LNL_UNC_RAW_EVENT_MASK,
+ .box_ctl = LNL_UNCORE_HBO_BOX_CTL,
+ .mmio_map_size = LNL_UNCORE_MAP_SIZE,
+ .ops = &lnl_uncore_hbo_ops,
+ .format_group = &lnl_uncore_format_group,
+};
+
+static void lnl_uncore_sncu_init_box(struct intel_uncore_box *box)
+{
+ uncore_get_box_mmio_addr(box, LNL_UNCORE_SNCU_BASE,
+ LNL_UNCORE_PCI_SAFBAR_OFFSET,
+ 0);
+
+ if (box->io_addr)
+ writel(ADL_UNCORE_IMC_CTL_INT, box->io_addr + LNL_UNCORE_GLOBAL_CTL);
+}
+
+static struct intel_uncore_ops lnl_uncore_sncu_ops = {
+ .init_box = lnl_uncore_sncu_init_box,
+ MMIO_UNCORE_COMMON_OPS()
+};
+
+static struct intel_uncore_type lnl_uncore_sncu = {
+ .name = "sncu",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 64,
+ .perf_ctr = LNL_UNCORE_SNCU_CTR,
+ .event_ctl = LNL_UNCORE_SNCU_CTRL,
+ .event_mask = LNL_UNC_RAW_EVENT_MASK,
+ .box_ctl = LNL_UNCORE_SNCU_BOX_CTL,
+ .mmio_map_size = LNL_UNCORE_MAP_SIZE,
+ .ops = &lnl_uncore_sncu_ops,
+ .format_group = &lnl_uncore_format_group,
+};
+
+static struct intel_uncore_type *lnl_mmio_uncores[] = {
+ &adl_uncore_imc,
+ &lnl_uncore_hbo,
+ &lnl_uncore_sncu,
+ &adl_uncore_imc_free_running,
+ NULL
+};
+
+void lnl_uncore_mmio_init(void)
+{
+ uncore_mmio_uncores = lnl_mmio_uncores;
+}
+
+/* end of Lunar Lake MMIO uncore support */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1a8942277dda..701549967c18 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -168,6 +168,9 @@ struct hw_perf_event {
struct hw_perf_event_extra extra_reg;
struct hw_perf_event_extra branch_reg;
};
+ struct { /* aux / Intel-PT */
+ u64 aux_config;
+ };
struct { /* software */
struct hrtimer hrtimer;
};
@@ -963,12 +966,16 @@ struct perf_event_context {
struct rcu_head rcu_head;
/*
- * Sum (event->pending_work + event->pending_work)
+ * The count of events for which using the switch-out fast path
+ * should be avoided.
+ *
+ * Sum (event->pending_work + events with
+ * (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)))
*
* The SIGTRAP is targeted at ctx->task, as such it won't do changing
* that until the signal is delivered.
*/
- local_t nr_pending;
+ local_t nr_no_switch_fast;
};
struct perf_cpu_pmu_context {
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index b503fafb7fb3..f50df1fa93e7 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -16,6 +16,7 @@
#include <linux/types.h>
#include <linux/wait.h>
+struct uprobe;
struct vm_area_struct;
struct mm_struct;
struct inode;
@@ -76,6 +77,8 @@ struct uprobe_task {
struct uprobe *active_uprobe;
unsigned long xol_vaddr;
+ struct arch_uprobe *auprobe;
+
struct return_instance *return_instances;
unsigned int depth;
};
@@ -110,10 +113,9 @@ extern bool is_trap_insn(uprobe_opcode_t *insn);
extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs);
extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t);
-extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
-extern int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
-extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool);
-extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
+extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
+extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool);
+extern void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc);
extern int uprobe_mmap(struct vm_area_struct *vma);
extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end);
extern void uprobe_start_dup_mmap(void);
@@ -151,22 +153,18 @@ static inline void uprobes_init(void)
#define uprobe_get_trap_addr(regs) instruction_pointer(regs)
-static inline int
-uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
-{
- return -ENOSYS;
-}
-static inline int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc)
+static inline struct uprobe *
+uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc)
{
- return -ENOSYS;
+ return ERR_PTR(-ENOSYS);
}
static inline int
-uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool add)
+uprobe_apply(struct uprobe* uprobe, struct uprobe_consumer *uc, bool add)
{
return -ENOSYS;
}
static inline void
-uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
}
static inline int uprobe_mmap(struct vm_area_struct *vma)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8a6c6bbcd658..4acec97e5448 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -155,20 +155,55 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info)
return data.ret;
}
+enum event_type_t {
+ EVENT_FLEXIBLE = 0x01,
+ EVENT_PINNED = 0x02,
+ EVENT_TIME = 0x04,
+ EVENT_FROZEN = 0x08,
+ /* see ctx_resched() for details */
+ EVENT_CPU = 0x10,
+ EVENT_CGROUP = 0x20,
+
+ /* compound helpers */
+ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
+ EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN,
+};
+
+static inline void __perf_ctx_lock(struct perf_event_context *ctx)
+{
+ raw_spin_lock(&ctx->lock);
+ WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN);
+}
+
static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
- raw_spin_lock(&cpuctx->ctx.lock);
+ __perf_ctx_lock(&cpuctx->ctx);
if (ctx)
- raw_spin_lock(&ctx->lock);
+ __perf_ctx_lock(ctx);
+}
+
+static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
+{
+ /*
+ * If ctx_sched_in() didn't again set any ALL flags, clean up
+ * after ctx_sched_out() by clearing is_active.
+ */
+ if (ctx->is_active & EVENT_FROZEN) {
+ if (!(ctx->is_active & EVENT_ALL))
+ ctx->is_active = 0;
+ else
+ ctx->is_active &= ~EVENT_FROZEN;
+ }
+ raw_spin_unlock(&ctx->lock);
}
static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
if (ctx)
- raw_spin_unlock(&ctx->lock);
- raw_spin_unlock(&cpuctx->ctx.lock);
+ __perf_ctx_unlock(ctx);
+ __perf_ctx_unlock(&cpuctx->ctx);
}
#define TASK_TOMBSTONE ((void *)-1L)
@@ -264,6 +299,7 @@ static void event_function_call(struct perf_event *event, event_f func, void *da
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
+ struct perf_cpu_context *cpuctx;
struct event_function_struct efs = {
.event = event,
.func = func,
@@ -291,22 +327,25 @@ again:
if (!task_function_call(task, event_function, &efs))
return;
- raw_spin_lock_irq(&ctx->lock);
+ local_irq_disable();
+ cpuctx = this_cpu_ptr(&perf_cpu_context);
+ perf_ctx_lock(cpuctx, ctx);
/*
* Reload the task pointer, it might have been changed by
* a concurrent perf_event_context_sched_out().
*/
task = ctx->task;
- if (task == TASK_TOMBSTONE) {
- raw_spin_unlock_irq(&ctx->lock);
- return;
- }
+ if (task == TASK_TOMBSTONE)
+ goto unlock;
if (ctx->is_active) {
- raw_spin_unlock_irq(&ctx->lock);
+ perf_ctx_unlock(cpuctx, ctx);
+ local_irq_enable();
goto again;
}
func(event, NULL, ctx, data);
- raw_spin_unlock_irq(&ctx->lock);
+unlock:
+ perf_ctx_unlock(cpuctx, ctx);
+ local_irq_enable();
}
/*
@@ -369,16 +408,6 @@ unlock:
(PERF_SAMPLE_BRANCH_KERNEL |\
PERF_SAMPLE_BRANCH_HV)
-enum event_type_t {
- EVENT_FLEXIBLE = 0x1,
- EVENT_PINNED = 0x2,
- EVENT_TIME = 0x4,
- /* see ctx_resched() for details */
- EVENT_CPU = 0x8,
- EVENT_CGROUP = 0x10,
- EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
-};
-
/*
* perf_sched_events : >0 events exist
*/
@@ -685,30 +714,32 @@ do { \
___p; \
})
+#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \
+ list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \
+ if (_cgroup && !_epc->nr_cgroups) \
+ continue; \
+ else if (_pmu && _epc->pmu != _pmu) \
+ continue; \
+ else
+
static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- if (cgroup && !pmu_ctx->nr_cgroups)
- continue;
+ for_each_epc(pmu_ctx, ctx, NULL, cgroup)
perf_pmu_disable(pmu_ctx->pmu);
- }
}
static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- if (cgroup && !pmu_ctx->nr_cgroups)
- continue;
+ for_each_epc(pmu_ctx, ctx, NULL, cgroup)
perf_pmu_enable(pmu_ctx->pmu);
- }
}
-static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
-static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
+static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
+static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
#ifdef CONFIG_CGROUP_PERF
@@ -865,7 +896,7 @@ static void perf_cgroup_switch(struct task_struct *task)
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_ctx_disable(&cpuctx->ctx, true);
- ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
+ ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
/*
* must not be done before ctxswout due
* to update_cgrp_time_from_cpuctx() in
@@ -877,7 +908,7 @@ static void perf_cgroup_switch(struct task_struct *task)
* perf_cgroup_set_timestamp() in ctx_sched_in()
* to not have to pass task around
*/
- ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
+ ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
perf_ctx_enable(&cpuctx->ctx, true);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -1769,6 +1800,14 @@ perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
typeof(*event), group_node))
/*
+ * Does the event attribute request inherit with PERF_SAMPLE_READ
+ */
+static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr)
+{
+ return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ);
+}
+
+/*
* Add an event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
*/
@@ -1798,6 +1837,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->nr_user++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
+ if (has_inherit_and_sample_read(&event->attr))
+ local_inc(&ctx->nr_no_switch_fast);
if (event->state > PERF_EVENT_STATE_OFF)
perf_cgroup_event_enable(event, ctx);
@@ -2022,6 +2063,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->nr_user--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
+ if (has_inherit_and_sample_read(&event->attr))
+ local_dec(&ctx->nr_no_switch_fast);
list_del_rcu(&event->event_entry);
@@ -2317,6 +2360,45 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
event_sched_out(event, ctx);
}
+static inline void
+__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final)
+{
+ if (ctx->is_active & EVENT_TIME) {
+ if (ctx->is_active & EVENT_FROZEN)
+ return;
+ update_context_time(ctx);
+ update_cgrp_time_from_cpuctx(cpuctx, final);
+ }
+}
+
+static inline void
+ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+{
+ __ctx_time_update(cpuctx, ctx, false);
+}
+
+/*
+ * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock().
+ */
+static inline void
+ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+{
+ ctx_time_update(cpuctx, ctx);
+ if (ctx->is_active & EVENT_TIME)
+ ctx->is_active |= EVENT_FROZEN;
+}
+
+static inline void
+ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
+{
+ if (ctx->is_active & EVENT_TIME) {
+ if (ctx->is_active & EVENT_FROZEN)
+ return;
+ update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ }
+}
+
#define DETACH_GROUP 0x01UL
#define DETACH_CHILD 0x02UL
#define DETACH_DEAD 0x04UL
@@ -2336,10 +2418,7 @@ __perf_remove_from_context(struct perf_event *event,
struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
unsigned long flags = (unsigned long)info;
- if (ctx->is_active & EVENT_TIME) {
- update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx, false);
- }
+ ctx_time_update(cpuctx, ctx);
/*
* Ensure event_sched_out() switches to OFF, at the very least
@@ -2424,12 +2503,8 @@ static void __perf_event_disable(struct perf_event *event,
if (event->state < PERF_EVENT_STATE_INACTIVE)
return;
- if (ctx->is_active & EVENT_TIME) {
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- }
-
perf_pmu_disable(event->pmu_ctx->pmu);
+ ctx_time_update_event(ctx, event);
if (event == event->group_leader)
group_sched_out(event, ctx);
@@ -2645,7 +2720,8 @@ static void add_event_to_ctx(struct perf_event *event,
}
static void task_ctx_sched_out(struct perf_event_context *ctx,
- enum event_type_t event_type)
+ struct pmu *pmu,
+ enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
@@ -2655,18 +2731,19 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
return;
- ctx_sched_out(ctx, event_type);
+ ctx_sched_out(ctx, pmu, event_type);
}
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+ struct perf_event_context *ctx,
+ struct pmu *pmu)
{
- ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
+ ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED);
if (ctx)
- ctx_sched_in(ctx, EVENT_PINNED);
- ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, pmu, EVENT_PINNED);
+ ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
if (ctx)
- ctx_sched_in(ctx, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE);
}
/*
@@ -2684,16 +2761,12 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
* event_type is a bit mask of the types of events involved. For CPU events,
* event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
*/
-/*
- * XXX: ctx_resched() reschedule entire perf_event_context while adding new
- * event to the context or enabling existing event in the context. We can
- * probably optimize it by rescheduling only affected pmu_ctx.
- */
static void ctx_resched(struct perf_cpu_context *cpuctx,
struct perf_event_context *task_ctx,
- enum event_type_t event_type)
+ struct pmu *pmu, enum event_type_t event_type)
{
bool cpu_event = !!(event_type & EVENT_CPU);
+ struct perf_event_pmu_context *epc;
/*
* If pinned groups are involved, flexible groups also need to be
@@ -2704,10 +2777,14 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
event_type &= EVENT_ALL;
- perf_ctx_disable(&cpuctx->ctx, false);
+ for_each_epc(epc, &cpuctx->ctx, pmu, false)
+ perf_pmu_disable(epc->pmu);
+
if (task_ctx) {
- perf_ctx_disable(task_ctx, false);
- task_ctx_sched_out(task_ctx, event_type);
+ for_each_epc(epc, task_ctx, pmu, false)
+ perf_pmu_disable(epc->pmu);
+
+ task_ctx_sched_out(task_ctx, pmu, event_type);
}
/*
@@ -2718,15 +2795,19 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
* - otherwise, do nothing more.
*/
if (cpu_event)
- ctx_sched_out(&cpuctx->ctx, event_type);
+ ctx_sched_out(&cpuctx->ctx, pmu, event_type);
else if (event_type & EVENT_PINNED)
- ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+ ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
+
+ perf_event_sched_in(cpuctx, task_ctx, pmu);
- perf_event_sched_in(cpuctx, task_ctx);
+ for_each_epc(epc, &cpuctx->ctx, pmu, false)
+ perf_pmu_enable(epc->pmu);
- perf_ctx_enable(&cpuctx->ctx, false);
- if (task_ctx)
- perf_ctx_enable(task_ctx, false);
+ if (task_ctx) {
+ for_each_epc(epc, task_ctx, pmu, false)
+ perf_pmu_enable(epc->pmu);
+ }
}
void perf_pmu_resched(struct pmu *pmu)
@@ -2735,7 +2816,7 @@ void perf_pmu_resched(struct pmu *pmu)
struct perf_event_context *task_ctx = cpuctx->task_ctx;
perf_ctx_lock(cpuctx, task_ctx);
- ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
+ ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
perf_ctx_unlock(cpuctx, task_ctx);
}
@@ -2791,9 +2872,10 @@ static int __perf_install_in_context(void *info)
#endif
if (reprogram) {
- ctx_sched_out(ctx, EVENT_TIME);
+ ctx_time_freeze(cpuctx, ctx);
add_event_to_ctx(event, ctx);
- ctx_resched(cpuctx, task_ctx, get_event_type(event));
+ ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
+ get_event_type(event));
} else {
add_event_to_ctx(event, ctx);
}
@@ -2936,8 +3018,7 @@ static void __perf_event_enable(struct perf_event *event,
event->state <= PERF_EVENT_STATE_ERROR)
return;
- if (ctx->is_active)
- ctx_sched_out(ctx, EVENT_TIME);
+ ctx_time_freeze(cpuctx, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
perf_cgroup_event_enable(event, ctx);
@@ -2945,25 +3026,21 @@ static void __perf_event_enable(struct perf_event *event,
if (!ctx->is_active)
return;
- if (!event_filter_match(event)) {
- ctx_sched_in(ctx, EVENT_TIME);
+ if (!event_filter_match(event))
return;
- }
/*
* If the event is in a group and isn't the group leader,
* then don't put it on unless the group is on.
*/
- if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
- ctx_sched_in(ctx, EVENT_TIME);
+ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
return;
- }
task_ctx = cpuctx->task_ctx;
if (ctx->task)
WARN_ON_ONCE(task_ctx != ctx);
- ctx_resched(cpuctx, task_ctx, get_event_type(event));
+ ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
}
/*
@@ -3231,7 +3308,7 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
struct perf_event *event, *tmp;
struct pmu *pmu = pmu_ctx->pmu;
- if (ctx->task && !ctx->is_active) {
+ if (ctx->task && !(ctx->is_active & EVENT_ALL)) {
struct perf_cpu_pmu_context *cpc;
cpc = this_cpu_ptr(pmu->cpu_pmu_context);
@@ -3239,7 +3316,7 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
cpc->task_epc = NULL;
}
- if (!event_type)
+ if (!(event_type & EVENT_ALL))
return;
perf_pmu_disable(pmu);
@@ -3265,8 +3342,17 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
perf_pmu_enable(pmu);
}
+/*
+ * Be very careful with the @pmu argument since this will change ctx state.
+ * The @pmu argument works for ctx_resched(), because that is symmetric in
+ * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant.
+ *
+ * However, if you were to be asymmetrical, you could end up with messed up
+ * state, eg. ctx->is_active cleared even though most EPCs would still actually
+ * be active.
+ */
static void
-ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
+ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_pmu_context *pmu_ctx;
@@ -3297,34 +3383,36 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
*
* would only update time for the pinned events.
*/
- if (is_active & EVENT_TIME) {
- /* update (and stop) ctx time */
- update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
+ __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx);
+
+ /*
+ * CPU-release for the below ->is_active store,
+ * see __load_acquire() in perf_event_time_now()
+ */
+ barrier();
+ ctx->is_active &= ~event_type;
+
+ if (!(ctx->is_active & EVENT_ALL)) {
/*
- * CPU-release for the below ->is_active store,
- * see __load_acquire() in perf_event_time_now()
+ * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now()
+ * does not observe a hole. perf_ctx_unlock() will clean up.
*/
- barrier();
+ if (ctx->is_active & EVENT_FROZEN)
+ ctx->is_active &= EVENT_TIME_FROZEN;
+ else
+ ctx->is_active = 0;
}
- ctx->is_active &= ~event_type;
- if (!(ctx->is_active & EVENT_ALL))
- ctx->is_active = 0;
-
if (ctx->task) {
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
- if (!ctx->is_active)
+ if (!(ctx->is_active & EVENT_ALL))
cpuctx->task_ctx = NULL;
}
is_active ^= ctx->is_active; /* changed bits */
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- if (cgroup && !pmu_ctx->nr_cgroups)
- continue;
+ for_each_epc(pmu_ctx, ctx, pmu, cgroup)
__pmu_ctx_sched_out(pmu_ctx, is_active);
- }
}
/*
@@ -3517,12 +3605,17 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
perf_ctx_disable(ctx, false);
- /* PMIs are disabled; ctx->nr_pending is stable. */
- if (local_read(&ctx->nr_pending) ||
- local_read(&next_ctx->nr_pending)) {
+ /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */
+ if (local_read(&ctx->nr_no_switch_fast) ||
+ local_read(&next_ctx->nr_no_switch_fast)) {
/*
* Must not swap out ctx when there's pending
* events that rely on the ctx->task relation.
+ *
+ * Likewise, when a context contains inherit +
+ * SAMPLE_READ events they should be switched
+ * out using the slow path so that they are
+ * treated as if they were distinct contexts.
*/
raw_spin_unlock(&next_ctx->lock);
rcu_read_unlock();
@@ -3563,7 +3656,7 @@ unlock:
inside_switch:
perf_ctx_sched_task_cb(ctx, false);
- task_ctx_sched_out(ctx, EVENT_ALL);
+ task_ctx_sched_out(ctx, NULL, EVENT_ALL);
perf_ctx_enable(ctx, false);
raw_spin_unlock(&ctx->lock);
@@ -3861,29 +3954,22 @@ static void pmu_groups_sched_in(struct perf_event_context *ctx,
merge_sched_in, &can_add_hw);
}
-static void ctx_groups_sched_in(struct perf_event_context *ctx,
- struct perf_event_groups *groups,
- bool cgroup)
+static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
+ enum event_type_t event_type)
{
- struct perf_event_pmu_context *pmu_ctx;
-
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- if (cgroup && !pmu_ctx->nr_cgroups)
- continue;
- pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
- }
-}
+ struct perf_event_context *ctx = pmu_ctx->ctx;
-static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
- struct pmu *pmu)
-{
- pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
+ if (event_type & EVENT_PINNED)
+ pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
+ if (event_type & EVENT_FLEXIBLE)
+ pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
}
static void
-ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
+ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
bool cgroup = event_type & EVENT_CGROUP;
@@ -3907,7 +3993,7 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
ctx->is_active |= (event_type | EVENT_TIME);
if (ctx->task) {
- if (!is_active)
+ if (!(is_active & EVENT_ALL))
cpuctx->task_ctx = ctx;
else
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
@@ -3919,12 +4005,16 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
*/
- if (is_active & EVENT_PINNED)
- ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
+ if (is_active & EVENT_PINNED) {
+ for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+ __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED);
+ }
/* Then walk through the lower prio flexible groups */
- if (is_active & EVENT_FLEXIBLE)
- ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
+ if (is_active & EVENT_FLEXIBLE) {
+ for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+ __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE);
+ }
}
static void perf_event_context_sched_in(struct task_struct *task)
@@ -3967,10 +4057,10 @@ static void perf_event_context_sched_in(struct task_struct *task)
*/
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
perf_ctx_disable(&cpuctx->ctx, false);
- ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+ ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
}
- perf_event_sched_in(cpuctx, ctx);
+ perf_event_sched_in(cpuctx, ctx, NULL);
perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
@@ -4311,14 +4401,14 @@ static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
update_context_time(&cpuctx->ctx);
__pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
rotate_ctx(&cpuctx->ctx, cpu_event);
- __pmu_ctx_sched_in(&cpuctx->ctx, pmu);
+ __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
}
if (task_event)
rotate_ctx(task_epc->ctx, task_event);
if (task_event || (task_epc && cpu_event))
- __pmu_ctx_sched_in(task_epc->ctx, pmu);
+ __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -4384,7 +4474,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
cpuctx = this_cpu_ptr(&perf_cpu_context);
perf_ctx_lock(cpuctx, ctx);
- ctx_sched_out(ctx, EVENT_TIME);
+ ctx_time_freeze(cpuctx, ctx);
list_for_each_entry(event, &ctx->event_list, event_entry) {
enabled |= event_enable_on_exec(event, ctx);
@@ -4396,9 +4486,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
*/
if (enabled) {
clone_ctx = unclone_ctx(ctx);
- ctx_resched(cpuctx, ctx, event_type);
- } else {
- ctx_sched_in(ctx, EVENT_TIME);
+ ctx_resched(cpuctx, ctx, NULL, event_type);
}
perf_ctx_unlock(cpuctx, ctx);
@@ -4501,10 +4589,7 @@ static void __perf_event_read(void *info)
return;
raw_spin_lock(&ctx->lock);
- if (ctx->is_active & EVENT_TIME) {
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- }
+ ctx_time_update_event(ctx, event);
perf_event_update_time(event);
if (data->group)
@@ -4539,8 +4624,11 @@ unlock:
raw_spin_unlock(&ctx->lock);
}
-static inline u64 perf_event_count(struct perf_event *event)
+static inline u64 perf_event_count(struct perf_event *event, bool self)
{
+ if (self)
+ return local64_read(&event->count);
+
return local64_read(&event->count) + atomic64_read(&event->child_count);
}
@@ -4701,10 +4789,7 @@ again:
* May read while context is not active (e.g., thread is
* blocked), in that case we cannot update context time
*/
- if (ctx->is_active & EVENT_TIME) {
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- }
+ ctx_time_update_event(ctx, event);
perf_event_update_time(event);
if (group)
@@ -5205,7 +5290,7 @@ static void perf_pending_task_sync(struct perf_event *event)
*/
if (task_work_cancel(current, head)) {
event->pending_work = 0;
- local_dec(&event->ctx->nr_pending);
+ local_dec(&event->ctx->nr_no_switch_fast);
return;
}
@@ -5499,7 +5584,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *
mutex_lock(&event->child_mutex);
(void)perf_event_read(event, false);
- total += perf_event_count(event);
+ total += perf_event_count(event, false);
*enabled += event->total_time_enabled +
atomic64_read(&event->child_total_time_enabled);
@@ -5508,7 +5593,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *
list_for_each_entry(child, &event->child_list, child_list) {
(void)perf_event_read(child, false);
- total += perf_event_count(child);
+ total += perf_event_count(child, false);
*enabled += child->total_time_enabled;
*running += child->total_time_running;
}
@@ -5590,14 +5675,14 @@ static int __perf_read_group_add(struct perf_event *leader,
/*
* Write {count,id} tuples for every sibling.
*/
- values[n++] += perf_event_count(leader);
+ values[n++] += perf_event_count(leader, false);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
if (read_format & PERF_FORMAT_LOST)
values[n++] = atomic64_read(&leader->lost_samples);
for_each_sibling_event(sub, leader) {
- values[n++] += perf_event_count(sub);
+ values[n++] += perf_event_count(sub, false);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
if (read_format & PERF_FORMAT_LOST)
@@ -6177,7 +6262,7 @@ void perf_event_update_userpage(struct perf_event *event)
++userpg->lock;
barrier();
userpg->index = perf_event_index(event);
- userpg->offset = perf_event_count(event);
+ userpg->offset = perf_event_count(event, false);
if (userpg->index)
userpg->offset -= local64_read(&event->hw.prev_count);
@@ -6874,7 +6959,7 @@ static void perf_pending_task(struct callback_head *head)
if (event->pending_work) {
event->pending_work = 0;
perf_sigtrap(event);
- local_dec(&event->ctx->nr_pending);
+ local_dec(&event->ctx->nr_no_switch_fast);
rcuwait_wake_up(&event->pending_work_wait);
}
rcu_read_unlock();
@@ -7256,7 +7341,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
u64 values[5];
int n = 0;
- values[n++] = perf_event_count(event);
+ values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr));
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
values[n++] = enabled +
atomic64_read(&event->child_total_time_enabled);
@@ -7274,14 +7359,15 @@ static void perf_output_read_one(struct perf_output_handle *handle,
}
static void perf_output_read_group(struct perf_output_handle *handle,
- struct perf_event *event,
- u64 enabled, u64 running)
+ struct perf_event *event,
+ u64 enabled, u64 running)
{
struct perf_event *leader = event->group_leader, *sub;
u64 read_format = event->attr.read_format;
unsigned long flags;
u64 values[6];
int n = 0;
+ bool self = has_inherit_and_sample_read(&event->attr);
/*
* Disabling interrupts avoids all counter scheduling
@@ -7301,7 +7387,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
(leader->state == PERF_EVENT_STATE_ACTIVE))
leader->pmu->read(leader);
- values[n++] = perf_event_count(leader);
+ values[n++] = perf_event_count(leader, self);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
if (read_format & PERF_FORMAT_LOST)
@@ -7316,7 +7402,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
(sub->state == PERF_EVENT_STATE_ACTIVE))
sub->pmu->read(sub);
- values[n++] = perf_event_count(sub);
+ values[n++] = perf_event_count(sub, self);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
if (read_format & PERF_FORMAT_LOST)
@@ -7337,6 +7423,10 @@ static void perf_output_read_group(struct perf_output_handle *handle,
* The problem is that its both hard and excessively expensive to iterate the
* child list, not to mention that its impossible to IPI the children running
* on another CPU, from interrupt/NMI context.
+ *
+ * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread
+ * counts rather than attempting to accumulate some value across all children on
+ * all cores.
*/
static void perf_output_read(struct perf_output_handle *handle,
struct perf_event *event)
@@ -9747,7 +9837,7 @@ static int __perf_event_overflow(struct perf_event *event,
if (!event->pending_work &&
!task_work_add(current, &event->pending_task, notify_mode)) {
event->pending_work = pending_id;
- local_inc(&event->ctx->nr_pending);
+ local_inc(&event->ctx->nr_no_switch_fast);
event->pending_addr = 0;
if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
@@ -12064,10 +12154,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
local64_set(&hwc->period_left, hwc->sample_period);
/*
- * We currently do not support PERF_SAMPLE_READ on inherited events.
+ * We do not support PERF_SAMPLE_READ on inherited events unless
+ * PERF_SAMPLE_TID is also selected, which allows inherited events to
+ * collect per-thread samples.
* See perf_output_read().
*/
- if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
+ if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID))
goto err_ns;
if (!has_branch_stack(event))
@@ -13091,7 +13183,7 @@ static void sync_child_event(struct perf_event *child_event)
perf_event_read_event(child_event, task);
}
- child_val = perf_event_count(child_event);
+ child_val = perf_event_count(child_event, false);
/*
* Add back the child's count to the parent's count:
@@ -13182,7 +13274,7 @@ static void perf_event_exit_task_context(struct task_struct *child)
* in.
*/
raw_spin_lock_irq(&child_ctx->lock);
- task_ctx_sched_out(child_ctx, EVENT_ALL);
+ task_ctx_sched_out(child_ctx, NULL, EVENT_ALL);
/*
* Now that the context is inactive, destroy the task <-> ctx relation
@@ -13731,7 +13823,7 @@ static void __perf_event_exit_context(void *__info)
struct perf_event *event;
raw_spin_lock(&ctx->lock);
- ctx_sched_out(ctx, EVENT_TIME);
+ ctx_sched_out(ctx, NULL, EVENT_TIME);
list_for_each_entry(event, &ctx->event_list, event_entry)
__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
raw_spin_unlock(&ctx->lock);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 50d7949be2b1..cac45ea4c284 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -453,7 +453,7 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
* @vaddr: the virtual address to store the opcode.
* @opcode: opcode to be written at @vaddr.
*
- * Called with mm->mmap_lock held for write.
+ * Called with mm->mmap_lock held for read or write.
* Return 0 (success) or a negative errno.
*/
int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
@@ -725,7 +725,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
if (!uprobe)
- return NULL;
+ return ERR_PTR(-ENOMEM);
uprobe->inode = inode;
uprobe->offset = offset;
@@ -939,7 +939,6 @@ static void delete_uprobe(struct uprobe *uprobe)
rb_erase(&uprobe->rb_node, &uprobes_tree);
write_unlock(&uprobes_treelock);
RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
- put_uprobe(uprobe);
}
struct map_info {
@@ -1046,7 +1045,13 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
if (err && is_register)
goto free;
-
+ /*
+ * We take mmap_lock for writing to avoid the race with
+ * find_active_uprobe() which takes mmap_lock for reading.
+ * Thus this install_breakpoint() can not make
+ * is_trap_at_addr() true right after find_uprobe()
+ * returns NULL in find_active_uprobe().
+ */
mmap_write_lock(mm);
vma = find_vma(mm, info->vaddr);
if (!vma || !valid_vma(vma, is_register) ||
@@ -1079,92 +1084,85 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
return err;
}
-static void
-__uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
-{
- int err;
-
- if (WARN_ON(!consumer_del(uprobe, uc)))
- return;
-
- err = register_for_each_vma(uprobe, NULL);
- /* TODO : cant unregister? schedule a worker thread */
- if (!uprobe->consumers && !err)
- delete_uprobe(uprobe);
-}
-
-/*
+/**
* uprobe_unregister - unregister an already registered probe.
- * @inode: the file in which the probe has to be removed.
- * @offset: offset from the start of the file.
+ * @uprobe: uprobe to remove
* @uc: identify which probe if multiple probes are colocated.
*/
-void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
- struct uprobe *uprobe;
-
- uprobe = find_uprobe(inode, offset);
- if (WARN_ON(!uprobe))
- return;
+ int err;
down_write(&uprobe->register_rwsem);
- __uprobe_unregister(uprobe, uc);
+ if (WARN_ON(!consumer_del(uprobe, uc)))
+ err = -ENOENT;
+ else
+ err = register_for_each_vma(uprobe, NULL);
+
+ /* TODO : cant unregister? schedule a worker thread */
+ if (!err) {
+ if (!uprobe->consumers)
+ delete_uprobe(uprobe);
+ else
+ err = -EBUSY;
+ }
up_write(&uprobe->register_rwsem);
- put_uprobe(uprobe);
+
+ if (!err)
+ put_uprobe(uprobe);
}
EXPORT_SYMBOL_GPL(uprobe_unregister);
-/*
- * __uprobe_register - register a probe
+/**
+ * uprobe_register - register a probe
* @inode: the file in which the probe has to be placed.
* @offset: offset from the start of the file.
+ * @ref_ctr_offset: offset of SDT marker / reference counter
* @uc: information on howto handle the probe..
*
- * Apart from the access refcount, __uprobe_register() takes a creation
+ * Apart from the access refcount, uprobe_register() takes a creation
* refcount (thro alloc_uprobe) if and only if this @uprobe is getting
* inserted into the rbtree (i.e first consumer for a @inode:@offset
* tuple). Creation refcount stops uprobe_unregister from freeing the
* @uprobe even before the register operation is complete. Creation
* refcount is released when the last @uc for the @uprobe
- * unregisters. Caller of __uprobe_register() is required to keep @inode
+ * unregisters. Caller of uprobe_register() is required to keep @inode
* (and the containing mount) referenced.
*
- * Return errno if it cannot successully install probes
- * else return 0 (success)
+ * Return: pointer to the new uprobe on success or an ERR_PTR on failure.
*/
-static int __uprobe_register(struct inode *inode, loff_t offset,
- loff_t ref_ctr_offset, struct uprobe_consumer *uc)
+struct uprobe *uprobe_register(struct inode *inode,
+ loff_t offset, loff_t ref_ctr_offset,
+ struct uprobe_consumer *uc)
{
struct uprobe *uprobe;
int ret;
/* Uprobe must have at least one set consumer */
if (!uc->handler && !uc->ret_handler)
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
/* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
if (!inode->i_mapping->a_ops->read_folio &&
!shmem_mapping(inode->i_mapping))
- return -EIO;
+ return ERR_PTR(-EIO);
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
/*
* This ensures that copy_from_page(), copy_to_page() and
* __update_ref_ctr() can't cross page boundary.
*/
if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
retry:
uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
- if (!uprobe)
- return -ENOMEM;
if (IS_ERR(uprobe))
- return PTR_ERR(uprobe);
+ return uprobe;
/*
* We can race with uprobe_unregister()->delete_uprobe().
@@ -1175,56 +1173,39 @@ static int __uprobe_register(struct inode *inode, loff_t offset,
if (likely(uprobe_is_active(uprobe))) {
consumer_add(uprobe, uc);
ret = register_for_each_vma(uprobe, uc);
- if (ret)
- __uprobe_unregister(uprobe, uc);
}
up_write(&uprobe->register_rwsem);
put_uprobe(uprobe);
- if (unlikely(ret == -EAGAIN))
- goto retry;
- return ret;
-}
+ if (ret) {
+ if (unlikely(ret == -EAGAIN))
+ goto retry;
+ uprobe_unregister(uprobe, uc);
+ return ERR_PTR(ret);
+ }
-int uprobe_register(struct inode *inode, loff_t offset,
- struct uprobe_consumer *uc)
-{
- return __uprobe_register(inode, offset, 0, uc);
+ return uprobe;
}
EXPORT_SYMBOL_GPL(uprobe_register);
-int uprobe_register_refctr(struct inode *inode, loff_t offset,
- loff_t ref_ctr_offset, struct uprobe_consumer *uc)
-{
- return __uprobe_register(inode, offset, ref_ctr_offset, uc);
-}
-EXPORT_SYMBOL_GPL(uprobe_register_refctr);
-
-/*
- * uprobe_apply - unregister an already registered probe.
- * @inode: the file in which the probe has to be removed.
- * @offset: offset from the start of the file.
+/**
+ * uprobe_apply - add or remove the breakpoints according to @uc->filter
+ * @uprobe: uprobe which "owns" the breakpoint
* @uc: consumer which wants to add more or remove some breakpoints
* @add: add or remove the breakpoints
+ * Return: 0 on success or negative error code.
*/
-int uprobe_apply(struct inode *inode, loff_t offset,
- struct uprobe_consumer *uc, bool add)
+int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
{
- struct uprobe *uprobe;
struct uprobe_consumer *con;
int ret = -ENOENT;
- uprobe = find_uprobe(inode, offset);
- if (WARN_ON(!uprobe))
- return ret;
-
down_write(&uprobe->register_rwsem);
for (con = uprobe->consumers; con && con != uc ; con = con->next)
;
if (con)
ret = register_for_each_vma(uprobe, add ? uc : NULL);
up_write(&uprobe->register_rwsem);
- put_uprobe(uprobe);
return ret;
}
@@ -2028,13 +2009,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
if (likely(result == 0))
goto out;
- /*
- * The NULL 'tsk' here ensures that any faults that occur here
- * will not be accounted to the task. 'mm' *is* current->mm,
- * but we treat this as a 'remote' access since it is
- * essentially a kernel access to the memory.
- */
- result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, NULL);
+ result = get_user_pages(vaddr, 1, FOLL_FORCE, &page);
if (result < 0)
return result;
@@ -2081,6 +2056,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
bool need_prep = false; /* prepare return uprobe, when needed */
down_read(&uprobe->register_rwsem);
+ current->utask->auprobe = &uprobe->arch;
for (uc = uprobe->consumers; uc; uc = uc->next) {
int rc = 0;
@@ -2095,6 +2071,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
remove &= rc;
}
+ current->utask->auprobe = NULL;
if (need_prep && !remove)
prepare_uretprobe(uprobe, regs); /* put bp at return */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index cd098846e251..4e391daafa64 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -3160,6 +3160,7 @@ struct bpf_uprobe {
loff_t offset;
unsigned long ref_ctr_offset;
u64 cookie;
+ struct uprobe *uprobe;
struct uprobe_consumer consumer;
};
@@ -3178,15 +3179,12 @@ struct bpf_uprobe_multi_run_ctx {
struct bpf_uprobe *uprobe;
};
-static void bpf_uprobe_unregister(struct path *path, struct bpf_uprobe *uprobes,
- u32 cnt)
+static void bpf_uprobe_unregister(struct bpf_uprobe *uprobes, u32 cnt)
{
u32 i;
- for (i = 0; i < cnt; i++) {
- uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset,
- &uprobes[i].consumer);
- }
+ for (i = 0; i < cnt; i++)
+ uprobe_unregister(uprobes[i].uprobe, &uprobes[i].consumer);
}
static void bpf_uprobe_multi_link_release(struct bpf_link *link)
@@ -3194,7 +3192,7 @@ static void bpf_uprobe_multi_link_release(struct bpf_link *link)
struct bpf_uprobe_multi_link *umulti_link;
umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
- bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt);
+ bpf_uprobe_unregister(umulti_link->uprobes, umulti_link->cnt);
if (umulti_link->task)
put_task_struct(umulti_link->task);
path_put(&umulti_link->path);
@@ -3480,12 +3478,13 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
&bpf_uprobe_multi_link_lops, prog);
for (i = 0; i < cnt; i++) {
- err = uprobe_register_refctr(d_real_inode(link->path.dentry),
- uprobes[i].offset,
- uprobes[i].ref_ctr_offset,
- &uprobes[i].consumer);
- if (err) {
- bpf_uprobe_unregister(&path, uprobes, i);
+ uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry),
+ uprobes[i].offset,
+ uprobes[i].ref_ctr_offset,
+ &uprobes[i].consumer);
+ if (IS_ERR(uprobes[i].uprobe)) {
+ err = PTR_ERR(uprobes[i].uprobe);
+ bpf_uprobe_unregister(uprobes, i);
goto error_free;
}
}
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c98e3b3386ba..52e76a73fa7c 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -58,8 +58,8 @@ struct trace_uprobe {
struct dyn_event devent;
struct uprobe_consumer consumer;
struct path path;
- struct inode *inode;
char *filename;
+ struct uprobe *uprobe;
unsigned long offset;
unsigned long ref_ctr_offset;
unsigned long nhit;
@@ -1084,21 +1084,16 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self,
static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter)
{
- int ret;
+ struct inode *inode = d_real_inode(tu->path.dentry);
+ struct uprobe *uprobe;
tu->consumer.filter = filter;
- tu->inode = d_real_inode(tu->path.dentry);
-
- if (tu->ref_ctr_offset)
- ret = uprobe_register_refctr(tu->inode, tu->offset,
- tu->ref_ctr_offset, &tu->consumer);
- else
- ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+ uprobe = uprobe_register(inode, tu->offset, tu->ref_ctr_offset, &tu->consumer);
+ if (IS_ERR(uprobe))
+ return PTR_ERR(uprobe);
- if (ret)
- tu->inode = NULL;
-
- return ret;
+ tu->uprobe = uprobe;
+ return 0;
}
static void __probe_event_disable(struct trace_probe *tp)
@@ -1109,11 +1104,11 @@ static void __probe_event_disable(struct trace_probe *tp)
WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter));
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
- if (!tu->inode)
+ if (!tu->uprobe)
continue;
- uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
- tu->inode = NULL;
+ uprobe_unregister(tu->uprobe, &tu->consumer);
+ tu->uprobe = NULL;
}
}
@@ -1310,7 +1305,7 @@ static int uprobe_perf_close(struct trace_event_call *call,
return 0;
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
- ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
+ ret = uprobe_apply(tu->uprobe, &tu->consumer, false);
if (ret)
break;
}
@@ -1334,7 +1329,7 @@ static int uprobe_perf_open(struct trace_event_call *call,
return 0;
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
- err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
+ err = uprobe_apply(tu->uprobe, &tu->consumer, true);
if (err) {
uprobe_perf_close(call, event);
break;
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index fd28c1157bd3..3c0515a27842 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -432,7 +432,7 @@ uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func,
struct testmod_uprobe {
struct path path;
- loff_t offset;
+ struct uprobe *uprobe;
struct uprobe_consumer consumer;
};
@@ -446,25 +446,25 @@ static int testmod_register_uprobe(loff_t offset)
{
int err = -EBUSY;
- if (uprobe.offset)
+ if (uprobe.uprobe)
return -EBUSY;
mutex_lock(&testmod_uprobe_mutex);
- if (uprobe.offset)
+ if (uprobe.uprobe)
goto out;
err = kern_path("/proc/self/exe", LOOKUP_FOLLOW, &uprobe.path);
if (err)
goto out;
- err = uprobe_register_refctr(d_real_inode(uprobe.path.dentry),
- offset, 0, &uprobe.consumer);
- if (err)
+ uprobe.uprobe = uprobe_register(d_real_inode(uprobe.path.dentry),
+ offset, 0, &uprobe.consumer);
+ if (IS_ERR(uprobe.uprobe)) {
+ err = PTR_ERR(uprobe.uprobe);
path_put(&uprobe.path);
- else
- uprobe.offset = offset;
-
+ uprobe.uprobe = NULL;
+ }
out:
mutex_unlock(&testmod_uprobe_mutex);
return err;
@@ -474,10 +474,10 @@ static void testmod_unregister_uprobe(void)
{
mutex_lock(&testmod_uprobe_mutex);
- if (uprobe.offset) {
- uprobe_unregister(d_real_inode(uprobe.path.dentry),
- uprobe.offset, &uprobe.consumer);
- uprobe.offset = 0;
+ if (uprobe.uprobe) {
+ uprobe_unregister(uprobe.uprobe, &uprobe.consumer);
+ path_put(&uprobe.path);
+ uprobe.uprobe = NULL;
}
mutex_unlock(&testmod_uprobe_mutex);