diff options
-rw-r--r-- | arch/x86/events/intel/pt.c | 203 | ||||
-rw-r--r-- | arch/x86/events/intel/pt.h | 12 | ||||
-rw-r--r-- | include/linux/perf_event.h | 19 | ||||
-rw-r--r-- | include/uapi/linux/perf_event.h | 10 | ||||
-rw-r--r-- | kernel/events/core.c | 198 | ||||
-rw-r--r-- | kernel/events/internal.h | 1 | ||||
-rw-r--r-- | kernel/events/ring_buffer.c | 36 |
7 files changed, 431 insertions, 48 deletions
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 05e43d0f430b..1db7a51d9792 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -397,6 +397,20 @@ static bool pt_event_valid(struct perf_event *event) * These all are cpu affine and operate on a local PT */ +static void pt_config_start(struct perf_event *event) +{ + struct pt *pt = this_cpu_ptr(&pt_ctx); + u64 ctl = event->hw.config; + + ctl |= RTIT_CTL_TRACEEN; + if (READ_ONCE(pt->vmx_on)) + perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL); + else + wrmsrl(MSR_IA32_RTIT_CTL, ctl); + + WRITE_ONCE(event->hw.config, ctl); +} + /* Address ranges and their corresponding msr configuration registers */ static const struct pt_address_range { unsigned long msr_a; @@ -469,6 +483,7 @@ static u64 pt_config_filters(struct perf_event *event) static void pt_config(struct perf_event *event) { struct pt *pt = this_cpu_ptr(&pt_ctx); + struct pt_buffer *buf = perf_get_aux(&pt->handle); u64 reg; /* First round: clear STATUS, in particular the PSB byte counter. */ @@ -478,7 +493,9 @@ static void pt_config(struct perf_event *event) } reg = pt_config_filters(event); - reg |= RTIT_CTL_TOPA | RTIT_CTL_TRACEEN; + reg |= RTIT_CTL_TRACEEN; + if (!buf->single) + reg |= RTIT_CTL_TOPA; /* * Previously, we had BRANCH_EN on by default, but now that PT has @@ -501,10 +518,7 @@ static void pt_config(struct perf_event *event) reg |= (event->attr.config & PT_CONFIG_MASK); event->hw.config = reg; - if (READ_ONCE(pt->vmx_on)) - perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL); - else - wrmsrl(MSR_IA32_RTIT_CTL, reg); + pt_config_start(event); } static void pt_config_stop(struct perf_event *event) @@ -533,18 +547,6 @@ static void pt_config_stop(struct perf_event *event) wmb(); } -static void pt_config_buffer(void *buf, unsigned int topa_idx, - unsigned int output_off) -{ - u64 reg; - - wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf)); - - reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32); - - wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg); -} - /** * struct topa - ToPA metadata * @list: linkage to struct pt_buffer's list of tables @@ -602,6 +604,33 @@ static inline phys_addr_t topa_pfn(struct topa *topa) #define TOPA_ENTRY_SIZE(t, i) (sizes(TOPA_ENTRY((t), (i))->size)) #define TOPA_ENTRY_PAGES(t, i) (1 << TOPA_ENTRY((t), (i))->size) +static void pt_config_buffer(struct pt_buffer *buf) +{ + struct pt *pt = this_cpu_ptr(&pt_ctx); + u64 reg, mask; + void *base; + + if (buf->single) { + base = buf->data_pages[0]; + mask = (buf->nr_pages * PAGE_SIZE - 1) >> 7; + } else { + base = topa_to_page(buf->cur)->table; + mask = (u64)buf->cur_idx; + } + + reg = virt_to_phys(base); + if (pt->output_base != reg) { + pt->output_base = reg; + wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, reg); + } + + reg = 0x7f | (mask << 7) | ((u64)buf->output_off << 32); + if (pt->output_mask != reg) { + pt->output_mask = reg; + wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg); + } +} + /** * topa_alloc() - allocate page-sized ToPA table * @cpu: CPU on which to allocate. @@ -802,6 +831,11 @@ static void pt_update_head(struct pt *pt) struct pt_buffer *buf = perf_get_aux(&pt->handle); u64 topa_idx, base, old; + if (buf->single) { + local_set(&buf->data_size, buf->output_off); + return; + } + /* offset of the first region in this table from the beginning of buf */ base = buf->cur->offset + buf->output_off; @@ -903,18 +937,21 @@ static void pt_handle_status(struct pt *pt) */ static void pt_read_offset(struct pt_buffer *buf) { - u64 offset, base_topa; + struct pt *pt = this_cpu_ptr(&pt_ctx); struct topa_page *tp; - rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa); - tp = phys_to_virt(base_topa); - buf->cur = &tp->topa; + if (!buf->single) { + rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, pt->output_base); + tp = phys_to_virt(pt->output_base); + buf->cur = &tp->topa; + } - rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset); + rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, pt->output_mask); /* offset within current output region */ - buf->output_off = offset >> 32; + buf->output_off = pt->output_mask >> 32; /* index of current output region within this table */ - buf->cur_idx = (offset & 0xffffff80) >> 7; + if (!buf->single) + buf->cur_idx = (pt->output_mask & 0xffffff80) >> 7; } static struct topa_entry * @@ -1030,6 +1067,9 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, unsigned long head = local64_read(&buf->head); unsigned long idx, npages, wakeup; + if (buf->single) + return 0; + /* can't stop in the middle of an output region */ if (buf->output_off + handle->size + 1 < pt_buffer_region_size(buf)) { perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED); @@ -1111,13 +1151,17 @@ static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head) if (buf->snapshot) head &= (buf->nr_pages << PAGE_SHIFT) - 1; - pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1); - te = pt_topa_entry_for_page(buf, pg); + if (!buf->single) { + pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1); + te = pt_topa_entry_for_page(buf, pg); - cur_tp = topa_entry_to_page(te); - buf->cur = &cur_tp->topa; - buf->cur_idx = te - TOPA_ENTRY(buf->cur, 0); - buf->output_off = head & (pt_buffer_region_size(buf) - 1); + cur_tp = topa_entry_to_page(te); + buf->cur = &cur_tp->topa; + buf->cur_idx = te - TOPA_ENTRY(buf->cur, 0); + buf->output_off = head & (pt_buffer_region_size(buf) - 1); + } else { + buf->output_off = head; + } local64_set(&buf->head, head); local_set(&buf->data_size, 0); @@ -1131,6 +1175,9 @@ static void pt_buffer_fini_topa(struct pt_buffer *buf) { struct topa *topa, *iter; + if (buf->single) + return; + list_for_each_entry_safe(topa, iter, &buf->tables, list) { /* * right now, this is in free_aux() path only, so @@ -1176,6 +1223,36 @@ static int pt_buffer_init_topa(struct pt_buffer *buf, int cpu, return 0; } +static int pt_buffer_try_single(struct pt_buffer *buf, int nr_pages) +{ + struct page *p = virt_to_page(buf->data_pages[0]); + int ret = -ENOTSUPP, order = 0; + + /* + * We can use single range output mode + * + in snapshot mode, where we don't need interrupts; + * + if the hardware supports it; + * + if the entire buffer is one contiguous allocation. + */ + if (!buf->snapshot) + goto out; + + if (!intel_pt_validate_hw_cap(PT_CAP_single_range_output)) + goto out; + + if (PagePrivate(p)) + order = page_private(p); + + if (1 << order != nr_pages) + goto out; + + buf->single = true; + buf->nr_pages = nr_pages; + ret = 0; +out: + return ret; +} + /** * pt_buffer_setup_aux() - set up topa tables for a PT buffer * @cpu: Cpu on which to allocate, -1 means current. @@ -1198,6 +1275,13 @@ pt_buffer_setup_aux(struct perf_event *event, void **pages, if (!nr_pages) return NULL; + /* + * Only support AUX sampling in snapshot mode, where we don't + * generate NMIs. + */ + if (event->attr.aux_sample_size && !snapshot) + return NULL; + if (cpu == -1) cpu = raw_smp_processor_id(); node = cpu_to_node(cpu); @@ -1213,6 +1297,10 @@ pt_buffer_setup_aux(struct perf_event *event, void **pages, INIT_LIST_HEAD(&buf->tables); + ret = pt_buffer_try_single(buf, nr_pages); + if (!ret) + return buf; + ret = pt_buffer_init_topa(buf, cpu, nr_pages, GFP_KERNEL); if (ret) { kfree(buf); @@ -1379,9 +1467,8 @@ void intel_pt_interrupt(void) return; } - pt_config_buffer(topa_to_page(buf->cur)->table, buf->cur_idx, - buf->output_off); - pt_config(event); + pt_config_buffer(buf); + pt_config_start(event); } } @@ -1444,8 +1531,7 @@ static void pt_event_start(struct perf_event *event, int mode) WRITE_ONCE(pt->handle_nmi, 1); hwc->state = 0; - pt_config_buffer(topa_to_page(buf->cur)->table, buf->cur_idx, - buf->output_off); + pt_config_buffer(buf); pt_config(event); return; @@ -1496,6 +1582,52 @@ static void pt_event_stop(struct perf_event *event, int mode) } } +static long pt_event_snapshot_aux(struct perf_event *event, + struct perf_output_handle *handle, + unsigned long size) +{ + struct pt *pt = this_cpu_ptr(&pt_ctx); + struct pt_buffer *buf = perf_get_aux(&pt->handle); + unsigned long from = 0, to; + long ret; + + if (WARN_ON_ONCE(!buf)) + return 0; + + /* + * Sampling is only allowed on snapshot events; + * see pt_buffer_setup_aux(). + */ + if (WARN_ON_ONCE(!buf->snapshot)) + return 0; + + /* + * Here, handle_nmi tells us if the tracing is on + */ + if (READ_ONCE(pt->handle_nmi)) + pt_config_stop(event); + + pt_read_offset(buf); + pt_update_head(pt); + + to = local_read(&buf->data_size); + if (to < size) + from = buf->nr_pages << PAGE_SHIFT; + from += to - size; + + ret = perf_output_copy_aux(&pt->handle, handle, from, to); + + /* + * If the tracing was on when we turned up, restart it. + * Compiler barrier not needed as we couldn't have been + * preempted by anything that touches pt->handle_nmi. + */ + if (pt->handle_nmi) + pt_config_start(event); + + return ret; +} + static void pt_event_del(struct perf_event *event, int mode) { pt_event_stop(event, PERF_EF_UPDATE); @@ -1615,6 +1747,7 @@ static __init int pt_init(void) pt_pmu.pmu.del = pt_event_del; pt_pmu.pmu.start = pt_event_start; pt_pmu.pmu.stop = pt_event_stop; + pt_pmu.pmu.snapshot_aux = pt_event_snapshot_aux; pt_pmu.pmu.read = pt_event_read; pt_pmu.pmu.setup_aux = pt_buffer_setup_aux; pt_pmu.pmu.free_aux = pt_buffer_free_aux; diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index 1d2bb7572374..96906a62aacd 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h @@ -64,6 +64,7 @@ struct pt_pmu { * @lost: if data was lost/truncated * @head: logical write offset inside the buffer * @snapshot: if this is for a snapshot/overwrite counter + * @single: use Single Range Output instead of ToPA * @stop_pos: STOP topa entry index * @intr_pos: INT topa entry index * @stop_te: STOP topa entry pointer @@ -80,6 +81,7 @@ struct pt_buffer { local_t data_size; local64_t head; bool snapshot; + bool single; long stop_pos, intr_pos; struct topa_entry *stop_te, *intr_te; void **data_pages; @@ -111,16 +113,20 @@ struct pt_filters { /** * struct pt - per-cpu pt context - * @handle: perf output handle + * @handle: perf output handle * @filters: last configured filters - * @handle_nmi: do handle PT PMI on this cpu, there's an active event - * @vmx_on: 1 if VMX is ON on this cpu + * @handle_nmi: do handle PT PMI on this cpu, there's an active event + * @vmx_on: 1 if VMX is ON on this cpu + * @output_base: cached RTIT_OUTPUT_BASE MSR value + * @output_mask: cached RTIT_OUTPUT_MASK MSR value */ struct pt { struct perf_output_handle handle; struct pt_filters filters; int handle_nmi; int vmx_on; + u64 output_base; + u64 output_mask; }; #endif /* __INTEL_PT_H__ */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 011dcbdbccc2..34c7c6910026 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -249,6 +249,8 @@ struct perf_event; #define PERF_PMU_CAP_NO_EXCLUDE 0x80 #define PERF_PMU_CAP_AUX_OUTPUT 0x100 +struct perf_output_handle; + /** * struct pmu - generic performance monitoring unit */ @@ -433,6 +435,19 @@ struct pmu { void (*free_aux) (void *aux); /* optional */ /* + * Take a snapshot of the AUX buffer without touching the event + * state, so that preempting ->start()/->stop() callbacks does + * not interfere with their logic. Called in PMI context. + * + * Returns the size of AUX data copied to the output handle. + * + * Optional. + */ + long (*snapshot_aux) (struct perf_event *event, + struct perf_output_handle *handle, + unsigned long size); + + /* * Validate address range filters: make sure the HW supports the * requested configuration and number of filters; return 0 if the * supplied filters are valid, -errno otherwise. @@ -973,6 +988,7 @@ struct perf_sample_data { u32 reserved; } cpu_entry; struct perf_callchain_entry *callchain; + u64 aux_size; /* * regs_user may point to task_pt_regs or to regs_user_copy, depending @@ -1362,6 +1378,9 @@ extern unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); extern unsigned int perf_output_skip(struct perf_output_handle *handle, unsigned int len); +extern long perf_output_copy_aux(struct perf_output_handle *aux_handle, + struct perf_output_handle *handle, + unsigned long from, unsigned long to); extern int perf_swevent_get_recursion_context(void); extern void perf_swevent_put_recursion_context(int rctx); extern u64 perf_swevent_set_period(struct perf_event *event); diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index bb7b271397a6..377d794d3105 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -141,8 +141,9 @@ enum perf_event_sample_format { PERF_SAMPLE_TRANSACTION = 1U << 17, PERF_SAMPLE_REGS_INTR = 1U << 18, PERF_SAMPLE_PHYS_ADDR = 1U << 19, + PERF_SAMPLE_AUX = 1U << 20, - PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 21, /* non-ABI */ __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; @@ -300,6 +301,7 @@ enum perf_event_read_format { /* add: sample_stack_user */ #define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */ #define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */ +#define PERF_ATTR_SIZE_VER6 120 /* add: aux_sample_size */ /* * Hardware event_id to monitor via a performance monitoring event: @@ -424,7 +426,9 @@ struct perf_event_attr { */ __u32 aux_watermark; __u16 sample_max_stack; - __u16 __reserved_2; /* align to __u64 */ + __u16 __reserved_2; + __u32 aux_sample_size; + __u32 __reserved_3; }; /* @@ -864,6 +868,8 @@ enum perf_event_type { * { u64 abi; # enum perf_sample_regs_abi * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR + * { u64 size; + * char data[size]; } && PERF_SAMPLE_AUX * }; */ PERF_RECORD_SAMPLE = 9, diff --git a/kernel/events/core.c b/kernel/events/core.c index cfd89b4a02d8..16d80ad8d6d7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1031,7 +1031,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, { } -void +static inline void perf_cgroup_switch(struct task_struct *task, struct task_struct *next) { } @@ -1941,6 +1941,11 @@ static void perf_put_aux_event(struct perf_event *event) } } +static bool perf_need_aux_event(struct perf_event *event) +{ + return !!event->attr.aux_output || !!event->attr.aux_sample_size; +} + static int perf_get_aux_event(struct perf_event *event, struct perf_event *group_leader) { @@ -1953,7 +1958,17 @@ static int perf_get_aux_event(struct perf_event *event, if (!group_leader) return 0; - if (!perf_aux_output_match(event, group_leader)) + /* + * aux_output and aux_sample_size are mutually exclusive. + */ + if (event->attr.aux_output && event->attr.aux_sample_size) + return 0; + + if (event->attr.aux_output && + !perf_aux_output_match(event, group_leader)) + return 0; + + if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) return 0; if (!atomic_long_inc_not_zero(&group_leader->refcount)) @@ -6222,6 +6237,122 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, } } +static unsigned long perf_prepare_sample_aux(struct perf_event *event, + struct perf_sample_data *data, + size_t size) +{ + struct perf_event *sampler = event->aux_event; + struct ring_buffer *rb; + + data->aux_size = 0; + + if (!sampler) + goto out; + + if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE)) + goto out; + + if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id())) + goto out; + + rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler); + if (!rb) + goto out; + + /* + * If this is an NMI hit inside sampling code, don't take + * the sample. See also perf_aux_sample_output(). + */ + if (READ_ONCE(rb->aux_in_sampling)) { + data->aux_size = 0; + } else { + size = min_t(size_t, size, perf_aux_size(rb)); + data->aux_size = ALIGN(size, sizeof(u64)); + } + ring_buffer_put(rb); + +out: + return data->aux_size; +} + +long perf_pmu_snapshot_aux(struct ring_buffer *rb, + struct perf_event *event, + struct perf_output_handle *handle, + unsigned long size) +{ + unsigned long flags; + long ret; + + /* + * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler + * paths. If we start calling them in NMI context, they may race with + * the IRQ ones, that is, for example, re-starting an event that's just + * been stopped, which is why we're using a separate callback that + * doesn't change the event state. + * + * IRQs need to be disabled to prevent IPIs from racing with us. + */ + local_irq_save(flags); + /* + * Guard against NMI hits inside the critical section; + * see also perf_prepare_sample_aux(). + */ + WRITE_ONCE(rb->aux_in_sampling, 1); + barrier(); + + ret = event->pmu->snapshot_aux(event, handle, size); + + barrier(); + WRITE_ONCE(rb->aux_in_sampling, 0); + local_irq_restore(flags); + + return ret; +} + +static void perf_aux_sample_output(struct perf_event *event, + struct perf_output_handle *handle, + struct perf_sample_data *data) +{ + struct perf_event *sampler = event->aux_event; + unsigned long pad; + struct ring_buffer *rb; + long size; + + if (WARN_ON_ONCE(!sampler || !data->aux_size)) + return; + + rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler); + if (!rb) + return; + + size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size); + + /* + * An error here means that perf_output_copy() failed (returned a + * non-zero surplus that it didn't copy), which in its current + * enlightened implementation is not possible. If that changes, we'd + * like to know. + */ + if (WARN_ON_ONCE(size < 0)) + goto out_put; + + /* + * The pad comes from ALIGN()ing data->aux_size up to u64 in + * perf_prepare_sample_aux(), so should not be more than that. + */ + pad = data->aux_size - size; + if (WARN_ON_ONCE(pad >= sizeof(u64))) + pad = 8; + + if (pad) { + u64 zero = 0; + perf_output_copy(handle, &zero, pad); + } + +out_put: + ring_buffer_put(rb); +} + static void __perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) @@ -6541,6 +6672,13 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_PHYS_ADDR) perf_output_put(handle, data->phys_addr); + if (sample_type & PERF_SAMPLE_AUX) { + perf_output_put(handle, data->aux_size); + + if (data->aux_size) + perf_aux_sample_output(event, handle, data); + } + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; @@ -6729,6 +6867,35 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_PHYS_ADDR) data->phys_addr = perf_virt_to_phys(data->addr); + + if (sample_type & PERF_SAMPLE_AUX) { + u64 size; + + header->size += sizeof(u64); /* size */ + + /* + * Given the 16bit nature of header::size, an AUX sample can + * easily overflow it, what with all the preceding sample bits. + * Make sure this doesn't happen by using up to U16_MAX bytes + * per sample in total (rounded down to 8 byte boundary). + */ + size = min_t(size_t, U16_MAX - header->size, + event->attr.aux_sample_size); + size = rounddown(size, 8); + size = perf_prepare_sample_aux(event, data, size); + + WARN_ON_ONCE(size + header->size > U16_MAX); + header->size += size; + } + /* + * If you're adding more sample types here, you likely need to do + * something about the overflowing header::size, like repurpose the + * lowest 3 bits of size, which should be always zero at the moment. + * This raises a more important question, do we really need 512k sized + * samples and why, so good argumentation is in order for whatever you + * do here next. + */ + WARN_ON_ONCE(header->size & 7); } static __always_inline int @@ -10307,7 +10474,6 @@ static struct pmu *perf_init_event(struct perf_event *event) goto unlock; } - rcu_read_lock(); /* * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE * are often aliases for PERF_TYPE_RAW. @@ -10317,6 +10483,7 @@ static struct pmu *perf_init_event(struct perf_event *event) type = PERF_TYPE_RAW; again: + rcu_read_lock(); pmu = idr_find(&pmu_idr, type); rcu_read_unlock(); if (pmu) { @@ -10609,6 +10776,15 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, goto err_ns; } + /* + * Disallow uncore-cgroup events, they don't make sense as the cgroup will + * be different on other CPUs in the uncore mask. + */ + if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) { + err = -EINVAL; + goto err_pmu; + } + if (event->attr.aux_output && !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) { err = -EOPNOTSUPP; @@ -10718,7 +10894,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, attr->size = size; - if (attr->__reserved_1 || attr->__reserved_2) + if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) @@ -11268,7 +11444,7 @@ SYSCALL_DEFINE5(perf_event_open, } } - if (event->attr.aux_output && !perf_get_aux_event(event, group_leader)) + if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) goto err_locked; /* @@ -11416,8 +11592,11 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, int err; /* - * Get the target context (task or percpu): + * Grouping is not supported for kernel events, neither is 'AUX', + * make sure the caller's intentions are adjusted. */ + if (attr->aux_output) + return ERR_PTR(-EINVAL); event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler, context, -1); @@ -11429,6 +11608,9 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, /* Mark owner so we could distinguish it from user events. */ event->owner = TASK_TOMBSTONE; + /* + * Get the target context (task or percpu): + */ ctx = find_get_context(event->pmu, task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); @@ -11880,7 +12062,7 @@ inherit_event(struct perf_event *parent_event, GFP_KERNEL); if (!child_ctx->task_ctx_data) { free_event(child_event); - return NULL; + return ERR_PTR(-ENOMEM); } } @@ -11983,7 +12165,7 @@ static int inherit_group(struct perf_event *parent_event, if (IS_ERR(child_ctr)) return PTR_ERR(child_ctr); - if (sub->aux_event == parent_event && + if (sub->aux_event == parent_event && child_ctr && !perf_get_aux_event(child_ctr, leader)) return -EINVAL; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 3aef4191798c..747d67f130cb 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -50,6 +50,7 @@ struct ring_buffer { unsigned long aux_mmap_locked; void (*free_aux)(void *); refcount_t aux_refcount; + int aux_in_sampling; void **aux_pages; void *aux_priv; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 246c83ac5643..7ffd5c763f93 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -562,6 +562,42 @@ void *perf_get_aux(struct perf_output_handle *handle) } EXPORT_SYMBOL_GPL(perf_get_aux); +/* + * Copy out AUX data from an AUX handle. + */ +long perf_output_copy_aux(struct perf_output_handle *aux_handle, + struct perf_output_handle *handle, + unsigned long from, unsigned long to) +{ + unsigned long tocopy, remainder, len = 0; + struct ring_buffer *rb = aux_handle->rb; + void *addr; + + from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; + to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; + + do { + tocopy = PAGE_SIZE - offset_in_page(from); + if (to > from) + tocopy = min(tocopy, to - from); + if (!tocopy) + break; + + addr = rb->aux_pages[from >> PAGE_SHIFT]; + addr += offset_in_page(from); + + remainder = perf_output_copy(handle, addr, tocopy); + if (remainder) + return -EFAULT; + + len += tocopy; + from += tocopy; + from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; + } while (to != from); + + return len; +} + #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) static struct page *rb_alloc_aux_page(int node, int order) |