From 256d92bc93fd40411a02be5cdba74a7bf91e6e09 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 21 Dec 2018 14:06:19 +0200 Subject: perf thread-stack: Fix thread stack processing for the idle task perf creates a single 'struct thread' to represent the idle task. That is because threads are identified by PID and TID, and the idle task always has PID == TID == 0. However, there are actually separate idle tasks for each CPU. That creates a problem for thread stack processing which assumes that each thread has a single stack, not one stack per CPU. Fix that by passing through the CPU number, and in the case of the idle "thread", pick the thread stack from an array based on the CPU number. Signed-off-by: Adrian Hunter Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20181221120620.9659-8-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/thread-stack.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools/perf/util/thread-stack.h') diff --git a/tools/perf/util/thread-stack.h b/tools/perf/util/thread-stack.h index f97c00a8c251..1f626f4a1c40 100644 --- a/tools/perf/util/thread-stack.h +++ b/tools/perf/util/thread-stack.h @@ -80,14 +80,14 @@ struct call_return_processor { void *data; }; -int thread_stack__event(struct thread *thread, u32 flags, u64 from_ip, +int thread_stack__event(struct thread *thread, int cpu, u32 flags, u64 from_ip, u64 to_ip, u16 insn_len, u64 trace_nr); -void thread_stack__set_trace_nr(struct thread *thread, u64 trace_nr); -void thread_stack__sample(struct thread *thread, struct ip_callchain *chain, +void thread_stack__set_trace_nr(struct thread *thread, int cpu, u64 trace_nr); +void thread_stack__sample(struct thread *thread, int cpu, struct ip_callchain *chain, size_t sz, u64 ip, u64 kernel_start); int thread_stack__flush(struct thread *thread); void thread_stack__free(struct thread *thread); -size_t thread_stack__depth(struct thread *thread); +size_t thread_stack__depth(struct thread *thread, int cpu); struct call_return_processor * call_return_processor__new(int (*process)(struct call_return *cr, void *data), -- cgit From f08046cb3082b313e7b08dc35838cf8bd902c36b Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 9 Jan 2019 11:18:33 +0200 Subject: perf thread-stack: Represent jmps to the start of a different symbol The compiler might optimize a call/ret combination by making it a jmp. However the thread-stack does not presently cater for that, so that such control flow is not visible in the call graph. Make it visible by recording on the stack a branch to the start of a different symbol. Note, that means when a ret pops the stack, all jmps must be popped off first. Example: $ cat jmp-to-fn.c __attribute__((noinline)) int bar(void) { return -1; } __attribute__((noinline)) int foo(void) { return bar() + 1; } int main() { return foo(); } $ gcc -ggdb3 -Wall -Wextra -O2 -o jmp-to-fn jmp-to-fn.c $ objdump -d jmp-to-fn 0000000000001040
: 1040: 31 c0 xor %eax,%eax 1042: e9 09 01 00 00 jmpq 1150 0000000000001140 : 1140: b8 ff ff ff ff mov $0xffffffff,%eax 1145: c3 retq 0000000000001150 : 1150: 31 c0 xor %eax,%eax 1152: e8 e9 ff ff ff callq 1140 1157: 83 c0 01 add $0x1,%eax 115a: c3 retq $ perf record -o jmp-to-fn.perf.data -e intel_pt/cyc/u ./jmp-to-fn [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0,017 MB jmp-to-fn.perf.data ] $ perf script -i jmp-to-fn.perf.data --itrace=be -s ~/libexec/perf-core/scripts/python/export-to-sqlite.py jmp-to-fn.db branches calls 2019-01-08 13:24:58.783069 Creating database... 2019-01-08 13:24:58.794650 Writing records... 2019-01-08 13:24:59.008050 Adding indexes 2019-01-08 13:24:59.015802 Done $ ~/libexec/perf-core/scripts/python/exported-sql-viewer.py jmp-to-fn.db Before: main -> bar After: main -> foo -> bar Committer testing: Install the python2-pyside package, then select these menu options on the GUI: "Reports" "Context sensitive callgraphs" Then go on expanding the symbols, to get, full picture when doing this on a fedora:29 with gcc version 8.2.1 20181215 (Red Hat 8.2.1-6) (GCC): jmp-to-fn PID:TID _start (ld-2.28.so) __libc_start_main main foo bar To verify that indeed, this fixes the problem. Signed-off-by: Adrian Hunter Tested-by: Arnaldo Carvalho de Melo Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20190109091835.5570-5-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/scripts/python/export-to-postgresql.py | 2 +- tools/perf/scripts/python/export-to-sqlite.py | 2 +- tools/perf/util/thread-stack.c | 30 +++++++++++++++++++++-- tools/perf/util/thread-stack.h | 3 +++ 4 files changed, 33 insertions(+), 4 deletions(-) (limited to 'tools/perf/util/thread-stack.h') diff --git a/tools/perf/scripts/python/export-to-postgresql.py b/tools/perf/scripts/python/export-to-postgresql.py index 0564dd7377f2..30130213da7e 100644 --- a/tools/perf/scripts/python/export-to-postgresql.py +++ b/tools/perf/scripts/python/export-to-postgresql.py @@ -478,7 +478,7 @@ if perf_db_export_calls: 'branch_count,' 'call_id,' 'return_id,' - 'CASE WHEN flags=1 THEN \'no call\' WHEN flags=2 THEN \'no return\' WHEN flags=3 THEN \'no call/return\' ELSE \'\' END AS flags,' + 'CASE WHEN flags=0 THEN \'\' WHEN flags=1 THEN \'no call\' WHEN flags=2 THEN \'no return\' WHEN flags=3 THEN \'no call/return\' WHEN flags=6 THEN \'jump\' ELSE flags END AS flags,' 'parent_call_path_id' ' FROM calls INNER JOIN call_paths ON call_paths.id = call_path_id') diff --git a/tools/perf/scripts/python/export-to-sqlite.py b/tools/perf/scripts/python/export-to-sqlite.py index 245caf2643ed..ed237f2ed03f 100644 --- a/tools/perf/scripts/python/export-to-sqlite.py +++ b/tools/perf/scripts/python/export-to-sqlite.py @@ -320,7 +320,7 @@ if perf_db_export_calls: 'branch_count,' 'call_id,' 'return_id,' - 'CASE WHEN flags=1 THEN \'no call\' WHEN flags=2 THEN \'no return\' WHEN flags=3 THEN \'no call/return\' ELSE \'\' END AS flags,' + 'CASE WHEN flags=0 THEN \'\' WHEN flags=1 THEN \'no call\' WHEN flags=2 THEN \'no return\' WHEN flags=3 THEN \'no call/return\' WHEN flags=6 THEN \'jump\' ELSE flags END AS flags,' 'parent_call_path_id' ' FROM calls INNER JOIN call_paths ON call_paths.id = call_path_id') diff --git a/tools/perf/util/thread-stack.c b/tools/perf/util/thread-stack.c index 7f8eff018c16..f52c0f90915d 100644 --- a/tools/perf/util/thread-stack.c +++ b/tools/perf/util/thread-stack.c @@ -38,6 +38,7 @@ * @cp: call path * @no_call: a 'call' was not seen * @trace_end: a 'call' but trace ended + * @non_call: a branch but not a 'call' to the start of a different symbol */ struct thread_stack_entry { u64 ret_addr; @@ -47,6 +48,7 @@ struct thread_stack_entry { struct call_path *cp; bool no_call; bool trace_end; + bool non_call; }; /** @@ -268,6 +270,8 @@ static int thread_stack__call_return(struct thread *thread, cr.flags |= CALL_RETURN_NO_CALL; if (no_return) cr.flags |= CALL_RETURN_NO_RETURN; + if (tse->non_call) + cr.flags |= CALL_RETURN_NON_CALL; return crp->process(&cr, crp->data); } @@ -510,6 +514,7 @@ static int thread_stack__push_cp(struct thread_stack *ts, u64 ret_addr, tse->cp = cp; tse->no_call = no_call; tse->trace_end = trace_end; + tse->non_call = false; return 0; } @@ -531,14 +536,16 @@ static int thread_stack__pop_cp(struct thread *thread, struct thread_stack *ts, timestamp, ref, false); } - if (ts->stack[ts->cnt - 1].ret_addr == ret_addr) { + if (ts->stack[ts->cnt - 1].ret_addr == ret_addr && + !ts->stack[ts->cnt - 1].non_call) { return thread_stack__call_return(thread, ts, --ts->cnt, timestamp, ref, false); } else { size_t i = ts->cnt - 1; while (i--) { - if (ts->stack[i].ret_addr != ret_addr) + if (ts->stack[i].ret_addr != ret_addr || + ts->stack[i].non_call) continue; i += 1; while (ts->cnt > i) { @@ -757,6 +764,25 @@ int thread_stack__process(struct thread *thread, struct comm *comm, err = thread_stack__trace_begin(thread, ts, sample->time, ref); } else if (sample->flags & PERF_IP_FLAG_TRACE_END) { err = thread_stack__trace_end(ts, sample, ref); + } else if (sample->flags & PERF_IP_FLAG_BRANCH && + from_al->sym != to_al->sym && to_al->sym && + to_al->addr == to_al->sym->start) { + struct call_path_root *cpr = ts->crp->cpr; + struct call_path *cp; + + /* + * The compiler might optimize a call/ret combination by making + * it a jmp. Make that visible by recording on the stack a + * branch to the start of a different symbol. Note, that means + * when a ret pops the stack, all jmps must be popped off first. + */ + cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp, + to_al->sym, sample->addr, + ts->kernel_start); + err = thread_stack__push_cp(ts, 0, sample->time, ref, cp, false, + false); + if (!err) + ts->stack[ts->cnt - 1].non_call = true; } return err; diff --git a/tools/perf/util/thread-stack.h b/tools/perf/util/thread-stack.h index 1f626f4a1c40..b7c04e19ad41 100644 --- a/tools/perf/util/thread-stack.h +++ b/tools/perf/util/thread-stack.h @@ -35,10 +35,13 @@ struct call_path; * * CALL_RETURN_NO_CALL: 'return' but no matching 'call' * CALL_RETURN_NO_RETURN: 'call' but no matching 'return' + * CALL_RETURN_NON_CALL: a branch but not a 'call' to the start of a different + * symbol */ enum { CALL_RETURN_NO_CALL = 1 << 0, CALL_RETURN_NO_RETURN = 1 << 1, + CALL_RETURN_NON_CALL = 1 << 2, }; /** -- cgit From f435887ec0c941b97301bd6ed1f3e4b5200df409 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 28 Feb 2019 15:00:24 +0200 Subject: perf db-export: Add calls parent_id to enable creation of call trees The call_path can be used to find the parent symbol for a call but not the exact parent call. To do that add parent_id to the call_return export. This enables the creation of a call tree from the exported data. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: https://lkml.kernel.org/n/tip-6j7tzdxo67cox6kan7k22oo6@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/db-export.c | 15 ++++++++++----- tools/perf/util/db-export.h | 3 ++- tools/perf/util/scripting-engines/trace-event-python.c | 8 +++++--- tools/perf/util/thread-stack.c | 16 ++++++++++++++-- tools/perf/util/thread-stack.h | 6 ++++-- 5 files changed, 35 insertions(+), 13 deletions(-) (limited to 'tools/perf/util/thread-stack.h') diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index de9b4769d06c..d7315a00c731 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -510,18 +510,23 @@ int db_export__call_path(struct db_export *dbe, struct call_path *cp) return 0; } -int db_export__call_return(struct db_export *dbe, struct call_return *cr) +int db_export__call_return(struct db_export *dbe, struct call_return *cr, + u64 *parent_db_id) { int err; - if (cr->db_id) - return 0; - err = db_export__call_path(dbe, cr->cp); if (err) return err; - cr->db_id = ++dbe->call_return_last_db_id; + if (!cr->db_id) + cr->db_id = ++dbe->call_return_last_db_id; + + if (parent_db_id) { + if (!*parent_db_id) + *parent_db_id = ++dbe->call_return_last_db_id; + cr->parent_db_id = *parent_db_id; + } if (dbe->export_call_return) return dbe->export_call_return(dbe, cr); diff --git a/tools/perf/util/db-export.h b/tools/perf/util/db-export.h index 67bc6b8ad2d6..4e2424c89df9 100644 --- a/tools/perf/util/db-export.h +++ b/tools/perf/util/db-export.h @@ -104,6 +104,7 @@ int db_export__sample(struct db_export *dbe, union perf_event *event, int db_export__branch_types(struct db_export *dbe); int db_export__call_path(struct db_export *dbe, struct call_path *cp); -int db_export__call_return(struct db_export *dbe, struct call_return *cr); +int db_export__call_return(struct db_export *dbe, struct call_return *cr, + u64 *parent_db_id); #endif diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 0e17db41b49b..09604c6508f0 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -1173,7 +1173,7 @@ static int python_export_call_return(struct db_export *dbe, u64 comm_db_id = cr->comm ? cr->comm->db_id : 0; PyObject *t; - t = tuple_new(11); + t = tuple_new(12); tuple_set_u64(t, 0, cr->db_id); tuple_set_u64(t, 1, cr->thread->db_id); @@ -1186,6 +1186,7 @@ static int python_export_call_return(struct db_export *dbe, tuple_set_u64(t, 8, cr->return_ref); tuple_set_u64(t, 9, cr->cp->parent->db_id); tuple_set_s32(t, 10, cr->flags); + tuple_set_u64(t, 11, cr->parent_db_id); call_object(tables->call_return_handler, t, "call_return_table"); @@ -1194,11 +1195,12 @@ static int python_export_call_return(struct db_export *dbe, return 0; } -static int python_process_call_return(struct call_return *cr, void *data) +static int python_process_call_return(struct call_return *cr, u64 *parent_db_id, + void *data) { struct db_export *dbe = data; - return db_export__call_return(dbe, cr); + return db_export__call_return(dbe, cr, parent_db_id); } static void python_process_general_event(struct perf_sample *sample, diff --git a/tools/perf/util/thread-stack.c b/tools/perf/util/thread-stack.c index a8b45168513c..41942c2aaa18 100644 --- a/tools/perf/util/thread-stack.c +++ b/tools/perf/util/thread-stack.c @@ -49,6 +49,7 @@ enum retpoline_state_t { * @timestamp: timestamp (if known) * @ref: external reference (e.g. db_id of sample) * @branch_count: the branch count when the entry was created + * @db_id: id used for db-export * @cp: call path * @no_call: a 'call' was not seen * @trace_end: a 'call' but trace ended @@ -59,6 +60,7 @@ struct thread_stack_entry { u64 timestamp; u64 ref; u64 branch_count; + u64 db_id; struct call_path *cp; bool no_call; bool trace_end; @@ -280,12 +282,14 @@ static int thread_stack__call_return(struct thread *thread, .comm = ts->comm, .db_id = 0, }; + u64 *parent_db_id; tse = &ts->stack[idx]; cr.cp = tse->cp; cr.call_time = tse->timestamp; cr.return_time = timestamp; cr.branch_count = ts->branch_count - tse->branch_count; + cr.db_id = tse->db_id; cr.call_ref = tse->ref; cr.return_ref = ref; if (tse->no_call) @@ -295,7 +299,14 @@ static int thread_stack__call_return(struct thread *thread, if (tse->non_call) cr.flags |= CALL_RETURN_NON_CALL; - return crp->process(&cr, crp->data); + /* + * The parent db_id must be assigned before exporting the child. Note + * it is not possible to export the parent first because its information + * is not yet complete because its 'return' has not yet been processed. + */ + parent_db_id = idx ? &(tse - 1)->db_id : NULL; + + return crp->process(&cr, parent_db_id, crp->data); } static int __thread_stack__flush(struct thread *thread, struct thread_stack *ts) @@ -484,7 +495,7 @@ void thread_stack__sample(struct thread *thread, int cpu, } struct call_return_processor * -call_return_processor__new(int (*process)(struct call_return *cr, void *data), +call_return_processor__new(int (*process)(struct call_return *cr, u64 *parent_db_id, void *data), void *data) { struct call_return_processor *crp; @@ -537,6 +548,7 @@ static int thread_stack__push_cp(struct thread_stack *ts, u64 ret_addr, tse->no_call = no_call; tse->trace_end = trace_end; tse->non_call = false; + tse->db_id = 0; return 0; } diff --git a/tools/perf/util/thread-stack.h b/tools/perf/util/thread-stack.h index b7c04e19ad41..9c45f947f5a9 100644 --- a/tools/perf/util/thread-stack.h +++ b/tools/perf/util/thread-stack.h @@ -55,6 +55,7 @@ enum { * @call_ref: external reference to 'call' sample (e.g. db_id) * @return_ref: external reference to 'return' sample (e.g. db_id) * @db_id: id used for db-export + * @parent_db_id: id of parent call used for db-export * @flags: Call/Return flags */ struct call_return { @@ -67,6 +68,7 @@ struct call_return { u64 call_ref; u64 return_ref; u64 db_id; + u64 parent_db_id; u32 flags; }; @@ -79,7 +81,7 @@ struct call_return { */ struct call_return_processor { struct call_path_root *cpr; - int (*process)(struct call_return *cr, void *data); + int (*process)(struct call_return *cr, u64 *parent_db_id, void *data); void *data; }; @@ -93,7 +95,7 @@ void thread_stack__free(struct thread *thread); size_t thread_stack__depth(struct thread *thread, int cpu); struct call_return_processor * -call_return_processor__new(int (*process)(struct call_return *cr, void *data), +call_return_processor__new(int (*process)(struct call_return *cr, u64 *parent_db_id, void *data), void *data); void call_return_processor__free(struct call_return_processor *crp); int thread_stack__process(struct thread *thread, struct comm *comm, -- cgit