diff options
Diffstat (limited to 'tools/perf/bench')
-rw-r--r-- | tools/perf/bench/Build | 2 | ||||
-rw-r--r-- | tools/perf/bench/bench.h | 2 | ||||
-rw-r--r-- | tools/perf/bench/find-bit-bench.c | 135 | ||||
-rw-r--r-- | tools/perf/bench/mem-functions.c | 21 | ||||
-rw-r--r-- | tools/perf/bench/numa.c | 77 | ||||
-rw-r--r-- | tools/perf/bench/syscall.c | 81 |
6 files changed, 270 insertions, 48 deletions
diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build index 768e408757a0..dd68a40a790c 100644 --- a/tools/perf/bench/Build +++ b/tools/perf/bench/Build @@ -1,5 +1,6 @@ perf-y += sched-messaging.o perf-y += sched-pipe.o +perf-y += syscall.o perf-y += mem-functions.o perf-y += futex-hash.o perf-y += futex-wake.o @@ -10,6 +11,7 @@ perf-y += epoll-wait.o perf-y += epoll-ctl.o perf-y += synthesize.o perf-y += kallsyms-parse.o +perf-y += find-bit-bench.o perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-lib.o perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h index 61cae4966cae..2804812d4154 100644 --- a/tools/perf/bench/bench.h +++ b/tools/perf/bench/bench.h @@ -33,8 +33,10 @@ extern struct timeval bench__start, bench__end, bench__runtime; int bench_numa(int argc, const char **argv); int bench_sched_messaging(int argc, const char **argv); int bench_sched_pipe(int argc, const char **argv); +int bench_syscall_basic(int argc, const char **argv); int bench_mem_memcpy(int argc, const char **argv); int bench_mem_memset(int argc, const char **argv); +int bench_mem_find_bit(int argc, const char **argv); int bench_futex_hash(int argc, const char **argv); int bench_futex_wake(int argc, const char **argv); int bench_futex_wake_parallel(int argc, const char **argv); diff --git a/tools/perf/bench/find-bit-bench.c b/tools/perf/bench/find-bit-bench.c new file mode 100644 index 000000000000..73b5bcc5946a --- /dev/null +++ b/tools/perf/bench/find-bit-bench.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Benchmark find_next_bit and related bit operations. + * + * Copyright 2020 Google LLC. + */ +#include <stdlib.h> +#include "bench.h" +#include "../util/stat.h" +#include <linux/bitmap.h> +#include <linux/bitops.h> +#include <linux/time64.h> +#include <subcmd/parse-options.h> + +static unsigned int outer_iterations = 5; +static unsigned int inner_iterations = 100000; + +static const struct option options[] = { + OPT_UINTEGER('i', "outer-iterations", &outer_iterations, + "Number of outer iterations used"), + OPT_UINTEGER('j', "inner-iterations", &inner_iterations, + "Number of inner iterations used"), + OPT_END() +}; + +static const char *const bench_usage[] = { + "perf bench mem find_bit <options>", + NULL +}; + +static unsigned int accumulator; +static unsigned int use_of_val; + +static noinline void workload(int val) +{ + use_of_val += val; + accumulator++; +} + +#if (defined(__i386__) || defined(__x86_64__)) && defined(__GCC_ASM_FLAG_OUTPUTS__) +static bool asm_test_bit(long nr, const unsigned long *addr) +{ + bool oldbit; + + asm volatile("bt %2,%1" + : "=@ccc" (oldbit) + : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory"); + + return oldbit; +} +#else +#define asm_test_bit test_bit +#endif + +static int do_for_each_set_bit(unsigned int num_bits) +{ + unsigned long *to_test = bitmap_alloc(num_bits); + struct timeval start, end, diff; + u64 runtime_us; + struct stats fb_time_stats, tb_time_stats; + double time_average, time_stddev; + unsigned int bit, i, j; + unsigned int set_bits, skip; + unsigned int old; + + init_stats(&fb_time_stats); + init_stats(&tb_time_stats); + + for (set_bits = 1; set_bits <= num_bits; set_bits <<= 1) { + bitmap_zero(to_test, num_bits); + skip = num_bits / set_bits; + for (i = 0; i < num_bits; i += skip) + set_bit(i, to_test); + + for (i = 0; i < outer_iterations; i++) { + old = accumulator; + gettimeofday(&start, NULL); + for (j = 0; j < inner_iterations; j++) { + for_each_set_bit(bit, to_test, num_bits) + workload(bit); + } + gettimeofday(&end, NULL); + assert(old + (inner_iterations * set_bits) == accumulator); + timersub(&end, &start, &diff); + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + update_stats(&fb_time_stats, runtime_us); + + old = accumulator; + gettimeofday(&start, NULL); + for (j = 0; j < inner_iterations; j++) { + for (bit = 0; bit < num_bits; bit++) { + if (asm_test_bit(bit, to_test)) + workload(bit); + } + } + gettimeofday(&end, NULL); + assert(old + (inner_iterations * set_bits) == accumulator); + timersub(&end, &start, &diff); + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + update_stats(&tb_time_stats, runtime_us); + } + + printf("%d operations %d bits set of %d bits\n", + inner_iterations, set_bits, num_bits); + time_average = avg_stats(&fb_time_stats); + time_stddev = stddev_stats(&fb_time_stats); + printf(" Average for_each_set_bit took: %.3f usec (+- %.3f usec)\n", + time_average, time_stddev); + time_average = avg_stats(&tb_time_stats); + time_stddev = stddev_stats(&tb_time_stats); + printf(" Average test_bit loop took: %.3f usec (+- %.3f usec)\n", + time_average, time_stddev); + + if (use_of_val == accumulator) /* Try to avoid compiler tricks. */ + printf("\n"); + } + bitmap_free(to_test); + return 0; +} + +int bench_mem_find_bit(int argc, const char **argv) +{ + int err = 0, i; + + argc = parse_options(argc, argv, options, bench_usage, 0); + if (argc) { + usage_with_options(bench_usage, options); + exit(EXIT_FAILURE); + } + + for (i = 1; i <= 2048; i <<= 1) + do_for_each_set_bit(i); + + return err; +} diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c index 9235b76501be..19d45c377ac1 100644 --- a/tools/perf/bench/mem-functions.c +++ b/tools/perf/bench/mem-functions.c @@ -223,12 +223,8 @@ static int bench_mem_common(int argc, const char **argv, struct bench_mem_info * return 0; } -static u64 do_memcpy_cycles(const struct function *r, size_t size, void *src, void *dst) +static void memcpy_prefault(memcpy_t fn, size_t size, void *src, void *dst) { - u64 cycle_start = 0ULL, cycle_end = 0ULL; - memcpy_t fn = r->fn.memcpy; - int i; - /* Make sure to always prefault zero pages even if MMAP_THRESH is crossed: */ memset(src, 0, size); @@ -237,6 +233,15 @@ static u64 do_memcpy_cycles(const struct function *r, size_t size, void *src, vo * to not measure page fault overhead: */ fn(dst, src, size); +} + +static u64 do_memcpy_cycles(const struct function *r, size_t size, void *src, void *dst) +{ + u64 cycle_start = 0ULL, cycle_end = 0ULL; + memcpy_t fn = r->fn.memcpy; + int i; + + memcpy_prefault(fn, size, src, dst); cycle_start = get_cycles(); for (i = 0; i < nr_loops; ++i) @@ -252,11 +257,7 @@ static double do_memcpy_gettimeofday(const struct function *r, size_t size, void memcpy_t fn = r->fn.memcpy; int i; - /* - * We prefault the freshly allocated memory range here, - * to not measure page fault overhead: - */ - fn(dst, src, size); + memcpy_prefault(fn, size, src, dst); BUG_ON(gettimeofday(&tv_start, NULL)); for (i = 0; i < nr_loops; ++i) diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c index 5797253b9700..f85bceccc459 100644 --- a/tools/perf/bench/numa.c +++ b/tools/perf/bench/numa.c @@ -247,17 +247,22 @@ static int is_node_present(int node) */ static bool node_has_cpus(int node) { - struct bitmask *cpu = numa_allocate_cpumask(); - unsigned int i; + struct bitmask *cpumask = numa_allocate_cpumask(); + bool ret = false; /* fall back to nocpus */ + int cpu; - if (cpu && !numa_node_to_cpus(node, cpu)) { - for (i = 0; i < cpu->size; i++) { - if (numa_bitmask_isbitset(cpu, i)) - return true; + BUG_ON(!cpumask); + if (!numa_node_to_cpus(node, cpumask)) { + for (cpu = 0; cpu < (int)cpumask->size; cpu++) { + if (numa_bitmask_isbitset(cpumask, cpu)) { + ret = true; + break; + } } } + numa_free_cpumask(cpumask); - return false; /* lets fall back to nocpus safely */ + return ret; } static cpu_set_t bind_to_cpu(int target_cpu) @@ -288,14 +293,10 @@ static cpu_set_t bind_to_cpu(int target_cpu) static cpu_set_t bind_to_node(int target_node) { - int cpus_per_node = g->p.nr_cpus / nr_numa_nodes(); cpu_set_t orig_mask, mask; int cpu; int ret; - BUG_ON(cpus_per_node * nr_numa_nodes() != g->p.nr_cpus); - BUG_ON(!cpus_per_node); - ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask); BUG_ON(ret); @@ -305,13 +306,16 @@ static cpu_set_t bind_to_node(int target_node) for (cpu = 0; cpu < g->p.nr_cpus; cpu++) CPU_SET(cpu, &mask); } else { - int cpu_start = (target_node + 0) * cpus_per_node; - int cpu_stop = (target_node + 1) * cpus_per_node; + struct bitmask *cpumask = numa_allocate_cpumask(); - BUG_ON(cpu_stop > g->p.nr_cpus); - - for (cpu = cpu_start; cpu < cpu_stop; cpu++) - CPU_SET(cpu, &mask); + BUG_ON(!cpumask); + if (!numa_node_to_cpus(target_node, cpumask)) { + for (cpu = 0; cpu < (int)cpumask->size; cpu++) { + if (numa_bitmask_isbitset(cpumask, cpu)) + CPU_SET(cpu, &mask); + } + } + numa_free_cpumask(cpumask); } ret = sched_setaffinity(0, sizeof(mask), &mask); @@ -729,8 +733,6 @@ static int parse_nodes_opt(const struct option *opt __maybe_unused, return -1; return parse_node_list(arg); - - return 0; } #define BIT(x) (1ul << x) @@ -813,12 +815,12 @@ static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val } } } else if (!g->p.data_backwards || (nr + loop) & 1) { + /* Process data forwards: */ d0 = data + off; d = data + off + 1; d1 = data + words; - /* Process data forwards: */ for (;;) { if (unlikely(d >= d1)) d = data; @@ -836,7 +838,6 @@ static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val d = data + off - 1; d1 = data + words; - /* Process data forwards: */ for (;;) { if (unlikely(d < data)) d = data + words-1; @@ -1733,12 +1734,12 @@ err: */ static const char *tests[][MAX_ARGS] = { /* Basic single-stream NUMA bandwidth measurements: */ - { "RAM-bw-local,", "mem", "-p", "1", "-t", "1", "-P", "1024", + { "RAM-bw-local,", "mem", "-p", "1", "-t", "1", "-P", "1024", "-C" , "0", "-M", "0", OPT_BW_RAM }, { "RAM-bw-local-NOTHP,", "mem", "-p", "1", "-t", "1", "-P", "1024", "-C" , "0", "-M", "0", OPT_BW_RAM_NOTHP }, - { "RAM-bw-remote,", "mem", "-p", "1", "-t", "1", "-P", "1024", + { "RAM-bw-remote,", "mem", "-p", "1", "-t", "1", "-P", "1024", "-C" , "0", "-M", "1", OPT_BW_RAM }, /* 2-stream NUMA bandwidth measurements: */ @@ -1755,7 +1756,7 @@ static const char *tests[][MAX_ARGS] = { { " 1x3-convergence,", "mem", "-p", "1", "-t", "3", "-P", "512", OPT_CONV }, { " 1x4-convergence,", "mem", "-p", "1", "-t", "4", "-P", "512", OPT_CONV }, { " 1x6-convergence,", "mem", "-p", "1", "-t", "6", "-P", "1020", OPT_CONV }, - { " 2x3-convergence,", "mem", "-p", "3", "-t", "3", "-P", "1020", OPT_CONV }, + { " 2x3-convergence,", "mem", "-p", "2", "-t", "3", "-P", "1020", OPT_CONV }, { " 3x3-convergence,", "mem", "-p", "3", "-t", "3", "-P", "1020", OPT_CONV }, { " 4x4-convergence,", "mem", "-p", "4", "-t", "4", "-P", "512", OPT_CONV }, { " 4x4-convergence-NOTHP,", @@ -1780,24 +1781,24 @@ static const char *tests[][MAX_ARGS] = { "mem", "-p", "8", "-t", "1", "-P", " 512", OPT_BW_NOTHP }, { "16x1-bw-process,", "mem", "-p", "16", "-t", "1", "-P", "256", OPT_BW }, - { " 4x1-bw-thread,", "mem", "-p", "1", "-t", "4", "-T", "256", OPT_BW }, - { " 8x1-bw-thread,", "mem", "-p", "1", "-t", "8", "-T", "256", OPT_BW }, - { "16x1-bw-thread,", "mem", "-p", "1", "-t", "16", "-T", "128", OPT_BW }, - { "32x1-bw-thread,", "mem", "-p", "1", "-t", "32", "-T", "64", OPT_BW }, + { " 1x4-bw-thread,", "mem", "-p", "1", "-t", "4", "-T", "256", OPT_BW }, + { " 1x8-bw-thread,", "mem", "-p", "1", "-t", "8", "-T", "256", OPT_BW }, + { "1x16-bw-thread,", "mem", "-p", "1", "-t", "16", "-T", "128", OPT_BW }, + { "1x32-bw-thread,", "mem", "-p", "1", "-t", "32", "-T", "64", OPT_BW }, - { " 2x3-bw-thread,", "mem", "-p", "2", "-t", "3", "-P", "512", OPT_BW }, - { " 4x4-bw-thread,", "mem", "-p", "4", "-t", "4", "-P", "512", OPT_BW }, - { " 4x6-bw-thread,", "mem", "-p", "4", "-t", "6", "-P", "512", OPT_BW }, - { " 4x8-bw-thread,", "mem", "-p", "4", "-t", "8", "-P", "512", OPT_BW }, - { " 4x8-bw-thread-NOTHP,", + { " 2x3-bw-process,", "mem", "-p", "2", "-t", "3", "-P", "512", OPT_BW }, + { " 4x4-bw-process,", "mem", "-p", "4", "-t", "4", "-P", "512", OPT_BW }, + { " 4x6-bw-process,", "mem", "-p", "4", "-t", "6", "-P", "512", OPT_BW }, + { " 4x8-bw-process,", "mem", "-p", "4", "-t", "8", "-P", "512", OPT_BW }, + { " 4x8-bw-process-NOTHP,", "mem", "-p", "4", "-t", "8", "-P", "512", OPT_BW_NOTHP }, - { " 3x3-bw-thread,", "mem", "-p", "3", "-t", "3", "-P", "512", OPT_BW }, - { " 5x5-bw-thread,", "mem", "-p", "5", "-t", "5", "-P", "512", OPT_BW }, + { " 3x3-bw-process,", "mem", "-p", "3", "-t", "3", "-P", "512", OPT_BW }, + { " 5x5-bw-process,", "mem", "-p", "5", "-t", "5", "-P", "512", OPT_BW }, - { "2x16-bw-thread,", "mem", "-p", "2", "-t", "16", "-P", "512", OPT_BW }, - { "1x32-bw-thread,", "mem", "-p", "1", "-t", "32", "-P", "2048", OPT_BW }, + { "2x16-bw-process,", "mem", "-p", "2", "-t", "16", "-P", "512", OPT_BW }, + { "1x32-bw-process,", "mem", "-p", "1", "-t", "32", "-P", "2048", OPT_BW }, - { "numa02-bw,", "mem", "-p", "1", "-t", "32", "-T", "32", OPT_BW }, + { "numa02-bw,", "mem", "-p", "1", "-t", "32", "-T", "32", OPT_BW }, { "numa02-bw-NOTHP,", "mem", "-p", "1", "-t", "32", "-T", "32", OPT_BW_NOTHP }, { "numa01-bw-thread,", "mem", "-p", "2", "-t", "16", "-T", "192", OPT_BW }, { "numa01-bw-thread-NOTHP,", diff --git a/tools/perf/bench/syscall.c b/tools/perf/bench/syscall.c new file mode 100644 index 000000000000..5fe621cff8e9 --- /dev/null +++ b/tools/perf/bench/syscall.c @@ -0,0 +1,81 @@ +/* + * + * syscall.c + * + * syscall: Benchmark for system call performance + */ +#include "../perf.h" +#include "../util/util.h" +#include <subcmd/parse-options.h> +#include "../builtin.h" +#include "bench.h" + +#include <stdio.h> +#include <sys/time.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <unistd.h> +#include <stdlib.h> + +#define LOOPS_DEFAULT 10000000 +static int loops = LOOPS_DEFAULT; + +static const struct option options[] = { + OPT_INTEGER('l', "loop", &loops, "Specify number of loops"), + OPT_END() +}; + +static const char * const bench_syscall_usage[] = { + "perf bench syscall <options>", + NULL +}; + +int bench_syscall_basic(int argc, const char **argv) +{ + struct timeval start, stop, diff; + unsigned long long result_usec = 0; + int i; + + argc = parse_options(argc, argv, options, bench_syscall_usage, 0); + + gettimeofday(&start, NULL); + + for (i = 0; i < loops; i++) + getppid(); + + gettimeofday(&stop, NULL); + timersub(&stop, &start, &diff); + + switch (bench_format) { + case BENCH_FORMAT_DEFAULT: + printf("# Executed %'d getppid() calls\n", loops); + + result_usec = diff.tv_sec * 1000000; + result_usec += diff.tv_usec; + + printf(" %14s: %lu.%03lu [sec]\n\n", "Total time", + diff.tv_sec, + (unsigned long) (diff.tv_usec/1000)); + + printf(" %14lf usecs/op\n", + (double)result_usec / (double)loops); + printf(" %'14d ops/sec\n", + (int)((double)loops / + ((double)result_usec / (double)1000000))); + break; + + case BENCH_FORMAT_SIMPLE: + printf("%lu.%03lu\n", + diff.tv_sec, + (unsigned long) (diff.tv_usec / 1000)); + break; + + default: + /* reaching here is something disaster */ + fprintf(stderr, "Unknown format:%d\n", bench_format); + exit(1); + break; + } + + return 0; +} |